Graph_gff.py

import ClassSegFeat as Class

# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
global Segments
Segments={}
Features={}


# functions to create the segments and the features

# create a segment with its first feature
def init_seg(line,feature):
    line=line.split()
    seg_id=line[3][1:]
    chr=line[0]
    start=line[1]
    stop=line[2]
    
    # add the current feature to the list of features that are on the segment
    feature_strand=line[10]
    feature_stranded=feature_strand+feature
    feat=list()
    feat.append(feature_stranded)
    
    # create the segment
    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)


# create a feature with the first segment its on
def init_feature(line):
    line=line.split()
    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
    type=line[6]
    annot=line[12]
    chr=line[4]
    start=line[7]
    stop=line[8]
    childs=list()
    
    # add the current segment to the list of segments that have the feature
    seg=line[3][1:]
    strand=line[10]
    seg_stranded=strand+seg
    segments_list=list()
    segments_list.append(seg_stranded)

    parent=get_parent(annot,feature_id)

    # create the feature
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)


# if there is a parent, get the id of the parent feature and add the child to the parent
def get_parent(annot,feature_id):
    # if the current feature has a parent, add the current feature in the childs list of its parent
    if annot.split(";")[1].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    
    elif annot.split(";")[2].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""

    return parent


# add a feature to an existing segment
def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)
    

# add a child feature to an existing feature
def add_child(feat,new_child):
    if feat in Features.keys(): # if the parent feature exists
        if new_child not in Features[feat].childs:
            Features[feat].childs.append(new_child)
    
    
# add a segment to an existing feature
def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)


# create a note for the child features that do not have annotation. 
def set_note(id):
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
    feat=Features[id]
    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
        feat.note=feat.annot.split(';')[-1]
    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
        # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
        curent=feat.parent
        annot_found=False
        while annot_found==False:
            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                note=Features[curent].annot.split(';')[-1]
                feat.note=note
                annot_found=True
            else: # if we didn't find the gene, we go up to the current feature's parent until we find it
                curent=Features[Features[curent].parent].id


# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):    
    
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    file.close()
    
    for line in lines: # for each line in the intersect file
        # get the ids for the dictionnaries' keys
        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        segment_id=line.split()[3][1:]
    
        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
            init_feature(line)
        else: # if it exists, add the current segment to the list of segments that have the existing feature
            strand=line.split()[10]
            add_seg(feature_id,segment_id,strand)
    
        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
            init_seg(line, feature_id)
        
        else: # if it exists, add the current feature to the list of features on the existing segment
            strand=line.split()[10]
            add_feature(segment_id,feature_id,strand)

    # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
    # cant always do it before because for that i need to have all the parents in the dict Features, and all the segments in the list segments_list for each feature.
    for feat_id in Features:
        feat=Features[feat_id]
        set_note(feat.id)
        feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
        feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)


# functions to generate the graph's gff from the segments and features created with create_seg_feat

# get the feature's start position on the segment
def get_start(seg_id,feat_id):
    s=Segments[seg_id]
    f=Features[feat_id]
    if s.start>=f.start:
        result=1
    else:
        result=f.start-s.start+1
    return result


# get the feature's stop position on the segment
def get_stop(seg_id,feat_id):
    s=Segments[seg_id]
    f=Features[feat_id]
    if s.stop<=f.stop:
        result=s.size
    else:
        result=f.stop-s.start+1
    return result


# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')
    string_out=""
    cpt_out=0

    for segment in Segments:
        # get the list of the features on the segment
        features_seg=Segments[segment].features
        
        # go through all the segment's features, and print the gff line for each
        for feature_stranded in features_seg:
            strand=feature_stranded[0:1]
            feature=feature_stranded[1:]
            # feature[feature]

            #[strand,feature]=[feature_stranded[0:1],feature_stranded[1:]]
            segment_stranded=strand+segment
            type=Features[feature].type
            start=get_start(segment,feature)
            stop=get_stop(segment,feature)
            
            # get the rank and the total number of ocurrences for the feature
            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
            total=str(len(Features[feature].segments_list))

            # create the annotation with the rank information
            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total

            # write the gff line in the output file
            line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
            string_out+=line
            cpt_out+=1
            #file_out.write(line)

            if cpt_out==1000:
                file_out.write(string_out)
                string_out=""
                cpt_out=0
                # faire un fct print
    file_out.write(string_out)             

    file_out.close()