séparé le code en plus de fonctions pour le rendre plus lisible

4043b720 · NMarthe · 875cc898 · 4043b720
Commit 4043b720 authored 1 year ago by NMarthe
--- a/Graph_gff.py
+++ b/Graph_gff.py
@@ -6,113 +6,113 @@ global Segments
 Segments={}
 Features={}
+global print_graph_gff
+print_graph_gff=[0,"",""]
+# write in output file
+def write_line(line,force):
+    print_graph_gff[1]+=line
+    print_graph_gff[0]+=1
+    if (print_graph_gff[0]>999) | (force==True):
+        print_graph_gff[2].write(print_graph_gff[1])
+        print_graph_gff[0]=0
+        print_graph_gff[1]=""
 # functions to create the segments and the features
 # create a segment with its first feature
-def init_seg(line,feature):
+def init_seg(line,feature_id,segment_id,strand):
-    line=line.split()
+    [chr,start,stop]=[line[0],line[1],line[2]]
-    seg_id=line[3][1:]
-    chr=line[0]
-    start=line[1]
-    stop=line[2]
    # add the current feature to the list of features that are on the segment
-    feature_strand=line[10]
+    feature_stranded=strand+feature_id
-    feature_stranded=feature_strand+feature
+    feature_list=[feature_stranded]
-    feat=list()
+    # create the segment, store it in the Segments dict
-    feat.append(feature_stranded)
+    Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)
-    # create the segment
-    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
 # create a feature with the first segment its on
-def init_feature(line):
+def init_feature(line,feature_id,segment_id,strand):
-    line=line.split()
+    [type,annot,chr,start,stop,childs]=[line[6],line[12],line[4],line[7],line[8],[]]
-    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
-    type=line[6]
-    annot=line[12]
-    chr=line[4]
-    start=line[7]
-    stop=line[8]
-    childs=list()
-    # add the current segment to the list of segments that have the feature
-    seg=line[3][1:]
-    strand=line[10]
-    seg_stranded=strand+seg
-    segments_list=list()
-    segments_list.append(seg_stranded)
    parent=get_parent(annot,feature_id)
+    # add the current segment to the list of segments that have the feature
-    # create the feature
+    segment_stranded=strand+segment_id
+    segments_list=[segment_stranded]
+    # create the feature, store it in the dict Features
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
+    set_note(feature_id)
+# if there is a parent, returns the id of the parent feature and add the child to the parent
-# if there is a parent, get the id of the parent feature and add the child to the parent
 def get_parent(annot,feature_id):
-    # if the current feature has a parent, add the current feature in the childs list of its parent
    if annot.split(";")[1].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    elif annot.split(";")[2].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""
    return parent
 # add a feature to an existing segment
 def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)
 # add a child feature to an existing feature
 def add_child(feat,new_child):
-    if feat in Features.keys(): # if the parent feature exists
+    if (feat in Features.keys()) & (new_child not in Features[feat].childs): # if the parent feature exists
-        if new_child not in Features[feat].childs:
+        Features[feat].childs.append(new_child)
-            Features[feat].childs.append(new_child)
 # add a segment to an existing feature
 def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)
+# add the position of the feature on its first and last segment
+def add_pos(feature):
+    feature.pos_start=get_start(feature.segments_list[0][1:],feature.id)
+    feature.pos_stop=get_stop(feature.segments_list[-1][1:],feature.id)
+def load_feature_intersect(line,feature_id,segment_id,strand):
+    if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
+        init_feature(line, feature_id, segment_id,strand)
+    else: # if it exists, add the current segment to the list of segments that have the existing feature
+        add_seg(feature_id,segment_id,strand)
+def load_segment_intersect(line,feature_id,segment_id,strand):
+    if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
+        init_seg(line, feature_id,segment_id,strand)
+    else: # if it exists, add the current feature to the list of features on the existing segment
+        add_feature(segment_id,feature_id,strand)
-# create a note for the child features that do not have annotation. 
-def set_note(id):
+# create a note for the child features that do not have annotation.     not clean. fct for getting parent ? 
+def set_note(feature_id):
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
-    feat=Features[id]
+    feature=Features[feature_id]
-    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
+    if feature.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
-        feat.note=feat.annot.split(';')[-1]
+        feature.note=feature.annot.split(';')[-1]
    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
        # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
-        curent=feat.parent
+        curent=feature.parent
        annot_found=False
        while annot_found==False:
            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                note=Features[curent].annot.split(';')[-1]
-                feat.note=note
+                feature.note=note
                annot_found=True
            else: # if we didn't find the gene, we go up to the current feature's parent until we find it
                curent=Features[Features[curent].parent].id
 # create all the Segment and Feature objects in the dictionnaries Segments and Features
-def create_seg_feat(intersect_path):    
+def load_intersect(intersect_path):
+    print("loading the intersect file")
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
@@ -120,31 +120,19 @@ def create_seg_feat(intersect_path):
    file.close()
    for line in lines: # for each line in the intersect file
+        line=line.split()
        # get the ids for the dictionnaries' keys
-        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+        feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
-        segment_id=line.split()[3][1:]
+        segment_id=line[3][1:]
+        strand=line[10]
-        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
-            init_feature(line)
-        else: # if it exists, add the current segment to the list of segments that have the existing feature
-            strand=line.split()[10]
-            add_seg(feature_id,segment_id,strand)
-        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
+        load_feature_intersect(line,feature_id,segment_id,strand)
-            init_seg(line, feature_id)
+        load_segment_intersect(line,feature_id,segment_id,strand)
-        else: # if it exists, add the current feature to the list of features on the existing segment
-            strand=line.split()[10]
-            add_feature(segment_id,feature_id,strand)
-    # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
-    # cant always do it before because for that i need to have all the parents in the dict Features, and all the segments in the list segments_list for each feature.
-    for feat_id in Features:
-        feat=Features[feat_id]
-        set_note(feat.id)
-        feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
-        feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)
+    # for all the features, add the position of the feature on its first and last segment.
+    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
+    for feat_id in Features:
+        add_pos(Features[feat_id])
@@ -162,7 +150,6 @@ def get_start(seg_id,feat_id):
        result=f.start-s.start+1
    return result
 # get the feature's stop position on the segment
 def get_stop(seg_id,feat_id):
    s=Segments[seg_id]
@@ -173,49 +160,39 @@ def get_stop(seg_id,feat_id):
        result=f.stop-s.start+1
    return result
+# generates the annotation for the gff of the graph, with the rank of the segment in the feature
+def make_annot_graph(feature,segment,strand):
+    segment_stranded=strand+segment
+    # get the rank and the total number of ocurrences for the feature
+    rank=str(feature.segments_list.index(segment_stranded)+1)
+    total=str(len(feature.segments_list))
+    # create the annotation with the rank information
+    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
+    return annotation
+def create_line_graph_gff(feature_stranded,segment):
+    [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
+    feature=Features[feature_id]
+    annotation=make_annot_graph(feature,segment,strand)
+    type=feature.type
+    start=get_start(segment,feature_id)
+    stop=get_stop(segment,feature_id)
+    line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
+    return line
 # go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
 def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')
-    string_out=""
+    print_graph_gff[2]=file_out
-    cpt_out=0
    for segment in Segments:
-        # get the list of the features on the segment
+        features_seg=Segments[segment].features # get the list of the features on the segment
-        features_seg=Segments[segment].features
+        for feature_stranded in features_seg: # go through all the segment's features, and print the gff line for each
+            line = create_line_graph_gff(feature_stranded,segment)
-        # go through all the segment's features, and print the gff line for each
+            write_line(line,False)
-        for feature_stranded in features_seg:
+    write_line("",True)
-            strand=feature_stranded[0:1]
-            feature=feature_stranded[1:]
-            # feature[feature]
-            #[strand,feature]=[feature_stranded[0:1],feature_stranded[1:]]
-            segment_stranded=strand+segment
-            type=Features[feature].type
-            start=get_start(segment,feature)
-            stop=get_stop(segment,feature)
-            # get the rank and the total number of ocurrences for the feature
-            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
-            total=str(len(Features[feature].segments_list))
-            # create the annotation with the rank information
-            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
-            # write the gff line in the output file
-            line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
-            string_out+=line
-            cpt_out+=1
-            #file_out.write(line)
-            if cpt_out==1000:
-                file_out.write(string_out)
-                string_out=""
-                cpt_out=0
-                # faire un fct print
-    file_out.write(string_out)             
    file_out.close()
@@ -223,4 +200,3 @@ def graph_gff(file_out):