From 4043b720a327f33d3f02986f2830802634c0080e Mon Sep 17 00:00:00 2001
From: NMarthe <nina.marthe34@gmail.com>
Date: Thu, 20 Jul 2023 16:37:03 +0200
Subject: [PATCH] =?UTF-8?q?s=C3=A9par=C3=A9=20le=20code=20en=20plus=20de?=
 =?UTF-8?q?=20fonctions=20pour=20le=20rendre=20plus=20lisible?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Graph_gff.py | 204 +++++++++++++++++++++++----------------------------
 1 file changed, 90 insertions(+), 114 deletions(-)

diff --git a/Graph_gff.py b/Graph_gff.py
index 6af09f4..29ccbcd 100644
--- a/Graph_gff.py
+++ b/Graph_gff.py
@@ -6,113 +6,113 @@ global Segments
 Segments={}
 Features={}
 
+global print_graph_gff
+print_graph_gff=[0,"",""]
+
+# write in output file
+def write_line(line,force):
+    print_graph_gff[1]+=line
+    print_graph_gff[0]+=1
+    if (print_graph_gff[0]>999) | (force==True):
+        print_graph_gff[2].write(print_graph_gff[1])
+        print_graph_gff[0]=0
+        print_graph_gff[1]=""
 
 
 # functions to create the segments and the features
 
 # create a segment with its first feature
-def init_seg(line,feature):
-    line=line.split()
-    seg_id=line[3][1:]
-    chr=line[0]
-    start=line[1]
-    stop=line[2]
-    
+def init_seg(line,feature_id,segment_id,strand):
+    [chr,start,stop]=[line[0],line[1],line[2]]
     # add the current feature to the list of features that are on the segment
-    feature_strand=line[10]
-    feature_stranded=feature_strand+feature
-    feat=list()
-    feat.append(feature_stranded)
-    
-    # create the segment
-    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
-
+    feature_stranded=strand+feature_id
+    feature_list=[feature_stranded]
+    # create the segment, store it in the Segments dict
+    Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)
 
 # create a feature with the first segment its on
-def init_feature(line):
-    line=line.split()
-    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
-    type=line[6]
-    annot=line[12]
-    chr=line[4]
-    start=line[7]
-    stop=line[8]
-    childs=list()
-    
-    # add the current segment to the list of segments that have the feature
-    seg=line[3][1:]
-    strand=line[10]
-    seg_stranded=strand+seg
-    segments_list=list()
-    segments_list.append(seg_stranded)
-
+def init_feature(line,feature_id,segment_id,strand):
+    [type,annot,chr,start,stop,childs]=[line[6],line[12],line[4],line[7],line[8],[]]
     parent=get_parent(annot,feature_id)
-
-    # create the feature
+    # add the current segment to the list of segments that have the feature
+    segment_stranded=strand+segment_id
+    segments_list=[segment_stranded]
+    # create the feature, store it in the dict Features
     Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
+    set_note(feature_id)
 
-
-# if there is a parent, get the id of the parent feature and add the child to the parent
+# if there is a parent, returns the id of the parent feature and add the child to the parent
 def get_parent(annot,feature_id):
-    # if the current feature has a parent, add the current feature in the childs list of its parent
     if annot.split(";")[1].split("=")[0]=="Parent":
     # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
         parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
         add_child(parent,feature_id)
-    
     elif annot.split(";")[2].split("=")[0]=="Parent":
     # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
         parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
         add_child(parent,feature_id)
     else: parent=""
-
     return parent
 
-
 # add a feature to an existing segment
 def add_feature(seg,new_feature,strand):
     new_feature_stranded=strand+new_feature
     if new_feature_stranded not in Segments[seg].features:
         Segments[seg].features.append(new_feature_stranded)
-    
 
 # add a child feature to an existing feature
 def add_child(feat,new_child):
-    if feat in Features.keys(): # if the parent feature exists
-        if new_child not in Features[feat].childs:
-            Features[feat].childs.append(new_child)
-    
-    
+    if (feat in Features.keys()) & (new_child not in Features[feat].childs): # if the parent feature exists
+        Features[feat].childs.append(new_child)
+
 # add a segment to an existing feature
 def add_seg(feat,new_seg,strand):
     seg_stranded=strand+new_seg
     if seg_stranded not in Features[feat].segments_list:
         Features[feat].segments_list.append(seg_stranded)
 
+# add the position of the feature on its first and last segment
+def add_pos(feature):
+    feature.pos_start=get_start(feature.segments_list[0][1:],feature.id)
+    feature.pos_stop=get_stop(feature.segments_list[-1][1:],feature.id)
+
+def load_feature_intersect(line,feature_id,segment_id,strand):
+    if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
+        init_feature(line, feature_id, segment_id,strand)
+    else: # if it exists, add the current segment to the list of segments that have the existing feature
+        add_seg(feature_id,segment_id,strand)
+
+def load_segment_intersect(line,feature_id,segment_id,strand):
+    if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
+        init_seg(line, feature_id,segment_id,strand)
+    else: # if it exists, add the current feature to the list of features on the existing segment
+        add_feature(segment_id,feature_id,strand)
 
-# create a note for the child features that do not have annotation. 
-def set_note(id):
+
+# create a note for the child features that do not have annotation.     not clean. fct for getting parent ? 
+def set_note(feature_id):
     # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
     # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
-    feat=Features[id]
-    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
-        feat.note=feat.annot.split(';')[-1]
+    feature=Features[feature_id]
+    if feature.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
+        feature.note=feature.annot.split(';')[-1]
     else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
         # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
         # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
-        curent=feat.parent
+        curent=feature.parent
         annot_found=False
         while annot_found==False:
             if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                 note=Features[curent].annot.split(';')[-1]
-                feat.note=note
+                feature.note=note
                 annot_found=True
             else: # if we didn't find the gene, we go up to the current feature's parent until we find it
                 curent=Features[Features[curent].parent].id
 
 
 # create all the Segment and Feature objects in the dictionnaries Segments and Features
-def create_seg_feat(intersect_path):    
+def load_intersect(intersect_path):
+    print("loading the intersect file")
     
     # open the file with the intersect between the segments and the gff
     file = open(intersect_path, 'r')
@@ -120,31 +120,19 @@ def create_seg_feat(intersect_path):
     file.close()
     
     for line in lines: # for each line in the intersect file
+        line=line.split()
         # get the ids for the dictionnaries' keys
-        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
-        segment_id=line.split()[3][1:]
-    
-        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
-            init_feature(line)
-        else: # if it exists, add the current segment to the list of segments that have the existing feature
-            strand=line.split()[10]
-            add_seg(feature_id,segment_id,strand)
+        feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+        segment_id=line[3][1:]
+        strand=line[10]
     
-        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
-            init_seg(line, feature_id)
-        
-        else: # if it exists, add the current feature to the list of features on the existing segment
-            strand=line.split()[10]
-            add_feature(segment_id,feature_id,strand)
-
-    # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
-    # cant always do it before because for that i need to have all the parents in the dict Features, and all the segments in the list segments_list for each feature.
-    for feat_id in Features:
-        feat=Features[feat_id]
-        set_note(feat.id)
-        feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
-        feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)
+        load_feature_intersect(line,feature_id,segment_id,strand)
+        load_segment_intersect(line,feature_id,segment_id,strand)
 
+    # for all the features, add the position of the feature on its first and last segment.
+    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
+    for feat_id in Features:
+        add_pos(Features[feat_id])
 
 
 
@@ -162,7 +150,6 @@ def get_start(seg_id,feat_id):
         result=f.start-s.start+1
     return result
 
-
 # get the feature's stop position on the segment
 def get_stop(seg_id,feat_id):
     s=Segments[seg_id]
@@ -173,49 +160,39 @@ def get_stop(seg_id,feat_id):
         result=f.stop-s.start+1
     return result
 
+# generates the annotation for the gff of the graph, with the rank of the segment in the feature
+def make_annot_graph(feature,segment,strand):
+    segment_stranded=strand+segment
+    # get the rank and the total number of ocurrences for the feature
+    rank=str(feature.segments_list.index(segment_stranded)+1)
+    total=str(len(feature.segments_list))
+    # create the annotation with the rank information
+    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
+    return annotation
+
+def create_line_graph_gff(feature_stranded,segment):
+    [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
+    feature=Features[feature_id]
+
+    annotation=make_annot_graph(feature,segment,strand)
+    type=feature.type
+    start=get_start(segment,feature_id)
+    stop=get_stop(segment,feature_id)
+    line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
+    return line
 
 # go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
 def graph_gff(file_out):
     print("generation of the graph's gff")
     file_out = open(file_out, 'w')
-    string_out=""
-    cpt_out=0
+    print_graph_gff[2]=file_out
 
     for segment in Segments:
-        # get the list of the features on the segment
-        features_seg=Segments[segment].features
-        
-        # go through all the segment's features, and print the gff line for each
-        for feature_stranded in features_seg:
-            strand=feature_stranded[0:1]
-            feature=feature_stranded[1:]
-            # feature[feature]
-
-            #[strand,feature]=[feature_stranded[0:1],feature_stranded[1:]]
-            segment_stranded=strand+segment
-            type=Features[feature].type
-            start=get_start(segment,feature)
-            stop=get_stop(segment,feature)
-            
-            # get the rank and the total number of ocurrences for the feature
-            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
-            total=str(len(Features[feature].segments_list))
-
-            # create the annotation with the rank information
-            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
-
-            # write the gff line in the output file
-            line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
-            string_out+=line
-            cpt_out+=1
-            #file_out.write(line)
-
-            if cpt_out==1000:
-                file_out.write(string_out)
-                string_out=""
-                cpt_out=0
-                # faire un fct print
-    file_out.write(string_out)             
+        features_seg=Segments[segment].features # get the list of the features on the segment
+        for feature_stranded in features_seg: # go through all the segment's features, and print the gff line for each
+            line = create_line_graph_gff(feature_stranded,segment)
+            write_line(line,False)
+    write_line("",True)
 
     file_out.close()
 
@@ -223,4 +200,3 @@ def graph_gff(file_out):
 
 
 
-
-- 
GitLab