From 83f25031fe503f62a0b01504c1ca3757a52b9fa3 Mon Sep 17 00:00:00 2001
From: "nina.marthe_ird.fr" <nina.marthe@ird.fr>
Date: Wed, 15 May 2024 11:32:14 +0200
Subject: [PATCH] reorganised some functions, made them smaller, added comments

---
 Functions.py        | 223 ++++++++++++++++++++++----------------------
 Functions_output.py |  80 ++++++++--------
 2 files changed, 152 insertions(+), 151 deletions(-)

diff --git a/Functions.py b/Functions.py
index f06537e..a085ef1 100644
--- a/Functions.py
+++ b/Functions.py
@@ -416,102 +416,124 @@ def add_target_genome_paths(feature_id,target_genome_paths):
     list_seg=feature.segments_list_source
     list_first_last_segs=get_first_last_seg(list_seg)
 
-    for match in list_first_last_segs:
+    for match in list_first_last_segs: # for each walk that has the feature
         [first_seg,last_seg,walk_name]=match
-        # get the first and last segments of all the copies
-        [first_last_segs_list]=detect_gene_copies(list_seg,walk_name,feature_id)
+        # get the first and last segments of all the copies on this walk
+        first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id)
         copy_number=0
         for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies
             copy_number+=1
             copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg
-            feature_path=[walk_name]
-            feature_path.append(copy_id)
             feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id)
-            feature_path.append(feature_target_path)
+            feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
             feature.segments_list_target.append(feature_path)
 
             # add the feat_copy to all the segments !!!
 
-            # get the pos start of first and last, they are always present
+            # get the pos start of the first and last segments (existing because done in find_gene_copies)
             # then look for segs that are between.
             walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id)[1]
             walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id)[2]
             feat_copy=(feature_id,copy_id)
-            for seg in feature_target_path[1:-1]: # the first and last segs are already done in detect_gene_copies
+            for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
                 for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
                     if walk_start <= segment[1] <= walk_stop: 
-                        segment.append(feat_copy)
+                        segment.append(feat_copy) # annotate the segment : feature_id, copy_id
                         break
 
-            # add paths of childs
-            childs_list=get_child_list(feature_id)
-            for child_id in childs_list:
-                child=Features[child_id]
+            # add the paths of the gene's child features
+            add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths)
+     
+    if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
+        feature.segments_list_target.append(['','',[]])
 
-                # find child path in parent's path
-                [child_first_seg,child_last_seg]=['','']
-                for seg in child.segments_list_source: # DOESNT TAKE INTO ACCOUNT THE INVERSIONS !!!
-                    if seg in feature_target_path: # parent path
-                        child_first_seg=seg
-                        break
-                    elif invert_seg(seg) in feature_target_path:
-                        child_first_seg=invert_seg(seg)
-                        break
+def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths):
+    childs_list=get_child_list(feature_id)
+    for child_id in childs_list:
+        child=Features[child_id]
+
+        # find child path in parent's path
+        [child_first_seg,child_last_seg]=['','']
+        for seg in child.segments_list_source:
+            if seg in feature_target_path: # parent path
+                child_first_seg=seg
+                break
+            elif invert_seg(seg) in feature_target_path:
+                child_first_seg=invert_seg(seg)
+                break
 
-                if child_first_seg!='': # the child may be absent in this occurence !
-                    for seg in reversed(child.segments_list_source):
-                        if seg in feature_target_path: # parent path
-                            child_last_seg=seg
-                            break
-                        elif invert_seg(seg) in feature_target_path:
-                            child_last_seg=invert_seg(seg)
-                            break
-                    
-                    # get the index of these segs, to get the sous-liste
-                    child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id)
-                    feat_copy=(child_id,copy_id)
-
-                    feature_path=[walk_name,copy_id]
-                    feature_path.append(child_path)
-                    child.segments_list_target.append(feature_path)
-
-                    for segment_id in child_path:
-                        segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id)
-                        segment.append(feat_copy)
+        if child_first_seg!='': # the child feature may be absent in this occurence of the parent
+            for seg in reversed(child.segments_list_source):
+                if seg in feature_target_path: # parent path
+                    child_last_seg=seg
+                    break
+                elif invert_seg(seg) in feature_target_path:
+                    child_last_seg=invert_seg(seg)
+                    break
             
-    if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
-        feature.segments_list_target.append(['','',[]])
+            # get the path of the child feature
+            child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id)
+            feat_copy=(child_id,copy_id)
 
-def detect_gene_copies(list_seg_source,walk_name,feature_id):
+            feature_path=[walk_name,copy_id]
+            feature_path.append(child_path)
+            child.segments_list_target.append(feature_path)
 
-    # find all copies of all segments from the gene in the target genome (in both orientations)
+            for segment_id in child_path: # annotate the segments with info about the occurence of the child feature
+                segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id)
+                segment.append(feat_copy)
+
+# find all the copies of the segments from the source list in the target genome
+def find_all_seg(list_seg_source,walk_name):
     index=0
-    list_seg_target=[] # contains list of info for each seg [seg_id,seg_strand,start_pos,index_on_source_walk]
-    list_seg_source_unstranded=[]
+    list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk]
+    list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>]
     for seg in list_seg_source:
         list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
         seg_inverted=invert_seg(seg)
         # look for all the segment copies in the target genome walk, in both orientations
-        if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]):
-            for copy in segments_on_target_genome[seg][walk_name]:
+        if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
+            for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
                 seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
                 list_seg_target.append(seg_info)
-        if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) :
+        if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
             for copy in segments_on_target_genome[seg_inverted][walk_name]:
                 seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
                 list_seg_target.append(seg_info)
         index+=1
-    
     list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position
-    old_index=list_seg_target[0][3]
-    old_strand=list_seg_target[0][1]
-    copy_number=1
-    first_segs_list=[]
-    last_segs_list=[]
-    old_seg_id=list_seg_target[0][1]+list_seg_target[0][0]
+    return [list_seg_target,list_seg_source_unstranded]
+
+def find_gene_copies(list_seg_source,walk_name,feature_id):
+    # find all copies of all segments from the gene in the target genome (in both orientations)
+    [list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name)
+    # find each copy of the gene in the ordered list of segments
+    [first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id)
+
+    # join the first and last seg lists
+    first_last_segs_list=[]
+    index=0
+    for first_seg in first_segs_list:
+        last_seg=last_segs_list[index]
+        pair=(first_seg,last_seg)
+        first_last_segs_list.append(pair)
+        index+=1
+
+    return first_last_segs_list # return a list of pairs (first_seg,last_seg)
+
+# called by find_gene_copies
+def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id):
+    # prepare the variables for the first iteration of the for loop
+    [old_id,old_strand,old_start,old_index]=list_seg_target[0]
+    [first_segs_list,last_segs_list]=[[],[]]
+    old_seg_id=old_strand+old_id
     first_segs_list.append(old_seg_id)
-    old_start=list_seg_target[0][2]
-    for segment in segments_on_target_genome[old_seg_id][walk_name]: # add first seg info
+
+    copy_number=1
+    copy_id="copy_"+str(copy_number)
+    feat_copy=(feature_id,copy_id)
+
+    for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
         if segment[1]==old_start:
             copy_id="copy_"+str(copy_number)
             feat_copy=(feature_id,copy_id)
@@ -523,71 +545,48 @@ def detect_gene_copies(list_seg_source,walk_name,feature_id):
         old_index+=1
     else:
         old_index-=1
-    
-    # find each copy of the gene in the ordered list of segments
-    for seg in list_seg_target:
-        new_seg_id=seg[1]+seg[0]
-        new_index=seg[3] # index in the list_source
-        new_strand=seg[1]
-        seg_start=seg[2]
+
+    for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies
+        [new_id,new_strand,new_start,new_index]=seg
+        new_seg_id=new_strand+new_id
         inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk
 
         if inversion : 
-            if not((old_strand==new_strand) and (old_index>new_index)): # if the index decreases and the strand stays the same, it is the same gene copy
+            if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy)
                 last_segs_list.append(old_seg_id)
-                for segment in segments_on_target_genome[old_seg_id][walk_name]: # add last seg info
-                    if segment[1]==old_start:
-                        copy_id="copy_"+str(copy_number)
-                        feat_copy=(feature_id,copy_id)
-                        segment.append(feat_copy)
-                        break
+                add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
+
                 copy_number+=1
+                copy_id="copy_"+str(copy_number)
+                feat_copy=(feature_id,copy_id) # new feature copy, change the info
+
                 first_segs_list.append(new_seg_id)
-                for segment in segments_on_target_genome[new_seg_id][walk_name]: # add first seg info
-                    if segment[1]==seg_start:
-                        copy_id="copy_"+str(copy_number)
-                        feat_copy=(feature_id,copy_id)
-                        segment.append(feat_copy)
-                        break
+                add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
         else: 
-            if not((old_strand==new_strand) and (old_index<new_index)): # if the index increases and the strand stays the same, it is the same gene copy
+            if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy)
                 last_segs_list.append(old_seg_id)
-                for segment in segments_on_target_genome[old_seg_id][walk_name]: # add last seg info
-                    if segment[1]==old_start:
-                        copy_id="copy_"+str(copy_number)
-                        feat_copy=(feature_id,copy_id)
-                        segment.append(feat_copy)
-                        break
+                add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
+
                 copy_number+=1
+                copy_id="copy_"+str(copy_number)
+                feat_copy=(feature_id,copy_id) # new feature copy, change the info
+
                 first_segs_list.append(new_seg_id)
-                for segment in segments_on_target_genome[new_seg_id][walk_name]: # add first seg info
-                    if segment[1]==seg_start:
-                        copy_id="copy_"+str(copy_number)
-                        feat_copy=(feature_id,copy_id)
-                        segment.append(feat_copy)
-                        break
-        
-        old_strand=new_strand
-        old_index=new_index
-        old_seg_id=new_seg_id
-        old_start=seg_start
+                add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
+
+        [old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start]
+
+    # add the last seg info
     last_segs_list.append(old_seg_id)
-    for segment in segments_on_target_genome[old_seg_id][walk_name]: # add last seg info
-        if segment[1]==seg_start:
-            copy_id="copy_"+str(copy_number)
-            feat_copy=(feature_id,copy_id)
-            segment.append(feat_copy)
-            break
+    add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy) # add info for the last seg of the last copy
 
-    first_last_segs_list=[]
-    index=0
-    for first_seg in first_segs_list:
-        last_seg=last_segs_list[index]
-        pair=(first_seg,last_seg)
-        first_last_segs_list.append(pair)
-        index+=1
+    return [first_segs_list,last_segs_list]
 
-    return [first_last_segs_list] # return a list of pairs (first_seg,last_seg)
+def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy):
+    for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
+        if segment[1]==target_start:
+            segment.append(feat_copy) # add seg info
+            break
 
 def sort_seg_info(seg_info):
     return seg_info[2]
diff --git a/Functions_output.py b/Functions_output.py
index 40782fe..ea7301f 100644
--- a/Functions_output.py
+++ b/Functions_output.py
@@ -25,48 +25,13 @@ def generate_target_gff(feature_id,seg_size,args,reason_features_not_transfered,
             match.append(False) # store the information that this gene copy didnt pass the filters
             return "no"
         
-        if right_size(size_on_new_genome,max_diff,feature_id):# or (feature.type!="gene"): 
+        if right_size(size_on_new_genome,max_diff,feature_id):
             line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id) # transfer the gene
             write_line(line_gff,output_target_gff,False)
 
             # transfer all the gene's childs
-            # MAKE A FUNCTION FOR THAT
-            childs_list=get_child_list(feature_id)
-            for child_id in childs_list:
-                child=Features[child_id]
-                # look for the first and last seg of the child in its parent path
-                [child_first_seg,child_last_seg]=['','']
-                for seg in child.segments_list_source: # get first seg in the parent path
-                    if seg in feature_target_path: # parent path
-                        child_first_seg=seg
-                        break
-                    elif invert_seg(seg) in feature_target_path:
-                        child_first_seg=invert_seg(seg)
-                        break
-
-                if child_first_seg!='': # the child may be absent in this occurence !
-                    for seg in reversed(child.segments_list_source): # get last seg in the parent path
-                        if seg in feature_target_path: # parent path
-                            child_last_seg=seg
-                            break
-                        elif invert_seg(seg) in feature_target_path:
-                            child_last_seg=invert_seg(seg)
-                            break
-                    if inversion: # done twice, make a function
-                        child_size_on_new_genome=get_feature_start_on_target_genome_inv(child_last_seg,child_id,walk,copy_id)-get_feature_stop_on_target_genome_inv(child_first_seg,child_id,walk,copy_id)+1
-                    else:
-                        child_size_on_new_genome=get_feature_stop_on_target_genome(child_last_seg,child_id,walk,copy_id)-get_feature_start_on_target_genome(child_first_seg,child_id,walk,copy_id)+1
-                    child_size_diff=abs(child_size_on_new_genome-child.size)
-                    for match in child.segments_list_target:
-                        if (match[0]==walk) and (match[1]==copy_id):
-                            seg_list=match[2]
-                            [cov,id]=get_id_cov(child_id,seg_size,seg_list)
-
-                    # transfer the child feature:
-                    line_gff=create_line_target_gff(child_first_seg,child_last_seg,child_id,child_size_diff,inversion,walk,cov,id,copy_id)
-                    write_line(line_gff,output_target_gff,False)
-                
-            
+            transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_size)
+
             match.append(True) # store the information that this gene copy was transfered
             return size_diff
         
@@ -78,7 +43,44 @@ def generate_target_gff(feature_id,seg_size,args,reason_features_not_transfered,
         reason_features_not_transfered[1]+=1
         match.append(False) # store the information that this gene copy didnt pass the filters
         return "no"
-
+    
+def transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_size):
+    childs_list=get_child_list(feature_id)
+    for child_id in childs_list:
+        child=Features[child_id]
+        # look for the first and last seg of the child in its parent path
+        [child_first_seg,child_last_seg]=['','']
+        for seg in child.segments_list_source: # get first seg in the parent path
+            if seg in feature_target_path: # parent path
+                child_first_seg=seg
+                break
+            elif invert_seg(seg) in feature_target_path:
+                child_first_seg=invert_seg(seg)
+                break
+
+        if child_first_seg!='': # check that the child feature is not absent in this occurence of the parent feature
+            for seg in reversed(child.segments_list_source): # get last seg in the parent path
+                if seg in feature_target_path: # parent path
+                    child_last_seg=seg
+                    break
+                elif invert_seg(seg) in feature_target_path:
+                    child_last_seg=invert_seg(seg)
+                    break
+            
+            if inversion: # calculate the child feature size
+                child_size_on_new_genome=get_feature_start_on_target_genome_inv(child_last_seg,child_id,walk,copy_id)-get_feature_stop_on_target_genome_inv(child_first_seg,child_id,walk,copy_id)+1
+            else:
+                child_size_on_new_genome=get_feature_stop_on_target_genome(child_last_seg,child_id,walk,copy_id)-get_feature_start_on_target_genome(child_first_seg,child_id,walk,copy_id)+1
+            child_size_diff=abs(child_size_on_new_genome-child.size)
+
+            for match in child.segments_list_target: # look for the right occurence of the child feature
+                if (match[0]==walk) and (match[1]==copy_id):
+                    seg_list=match[2]
+                    [cov,id]=get_id_cov(child_id,seg_size,seg_list)
+
+            # transfer the child feature:
+            line_gff=create_line_target_gff(child_first_seg,child_last_seg,child_id,child_size_diff,inversion,walk,cov,id,copy_id)
+            write_line(line_gff,output_target_gff,False)
 
 # handles all the transfer, variation, alignment (stats?), on the target genome.
 def transfer_on_target(segments_file, out_target_gff, out_var,out_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args):
-- 
GitLab