From e91317893db47bdbff4c46097e40423d4d7fcad7 Mon Sep 17 00:00:00 2001
From: "nina.marthe_ird.fr" <nina.marthe@ird.fr>
Date: Wed, 12 Mar 2025 11:19:50 +0100
Subject: [PATCH] changed feature_on_target_genome occurence into a class
 FeaturePath

---
 GrAnnoT/alignment.py                | 11 +++----
 GrAnnoT/detect_copies.py            |  2 +-
 GrAnnoT/find_feature_target_path.py | 27 +++++++++++++---
 GrAnnoT/genome_transfer.py          | 29 ++++++++---------
 GrAnnoT/load_gfa.py                 | 21 +------------
 GrAnnoT/setup_transfer.py           | 48 ++++++++++++-----------------
 GrAnnoT/usefull_little_functions.py | 12 --------
 GrAnnoT/variation_details.py        | 13 ++++----
 8 files changed, 66 insertions(+), 97 deletions(-)

diff --git a/GrAnnoT/alignment.py b/GrAnnoT/alignment.py
index 5ba684b..8f92f78 100644
--- a/GrAnnoT/alignment.py
+++ b/GrAnnoT/alignment.py
@@ -6,15 +6,14 @@ from .detect_inversion import detect_feature_inversion
 
 # get the alignment between the features on the source genome and the features on the target genome 
 def print_alignment(feat,match,seg_info,file_out_aln):
-    if match[0]!='':
-        feature_target_path=match[2]
-        if len(feature_target_path)>0: # if the feature is not completely absent
+    if match.walk_name!='':
+        if len(match.path)>0: # if the feature is not completely absent
             feature_path_source_genome=Features[feat].segments_list_source
-            inversion=detect_feature_inversion(feature_path_source_genome,feature_target_path)
+            inversion=detect_feature_inversion(feature_path_source_genome,match.path)
             if inversion:
-                feature_target_path=invert_segment_list(feature_target_path)
+                match.path=invert_segment_list(match.path)
 
-            line_aln=create_line_aln(feature_path_source_genome,feature_target_path,seg_info,feat)
+            line_aln=create_line_aln(feature_path_source_genome,match.path,seg_info,feat)
             file_out_aln.write(line_aln)
 
 # creates an alignment for two segments
diff --git a/GrAnnoT/detect_copies.py b/GrAnnoT/detect_copies.py
index abf1ea6..17395b6 100644
--- a/GrAnnoT/detect_copies.py
+++ b/GrAnnoT/detect_copies.py
@@ -136,7 +136,7 @@ def get_first_last_seg(list_seg,segments_on_target_genome):
                     break
         list_first_last_segs.append([first_seg_found,last_seg_found,walk_found])
         [first_seg_found,last_seg_found,walk_found]=['','','']
-    # return all the match
+    # return all the matches
     return list_first_last_segs
 
 
diff --git a/GrAnnoT/find_feature_target_path.py b/GrAnnoT/find_feature_target_path.py
index a62b010..b2cf4f2 100644
--- a/GrAnnoT/find_feature_target_path.py
+++ b/GrAnnoT/find_feature_target_path.py
@@ -20,7 +20,7 @@ def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_ge
             copy_number+=1
             copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg
             feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome)
-            feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
+            feature_path=FeaturePath(walk_name,copy_id,feature_target_path) # informations about the occurence of the feature
             feature.segments_list_target.append(feature_path)
 
             # add the feat_copy to all the segments
@@ -40,7 +40,7 @@ def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_ge
             add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome)
      
     if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
-        feature.segments_list_target.append(['','',[]])
+        feature.segments_list_target.append(FeaturePath('','',[]))
 
 def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome):
     childs_list=Features[feature_id].get_child_list(Features)
@@ -73,9 +73,7 @@ def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_gen
             child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id,segments_on_target_genome)
             feat_copy=(child.key,copy_id)
 
-            feature_path=[walk_name,copy_id]
-            feature_path.append(child_path)
-            child.segments_list_target.append(feature_path)
+            child.segments_list_target.append(FeaturePath(walk_name,copy_id,child_path))
 
             for segment_id in child_path: # annotate the segments with info about the occurence of the child feature
                 segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id,segments_on_target_genome)
@@ -91,3 +89,22 @@ def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,fea
     last_index=max(first_seg_index,last_seg_index)
     feature_path_target_genome=target_genome_path[first_index:last_index+1]
     return feature_path_target_genome
+
+
+class FeaturePath:
+    def __init__(self,walk_name,copy_id,path):
+        self.walk_name=walk_name
+        self.copy_id=copy_id
+        self.path=path
+        self.first_seg=''
+        self.last_seg=''
+        if len(path)>0:
+            self.first_seg=path[0]
+            self.last_seg=path[-1]
+        self.pass_filter=None
+        self.cov=0
+        self.id=0
+
+    def add_cov_id(self,cov_id):
+        self.cov=cov_id[0]
+        self.id=cov_id[1]
\ No newline at end of file
diff --git a/GrAnnoT/genome_transfer.py b/GrAnnoT/genome_transfer.py
index 9943eb3..4ca4b09 100644
--- a/GrAnnoT/genome_transfer.py
+++ b/GrAnnoT/genome_transfer.py
@@ -7,35 +7,31 @@ from .compute_cov_id import get_id_cov
 def generate_target_gff(feature_id,seg_info,args,reason_features_not_transfered,match,file_out_gff,segments_on_target_genome):
     feature=Features[feature_id]
 
-    # get list of the segments where it is and the first and last segment of the feature on the new genome
-    [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match)
-    inversion=detect_feature_inversion(feature.segments_list_source,feature_target_path)
-    if first_seg!='': # feature present on the target genome
+    inversion=detect_feature_inversion(feature.segments_list_source,match.path)
+    if match.first_seg!='': # feature present on the target genome
 
         if inversion:
-            size_on_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id,segments_on_target_genome)-get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id,segments_on_target_genome)+1
+            size_on_new_genome=get_feature_start_on_target_genome_inv(match.last_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)-get_feature_stop_on_target_genome_inv(match.first_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)+1
         else:
-            size_on_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id,segments_on_target_genome)-get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id,segments_on_target_genome)+1
+            size_on_new_genome=get_feature_stop_on_target_genome(match.last_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)-get_feature_start_on_target_genome(match.first_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)+1
         size_diff=abs(size_on_new_genome-feature.size)
 
-        [cov,id]=match[3]
-
-        if (cov*100<args.coverage) or (id*100<args.identity):
+        if (match.cov*100<args.coverage) or (match.id*100<args.identity):
             reason_features_not_transfered[1]+=1
-            match.append(False) # store the information that this gene copy didnt pass the filters
+            match.pass_filter=False # store the information that this gene copy didnt pass the filters
             return "no"
         else:
-            line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id,segments_on_target_genome) # transfer the gene
+            line_gff=create_line_target_gff(match.first_seg,match.last_seg,feature_id,size_diff,inversion,match.walk_name,match.cov,match.id,match.copy_id,segments_on_target_genome) # transfer the gene
             file_out_gff.write(line_gff)
 
             # transfer all the gene's childs
-            transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_info,file_out_gff,segments_on_target_genome)
+            transfer_childs(feature_id,match.path,match.walk_name,match.copy_id,inversion,seg_info,file_out_gff,segments_on_target_genome)
 
-            match.append(True) # store the information that this gene copy was transfered
+            match.pass_filter=True # store the information that this gene copy was transfered
             return size_diff
     else :
         reason_features_not_transfered[0]+=1
-        match.append(False) # store the information that this gene copy didnt pass the filters
+        match.pass_filter=False # store the information that this gene copy didnt pass the filters
         return "no"
     
 
@@ -68,9 +64,8 @@ def transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_in
             child_size_diff=abs(child_size_on_new_genome-child.size)
 
             for match in child.segments_list_target: # look for the right occurence of the child feature
-                if (match[0]==walk) and (match[1]==copy_id):
-                    seg_list=match[2]
-                    [cov,id]=get_id_cov(child.key,seg_info,seg_list)
+                if (match.walk_name==walk) and (match.copy_id==copy_id):
+                    [cov,id]=get_id_cov(child.key,seg_info,match.path)
                     break
 
             if inversion:
diff --git a/GrAnnoT/load_gfa.py b/GrAnnoT/load_gfa.py
index 1b31a0f..5557848 100644
--- a/GrAnnoT/load_gfa.py
+++ b/GrAnnoT/load_gfa.py
@@ -124,24 +124,6 @@ class SegmentTarget:
         return list_seg
 
 
-
-# # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
-# def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
-#     with open(pos_seg,'r') as bed:
-#         seg_count=0
-#         file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
-#         for line in bed:
-#             line=line.split()
-#             [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
-#             # check if segment present twice on the same walk ???
-#             #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
-#             if seg not in segments_on_target_genome:
-#                 segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
-#             if file_name not in segments_on_target_genome[seg]:
-#                 segments_on_target_genome[seg][file_name]=list()
-#             segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
-#             seg_count+=1
-
 # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
 def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
     with open(pos_seg,'r') as bed:
@@ -151,9 +133,8 @@ def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add t
             line=line.split()
             [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
             # check if segment present twice on the same walk ???
-            #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
             if seg not in segments_on_target_genome:
-                segments_on_target_genome[seg]=SegmentTarget() # dict of walks->segment_info; you get the info about the segment for each walk
+                segments_on_target_genome[seg]=SegmentTarget() # list of walks -> segment_info; you get the info about the segment for each walk
 
             segments_on_target_genome[seg].add_occurence(file_name,[chrom,start,stop,strand,index])
             seg_count+=1
\ No newline at end of file
diff --git a/GrAnnoT/setup_transfer.py b/GrAnnoT/setup_transfer.py
index 3f35a01..826b24d 100644
--- a/GrAnnoT/setup_transfer.py
+++ b/GrAnnoT/setup_transfer.py
@@ -2,7 +2,6 @@ from .intersect import Features
 from .compute_cov_id import get_id_cov
 from .find_feature_target_path import add_target_genome_paths
 from .compute_transfer_stat import stats_feature_missing_segment,stats_features
-from .usefull_little_functions import get_featurePath_ends
 from .load_gfa import get_segments_sequence
 
 from .genome_transfer import generate_target_gff
@@ -41,11 +40,11 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
                 feature=Features[feat]
                 if feature.parent=="": # usually a gene
                     for match in feature.segments_list_target: # compute cov and id for all matches.
-                        if match[0]=='':
+                        if match.walk_name=='':
                             feature_target_path=[]
                         else:
-                            feature_target_path=match[2]
-                        match.append(get_id_cov(feat,seg_info,feature_target_path))
+                            feature_target_path=match.path
+                        match.add_cov_id(get_id_cov(feat,seg_info,feature_target_path))
                     for match in feature.segments_list_target:
                         # if option no_dupl, only transfer the best match
                         if args.no_duplication:
@@ -53,15 +52,11 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
                             all_match=feature.segments_list_target
                             best_match=all_match[0]
                             for candidate_match in all_match:
-                                candidate_cov=candidate_match[3][0]
-                                candidate_id=candidate_match[3][1]
-                                best_cov=best_match[3][0]
-                                best_id=best_match[3][1]
-                                if (candidate_cov+candidate_id)>(best_cov+best_id):
+                                if (candidate_match.cov+candidate_match.id)>(best_match.cov+best_match.id):
                                     best_match=candidate_match
 
                             if match!=best_match: # if current match is not best match, dont transfer
-                                match.append(False)
+                                match.pass_filter=False
                                 continue
 
                         transfer_stat=generate_target_gff(feat,seg_info,args,reason_features_not_transfered,match,file_out_gff,segments_on_target_genome) # the childs are handled in this function
@@ -76,8 +71,8 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
             list_seg=Features[feat].segments_list_source
             if Features[feat].parent=="":
                 feature_target_path=[]
-                for occurence in Features[feat].segments_list_target: # add all the occurences of the feature in the target genome
-                    feature_target_path+=occurence[2]
+                for match in Features[feat].segments_list_target: # add all the occurences of the feature in the target genome
+                    feature_target_path+=match.path
             for segment in list_seg:
                 segments_list[segment[1:]]=''
             for segment in feature_target_path:
@@ -90,19 +85,18 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
                 if feature.parent=="": # usually a gene
 
                     for match in feature.segments_list_target: # compute cov and id for all matches.
-                        if match[0]=='':
+                        if match.walk_name=='':
                             feature_target_path=[]
                         else:
-                            feature_target_path=match[2]
-                        match.append(get_id_cov(feat,seg_info,feature_target_path))
+                            feature_target_path=match.path
+                        match.add_cov_id(get_id_cov(feat,seg_info,feature_target_path))
 
                     for match in feature.segments_list_target:
-                        [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match)
                         # add the filter info in all the matches...
-                        [cov,id]=get_id_cov(feature_id,seg_info,feature_target_path)
+                        [cov,id]=get_id_cov(feature_id,seg_info,match.feature_target_path)
 
-                        if (cov*100<args.coverage) or (id*100<args.identity) or (first_seg==''): # didnt put the "right_size" filter)
-                            match.append(False) # store the information that this gene copy didnt pass the filters
+                        if (cov*100<args.coverage) or (id*100<args.identity) or (match.first_seg==''): # didnt put the "right_size" filter)
+                            match.pass_filter=False # store the information that this gene copy didnt pass the filters
                         else:
                             # if option no_dupl, only transfer the best match
                             if args.no_duplication:
@@ -110,17 +104,13 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
                                 all_match=feature.segments_list_target
                                 best_match=all_match[0]
                                 for candidate_match in all_match:
-                                    candidate_cov=candidate_match[3][0]
-                                    candidate_id=candidate_match[3][1]
-                                    best_cov=best_match[3][0]
-                                    best_id=best_match[3][1]
-                                    if (candidate_cov+candidate_id)>(best_cov+best_id):
+                                    if (candidate_match.cov+candidate_match.id)>(best_match.cov+best_match.id):
                                         best_match=candidate_match
 
                                 if match!=best_match: # if current match is not best match, dont transfer
-                                    match.append(False)
+                                    match.pass_filter=False
                                     continue
-                            match.append(True)
+                            match.pass_filter=True
 
     if args.variation:
         get_segments_sequence(segments_file,segments_list,seg_info)
@@ -144,7 +134,7 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
                 feature=Features[feat]
                 if feature.parent=="": # usually a gene
                     for match in feature.segments_list_target: # for all occurences of the gene
-                        if match[4]==True:
+                        if match.pass_filter==True:
                             print_alignment(feat,match,seg_info,file_out_aln)
 
 
@@ -161,10 +151,10 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path
 
         # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
         list_seg=Features[feat].segments_list_source
-        [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(Features[feat].segments_list_target[0])
+        match=Features[feat].segments_list_target[0]
 
         for feat in list_feature_to_transfer:
-            stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat,walk,segments_on_target_genome)
+            stats_feature_missing_segment(feature_missing_segments,match.first_seg,match.last_seg,list_seg,feat,match.walk_name,segments_on_target_genome)
 
         if args.annotation:
             absent_features=reason_features_not_transfered[0];low_cov_id=reason_features_not_transfered[1]
diff --git a/GrAnnoT/usefull_little_functions.py b/GrAnnoT/usefull_little_functions.py
index 91c33dc..7f08dbf 100644
--- a/GrAnnoT/usefull_little_functions.py
+++ b/GrAnnoT/usefull_little_functions.py
@@ -38,18 +38,6 @@ def get_feature_stop_on_target_genome_inv(stop_seg,feat_id,walk,copy_id,segments
     return seg_end_pos-feat_stop_pos+1
 
 
-# gives the first and last segments of a walk (list)
-def get_featurePath_ends(walk_info): # walk_info = [walk_name,copy_id,[list_seg]]
-    walk_name=walk_info[0]
-    if walk_name!='':
-        seg_list=walk_info[2]
-        copy_id=walk_info[1]
-        [first_seg,last_seg]=[seg_list[0],seg_list[-1]]
-    else:
-        [first_seg,last_seg,walk_name,copy_id,seg_list]=['','','','',[]]
-    return [first_seg,last_seg,walk_name,copy_id,seg_list]
-
-
 # invert the given strand
 def invert_strand(strand):
     match strand:
diff --git a/GrAnnoT/variation_details.py b/GrAnnoT/variation_details.py
index e470991..c87fe3c 100644
--- a/GrAnnoT/variation_details.py
+++ b/GrAnnoT/variation_details.py
@@ -7,13 +7,12 @@ from .detect_inversion import detect_feature_inversion
 # handle the variations analysis between the feature on the source genome and the feature on the target genome
 def generate_variations_details(feature_id,seg_info,match,file_out_var,segments_on_target_genome):
     # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
-    [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match)
 
     # do it for the gene, then for its childs.
-    if match[4]==True: # gene passed the filters
-        inversion=detect_feature_inversion(Features[feature_id].segments_list_source,feature_target_path)
-        print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome)
-        child_variation_details(feature_id,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome)
+    if match.pass_filter==True: # gene passed the filters
+        inversion=detect_feature_inversion(Features[feature_id].segments_list_source,match.path)
+        print_variations(feature_id,match.first_seg,match.last_seg,match.copy_id,match.walk_name,seg_info,match.path,inversion,file_out_var,segments_on_target_genome)
+        child_variation_details(feature_id,match.copy_id,match.walk_name,seg_info,match.path,inversion,file_out_var,segments_on_target_genome)
 
 def child_variation_details(feature_id,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome):
         # print child feature var
@@ -60,8 +59,8 @@ def get_child_last_seg(child_segments_list_source,feature_target_path):
 
 def find_child_target_path(child_segments_list_target,walk,copy_id):
     for match in child_segments_list_target: # look for the right occurence of the child feature
-        if (match[0]==walk) and (match[1]==copy_id):
-            return match[2]
+        if (match.walk_name==walk) and (match.copy_id==copy_id):
+            return match.path
 
 def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome):
 
-- 
GitLab