From 1efc9e322e6f65bf0e732adb0bd591dd341b7132 Mon Sep 17 00:00:00 2001
From: "nina.marthe_ird.fr" <nina.marthe@ird.fr>
Date: Thu, 8 Feb 2024 10:35:05 +0100
Subject: [PATCH] added indexing of genome paths and binary search

---
 Functions.py        | 112 +++++++++++++++++++++++++++++++++++++-------
 Functions_output.py |  15 +++---
 main.py             |   2 -
 3 files changed, 102 insertions(+), 27 deletions(-)

diff --git a/Functions.py b/Functions.py
index 2304df5..4cb0a8c 100644
--- a/Functions.py
+++ b/Functions.py
@@ -161,7 +161,7 @@ def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,li
 #Â [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
     feature_missing_segments[5].append(feature_id)
 
-    if (first_seg=='') : # no segment of the feature is in the genome, the feature is missing entirely
+    if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
         feature_missing_segments[3].append(feature_id)
     elif first_seg != list_seg[0][1:]: # the first segment is missing 
         feature_missing_segments[0].append(feature_id)
@@ -245,7 +245,7 @@ def get_segments_positions_on_genome(pos_seg):
         [seg,chrom,start,stop,strand]=[line[3][1:],line[0],int(line[1])+1,int(line[2]),line[3][0:1]] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
         segments_on_target_genome[seg]=[chrom,start,stop,strand]
 
-def get_segments_sequence_and_paths(gfa):
+def get_segments_sequence_and_paths(gfa,genomes_to_index):
     file_gfa=open(gfa,'r')
     lines_gfa=file_gfa.readlines()
     file_gfa.close()
@@ -258,17 +258,19 @@ def get_segments_sequence_and_paths(gfa):
             seg_seq[seg_id]=line[2]
 
         if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
+
+            if line[3] in genomes_to_index:
             
-            path=line[6].replace(">",";>")
-            path=path.replace("<",";<").split(';')
-            list_path=[]
-            for segment in path:
-                if segment[0:1]=='>':
-                    list_path.append('+s'+segment[1:])
-                elif segment[0:1]=='<':
-                    list_path.append('-s'+segment[1:])
- 
-            paths[line[3]]=list_path
+                path=line[6].replace(">",";>")
+                path=path.replace("<",";<").split(';')
+                list_path=[]
+                for segment in path:
+                    if segment[0:1]=='>':
+                        list_path.append('>s'+segment[1:])
+                    elif segment[0:1]=='<':
+                        list_path.append('<s'+segment[1:])
+    
+                paths[line[3]]=list_path
     return [paths,seg_seq]
 
 def get_first_seg(list_seg): # get the first segment of the list that is in the target genome
@@ -279,6 +281,40 @@ def get_first_seg(list_seg): # get the first segment of the list that is in the
             break
     return first_seg_found
 
+def key_segment(pair):
+    return int(pair[0][2:])
+
+def index_paths(gfa,genomes_to_index):
+    file_gfa=open(gfa,'r')
+    lines_gfa=file_gfa.readlines()
+    file_gfa.close()
+    index_paths={}
+
+    for line in lines_gfa:
+        line=line.split()
+
+        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get all paths
+
+            if line[3] in genomes_to_index:
+
+                path=line[6].replace(">",";>")
+                path=path.replace("<",";<").split(';')
+                list_segments=[]
+                list_index=[]
+                index=0
+                for segment in path: #Â get list of segments
+                    if segment[0:1]=='>':
+                        list_segments.append('>s'+segment[1:])
+                    elif segment[0:1]=='<':
+                        list_segments.append('<s'+segment[1:])
+                    list_index.append(index)
+                    index+=1
+
+                zipped_lists=zip(list_segments,list_index)
+                sorted_pairs=sorted(zipped_lists,key=key_segment)
+                index_paths[line[3]]=list(sorted_pairs)
+
+    return index_paths
 
 
 # functions to get the detail of the variations in the features
@@ -299,19 +335,58 @@ def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
             same_strand_count+=1
     return [seg_common,same_strand_count]
 
+def path_binary_search(indexed_path,segment): #Â add the option to use the strand or not. rn the strand is not used. it will complicate the comparisons.
+    index=-1
+    if len(indexed_path)==0:
+        print("empty path")
+        return index
+    [found,lower,upper]=[False,0,len(indexed_path)]
+    check_index=(upper+lower)//2        
+    
+    while found==False:
+
+        if upper-lower<=1:
+            if int(indexed_path[check_index][0][2:])==int(segment[2:]):
+                index=indexed_path[check_index][1]
+            break
+
+        if int(segment[2:])==int(indexed_path[check_index][0][2:]):
+            index=indexed_path[check_index][1]
+            found=True
+        elif int(segment[2:])>int(indexed_path[check_index][0][2:]):
+            lower=check_index+1
+            check_index=(upper+lower)//2
+        else:
+            upper=check_index
+            check_index=(upper+lower)//2
+
+    return index
+
+def get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name):
+    # find the path in target genome
+    first_strand=segments_on_target_genome[first_seg][3]
+    first_seg_stranded=first_strand+first_seg
+    last_strand=segments_on_target_genome[last_seg][3]
+    last_seg_stranded=last_strand+last_seg
+    first_seg_index=path_binary_search(paths_index[target_genome_name],first_seg_stranded)
+    last_seg_index=path_binary_search(paths_index[target_genome_name],last_seg_stranded)
+    first_index=min(first_seg_index,last_seg_index)
+    last_index=max(first_seg_index,last_seg_index)
+    feature_path_target_genome=paths[target_genome_name][first_index:last_index+1]
+    return feature_path_target_genome
+
 def get_feature_path(paths,first_seg,last_seg,target_genome_name):
     # find the path in target genome
-    first_strand=convert_strand(segments_on_target_genome[first_seg][3])
+    first_strand=segments_on_target_genome[first_seg][3]
     first_seg_stranded=first_strand+first_seg
-    last_strand=convert_strand(segments_on_target_genome[last_seg][3])
+    last_strand=segments_on_target_genome[last_seg][3]
     last_seg_stranded=last_strand+last_seg
     index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
     index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
     first_index=min(index_first_seg,index_last_seg)
     last_index=max(index_last_seg,index_first_seg)
     list_segfeat_target_genome=paths[target_genome_name][first_index:last_index+1]
-    list_segfeat_target_genome_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_target_genome]
-    return list_segfeat_target_genome_corrected
+    return list_segfeat_target_genome
 
 def convert_strand(strand):
     match strand:
@@ -379,7 +454,7 @@ class Variation:
     #def __str__(self):
     #    return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"
 
-def create_var(feature_id,first_seg,last_seg,paths,target_genome_name):
+def create_var(feature_id,first_seg,last_seg,paths,paths_index,target_genome_name):
     feature=Features[feature_id]
     start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id)-1
     stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id)
@@ -387,7 +462,8 @@ def create_var(feature_id,first_seg,last_seg,paths,target_genome_name):
     size_diff=str(size_new_genome-feature.size)
 
     #Â get feature paths on the original genome and on the target genome
-    list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name)
+    list_segfeat_target_genome=get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name)
+    #list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name)
     list_segfeat_source_genome=feature.segments_list
     [list_segfeat_source_genome,list_segfeat_target_genome,inversion]=detect_feature_inversion(list_segfeat_source_genome,list_segfeat_target_genome)
 
diff --git a/Functions_output.py b/Functions_output.py
index 2170c09..1750004 100644
--- a/Functions_output.py
+++ b/Functions_output.py
@@ -25,13 +25,14 @@ def target_gff(first_seg,last_seg,feature_id,list_seg,max_diff):
 
 # writes the gff of target genome using the gff of the graph
 def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome_name,max_diff):
-    print("generation of the genome's gff ")
+    print("generation of the target genome's outputs")
 
     #Â create variables and open files
-    [once,var,stats]=[True,True,False]
-    clustal=True
+    [once,var,stats,clustal]=[True,True,False,True]
+    genomes_to_index=[target_genome_name]
     if var:
-        [paths,seg_seq]=get_segments_sequence_and_paths(gfa)
+        [paths,seg_seq]=get_segments_sequence_and_paths(gfa,genomes_to_index)
+        paths_index=index_paths(gfa,genomes_to_index)
         file_out_var = open(out_var, 'w')
         global output_variations
         output_variations=[0,"",file_out_var]
@@ -82,7 +83,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
 
         # outputs the detail of variations of the feature :
         if var:
-            print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name)
+            print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name)
             write_line("",output_variations,True)
 
         if stats==True:
@@ -109,10 +110,10 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
 
 # functions to get the detail of the variations in the features
 
-def print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name):
+def print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name):
 
     if (first_seg!=''): # if the feature is not completly absent        # add the else, output absent features
-        [variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,target_genome_name) # removes the strands in the segment lists
+        [variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,paths_index,target_genome_name) # removes the strands in the segment lists
         feature=Features[feat]
         feat_start=feature.start
         # loop to go through both paths with i and j
diff --git a/main.py b/main.py
index 05b5201..5ad9650 100755
--- a/main.py
+++ b/main.py
@@ -1,5 +1,3 @@
-#!/home/nina/annotpangenome/.venv/bin/python
-
 # created by Nina Marthe 2023 - nina.marthe@ird.fr
 # licensed under MIT
 
-- 
GitLab