added indexing of genome paths and binary search

1efc9e32 · nina.marthe_ird.fr · 79fe687d · 1efc9e32 · 1efc9e32 · 1efc9e32
Commit 1efc9e32 authored 1 year ago by nina.marthe_ird.fr
--- a/Functions.py
+++ b/Functions.py
@@ -161,7 +161,7 @@ def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,li
 # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
    feature_missing_segments[5].append(feature_id)

-    if (first_seg=='') : # no segment of the feature is in the genome, the feature is missing entirely
+    if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
        feature_missing_segments[3].append(feature_id)
    elif first_seg != list_seg[0][1:]: # the first segment is missing 
        feature_missing_segments[0].append(feature_id)
@@ -245,7 +245,7 @@ def get_segments_positions_on_genome(pos_seg):
        [seg,chrom,start,stop,strand]=[line[3][1:],line[0],int(line[1])+1,int(line[2]),line[3][0:1]] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
        segments_on_target_genome[seg]=[chrom,start,stop,strand]

-def get_segments_sequence_and_paths(gfa):
+def get_segments_sequence_and_paths(gfa,genomes_to_index):
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    file_gfa.close()
@@ -258,17 +258,19 @@ def get_segments_sequence_and_paths(gfa):
            seg_seq[seg_id]=line[2]

        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
+
+            if line[3] in genomes_to_index:
            
-            path=line[6].replace(">",";>")
-            path=path.replace("<",";<").split(';')
-            list_path=[]
-            for segment in path:
-                if segment[0:1]=='>':
-                    list_path.append('+s'+segment[1:])
-                elif segment[0:1]=='<':
-                    list_path.append('-s'+segment[1:])
- 
-            paths[line[3]]=list_path
+                path=line[6].replace(">",";>")
+                path=path.replace("<",";<").split(';')
+                list_path=[]
+                for segment in path:
+                    if segment[0:1]=='>':
+                        list_path.append('>s'+segment[1:])
+                    elif segment[0:1]=='<':
+                        list_path.append('<s'+segment[1:])
+    
+                paths[line[3]]=list_path
    return [paths,seg_seq]

 def get_first_seg(list_seg): # get the first segment of the list that is in the target genome
@@ -279,6 +281,40 @@ def get_first_seg(list_seg): # get the first segment of the list that is in the
            break
    return first_seg_found

+def key_segment(pair):
+    return int(pair[0][2:])
+
+def index_paths(gfa,genomes_to_index):
+    file_gfa=open(gfa,'r')
+    lines_gfa=file_gfa.readlines()
+    file_gfa.close()
+    index_paths={}
+
+    for line in lines_gfa:
+        line=line.split()
+
+        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get all paths
+
+            if line[3] in genomes_to_index:
+
+                path=line[6].replace(">",";>")
+                path=path.replace("<",";<").split(';')
+                list_segments=[]
+                list_index=[]
+                index=0
+                for segment in path: # get list of segments
+                    if segment[0:1]=='>':
+                        list_segments.append('>s'+segment[1:])
+                    elif segment[0:1]=='<':
+                        list_segments.append('<s'+segment[1:])
+                    list_index.append(index)
+                    index+=1
+
+                zipped_lists=zip(list_segments,list_index)
+                sorted_pairs=sorted(zipped_lists,key=key_segment)
+                index_paths[line[3]]=list(sorted_pairs)
+
+    return index_paths


 # functions to get the detail of the variations in the features
@@ -299,19 +335,58 @@ def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
            same_strand_count+=1
    return [seg_common,same_strand_count]

+def path_binary_search(indexed_path,segment): # add the option to use the strand or not. rn the strand is not used. it will complicate the comparisons.
+    index=-1
+    if len(indexed_path)==0:
+        print("empty path")
+        return index
+    [found,lower,upper]=[False,0,len(indexed_path)]
+    check_index=(upper+lower)//2        
+    
+    while found==False:
+
+        if upper-lower<=1:
+            if int(indexed_path[check_index][0][2:])==int(segment[2:]):
+                index=indexed_path[check_index][1]
+            break
+
+        if int(segment[2:])==int(indexed_path[check_index][0][2:]):
+            index=indexed_path[check_index][1]
+            found=True
+        elif int(segment[2:])>int(indexed_path[check_index][0][2:]):
+            lower=check_index+1
+            check_index=(upper+lower)//2
+        else:
+            upper=check_index
+            check_index=(upper+lower)//2
+
+    return index
+
+def get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name):
+    # find the path in target genome
+    first_strand=segments_on_target_genome[first_seg][3]
+    first_seg_stranded=first_strand+first_seg
+    last_strand=segments_on_target_genome[last_seg][3]
+    last_seg_stranded=last_strand+last_seg
+    first_seg_index=path_binary_search(paths_index[target_genome_name],first_seg_stranded)
+    last_seg_index=path_binary_search(paths_index[target_genome_name],last_seg_stranded)
+    first_index=min(first_seg_index,last_seg_index)
+    last_index=max(first_seg_index,last_seg_index)
+    feature_path_target_genome=paths[target_genome_name][first_index:last_index+1]
+    return feature_path_target_genome
+
 def get_feature_path(paths,first_seg,last_seg,target_genome_name):
    # find the path in target genome
-    first_strand=convert_strand(segments_on_target_genome[first_seg][3])
+    first_strand=segments_on_target_genome[first_seg][3]
    first_seg_stranded=first_strand+first_seg
-    last_strand=convert_strand(segments_on_target_genome[last_seg][3])
+    last_strand=segments_on_target_genome[last_seg][3]
    last_seg_stranded=last_strand+last_seg
    index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
    index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
    first_index=min(index_first_seg,index_last_seg)
    last_index=max(index_last_seg,index_first_seg)
    list_segfeat_target_genome=paths[target_genome_name][first_index:last_index+1]
-    list_segfeat_target_genome_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_target_genome]
-    return list_segfeat_target_genome_corrected
+    return list_segfeat_target_genome

 def convert_strand(strand):
    match strand:
@@ -379,7 +454,7 @@ class Variation:
    #def __str__(self):
    #    return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"

-def create_var(feature_id,first_seg,last_seg,paths,target_genome_name):
+def create_var(feature_id,first_seg,last_seg,paths,paths_index,target_genome_name):
    feature=Features[feature_id]
    start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id)-1
    stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id)
@@ -387,7 +462,8 @@ def create_var(feature_id,first_seg,last_seg,paths,target_genome_name):
    size_diff=str(size_new_genome-feature.size)

    # get feature paths on the original genome and on the target genome
-    list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name)
+    list_segfeat_target_genome=get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name)
+    #list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name)
    list_segfeat_source_genome=feature.segments_list
    [list_segfeat_source_genome,list_segfeat_target_genome,inversion]=detect_feature_inversion(list_segfeat_source_genome,list_segfeat_target_genome)


--- a/Functions_output.py
+++ b/Functions_output.py
@@ -25,13 +25,14 @@ def target_gff(first_seg,last_seg,feature_id,list_seg,max_diff):

 # writes the gff of target genome using the gff of the graph
 def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome_name,max_diff):
-    print("generation of the genome's gff ")
+    print("generation of the target genome's outputs")

    # create variables and open files
-    [once,var,stats]=[True,True,False]
-    clustal=True
+    [once,var,stats,clustal]=[True,True,False,True]
+    genomes_to_index=[target_genome_name]
    if var:
-        [paths,seg_seq]=get_segments_sequence_and_paths(gfa)
+        [paths,seg_seq]=get_segments_sequence_and_paths(gfa,genomes_to_index)
+        paths_index=index_paths(gfa,genomes_to_index)
        file_out_var = open(out_var, 'w')
        global output_variations
        output_variations=[0,"",file_out_var]
@@ -82,7 +83,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome

        # outputs the detail of variations of the feature :
        if var:
-            print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name)
+            print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name)
            write_line("",output_variations,True)

        if stats==True:
@@ -109,10 +110,10 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome

 # functions to get the detail of the variations in the features

-def print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name):
+def print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name):

    if (first_seg!=''): # if the feature is not completly absent        # add the else, output absent features
-        [variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,target_genome_name) # removes the strands in the segment lists
+        [variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,paths_index,target_genome_name) # removes the strands in the segment lists
        feature=Features[feat]
        feat_start=feature.start
        # loop to go through both paths with i and j

--- a/main.py
+++ b/main.py
-#!/home/nina/annotpangenome/.venv/bin/python
-
 # created by Nina Marthe 2023 - nina.marthe@ird.fr
 # licensed under MIT