From 1efc9e322e6f65bf0e732adb0bd591dd341b7132 Mon Sep 17 00:00:00 2001 From: "nina.marthe_ird.fr" <nina.marthe@ird.fr> Date: Thu, 8 Feb 2024 10:35:05 +0100 Subject: [PATCH] added indexing of genome paths and binary search --- Functions.py | 112 +++++++++++++++++++++++++++++++++++++------- Functions_output.py | 15 +++--- main.py | 2 - 3 files changed, 102 insertions(+), 27 deletions(-) diff --git a/Functions.py b/Functions.py index 2304df5..4cb0a8c 100644 --- a/Functions.py +++ b/Functions.py @@ -161,7 +161,7 @@ def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,li #Â [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok] feature_missing_segments[5].append(feature_id) - if (first_seg=='') : # no segment of the feature is in the genome, the feature is missing entirely + if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely feature_missing_segments[3].append(feature_id) elif first_seg != list_seg[0][1:]: # the first segment is missing feature_missing_segments[0].append(feature_id) @@ -245,7 +245,7 @@ def get_segments_positions_on_genome(pos_seg): [seg,chrom,start,stop,strand]=[line[3][1:],line[0],int(line[1])+1,int(line[2]),line[3][0:1]] # +1 in the start to convert the bed 0-based coordinate to a 1-based system segments_on_target_genome[seg]=[chrom,start,stop,strand] -def get_segments_sequence_and_paths(gfa): +def get_segments_sequence_and_paths(gfa,genomes_to_index): file_gfa=open(gfa,'r') lines_gfa=file_gfa.readlines() file_gfa.close() @@ -258,17 +258,19 @@ def get_segments_sequence_and_paths(gfa): seg_seq[seg_id]=line[2] if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome + + if line[3] in genomes_to_index: - path=line[6].replace(">",";>") - path=path.replace("<",";<").split(';') - list_path=[] - for segment in path: - if segment[0:1]=='>': - list_path.append('+s'+segment[1:]) - elif segment[0:1]=='<': - list_path.append('-s'+segment[1:]) - - paths[line[3]]=list_path + path=line[6].replace(">",";>") + path=path.replace("<",";<").split(';') + list_path=[] + for segment in path: + if segment[0:1]=='>': + list_path.append('>s'+segment[1:]) + elif segment[0:1]=='<': + list_path.append('<s'+segment[1:]) + + paths[line[3]]=list_path return [paths,seg_seq] def get_first_seg(list_seg): # get the first segment of the list that is in the target genome @@ -279,6 +281,40 @@ def get_first_seg(list_seg): # get the first segment of the list that is in the break return first_seg_found +def key_segment(pair): + return int(pair[0][2:]) + +def index_paths(gfa,genomes_to_index): + file_gfa=open(gfa,'r') + lines_gfa=file_gfa.readlines() + file_gfa.close() + index_paths={} + + for line in lines_gfa: + line=line.split() + + if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get all paths + + if line[3] in genomes_to_index: + + path=line[6].replace(">",";>") + path=path.replace("<",";<").split(';') + list_segments=[] + list_index=[] + index=0 + for segment in path: #Â get list of segments + if segment[0:1]=='>': + list_segments.append('>s'+segment[1:]) + elif segment[0:1]=='<': + list_segments.append('<s'+segment[1:]) + list_index.append(index) + index+=1 + + zipped_lists=zip(list_segments,list_index) + sorted_pairs=sorted(zipped_lists,key=key_segment) + index_paths[line[3]]=list(sorted_pairs) + + return index_paths # functions to get the detail of the variations in the features @@ -299,19 +335,58 @@ def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand): same_strand_count+=1 return [seg_common,same_strand_count] +def path_binary_search(indexed_path,segment): #Â add the option to use the strand or not. rn the strand is not used. it will complicate the comparisons. + index=-1 + if len(indexed_path)==0: + print("empty path") + return index + [found,lower,upper]=[False,0,len(indexed_path)] + check_index=(upper+lower)//2 + + while found==False: + + if upper-lower<=1: + if int(indexed_path[check_index][0][2:])==int(segment[2:]): + index=indexed_path[check_index][1] + break + + if int(segment[2:])==int(indexed_path[check_index][0][2:]): + index=indexed_path[check_index][1] + found=True + elif int(segment[2:])>int(indexed_path[check_index][0][2:]): + lower=check_index+1 + check_index=(upper+lower)//2 + else: + upper=check_index + check_index=(upper+lower)//2 + + return index + +def get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name): + # find the path in target genome + first_strand=segments_on_target_genome[first_seg][3] + first_seg_stranded=first_strand+first_seg + last_strand=segments_on_target_genome[last_seg][3] + last_seg_stranded=last_strand+last_seg + first_seg_index=path_binary_search(paths_index[target_genome_name],first_seg_stranded) + last_seg_index=path_binary_search(paths_index[target_genome_name],last_seg_stranded) + first_index=min(first_seg_index,last_seg_index) + last_index=max(first_seg_index,last_seg_index) + feature_path_target_genome=paths[target_genome_name][first_index:last_index+1] + return feature_path_target_genome + def get_feature_path(paths,first_seg,last_seg,target_genome_name): # find the path in target genome - first_strand=convert_strand(segments_on_target_genome[first_seg][3]) + first_strand=segments_on_target_genome[first_seg][3] first_seg_stranded=first_strand+first_seg - last_strand=convert_strand(segments_on_target_genome[last_seg][3]) + last_strand=segments_on_target_genome[last_seg][3] last_seg_stranded=last_strand+last_seg index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) first_index=min(index_first_seg,index_last_seg) last_index=max(index_last_seg,index_first_seg) list_segfeat_target_genome=paths[target_genome_name][first_index:last_index+1] - list_segfeat_target_genome_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_target_genome] - return list_segfeat_target_genome_corrected + return list_segfeat_target_genome def convert_strand(strand): match strand: @@ -379,7 +454,7 @@ class Variation: #def __str__(self): # return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}" -def create_var(feature_id,first_seg,last_seg,paths,target_genome_name): +def create_var(feature_id,first_seg,last_seg,paths,paths_index,target_genome_name): feature=Features[feature_id] start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id)-1 stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id) @@ -387,7 +462,8 @@ def create_var(feature_id,first_seg,last_seg,paths,target_genome_name): size_diff=str(size_new_genome-feature.size) #Â get feature paths on the original genome and on the target genome - list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name) + list_segfeat_target_genome=get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name) + #list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name) list_segfeat_source_genome=feature.segments_list [list_segfeat_source_genome,list_segfeat_target_genome,inversion]=detect_feature_inversion(list_segfeat_source_genome,list_segfeat_target_genome) diff --git a/Functions_output.py b/Functions_output.py index 2170c09..1750004 100644 --- a/Functions_output.py +++ b/Functions_output.py @@ -25,13 +25,14 @@ def target_gff(first_seg,last_seg,feature_id,list_seg,max_diff): # writes the gff of target genome using the gff of the graph def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome_name,max_diff): - print("generation of the genome's gff ") + print("generation of the target genome's outputs") #Â create variables and open files - [once,var,stats]=[True,True,False] - clustal=True + [once,var,stats,clustal]=[True,True,False,True] + genomes_to_index=[target_genome_name] if var: - [paths,seg_seq]=get_segments_sequence_and_paths(gfa) + [paths,seg_seq]=get_segments_sequence_and_paths(gfa,genomes_to_index) + paths_index=index_paths(gfa,genomes_to_index) file_out_var = open(out_var, 'w') global output_variations output_variations=[0,"",file_out_var] @@ -82,7 +83,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome # outputs the detail of variations of the feature : if var: - print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name) + print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name) write_line("",output_variations,True) if stats==True: @@ -109,10 +110,10 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome # functions to get the detail of the variations in the features -def print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name): +def print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name): if (first_seg!=''): # if the feature is not completly absent # add the else, output absent features - [variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,target_genome_name) # removes the strands in the segment lists + [variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,paths_index,target_genome_name) # removes the strands in the segment lists feature=Features[feat] feat_start=feature.start # loop to go through both paths with i and j diff --git a/main.py b/main.py index 05b5201..5ad9650 100755 --- a/main.py +++ b/main.py @@ -1,5 +1,3 @@ -#!/home/nina/annotpangenome/.venv/bin/python - # created by Nina Marthe 2023 - nina.marthe@ird.fr # licensed under MIT -- GitLab