Skip to content
Snippets Groups Projects
Commit 1efc9e32 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

added indexing of genome paths and binary search

parent 79fe687d
No related branches found
No related tags found
No related merge requests found
......@@ -161,7 +161,7 @@ def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,li
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
feature_missing_segments[5].append(feature_id)
if (first_seg=='') : # no segment of the feature is in the genome, the feature is missing entirely
if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_segments[3].append(feature_id)
elif first_seg != list_seg[0][1:]: # the first segment is missing
feature_missing_segments[0].append(feature_id)
......@@ -245,7 +245,7 @@ def get_segments_positions_on_genome(pos_seg):
[seg,chrom,start,stop,strand]=[line[3][1:],line[0],int(line[1])+1,int(line[2]),line[3][0:1]] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
segments_on_target_genome[seg]=[chrom,start,stop,strand]
def get_segments_sequence_and_paths(gfa):
def get_segments_sequence_and_paths(gfa,genomes_to_index):
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
......@@ -258,17 +258,19 @@ def get_segments_sequence_and_paths(gfa):
seg_seq[seg_id]=line[2]
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
if line[3] in genomes_to_index:
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('-s'+segment[1:])
paths[line[3]]=list_path
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('>s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('<s'+segment[1:])
paths[line[3]]=list_path
return [paths,seg_seq]
def get_first_seg(list_seg): # get the first segment of the list that is in the target genome
......@@ -279,6 +281,40 @@ def get_first_seg(list_seg): # get the first segment of the list that is in the
break
return first_seg_found
def key_segment(pair):
return int(pair[0][2:])
def index_paths(gfa,genomes_to_index):
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
index_paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get all paths
if line[3] in genomes_to_index:
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_segments=[]
list_index=[]
index=0
for segment in path: # get list of segments
if segment[0:1]=='>':
list_segments.append('>s'+segment[1:])
elif segment[0:1]=='<':
list_segments.append('<s'+segment[1:])
list_index.append(index)
index+=1
zipped_lists=zip(list_segments,list_index)
sorted_pairs=sorted(zipped_lists,key=key_segment)
index_paths[line[3]]=list(sorted_pairs)
return index_paths
# functions to get the detail of the variations in the features
......@@ -299,19 +335,58 @@ def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
same_strand_count+=1
return [seg_common,same_strand_count]
def path_binary_search(indexed_path,segment): # add the option to use the strand or not. rn the strand is not used. it will complicate the comparisons.
index=-1
if len(indexed_path)==0:
print("empty path")
return index
[found,lower,upper]=[False,0,len(indexed_path)]
check_index=(upper+lower)//2
while found==False:
if upper-lower<=1:
if int(indexed_path[check_index][0][2:])==int(segment[2:]):
index=indexed_path[check_index][1]
break
if int(segment[2:])==int(indexed_path[check_index][0][2:]):
index=indexed_path[check_index][1]
found=True
elif int(segment[2:])>int(indexed_path[check_index][0][2:]):
lower=check_index+1
check_index=(upper+lower)//2
else:
upper=check_index
check_index=(upper+lower)//2
return index
def get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name):
# find the path in target genome
first_strand=segments_on_target_genome[first_seg][3]
first_seg_stranded=first_strand+first_seg
last_strand=segments_on_target_genome[last_seg][3]
last_seg_stranded=last_strand+last_seg
first_seg_index=path_binary_search(paths_index[target_genome_name],first_seg_stranded)
last_seg_index=path_binary_search(paths_index[target_genome_name],last_seg_stranded)
first_index=min(first_seg_index,last_seg_index)
last_index=max(first_seg_index,last_seg_index)
feature_path_target_genome=paths[target_genome_name][first_index:last_index+1]
return feature_path_target_genome
def get_feature_path(paths,first_seg,last_seg,target_genome_name):
# find the path in target genome
first_strand=convert_strand(segments_on_target_genome[first_seg][3])
first_strand=segments_on_target_genome[first_seg][3]
first_seg_stranded=first_strand+first_seg
last_strand=convert_strand(segments_on_target_genome[last_seg][3])
last_strand=segments_on_target_genome[last_seg][3]
last_seg_stranded=last_strand+last_seg
index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
first_index=min(index_first_seg,index_last_seg)
last_index=max(index_last_seg,index_first_seg)
list_segfeat_target_genome=paths[target_genome_name][first_index:last_index+1]
list_segfeat_target_genome_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_target_genome]
return list_segfeat_target_genome_corrected
return list_segfeat_target_genome
def convert_strand(strand):
match strand:
......@@ -379,7 +454,7 @@ class Variation:
#def __str__(self):
# return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"
def create_var(feature_id,first_seg,last_seg,paths,target_genome_name):
def create_var(feature_id,first_seg,last_seg,paths,paths_index,target_genome_name):
feature=Features[feature_id]
start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id)-1
stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id)
......@@ -387,7 +462,8 @@ def create_var(feature_id,first_seg,last_seg,paths,target_genome_name):
size_diff=str(size_new_genome-feature.size)
# get feature paths on the original genome and on the target genome
list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name)
list_segfeat_target_genome=get_feature_path_binary(paths_index,paths,first_seg,last_seg,target_genome_name)
#list_segfeat_target_genome=get_feature_path(paths,first_seg,last_seg,target_genome_name)
list_segfeat_source_genome=feature.segments_list
[list_segfeat_source_genome,list_segfeat_target_genome,inversion]=detect_feature_inversion(list_segfeat_source_genome,list_segfeat_target_genome)
......
......@@ -25,13 +25,14 @@ def target_gff(first_seg,last_seg,feature_id,list_seg,max_diff):
# writes the gff of target genome using the gff of the graph
def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome_name,max_diff):
print("generation of the genome's gff ")
print("generation of the target genome's outputs")
# create variables and open files
[once,var,stats]=[True,True,False]
clustal=True
[once,var,stats,clustal]=[True,True,False,True]
genomes_to_index=[target_genome_name]
if var:
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
[paths,seg_seq]=get_segments_sequence_and_paths(gfa,genomes_to_index)
paths_index=index_paths(gfa,genomes_to_index)
file_out_var = open(out_var, 'w')
global output_variations
output_variations=[0,"",file_out_var]
......@@ -82,7 +83,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
# outputs the detail of variations of the feature :
if var:
print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name)
print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name)
write_line("",output_variations,True)
if stats==True:
......@@ -109,10 +110,10 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
# functions to get the detail of the variations in the features
def print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name):
def print_variations(first_seg,last_seg,feat,paths,paths_index,seg_seq,target_genome_name):
if (first_seg!=''): # if the feature is not completly absent # add the else, output absent features
[variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,target_genome_name) # removes the strands in the segment lists
[variation,list_segfeat_source_genome,list_segfeat_target_genome]=create_var(feat,first_seg,last_seg,paths,paths_index,target_genome_name) # removes the strands in the segment lists
feature=Features[feat]
feat_start=feature.start
# loop to go through both paths with i and j
......
#!/home/nina/annotpangenome/.venv/bin/python
# created by Nina Marthe 2023 - nina.marthe@ird.fr
# licensed under MIT
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment