Skip to content
Snippets Groups Projects
Commit 9eef4242 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

moved functions into the class as methods; added a few attributes

parent d629d0a2
No related branches found
No related tags found
No related merge requests found
...@@ -11,57 +11,70 @@ def generate_variations_details(feature_id,seg_seq,match,file_out_var,segments_o ...@@ -11,57 +11,70 @@ def generate_variations_details(feature_id,seg_seq,match,file_out_var,segments_o
# do it for the gene, then for its childs. # do it for the gene, then for its childs.
if match[4]==True: # gene passed the filters if match[4]==True: # gene passed the filters
print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_target_path,file_out_var,segments_on_target_genome)
inversion=detect_feature_inversion(Features[feature_id].segments_list_source,feature_target_path) inversion=detect_feature_inversion(Features[feature_id].segments_list_source,feature_target_path)
print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_target_path,inversion,file_out_var,segments_on_target_genome)
child_variation_details(feature_id,copy_id,walk,seg_seq,feature_target_path,inversion,file_out_var,segments_on_target_genome)
def child_variation_details(feature_id,copy_id,walk,seg_seq,feature_target_path,inversion,file_out_var,segments_on_target_genome):
# print child feature var # print child feature var
childs_list=Features[feature_id].get_child_list(Features) for child_id in Features[feature_id].get_child_list(Features):
for child_id in childs_list:
child=Features[child_id] child=Features[child_id]
# look for the first and last seg of the child in its parent path # look for the first and last seg of the child in its parent path
[child_first_seg,child_last_seg]=['',''] [child_first_seg,child_last_seg]=get_child_first_last_seg(child.segments_list_source,feature_target_path,inversion)
for seg in child.segments_list_source: # get first seg in the parent path
if seg in feature_target_path: # parent path if child_first_seg!='': # else the child faeture is absent in this occurence of the parent feature
child_first_seg=seg child_target_path=find_child_target_path(child.segments_list_target,walk,copy_id)
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # check that the child feature is not absent in this occurence of the parent feature
for seg in reversed(child.segments_list_source): # get last seg in the parent path
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
if inversion:
temp=child_first_seg
child_first_seg=child_last_seg
child_last_seg=temp
for match in child.segments_list_target: # look for the right occurence of the child feature
if (match[0]==walk) and (match[1]==copy_id):
child_target_path=match[2]
break
# treat the child feature: # treat the child feature:
print_variations(child_id,child_first_seg,child_last_seg,copy_id,walk,seg_seq,child_target_path,file_out_var,segments_on_target_genome) print_variations(child_id,child_first_seg,child_last_seg,copy_id,walk,seg_seq,child_target_path,inversion,file_out_var,segments_on_target_genome)
def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_target_path,file_out_var,segments_on_target_genome): def get_child_first_last_seg(child_segments_list_source,feature_target_path,inversion):
[child_first_seg,child_last_seg]=['','']
child_first_seg=get_child_first_seg(child_segments_list_source,feature_target_path)
if child_first_seg!='': # check that the child feature is not absent in this occurence of the parent feature
child_last_seg=get_child_last_seg(child_segments_list_source,feature_target_path)
if inversion:
temp=child_first_seg
child_first_seg=child_last_seg
child_last_seg=temp
return [child_first_seg,child_last_seg]
def get_child_first_seg(child_segments_list_source,feature_target_path):
for seg in child_segments_list_source: # get first seg in the parent path
if seg in feature_target_path: # parent path
return seg
elif invert_seg(seg) in feature_target_path:
return invert_seg(seg)
return ''
def get_child_last_seg(child_segments_list_source,feature_target_path):
for seg in reversed(child_segments_list_source): # get last seg in the parent path
if seg in feature_target_path: # parent path
return seg
elif invert_seg(seg) in feature_target_path:
return invert_seg(seg)
return ''
def find_child_target_path(child_segments_list_target,walk,copy_id):
for match in child_segments_list_target: # look for the right occurence of the child feature
if (match[0]==walk) and (match[1]==copy_id):
return match[2]
def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_target_path,inversion,file_out_var,segments_on_target_genome):
if first_seg!='': # if the feature is not completly absent # add the else, output absent features if first_seg!='': # if the feature is not completly absent # add the else, output absent features
[variation,feature_path_source_genome,feature_path_target_genome]=create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path,segments_on_target_genome) # removes the strands in the segment lists feat_var=FeatureVariation(feature_id,file_out_var,first_seg,last_seg,walk,copy_id,feature_target_path,inversion,segments_on_target_genome)
feature_path_source_genome=feat_var.feature_path_source_genome # removes the strand on the segments list
feature_path_target_genome=feat_var.feature_path_target_genome # removes the strand on the segments list and inverts the list if necessary
feature=Features[feature_id] feature=Features[feature_id]
feat_start=feature.start
# loop to go through both paths with i and j # loop to go through both paths with i and j
[i,j,var_count]=[0,0,0] [i,j]=[0,0]
# detect and print variations ignoring the strands # detect and print variations ignoring the strands
start_feat_seg_target=feature_path_target_genome[0]
while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)): while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
...@@ -69,338 +82,308 @@ def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_ ...@@ -69,338 +82,308 @@ def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_
if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome
# if both segments are absent in the other genome, its a substitution # if both segments are absent in the other genome, its a substitution
variation.last_seg_in_target=feature_path_target_genome[j] feat_var.last_seg_in_target=feature_path_target_genome[j]
if (variation.type=='insertion') or (variation.type=='deletion'): # print the current variation before treating the substitution if (feat_var.type=='insertion') or (feat_var.type=='deletion'): # print the current variation before treating the substitution
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line) if feat_var.type=='substitution':
reset_var(variation) feat_var.continue_var(seg_seq,i,j,0)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
else: # initiate substitution else: # initiate substitution
init_new_var(variation,"substitution",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature) feat_var.init_new_var("substitution",i,j,seg_seq,feature)
i+=1;j+=1 i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion or continue substitution else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion or continue substitution
if variation.type=='deletion': # print the current variation before treating the insertion if feat_var.type=='deletion': # print the current variation before treating the insertion
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line) feat_var.last_seg_in_target=feature_path_target_genome[j]
reset_var(variation) if feat_var.type=='insertion':
var_count+=1 feat_var.continue_var(seg_seq,i,j,0)
variation.last_seg_in_target=feature_path_target_genome[j] elif feat_var.type=="substitution":
if variation.type=='insertion':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
elif variation.type=="substitution":
while feature_path_target_genome[j]!=feature_path_source_genome[i]: while feature_path_target_genome[j]!=feature_path_source_genome[i]:
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,2) feat_var.continue_var(seg_seq,i,j,2)
j+=1 j+=1
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
j-=1 j-=1
else: # intiate insertion else: # intiate insertion
init_new_var(variation,"insertion",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature) feat_var.init_new_var("insertion",i,j,seg_seq,feature)
j+=1 j+=1
elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion or continue substitution elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion or continue substitution
if variation.type=='insertion': # print the current variation before treating the deletion if feat_var.type=='insertion': # print the current variation before treating the deletion
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line) if feat_var.type=='deletion':
reset_var(variation) feat_var.continue_var(seg_seq,i,j,0)
var_count+=1 elif feat_var.type=="substitution":
if variation.type=='deletion':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
elif variation.type=="substitution":
while feature_path_target_genome[j]!=feature_path_source_genome[i]: while feature_path_target_genome[j]!=feature_path_source_genome[i]:
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,1) feat_var.continue_var(seg_seq,i,j,1)
i+=1 i+=1
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
i-=1 i-=1
else: # intiate deletion else: # intiate deletion
init_new_var(variation,"deletion",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature) feat_var.init_new_var("deletion",i,j,seg_seq,feature)
i+=1 i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet else : # if both segments are present in the other genome but not at the same position. weird case never found yet
# can be a substitution. check later if its not an inversion # can be a substitution. check later if its not an inversion
variation.last_seg_in_target=feature_path_target_genome[j] feat_var.last_seg_in_target=feature_path_target_genome[j]
if (variation.type=='insertion') or (variation.type=='deletion'): # print the current variation before treating the substitution if (feat_var.type=='insertion') or (feat_var.type=='deletion'): # print the current variation before treating the substitution
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line) if feat_var.type=='substitution':
reset_var(variation) feat_var.continue_var(seg_seq,i,j,0)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
else: # initiate substitution else: # initiate substitution
init_new_var(variation,"substitution",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature) feat_var.init_new_var("substitution",i,j,seg_seq,feature)
i+=1;j+=1 i+=1;j+=1
else: # segment present in both, no variation. print the running indel if there is one else: # segment present in both, no variation; print the running indel if there is one
if variation.type!='': # print the current variation if there is one if feat_var.type!='': # print the current variation if there is one
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line) feat_var.last_seg_in_target=feature_path_target_genome[j]
var_count+=1
reset_var(variation)
variation.last_seg_in_target=feature_path_target_genome[j]
i+=1;j+=1 i+=1;j+=1
if (variation.type!=''): # if there was a current variation when we reached the end, print it if (feat_var.type!=''): # if there was a current variation when we reached the end, print it
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,file_out_var,segments_on_target_genome) feat_var.print_current_var(segments_on_target_genome)
file_out_var.write(line)
var_count+=1
reset_var(variation)
if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
line=print_last_deletion(variation,feature_path_source_genome,i,feat_start,feature,seg_seq) feat_var.print_last_deletion(i,feature,seg_seq)
file_out_var.write(line)
var_count+=1 if feat_var.var_count==0: # if no variation was encountered
reset_var(variation) feat_var.print_novar()
if var_count==0: # if no variation was encountered
line=print_novar(variation)
file_out_var.write(line)
# print a variation
def print_current_var(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
warning=''
if variation.type=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
return line
elif variation.type=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
return line
elif variation.type=='substitution':
warning=detect_small_inversion(variation)
[pos_old,pos_new]=get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
size_subs=f'{len(variation.ref)}/{len(variation.alt)}'
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
# print the substitutions of different size as deletion+insertion.
#if len(variation.ref) == len(variation.alt): # if the substituion is between two segment of the same size, print it
# size_subs=len(variation.ref)
# line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
#else :
# # if the segments of the substitution have a different size, print deletion then insertion at the same position.
# line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
# line+=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
return line
# check if a substitution could be an inversion inside a feature
def detect_small_inversion(variation):
[list_ref_common,list_alt_common]=[list(),list()]
list_ref_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_ref]
list_alt_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_alt]
for seg in variation.seg_ref:
if seg[1:] in list_alt_unstrand:
list_ref_common.append(seg)
for seg in variation.seg_alt:
if seg[1:] in list_ref_unstrand:
list_alt_common.append(seg)
if (len(list_ref_common)>len(list_ref_unstrand)*0.5) and (len(list_alt_common)>len(list_alt_unstrand)*0.5):
return f'\t# Suspected inversion within this substitution.'
else:
return ''
# print a deletion at the end of a feature
def print_last_deletion(variation,feature_path_source_genome,i,feat_start,feature,seg_seq):
seg_del=search_segment(feature_path_source_genome[i])
pos_old=int(Segments[seg_del].get_start(feature.id))-int(feat_start)+1
del_sequence=get_sequence_list_seg(feature_path_source_genome,i,feature,seg_seq)
length=len(del_sequence)
pos_new=str(int(variation.size_new)+1) # the deletion is at the end of the feature on the new genome
if variation.inversion:
inversion='1'
else:
inversion='0'
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tdeletion\t{del_sequence}\t-\t{length}\t{pos_old}\t{pos_new}\n'
return line
# print a feature with no variation
def print_novar(variation):
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tno_var\t-\t-\t-\t-\t-\n'
return line
# get a digit for the inversion (convert bool in int)
def print_inversion(bool):
if bool==True:
return '1'
else:
return '0'
# initiate a Variation object with the information on the feature it is on
def create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path,segments_on_target_genome):
feature=Features[feature_id]
# get feature paths on the original genome and on the target genome
feature_path_target_genome=feature_target_path
feature_path_source_genome=feature.segments_list_source
inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)
if inversion:
feature_path_target_genome=invert_segment_list(feature_path_target_genome)
stop_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id,segments_on_target_genome)
start_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id,segments_on_target_genome)
else:
start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id,segments_on_target_genome)
stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id,segments_on_target_genome)
size_new_genome=stop_new_genome-start_new_genome+1
size_diff=str(size_new_genome-feature.size)
sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id,segments_on_target_genome)[0]
variation=Variation(feature.id,feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
return(variation,feature_path_source_genome,feature_path_target_genome)
# reset the informations of the variation, but keep the information about the feature
def reset_var(variation):
variation.type='' # make type enumerate
variation.size_var=0
variation.start_var=''
variation.start_var_index=0
variation.ref=''
variation.alt=''
# find the position of a substitution on the source and the target sequence
def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
seg_pos=search_segment(variation.start_var)
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
var_start_seg=variation.start_on_target
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
var_start_seg=invert_seg(var_start_seg)
end_var=get_seg_occ(var_start_seg,walk,feat,copy_id,segments_on_target_genome)[2]
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(var_start_seg,walk,feat,copy_id,segments_on_target_genome)[1]
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of an insertion on the source and the target sequence
def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
start_var_seg=variation.start_var
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
start_var_seg=invert_seg(start_var_seg)
end_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[2]+len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[1]-len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of a deletion on the source and the target sequence
def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
i=variation.start_var_index
seg_pos=search_segment(variation.start_var)
if i==0:
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)+Features[feat].pos_start-1
else:
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)
if pos_old<0:
pos_old=0
print("error with variation position",variation.inversion,"***")
if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new=0
else:
start_var_seg=variation.last_seg_in_target
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
start_var_seg=invert_seg(start_var_seg)
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[1]-1
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_feat-start_var)
else:
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[2]+1
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# change the variation information, but keep the feature information (the variation is on the feature)
def init_new_var(variation,type,feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature):
variation.type=type
variation.start_var=feature_path_source_genome[i]
variation.start_var_index=i
if type=="substitution":
variation.start_on_target=feature_path_target_genome[j]
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="insertion":
variation.ref="-"
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="deletion":
if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment :
#use pos_start, position of the feature on its first segment
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
variation.seg_ref.append(feature_path_source_genome[i])
else: # else, the deletion will always start at the start of the first segment.
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
variation.alt="-"
# update the variation
def continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,genome_to_continue):
if variation.type=="substitution":
if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif genome_to_continue==1: # deletion
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
elif genome_to_continue==2: # insertion
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="insertion":
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="deletion":
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
# stores information about a feature and its current variation # stores information about a feature and its current variation
class Variation: class FeatureVariation:
def __init__(self,feature_id,feature_key,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
self.feature_id=feature_id # real id in the gff def __init__(self,feature_id,file_out_var,first_seg,last_seg,walk,copy_id,feature_target_path,inversion,segments_on_target_genome):
self.feature_key=feature_key # key to access the feature info in the dict Features feature=Features[feature_id]
self.feature_type=feature_type # get feature paths on the original genome and on the target genome
self.chr=chr feature_path_target_genome=feature_target_path
self.start_new=start_new feature_path_source_genome=feature.segments_list_source
self.stop_new=stop_new
if inversion:
feature_path_target_genome=invert_segment_list(feature_path_target_genome)
stop_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id,segments_on_target_genome)
start_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id,segments_on_target_genome)
else:
start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id,segments_on_target_genome)
stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id,segments_on_target_genome)
size_new_genome=stop_new_genome-start_new_genome+1
size_diff=str(size_new_genome-feature.size)
sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id,segments_on_target_genome)[0]
self.feature_id=feature.id # real id in the gff
self.feature_key=feature_id # key to access the feature info in the dict Features
self.feature_type=feature.type
self.chr=sequence_name
self.start_new=start_new_genome
self.stop_new=stop_new_genome
self.inversion=inversion self.inversion=inversion
self.size_diff=size_diff self.size_diff=size_diff
self.size_new=size_new self.size_new=size_new_genome
self.type='' self.type=''
self.last_seg_in_target='' self.last_seg_in_target=''
self.seg_ref=list() self.seg_ref=list()
self.seg_alt=list() self.seg_alt=list()
self.feature_path_source_genome=feature_path_source_genome
self.feature_path_target_genome=feature_path_target_genome
self.feat_start=feature.start
self.copy_id=copy_id
self.walk=walk
self.start_feat_seg_target=feature_path_target_genome[0]
self.var_count=0
self.file_out_var=file_out_var
# check if a substitution could be an inversion inside a feature
def detect_small_inversion(self):
[list_ref_common,list_alt_common]=[list(),list()]
list_ref_unstrand=[segment_stranded[1:] for segment_stranded in self.seg_ref]
list_alt_unstrand=[segment_stranded[1:] for segment_stranded in self.seg_alt]
for seg in self.seg_ref:
if seg[1:] in list_alt_unstrand:
list_ref_common.append(seg)
for seg in self.seg_alt:
if seg[1:] in list_ref_unstrand:
list_alt_common.append(seg)
if (len(list_ref_common)>len(list_ref_unstrand)*0.5) and (len(list_alt_common)>len(list_alt_unstrand)*0.5):
return f'\t# Suspected inversion within this substitution.'
else:
return ''
# print a variation
def print_current_var(self,segments_on_target_genome):
warning=''
if self.type=='insertion':
[pos_old,pos_new]=self.get_old_new_pos_insertion(segments_on_target_genome)
line=f'{self.feature_id}\t{self.feature_type}\t{self.chr}\t{self.start_new}\t{self.stop_new}\t{self.size_new}\t{FeatureVariation.print_inversion(self.inversion)}\t{self.size_diff}\tinsertion\t-\t{self.alt}\t{len(self.alt)}\t{pos_old}\t{pos_new}{warning}\n'
elif self.type=='deletion':
[pos_old,pos_new]=self.get_old_new_pos_deletion(segments_on_target_genome)
line=f'{self.feature_id}\t{self.feature_type}\t{self.chr}\t{self.start_new}\t{self.stop_new}\t{self.size_new}\t{FeatureVariation.print_inversion(self.inversion)}\t{self.size_diff}\tdeletion\t{self.ref}\t-\t{len(self.ref)}\t{pos_old}\t{pos_new}{warning}\n'
elif self.type=='substitution':
warning=self.detect_small_inversion()
[pos_old,pos_new]=self.get_old_new_pos_substitution(segments_on_target_genome)
size_subs=f'{len(self.ref)}/{len(self.alt)}'
line=f'{self.feature_id}\t{self.feature_type}\t{self.chr}\t{self.start_new}\t{self.stop_new}\t{self.size_new}\t{FeatureVariation.print_inversion(self.inversion)}\t{self.size_diff}\tsubstitution\t{self.ref}\t{self.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
self.file_out_var.write(line)
self.var_count+=1
self.reset_var()
# print a feature with no variation
def print_novar(self):
line=f'{self.feature_id}\t{self.feature_type}\t{self.chr}\t{self.start_new}\t{self.stop_new}\t{self.size_new}\t{FeatureVariation.print_inversion(self.inversion)}\t{self.size_diff}\tno_var\t-\t-\t-\t-\t-\n'
self.file_out_var.write(line)
# print a deletion at the end of a feature
def print_last_deletion(self,i,feature,seg_seq):
seg_del=search_segment(self.feature_path_source_genome[i])
pos_old=int(Segments[seg_del].get_start(feature.id))-self.feat_start+1
pos_new=str(self.size_new+1) # the deletion is at the end of the feature on the new genome
del_sequence=FeatureVariation.get_sequence_list_seg(self.feature_path_source_genome,i,feature.pos_stop,seg_seq)
line=f'{self.feature_id}\t{self.feature_type}\t{self.chr}\t{self.start_new}\t{self.stop_new}\t{self.size_new}\t{FeatureVariation.print_inversion(self.inversion)}\t{self.size_diff}\tdeletion\t{del_sequence}\t-\t{len(del_sequence)}\t{pos_old}\t{pos_new}\n'
self.file_out_var.write(line)
self.var_count+=1
self.reset_var()
# reset the informations of the variation, but keep the information about the feature
def reset_var(self):
self.type='' # todo : make type enumerate
self.size_var=0
self.start_var=''
self.start_var_index=0
self.ref=''
self.alt=''
# change the variation information, but keep the feature information (the variation is on the feature)
def init_new_var(self,type,i,j,seg_seq,feature):
feature_path_source_genome=self.feature_path_source_genome
feature_path_target_genome=self.feature_path_target_genome
self.type=type
self.start_var=feature_path_source_genome[i]
self.start_var_index=i
if type=="substitution":
self.start_on_target=feature_path_target_genome[j]
self.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
self.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
self.seg_ref.append(feature_path_source_genome[i])
self.seg_alt.append(feature_path_target_genome[j])
elif type=="insertion":
self.ref="-"
self.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
self.seg_alt.append(feature_path_target_genome[j])
elif type=="deletion":
if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment :
#use pos_start, position of the feature on its first segment
self.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
self.seg_ref.append(feature_path_source_genome[i])
else: # else, the deletion will always start at the start of the first segment.
self.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
self.seg_ref.append(feature_path_source_genome[i])
self.alt="-"
# update the variation
def continue_var(self,seg_seq,i,j,genome_to_continue):
feature_path_source_genome=self.feature_path_source_genome
feature_path_target_genome=self.feature_path_target_genome
if self.type=="substitution":
if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
self.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
self.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
self.seg_ref.append(feature_path_source_genome[i])
self.seg_alt.append(feature_path_target_genome[j])
elif genome_to_continue==1: # deletion
self.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
self.seg_ref.append(feature_path_source_genome[i])
elif genome_to_continue==2: # insertion
self.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
self.seg_alt.append(feature_path_target_genome[j])
elif self.type=="insertion":
self.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
self.seg_alt.append(feature_path_target_genome[j])
elif self.type=="deletion":
self.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
self.seg_ref.append(feature_path_source_genome[i])
# find the position of a substitution on the source and the target sequence
def get_old_new_pos_substitution(self,segments_on_target_genome):
feat=self.feature_key
seg_pos=search_segment(self.start_var)
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-self.feat_start)
var_start_seg=self.start_on_target
if self.inversion:
var_start_seg=invert_seg(var_start_seg)
end_var=get_seg_occ(var_start_seg,self.walk,feat,self.copy_id,segments_on_target_genome)[2]
start_feat=get_feature_start_on_target_genome_inv(invert_seg(self.start_feat_seg_target),feat,self.walk,self.copy_id,segments_on_target_genome)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(var_start_seg,self.walk,feat,self.copy_id,segments_on_target_genome)[1]
start_feat=get_feature_start_on_target_genome(self.start_feat_seg_target,feat,self.walk,self.copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of an insertion on the source and the target sequence
def get_old_new_pos_insertion(self,segments_on_target_genome):
feat=self.feature_key
seg_pos=search_segment(self.start_var) # start_var is the segment AFTER the insertion
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-self.feat_start)
# outputs the nucleotide sequence of a list of segments, corresponding to the end of a feature start_var_seg=self.start_var
def get_sequence_list_seg(list_seg,i,feature,seg_seq): if self.inversion:
del_sequence="" start_var_seg=invert_seg(start_var_seg)
for k in range(i,len(list_seg)): end_var=get_seg_occ(start_var_seg,self.walk,feat,self.copy_id,segments_on_target_genome)[2]+len(self.alt) # start_var_seg is the segment AFTER the insertion
if k==len(list_seg)-1: start_feat=get_feature_start_on_target_genome_inv(invert_seg(self.start_feat_seg_target),feat,self.walk,self.copy_id,segments_on_target_genome)
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feature.pos_stop] pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(start_var_seg,self.walk,feat,self.copy_id,segments_on_target_genome)[1]-len(self.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome(self.start_feat_seg_target,feat,self.walk,self.copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of a deletion on the source and the target sequence
def get_old_new_pos_deletion(self,segments_on_target_genome):
feat=self.feature_key
seg_pos=search_segment(self.start_var)
if self.start_var_index==0:
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-self.feat_start+Features[feat].pos_start-1
else:
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-self.feat_start
if pos_old<0:
pos_old=0
print("error with variation position",self.inversion,"***")
if self.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new=0
else:
start_var_seg=self.last_seg_in_target
if self.inversion:
start_var_seg=invert_seg(start_var_seg)
start_var=get_seg_occ(start_var_seg,self.walk,feat,self.copy_id,segments_on_target_genome)[1]-1
start_feat=get_feature_start_on_target_genome_inv(invert_seg(self.start_feat_seg_target),feat,self.walk,self.copy_id,segments_on_target_genome)
pos_new=str(start_feat-start_var)
else:
start_var=get_seg_occ(start_var_seg,self.walk,feat,self.copy_id,segments_on_target_genome)[2]+1
start_feat=get_feature_start_on_target_genome(self.start_feat_seg_target,feat,self.walk,self.copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# get a digit for the inversion (convert bool in int)
@staticmethod
def print_inversion(bool):
if bool==True:
return '1'
else: else:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k]) return '0'
return del_sequence
# outputs the nucleotide sequence of a list of segments, corresponding to the end of a feature
@staticmethod
def get_sequence_list_seg(list_seg,i,feat_pos_stop,seg_seq):
del_sequence=""
for k in range(i,len(list_seg)):
if k==len(list_seg)-1:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feat_pos_stop]
else:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])
return del_sequence
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment