Newer
Older
from Graph_gff import Segments, Features, write_line
from Functions import *
### functions to generate a genome's gff from the graph's gff
# functions to get the gff with one line per feature
# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :

nina.marthe_ird.fr
committed
def generate_target_gff(first_seg,last_seg,feature_id,max_diff,inversion,walk,seg_size,args):
if first_seg!='': # feature present on the target genome

nina.marthe_ird.fr
committed
if inversion:
size_on_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk)-get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk)+1
else:
size_on_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk)-get_feature_start_on_target_genome(first_seg,feature_id,walk)+1
size_diff=abs(size_on_new_genome-Features[feature_id].size)
if right_size(size_on_new_genome,max_diff,feature_id):

nina.marthe_ird.fr
committed
line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,seg_size,args)

nina.marthe_ird.fr
committed
write_line(line_gff,output_target_gff,False)
return size_diff
else:
return "bad_size"
else :
return "absent"

nina.marthe_ird.fr
committed
# add cov or id too low to reason to not transfer !!
# writes the gff of target genome using the gff of the graph

nina.marthe_ird.fr
committed
def transfer_on_target(segments_file, out_target_gff, out_var,out_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args):

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
print(f'generation of {target_genome} output')
stats=False
list_feature_to_transfer= Features.keys()

nina.marthe_ird.fr
committed
segments_list={}

nina.marthe_ird.fr
committed
for feat in list_feature_to_transfer:
add_target_genome_path(feat,target_genome_paths)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
if args.annotation:

nina.marthe_ird.fr
committed
print(f' generation of {target_genome} gff')
file_out=open(out_target_gff,'w')
global output_target_gff
output_target_gff=[0,"",file_out]
bad_size_features=0
absent_features=0
diff_size_transfered_features=[0,0] # [count,sum], to get the average
for feat in list_feature_to_transfer:

nina.marthe_ird.fr
committed
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome

nina.marthe_ird.fr
committed
feature_target_path=Features[feat].segments_list_target

nina.marthe_ird.fr
committed
[first_seg,last_seg,walk]=get_featurePath_ends(feature_target_path)

nina.marthe_ird.fr
committed
inversion=detect_feature_inversion(Features[feat].segments_list_source,feature_target_path)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
transfer_stat=generate_target_gff(first_seg,last_seg,feat,args.max_difference,inversion,walk,seg_size,args) # insertions not considered !!!

nina.marthe_ird.fr
committed
if transfer_stat=="bad_size":
bad_size_features+=1
if Features[feat].type=='gene':
list_feat_absent.append(feat)

nina.marthe_ird.fr
committed
elif transfer_stat=="absent":
absent_features+=1
if Features[feat].type=='gene':
list_feat_absent.append(feat)

nina.marthe_ird.fr
committed
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat

nina.marthe_ird.fr
committed
write_line("",output_target_gff,True)
file_out.close()

nina.marthe_ird.fr
committed
if args.variation or args.alignment : # append dict of segments for which we may need the sequence

nina.marthe_ird.fr
committed
for feat in list_feature_to_transfer:
list_seg=Features[feat].segments_list_source
feature_target_path=Features[feat].segments_list_target
for segment in list_seg:
segments_list[segment[1:]]=''
for segment in feature_target_path:
segments_list[segment[1:]]=''

nina.marthe_ird.fr
committed
if args.variation:

nina.marthe_ird.fr
committed
print(f' generation of {target_genome} genes variation details')
seg_seq=get_segments_sequence(segments_file,segments_list)

nina.marthe_ird.fr
committed
file_out_var = open(out_var, 'w')
global output_variations
output_variations=[0,"",file_out_var]
for feat in list_feature_to_transfer:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list_source

nina.marthe_ird.fr
committed
feature_target_path=Features[feat].segments_list_target

nina.marthe_ird.fr
committed
[first_seg,last_seg,walk]=get_featurePath_ends(feature_target_path)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
print_variations(first_seg,last_seg,feat,seg_seq,walk)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
write_line("",output_variations,True)
file_out_var.close()

nina.marthe_ird.fr
committed
if args.alignment:
print(f' generation of {args.source_genome} genes alignment with {target_genome} genes')
if not args.variation:
seg_seq=get_segments_sequence(segments_file,segments_list)
file_aln=open(out_aln,'w')
global output_target_aln
output_target_aln=[0,"",file_aln]
write_line("Sequence alignment generated from feature path comparison in pangenome graph. Made with GrAnnoT v1.\n\n",output_target_aln,False)
for feat in list_feature_to_transfer:
list_seg=Features[feat].segments_list_source
feature_target_path=Features[feat].segments_list_target

nina.marthe_ird.fr
committed
[first_seg,last_seg,walk]=get_featurePath_ends(feature_target_path)

nina.marthe_ird.fr
committed
print_alignment(first_seg,feat,seg_seq)
write_line("",output_target_aln,True)
file_aln.close()

nina.marthe_ird.fr
committed
if stats:
# create objects for stats on how many segments are absent in target genome, their average length, etc
feature_missing_segments=[[],[],[],[],[],[],[]] # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing - feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list_source

nina.marthe_ird.fr
committed
feature_target_path=Features[feat].segments_list_target

nina.marthe_ird.fr
committed
[first_seg,last_seg,walk]=get_featurePath_ends(feature_target_path)

nina.marthe_ird.fr
committed
for feat in list_feature_to_transfer:

nina.marthe_ird.fr
committed
stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat,walk)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
if args.annotation:

nina.marthe_ird.fr
committed
print(len(Features)-(bad_size_features+absent_features),"out of",len(Features),"features are transfered.")
print(bad_size_features,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
print("average length difference of the transfered genes : ",diff_size_transfered_features[1]/diff_size_transfered_features[0])
stats_features(feature_missing_segments)

nina.marthe_ird.fr
committed
segments_on_target_genome.clear() # empty dict

nina.marthe_ird.fr
committed
def get_featurePath_ends(seg_list):

nina.marthe_ird.fr
committed
if len(seg_list)!=0:

nina.marthe_ird.fr
committed
[first_seg,last_seg]=[seg_list[0],seg_list[-1]]
walk=find_common_walk(first_seg,last_seg)

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
[first_seg,last_seg,walk]=['','','']
return [first_seg,last_seg,walk]

nina.marthe_ird.fr
committed
# fct to get the alignment between the features on the source genome and the features on the target genome
def print_alignment(first_seg,feat,seg_seq):
if first_seg!='': # if the feature is not completly absent
feature=Features[feat]
feature_path_target_genome=feature.segments_list_target
feature_path_source_genome=feature.segments_list_source

nina.marthe_ird.fr
committed
inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)

nina.marthe_ird.fr
committed
feature_path_target_genome=invert_segment_list(feature_path_target_genome)

nina.marthe_ird.fr
committed
line_aln=create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feat)
write_line(line_aln,output_target_aln,False)
# functions to get the detail of the variations in the features

nina.marthe_ird.fr
committed
def print_variations(first_seg,last_seg,feat,seg_seq,walk):

nina.marthe_ird.fr
committed
if first_seg!='': # if the feature is not completly absent # add the else, output absent features

nina.marthe_ird.fr
committed
[variation,feature_path_source_genome,feature_path_target_genome]=create_var(feat,first_seg,last_seg,walk) # removes the strands in the segment lists
feature=Features[feat]
feat_start=feature.start
# loop to go through both paths with i and j
[i,j,var_count]=[0,0,0]
# detect and print variations ignoring the strands
start_feat_seg_target=feature_path_target_genome[0]
while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome
# if both segments are absent in the other genome, its a substitution

nina.marthe_ird.fr
committed
variation.last_seg_in_target=feature_path_target_genome[j]
if (variation.type=='insertion') or (variation.type=='deletion'): # print the current variation before treating the substitution
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
reset_var(variation)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
init_new_var(variation,"substitution",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion or continue substitution
if variation.type=='deletion': # print the current variation before treating the insertion
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
reset_var(variation)
var_count+=1

nina.marthe_ird.fr
committed
variation.last_seg_in_target=feature_path_target_genome[j]
if variation.type=='insertion':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
elif variation.type=="substitution":
while feature_path_target_genome[j]!=feature_path_source_genome[i]:
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,2)
j+=1
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
reset_var(variation)
var_count+=1
j-=1
init_new_var(variation,"insertion",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
j+=1
elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion or continue substitution
if variation.type=='insertion': # print the current variation before treating the deletion
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
reset_var(variation)
var_count+=1
if variation.type=='deletion':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
elif variation.type=="substitution":
while feature_path_target_genome[j]!=feature_path_source_genome[i]:
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,1)
i+=1
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
reset_var(variation)
var_count+=1
i-=1
init_new_var(variation,"deletion",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
# can be a substitution. check later if its not an inversion

nina.marthe_ird.fr
committed
variation.last_seg_in_target=feature_path_target_genome[j]
if (variation.type=='insertion') or (variation.type=='deletion'): # print the current variation before treating the substitution
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
reset_var(variation)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
else: # initiate substitution
init_new_var(variation,"substitution",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
i+=1;j+=1
else: # segment present in both, no variation. print the running indel if there is one
if variation.type!='': # print the current variation if there is one
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
var_count+=1
reset_var(variation)

nina.marthe_ird.fr
committed
variation.last_seg_in_target=feature_path_target_genome[j]
i+=1;j+=1
if (variation.type!=''): # if there was a current variation when we reached the end, print it
print_current_var(variation,feat_start,start_feat_seg_target,feat,walk)
var_count+=1
reset_var(variation)
if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
print_last_deletion(variation,feature_path_source_genome,i,feat_start,feature,seg_seq)
var_count+=1
reset_var(variation)
if var_count==0: # if no variation was encountered
print_novar(variation)
def print_current_var(variation,feat_start,start_feat_seg_target,feat,walk):

nina.marthe_ird.fr
committed
warning=''
if variation.type=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
write_line(line,output_variations,False)
elif variation.type=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
write_line(line,output_variations,False)
elif variation.type=='substitution':

nina.marthe_ird.fr
committed
warning=detect_small_inversion(variation)
[pos_old,pos_new]=get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk)
size_subs=f'{len(variation.ref)}/{len(variation.alt)}'
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
# print the substitutions of different size as deletion+insertion.
#if len(variation.ref) == len(variation.alt): # if the substituion is between two segment of the same size, print it
# size_subs=len(variation.ref)
# line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
#else :
# # if the segments of the substitution have a different size, print deletion then insertion at the same position.
# line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
# line+=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
write_line(line,output_variations,False)
def detect_small_inversion(variation):
[list_ref_common,list_alt_common]=[list(),list()]
list_ref_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_ref]
list_alt_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_alt]
for seg in variation.seg_ref:
if seg[1:] in list_alt_unstrand:
list_ref_common.append(seg)
for seg in variation.seg_alt:
if seg[1:] in list_ref_unstrand:
list_alt_common.append(seg)
if (len(list_ref_common)>len(list_ref_unstrand)*0.5) and (len(list_alt_common)>len(list_alt_unstrand)*0.5):
return f'\t# Suspected inversion within this substitution.'
else:
return ''
def print_last_deletion(variation,feature_path_source_genome,i,feat_start,feature,seg_seq):

nina.marthe_ird.fr
committed
seg_del=search_segment(feature_path_source_genome[i])
pos_old=int(Segments[seg_del].start)-int(feat_start)+1
del_sequence=get_sequence_list_seg(feature_path_source_genome,i,feature,seg_seq)
length=len(del_sequence)
pos_new=str(int(variation.size_new)+1) # the deletion is at the end of the feature on the new genome
if variation.inversion:
inversion='1'
else:
inversion='0'
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tdeletion\t{del_sequence}\t-\t{length}\t{pos_old}\t{pos_new}\n'
write_line(line,output_variations,False)
def print_novar(variation):
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tno_var\t-\t-\t-\t-\t-\n'
write_line(line,output_variations,False)
def print_inversion(bool):
if bool==True:
return '1'
else:
return '0'
# not used.
def get_list_segments_missing(list_seg,segments_on_target_genome):
segments_missing=[]
for segment in list_seg:

nina.marthe_ird.fr
committed
if segment not in segments_on_target_genome:
segments_missing.append(Segments[segment])
return segments_missing
# takes a feature and a feature type, returns a list of child features that have the wanted type.
def get_child_list(feature,child_type):
if type=="":
return feature.childs
list_childs=[]
for child in feature.childs:
if Features[child].type==child_type:
list_childs.append(child)
return list_childs