Newer
Older

nina.marthe_ird.fr
committed
from Graph_gff import Segments, Features, get_feature_start_on_segment, get_feature_stop_on_segment,invert_seg,search_segment
global segments_on_target_genome
segments_on_target_genome={}
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome

nina.marthe_ird.fr
committed
def get_feature_start_on_target_genome(start_seg,feat_id,walk):
seg_start_pos=segments_on_target_genome[start_seg][walk][1]
feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)
return seg_start_pos+feat_start_pos-1
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome

nina.marthe_ird.fr
committed
def get_feature_stop_on_target_genome(stop_seg,feat_id,walk):
seg_start_pos=segments_on_target_genome[stop_seg][walk][1]
feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
return seg_start_pos+feat_stop_pos-1

nina.marthe_ird.fr
committed
# get the start position of the features on the linear genome for inverted features

nina.marthe_ird.fr
committed
def get_feature_start_on_target_genome_inv(start_seg,feat_id,walk):
seg_end_pos=segments_on_target_genome[start_seg][walk][2]

nina.marthe_ird.fr
committed
feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
return seg_end_pos-feat_start_pos+1
# get the stop position of the features on the linear genome for inverted features

nina.marthe_ird.fr
committed
def get_feature_stop_on_target_genome_inv(stop_seg,feat_id,walk):
seg_end_pos=segments_on_target_genome[stop_seg][walk][2]

nina.marthe_ird.fr
committed
feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
return seg_end_pos-feat_stop_pos+1
# functions to get the gff with one line per feature
def right_size(size,max_diff,feat):
if max_diff==0:
return True
return not ((size>Features[feat].size*max_diff) or (size<Features[feat].size/max_diff))
def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id):

nina.marthe_ird.fr
committed
[chr,strand,feature]=[segments_on_target_genome[first_seg][walk][0],Features[feature_id].strand,Features[feature_id]]

nina.marthe_ird.fr
committed
annotation=f'{feature.annot};Size_diff={size_diff};coverage={cov};sequence_ID={id}' # Nb_variants={var_count};

nina.marthe_ird.fr
committed
start=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk)
stop=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk)

nina.marthe_ird.fr
committed
strand=invert_strand(strand)
else:

nina.marthe_ird.fr
committed
start=get_feature_start_on_target_genome(first_seg,feature_id,walk)
stop=get_feature_stop_on_target_genome(last_seg,feature_id,walk)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
if start>stop:
temp=start
start=stop
stop=temp

nina.marthe_ird.fr
committed
output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start}\t{stop}\t.\t{strand}\t.\t{annotation}\n'

nina.marthe_ird.fr
committed
# functions to get the alignment for the transfered genes

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
# create alignment for a feature

nina.marthe_ird.fr
committed
def segment_aln(type,seg_seq,seg_a,seg_b,first,feature_id,last):

nina.marthe_ird.fr
committed
match type:
case "identity":

nina.marthe_ird.fr
committed
if first:
feature=Features[feature_id]

nina.marthe_ird.fr
committed
seq_aln=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]

nina.marthe_ird.fr
committed
elif last:
feature=Features[feature_id]

nina.marthe_ird.fr
committed
seq_aln=get_segment_sequence(seg_seq,seg_a)[:feature.pos_stop]

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
seq_aln=get_segment_sequence(seg_seq,seg_a)

nina.marthe_ird.fr
committed
line_a=seq_aln
line_b=seq_aln
len_aln=len(seq_aln)
line_c=len_aln*"*"
case "substitution":

nina.marthe_ird.fr
committed
seq_aln_a=get_segment_sequence(seg_seq,seg_a)
seq_aln_b=get_segment_sequence(seg_seq,seg_b)

nina.marthe_ird.fr
committed
len_a=len(seq_aln_a)
len_b=len(seq_aln_b)
if len_a>len_b:
diff_len=len_a-len_b
line_a=seq_aln_a
line_b=seq_aln_b+diff_len*"-"
line_c=len_a*" "
else:
diff_len=len_b-len_a
line_a=seq_aln_a+diff_len*"-"
line_b=seq_aln_b
line_c=len_b*" "
case "insertion":

nina.marthe_ird.fr
committed
seq_aln_b=get_segment_sequence(seg_seq,seg_b)

nina.marthe_ird.fr
committed
len_b=len(seq_aln_b)
line_a=len_b*"-"
line_b=seq_aln_b
line_c=len_b*" "
case "deletion":

nina.marthe_ird.fr
committed
if first:
feature=Features[feature_id]

nina.marthe_ird.fr
committed
seq_aln_a=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
seq_aln_a=get_segment_sequence(seg_seq,seg_a)

nina.marthe_ird.fr
committed
len_a=len(seq_aln_a)
line_a=seq_aln_a
line_b=len_a*"-"
line_c=len_a*" "
case "end_deletion":
seq_aln_a=""

nina.marthe_ird.fr
committed
for segment in seg_a[:-1]:

nina.marthe_ird.fr
committed
seq_aln_a+=get_segment_sequence(seg_seq,segment)

nina.marthe_ird.fr
committed
feature=Features[feature_id]

nina.marthe_ird.fr
committed
seq_aln_a+=get_segment_sequence(seg_seq,seg_a[-1])[0:feature.pos_stop] # for the last segment, only take the part that the feature is on

nina.marthe_ird.fr
committed
len_a=len(seq_aln_a)
line_a=seq_aln_a
line_b=len_a*"-"
line_c=len_a*" "

nina.marthe_ird.fr
committed
return [line_a,line_b,line_c,False] # check the orientation of the segment later

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def parse_aln_lines(line_a,line_b,line_c,feature_id):
if (len(line_a)!=len(line_b)) or (len(line_b)!=len(line_c)):

nina.marthe_ird.fr
committed
print("line lengths differ in alignment")

nina.marthe_ird.fr
committed
len_to_parse=len(line_a)
len_parsed=0

nina.marthe_ird.fr
committed
aln_line=""

nina.marthe_ird.fr
committed
nb_res_a=0
nb_res_b=0
while len_parsed<len_to_parse:
len_header=len(feature_id)+11
headers=[feature_id+"_source ",feature_id+"_target ",len_header*" "]
add_a=line_a[len_parsed:len_parsed+60]
add_b=line_b[len_parsed:len_parsed+60]
add_c=line_c[len_parsed:len_parsed+60]
nb_res_a+=len(add_a)-add_a.count("-")
nb_res_b+=len(add_b)-add_b.count("-")

nina.marthe_ird.fr
committed
aln_line+=f'{headers[0]}{add_a} {nb_res_a}\n'
aln_line+=f'{headers[1]}{add_b} {nb_res_b}\n'
aln_line+=f'{headers[2]}{add_c}\n\n'

nina.marthe_ird.fr
committed
len_parsed+=60

nina.marthe_ird.fr
committed
aln_line+="\n"

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
return aln_line

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feature_id):

nina.marthe_ird.fr
committed
line_a=""
line_b=""
line_c=""
[i,j]=[0,0]

nina.marthe_ird.fr
committed
first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
last=False # same for the last part of the feature

nina.marthe_ird.fr
committed
while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):

nina.marthe_ird.fr
committed
if i==len(feature_path_source_genome)-1:
last=True
if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome

nina.marthe_ird.fr
committed
if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome : substitution
[add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)

nina.marthe_ird.fr
committed
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1

nina.marthe_ird.fr
committed
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
[add_a,add_b,add_c,first]=segment_aln("insertion",seg_seq,"",feature_path_target_genome[j],first,feature_id,last)

nina.marthe_ird.fr
committed
line_a+=add_a;line_b+=add_b;line_c+=add_c
j+=1

nina.marthe_ird.fr
committed
elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
[add_a,add_b,add_c,first]=segment_aln("deletion",seg_seq,feature_path_source_genome[i],"",first,feature_id,last)

nina.marthe_ird.fr
committed
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet

nina.marthe_ird.fr
committed
[add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)

nina.marthe_ird.fr
committed
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1

nina.marthe_ird.fr
committed
else: # segment present in both, no variation.
[add_a,add_b,add_c,first]=segment_aln("identity",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)

nina.marthe_ird.fr
committed
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome

nina.marthe_ird.fr
committed
[add_a,add_b,add_c,first]=segment_aln("end_deletion",seg_seq,feature_path_source_genome[i:],"",first,feature_id,last)

nina.marthe_ird.fr
committed
line_a+=add_a;line_b+=add_b;line_c+=add_c

nina.marthe_ird.fr
committed
return parse_aln_lines(line_a,line_b,line_c,feature_id)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id,walk):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
feature_missing_segments[5].append(feature_id)
if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely

nina.marthe_ird.fr
committed
elif first_seg != list_seg[0]: # the first segment is missing

nina.marthe_ird.fr
committed
elif last_seg!=list_seg[-1]: # the last segment is missing
feature_missing_segments[2].append(feature_id)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element

nina.marthe_ird.fr
committed
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
feature_missing_segments[1].append(feature_id)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:

nina.marthe_ird.fr
committed
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[4].append(feature_id)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK FOR INSERTIONS !!
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[6].append(feature_id)
def get_annot_features(list_features):
list_annot_features=[]
for feature in list_features:
list_annot_features.append(Features[feature].note)
return list_annot_features
def count_hypput_total(list_annot_first):
total=len(list_annot_first)
count_hypput=0
for annot in list_annot_first:
if ("hypothetical" in annot) or ("putative" in annot):
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
count_hypput+=1
return [count_hypput,total]
# print stats on the transfer : number of feature that have segments in different positions missing.
def stats_features(feature_missing_segments):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
list_annot_first=get_annot_features(feature_missing_segments[0])
[hyp_put,total]=count_hypput_total(list_annot_first)
print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_middle=get_annot_features(feature_missing_segments[1])
[hyp_put,total]=count_hypput_total(list_annot_middle)
print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_last=get_annot_features(feature_missing_segments[2])
[hyp_put,total]=count_hypput_total(list_annot_last)
print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_all=get_annot_features(feature_missing_segments[3])
[hyp_put,total]=count_hypput_total(list_annot_all)
print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_total=get_annot_features(feature_missing_segments[4])
[hyp_put,total]=count_hypput_total(list_annot_total)
print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_ok=get_annot_features(feature_missing_segments[6])
[hyp_put,total]=count_hypput_total(list_annot_ok)
print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_features=get_annot_features(feature_missing_segments[5])
[hyp_put,total]=count_hypput_total(list_annot_features)
print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
# functions to generate the different gffs

nina.marthe_ird.fr
committed
def get_segments_positions_on_genome(pos_seg): # add to the dict the info about the segments.
bed=open(pos_seg,'r')
lines=bed.readlines() # read line by line ?
bed.close()

nina.marthe_ird.fr
committed
seg_count=0
file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename

nina.marthe_ird.fr
committed
[seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system

nina.marthe_ird.fr
committed
# check if segment present twice on the same walk ???
#segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
if seg not in segments_on_target_genome:
segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
segments_on_target_genome[seg][file_name]=[chrom,start,stop,strand,index]

nina.marthe_ird.fr
committed
seg_count+=1
# look for the segment on either strand of the target genome

nina.marthe_ird.fr
committed
def search_seg_on_target_genome(segment):
inverted_segment=invert_seg(segment)
if segment in segments_on_target_genome:
#if inverted_segment in segments_on_target_genome:
# print(segment," found in both orientations")
return segment
elif inverted_segment in segments_on_target_genome:
#print("inverted seg found *****")
return inverted_segment
else:
return False

nina.marthe_ird.fr
committed
def get_segments_sequence(segments_file,segments_list):
file_segments=open(segments_file,'r')
lines_segments=file_segments.readlines()
file_segments.close()

nina.marthe_ird.fr
committed
for line in lines_segments:

nina.marthe_ird.fr
committed
seg_id='s'+line[1]
if seg_id in segments_list:
seg_seq[seg_id]=line[2]

nina.marthe_ird.fr
committed
return seg_seq

nina.marthe_ird.fr
committed
def get_paths(walks_file,target_genome):
file_walks=open(walks_file,'r')
lines_walks=file_walks.readlines()
file_walks.close()

nina.marthe_ird.fr
committed
for line in lines_walks:

nina.marthe_ird.fr
committed
line=line.split()
seq_name=line[1]+"_"+line[3]
if target_genome in seq_name: # get the walk of the genome

nina.marthe_ird.fr
committed
list_segments=[]
for segment in path:
if segment[0:1]=='>':
list_segments.append('>s'+segment[1:])
elif segment[0:1]=='<':
list_segments.append('<s'+segment[1:])
paths[seq_name]=list_segments

nina.marthe_ird.fr
committed
def get_first_last_seg(list_seg): # get the first segment of the list that is in the target genome
[first_seg_found,last_seg_found,walk_found]=['','','']
for segment in list_seg:

nina.marthe_ird.fr
committed
seg_found=search_seg_on_target_genome(segment)
if seg_found:
first_seg_found=seg_found

nina.marthe_ird.fr
committed
if first_seg_found!='':
for segment in reversed(list_seg):
last_seg_found=search_seg_on_target_genome(segment)
if last_seg_found:
walk=find_common_walk(first_seg_found,last_seg_found)
if walk:
walk_found=walk
break
# return first match, not the longest (for now). still a match with the first seg being the earliest possible
return [first_seg_found,last_seg_found,walk_found]
def find_common_walk(seg1,seg2):
for walk in segments_on_target_genome[seg1]:
if walk in segments_on_target_genome[seg2]:
return walk
return False
# functions to get the detail of the variations in the features
# add the path of the feature on the target genome in the object Feature
def add_target_genome_path(feature_id,target_genome_paths):

nina.marthe_ird.fr
committed
feature=Features[feature_id]
list_seg=feature.segments_list_source

nina.marthe_ird.fr
committed
[first_seg,last_seg,walk_name]=get_first_last_seg(list_seg)

nina.marthe_ird.fr
committed
feature_path=[]
if first_seg!='':

nina.marthe_ird.fr
committed
feature_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name)

nina.marthe_ird.fr
committed
feature.segments_list_target=feature_path

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def get_feature_path(target_genome_path,first_seg,last_seg,walk_name):
#first_seg=search_seg_on_target_genome(first_seg)
#last_seg=search_seg_on_target_genome(last_seg)
first_seg_index=segments_on_target_genome[first_seg][walk_name][4]
last_seg_index=segments_on_target_genome[last_seg][walk_name][4]
first_index=min(first_seg_index,last_seg_index)
last_index=max(first_seg_index,last_seg_index)
feature_path_target_genome=target_genome_path[first_index:last_index+1]
return feature_path_target_genome
def count_variations(feature_id):
feature=Features[feature_id]
target_list=feature.segments_list_target
if len(target_list)!=0:
source_list=feature.segments_list_source

nina.marthe_ird.fr
committed
inversion=detect_feature_inversion(source_list,target_list)

nina.marthe_ird.fr
committed
target_list=invert_segment_list(target_list)
target_dict=dict.fromkeys(target_list,"")
source_dict=dict.fromkeys(source_list,"") # convert list into dict to search segments in dict quicker.
var_count=0
for segment in source_dict:
if segment not in target_dict:
var_count+=1
for segment in target_dict:
if segment not in source_dict:
var_count+=1
# this counts the substitutions twice, as insertion+deletion.
return var_count

nina.marthe_ird.fr
committed
def get_id_cov(feature_id,seg_size): # seg_size has unoriented segments : s25
feature=Features[feature_id]
target_list=feature.segments_list_target # make dict instead of lists ??? also in alignment function...
source_list=feature.segments_list_source
inversion=detect_feature_inversion(source_list,target_list)
if inversion:
target_list=invert_segment_list(target_list)

nina.marthe_ird.fr
committed
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
[match,subs,inser,delet]=[0,0,0,0]
[i,j]=[0,0]
first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
last=False # same for the last part of the feature # for id and ins.
while (i<len(source_list)) and (j<len(target_list)):
if i==len(source_list)-1:
last=True
if source_list[i] != target_list[j]: # if there is a difference between the two paths
if target_list[j] not in source_list: # if the segment in target genome is absent in source genome
if source_list[i] not in target_list: # if the segment in source genome is absent is target genome : substitution
add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
add=segment_id_cov("insertion",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
j+=1
elif source_list[i] not in target_list: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
add=segment_id_cov("deletion",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
else: # segment present in both, no variation.
add=segment_id_cov("identity",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
if i<=len(source_list)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
add=segment_id_cov("end_deletion",seg_size,source_list[i:],'',first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
cov=round((match+subs)/(match+subs+delet),3)
id=round(match/(match+subs+inser+delet),3)

nina.marthe_ird.fr
committed
#var_count=count_variations(feature_id)
return [cov,id]

nina.marthe_ird.fr
committed
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
def segment_id_cov(type,seg_size,seg_a,seg_b,first,feature,last):
[match,subs,inser,delet]=[0,0,0,0]
match type:
case "identity":
if first:
match+=seg_size[seg_a[1:]]-feature.pos_start+1
elif last:
match+=feature.pos_stop
else:
match+=seg_size[seg_a[1:]]
case "substitution":
if seg_size[seg_b[1:]]!=seg_size[seg_a[1:]]: # substitution can be between segments of different size
if seg_size[seg_b[1:]]>seg_size[seg_a[1:]]:
subs+=seg_size[seg_a[1:]]
inser+=seg_size[seg_b[1:]]-seg_size[seg_a[1:]]
elif seg_size[seg_b[1:]]<seg_size[seg_a[1:]]:
subs+=seg_size[seg_b[1:]]
delet+=seg_size[seg_a[1:]]-seg_size[seg_b[1:]]
else:
subs+=seg_size[seg_a[1:]]
case "insertion":
inser+=seg_size[seg_b[1:]]
case "deletion":
if first:
delet+=seg_size[seg_a[1:]]-feature.pos_start+1
else:
delet+=seg_size[seg_a[1:]]
case "end_deletion":
for seg in seg_a[:-1]:
delet+=seg_size[seg[1:]]
delet+=feature.pos_stop
return [match,subs,inser,delet,False] # check the orientation of the segment later

nina.marthe_ird.fr
committed
def invert_strand(strand):

nina.marthe_ird.fr
committed
return "-"

nina.marthe_ird.fr
committed
case ">":
return "<"

nina.marthe_ird.fr
committed
return ">"
def get_sequence_list_seg(list_seg,i,feature,seg_seq):
for k in range(i,len(list_seg)):
if k==len(list_seg)-1:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feature.pos_stop]
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])
return del_sequence
def get_segment_sequence(seg_seq,segment):

nina.marthe_ird.fr
committed
if segment[0]==">":
return seg_seq[segment[1:]]
else:
return reverse_complement(seg_seq[segment[1:]])
def reverse_complement(sequence):
sequence_rc=""
for char in sequence:
sequence_rc+=complement(char)
return sequence_rc[::-1]
def complement(nucl):
match nucl:
case "A":
return "T"
case "C":
return "G"
case "G":
return "C"
case "T":
return "A"
return nucl
class Variation:
def __init__(self,feature_id,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
self.feature_id=feature_id
self.feature_type=feature_type
self.chr=chr
self.start_new=start_new
self.stop_new=stop_new
self.inversion=inversion
self.size_diff=size_diff
self.size_new=size_new
self.type=''
self.last_seg_in_target=''
self.seg_ref=list()
self.seg_alt=list()
# initiate a Variation object with the information on the feature it is on

nina.marthe_ird.fr
committed
def create_var(feature_id,first_seg,last_seg,walk):
feature=Features[feature_id]
# get feature paths on the original genome and on the target genome

nina.marthe_ird.fr
committed
feature_path_target_genome=feature.segments_list_target
feature_path_source_genome=feature.segments_list_source

nina.marthe_ird.fr
committed
inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)

nina.marthe_ird.fr
committed
feature_path_target_genome=invert_segment_list(feature_path_target_genome)

nina.marthe_ird.fr
committed
start_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk)
stop_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk)

nina.marthe_ird.fr
committed
size_new_genome=start_new_genome-stop_new_genome+1

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id,walk)
stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk)

nina.marthe_ird.fr
committed
size_new_genome=stop_new_genome-start_new_genome+1

nina.marthe_ird.fr
committed
size_diff=str(size_new_genome-feature.size)

nina.marthe_ird.fr
committed
sequence_name=segments_on_target_genome[first_seg][walk][0]
variation=Variation(feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
return(variation,feature_path_source_genome,feature_path_target_genome)
# reset the informations of the variation, but keep the information about the feature
def reset_var(variation):
variation.type='' # make type enumerate
variation.size_var=0
variation.start_var=''

nina.marthe_ird.fr
committed
variation.start_var_index=0
variation.ref=''
variation.alt=''
def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk):

nina.marthe_ird.fr
committed
seg_pos=search_segment(variation.start_var)
pos_old=str(int(Segments[seg_pos].start)-int(feat_start))
var_start_seg=variation.start_on_target
start_feat_seg_target=invert_seg(start_feat_seg_target)

nina.marthe_ird.fr
committed
var_start_seg=invert_seg(var_start_seg)

nina.marthe_ird.fr
committed
end_var=segments_on_target_genome[var_start_seg][walk][2]
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk)

nina.marthe_ird.fr
committed
pos_new=str(start_feat-end_var)
else:

nina.marthe_ird.fr
committed
start_var=segments_on_target_genome[var_start_seg][walk][1]
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk)

nina.marthe_ird.fr
committed
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk):

nina.marthe_ird.fr
committed
seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion

nina.marthe_ird.fr
committed
pos_old=str(int(Segments[seg_pos].start)-int(feat_start))
start_var_seg=variation.start_var
start_feat_seg_target=invert_seg(start_feat_seg_target)

nina.marthe_ird.fr
committed
start_var_seg=invert_seg(start_var_seg)

nina.marthe_ird.fr
committed
end_var=segments_on_target_genome[start_var_seg][walk][2]+len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk)

nina.marthe_ird.fr
committed
pos_new=str(start_feat-end_var)
else:

nina.marthe_ird.fr
committed
start_var=segments_on_target_genome[start_var_seg][walk][1]-len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk)

nina.marthe_ird.fr
committed
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk):

nina.marthe_ird.fr
committed
i=variation.start_var_index

nina.marthe_ird.fr
committed
seg_pos=search_segment(variation.start_var)

nina.marthe_ird.fr
committed
pos_old=int(Segments[seg_pos].start)-int(feat_start)+Features[feat].pos_start-1

nina.marthe_ird.fr
committed
pos_old=int(Segments[seg_pos].start)-int(feat_start)

nina.marthe_ird.fr
committed
if pos_old<0:
pos_old=0

nina.marthe_ird.fr
committed
print("error with variation position",variation.inversion,"***")
if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.

nina.marthe_ird.fr
committed
pos_new=0

nina.marthe_ird.fr
committed
start_var_seg=variation.last_seg_in_target
start_feat_seg_target=invert_seg(start_feat_seg_target)

nina.marthe_ird.fr
committed
start_var_seg=invert_seg(start_var_seg)

nina.marthe_ird.fr
committed
start_var=segments_on_target_genome[start_var_seg][walk][1]-1
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk)

nina.marthe_ird.fr
committed
pos_new=str(start_feat-start_var)
else:

nina.marthe_ird.fr
committed
start_var=segments_on_target_genome[start_var_seg][walk][2]+1
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk)

nina.marthe_ird.fr
committed
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
def init_new_var(variation,type,feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature):
variation.type=type

nina.marthe_ird.fr
committed
variation.start_var=feature_path_source_genome[i]

nina.marthe_ird.fr
committed
variation.start_var_index=i
if type=="substitution":

nina.marthe_ird.fr
committed
variation.start_on_target=feature_path_target_genome[j]
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="insertion":
variation.ref="-"
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="deletion":
if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment :
#use pos_start, position of the feature on its first segment
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
variation.seg_ref.append(feature_path_source_genome[i])
else: # else, the deletion will always start at the start of the first segment.
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
variation.alt="-"
def continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,genome_to_continue):
if variation.type=="substitution":
if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif genome_to_continue==1: # deletion
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
elif genome_to_continue==2: # insertion
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="insertion":
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="deletion":
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])

nina.marthe_ird.fr
committed
# gives the list of segments from list1 that are in list2
def get_common_segments(dict1,dict2):
list_common=[]
for segment in dict1:
if segment in dict2:
list_common.append(segment)
return list_common
# check if common segments in the two dict have the same strand
def compare_strand(dict1,dict2): # dict1 and dict2 : [seg_id]->[seg_strand]
seg_common=get_common_segments(dict1,dict2)
# for each segment in common, check if the strand is the same
same_strand_count=0
for segment in seg_common:

nina.marthe_ird.fr
committed
if dict1[segment]==dict2[segment]:

nina.marthe_ird.fr
committed
return [len(seg_common),same_strand_count]

nina.marthe_ird.fr
committed
def detect_segment_order_inversion(dict1,dict2):
list_1_common=get_common_segments(dict1,dict2)
list_2_common=get_common_segments(dict2,dict1) # same segments, different orders
list_2_common_reversed=list(reversed(list_2_common))

nina.marthe_ird.fr
committed
[cpt,i]=[0,0]
while i<len(list_1_common):
if list_2_common_reversed[i]==list_1_common[i]:
return (cpt>len(list_1_common)*0.9) # if more than 90% of the segments are on the same position when the lists are reversed, there is an inversion.

nina.marthe_ird.fr
committed
def detect_orient_inversion(dict1,dict2):
[seg_common_count,same_strand_count]=compare_strand(dict1,dict2)

nina.marthe_ird.fr
committed
if same_strand_count>=seg_common_count*0.9: # if more than 90% of segments shared have the same strand, no inversion
return [False,dict1,dict2]

nina.marthe_ird.fr
committed
return [True,dict1,dict2]
# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if the segments in common are on the opposite strand)
def detect_feature_inversion(list_1,list_2):

nina.marthe_ird.fr
committed
# convert list into dict with unstranded segment id as key and strand as value
strand1=[seg[0] for seg in list_1]
id1=[seg[1:] for seg in list_1]
dict1 = {id1[i]: strand1[i] for i in range(len(strand1))}
strand2=[seg[0] for seg in list_2]
id2=[seg[1:] for seg in list_2]
dict2 = {id2[i]: strand2[i] for i in range(len(strand2))}
# check if we have an inversion of the orientation of the segments

nina.marthe_ird.fr
committed
[strand_inversion,dict1,dict2]=detect_orient_inversion(dict1,dict2)
# check if we have an inversion of the order of the segments

nina.marthe_ird.fr
committed
segment_order_inversion=detect_segment_order_inversion(dict1,dict2)

nina.marthe_ird.fr
committed
# if there we have both inversions, the gene is in an inverted region
if segment_order_inversion and strand_inversion:
else :
def invert_segment_list(seg_list):
list_inverted=list()
for seg in seg_list:

nina.marthe_ird.fr
committed
list_inverted.append(invert_seg(seg))
return list(reversed(list_inverted))