Newer
Older
from .Graph_gff import Segments, Features, get_feature_start_on_segment, get_feature_stop_on_segment,invert_seg,search_segment
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
global segments_on_target_genome
segments_on_target_genome={}
def get_seg_occ(seg,walk,feat,copy):
seg_in_walk=segments_on_target_genome[seg][walk]
for seg_occurence in seg_in_walk:
for feat_copy in seg_occurence[5:]:
if (feat_copy[0]==feat) & (feat_copy[1]==copy):
return seg_occurence # [chr,start,stop,strand,index,copies]
print("no occ found ???",seg,walk,feat,copy)
print(segments_on_target_genome[seg][walk])
print(Features[feat].segments_list_source)
print(Features[feat].segments_list_target)
exit()
# get the start position of the features on the linear target genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_feature_start_on_target_genome(start_seg,feat_id,walk,copy_id):
seg_start_pos=get_seg_occ(start_seg,walk,feat_id,copy_id)[1]
feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)
return seg_start_pos+feat_start_pos-1
# get the stop position of the features on the linear target genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_feature_stop_on_target_genome(stop_seg,feat_id,walk,copy_id):
seg_start_pos=get_seg_occ(stop_seg,walk,feat_id,copy_id)[1]
feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
return seg_start_pos+feat_stop_pos-1
# get the start position of the features on the linear target genome for inverted features
def get_feature_start_on_target_genome_inv(start_seg,feat_id,walk,copy_id):
seg_end_pos=get_seg_occ(start_seg,walk,feat_id,copy_id)[2]
feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)
return seg_end_pos-feat_start_pos+1
# get the stop position of the features on the linear target genome for inverted features
def get_feature_stop_on_target_genome_inv(stop_seg,feat_id,walk,copy_id):
seg_end_pos=get_seg_occ(stop_seg,walk,feat_id,copy_id)[2]
feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
return seg_end_pos-feat_stop_pos+1
# functions to get the gff with one line per feature
# generates the line for the gff of the target genome
def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id):
[chr,strand,feature]=[get_seg_occ(first_seg,walk,feature_id,copy_id)[0],Features[feature_id].strand,Features[feature_id]]
annotation=f'{feature.annot};Size_diff={size_diff};coverage={cov};sequence_ID={id}' # Nb_variants={var_count};
if inversion:
start=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id)
stop=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id)
strand=invert_strand(strand)
else:
start=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id)
stop=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id)
if start>stop:
temp=start
start=stop
stop=temp

nina.marthe_ird.fr
committed
if strand=='':
strand='.'
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start}\t{stop}\t.\t{strand}\t.\t{annotation}\n'
return output_line
# functions to get the alignment for the transfered genes
# creates an alignment for two segments
def segment_aln(type,seg_seq,seg_a,seg_b,first,feature_id,last):
match type:
case "identity":
if first:
feature=Features[feature_id]
seq_aln=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
elif last:
feature=Features[feature_id]
seq_aln=get_segment_sequence(seg_seq,seg_a)[:feature.pos_stop]
else:
seq_aln=get_segment_sequence(seg_seq,seg_a)
line_a=seq_aln
line_b=seq_aln
len_aln=len(seq_aln)
line_c=len_aln*"*"
case "substitution":
seq_aln_a=get_segment_sequence(seg_seq,seg_a)
seq_aln_b=get_segment_sequence(seg_seq,seg_b)
len_a=len(seq_aln_a)
len_b=len(seq_aln_b)
if len_a>len_b:
diff_len=len_a-len_b
line_a=seq_aln_a
line_b=seq_aln_b+diff_len*"-"
line_c=len_a*" "
else:
diff_len=len_b-len_a
line_a=seq_aln_a+diff_len*"-"
line_b=seq_aln_b
line_c=len_b*" "
case "insertion":
seq_aln_b=get_segment_sequence(seg_seq,seg_b)
len_b=len(seq_aln_b)
line_a=len_b*"-"
line_b=seq_aln_b
line_c=len_b*" "
case "deletion":
if first:
feature=Features[feature_id]
seq_aln_a=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
else:
seq_aln_a=get_segment_sequence(seg_seq,seg_a)
len_a=len(seq_aln_a)
line_a=seq_aln_a
line_b=len_a*"-"
line_c=len_a*" "
case "end_deletion":
seq_aln_a=""
for segment in seg_a[:-1]:
seq_aln_a+=get_segment_sequence(seg_seq,segment)
feature=Features[feature_id]
seq_aln_a+=get_segment_sequence(seg_seq,seg_a[-1])[0:feature.pos_stop] # for the last segment, only take the part that the feature is on
len_a=len(seq_aln_a)
line_a=seq_aln_a
line_b=len_a*"-"
line_c=len_a*" "
return [line_a,line_b,line_c,False] # check the orientation of the segment later
# formats the alignment lines
def parse_aln_lines(line_a,line_b,line_c,feature_id):
if (len(line_a)!=len(line_b)) or (len(line_b)!=len(line_c)):
print("line lengths differ in alignment")
len_to_parse=len(line_a)
len_parsed=0
aln_line=""
nb_res_a=0
nb_res_b=0
while len_parsed<len_to_parse:
len_header=len(feature_id)+11
headers=[feature_id+"_source ",feature_id+"_target ",len_header*" "]
add_a=line_a[len_parsed:len_parsed+60]
add_b=line_b[len_parsed:len_parsed+60]
add_c=line_c[len_parsed:len_parsed+60]
nb_res_a+=len(add_a)-add_a.count("-")
nb_res_b+=len(add_b)-add_b.count("-")
aln_line+=f'{headers[0]}{add_a} {nb_res_a}\n'
aln_line+=f'{headers[1]}{add_b} {nb_res_b}\n'
aln_line+=f'{headers[2]}{add_c}\n\n'
len_parsed+=60
aln_line+="\n"
return aln_line
# creates the alignment for a feature
def create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feature_id):
line_a=""
line_b=""
line_c=""
[i,j]=[0,0]
first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
last=False # same for the last part of the feature
while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
if i==len(feature_path_source_genome)-1:
last=True
if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome : substitution
[add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
[add_a,add_b,add_c,first]=segment_aln("insertion",seg_seq,"",feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
j+=1
elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
[add_a,add_b,add_c,first]=segment_aln("deletion",seg_seq,feature_path_source_genome[i],"",first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
[add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
else: # segment present in both, no variation.
[add_a,add_b,add_c,first]=segment_aln("identity",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
[add_a,add_b,add_c,first]=segment_aln("end_deletion",seg_seq,feature_path_source_genome[i:],"",first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
return parse_aln_lines(line_a,line_b,line_c,feature_id)
# functions to output the stats on the transfer
# stats about missing segments and feature type, not used, will change.
def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id,walk):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
feature_missing_segments[5].append(feature_id)
if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_segments[3].append(feature_id)
elif first_seg != list_seg[0]: # the first segment is missing
feature_missing_segments[0].append(feature_id)
elif last_seg!=list_seg[-1]: # the last segment is missing
feature_missing_segments[2].append(feature_id)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
feature_missing_segments[1].append(feature_id)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[4].append(feature_id)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK FOR INSERTIONS !!
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[6].append(feature_id)
def get_annot_features(list_features):
list_annot_features=[]
for feature in list_features:
list_annot_features.append(Features[feature].note)
return list_annot_features
def count_hypput_total(list_annot_first):
total=len(list_annot_first)
count_hypput=0
for annot in list_annot_first:
if ("hypothetical" in annot) or ("putative" in annot):
count_hypput+=1
return [count_hypput,total]
# print stats on the transfer : number of feature that have segments in different positions missing.
def stats_features(feature_missing_segments):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
list_annot_first=get_annot_features(feature_missing_segments[0])
[hyp_put,total]=count_hypput_total(list_annot_first)
print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_middle=get_annot_features(feature_missing_segments[1])
[hyp_put,total]=count_hypput_total(list_annot_middle)
print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_last=get_annot_features(feature_missing_segments[2])
[hyp_put,total]=count_hypput_total(list_annot_last)
print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_all=get_annot_features(feature_missing_segments[3])
[hyp_put,total]=count_hypput_total(list_annot_all)
print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_total=get_annot_features(feature_missing_segments[4])
[hyp_put,total]=count_hypput_total(list_annot_total)
print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_ok=get_annot_features(feature_missing_segments[6])
[hyp_put,total]=count_hypput_total(list_annot_ok)
print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_features=get_annot_features(feature_missing_segments[5])
[hyp_put,total]=count_hypput_total(list_annot_features)
print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
# functions to generate the different gffs
# appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segmnet list)
def get_segments_positions_on_genome(pos_seg): # add to the dict the info about the segments.
with open(pos_seg,'r') as bed:
seg_count=0
file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
line=line.split()
[seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
# check if segment present twice on the same walk ???
#segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
if seg not in segments_on_target_genome:
segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
if file_name not in segments_on_target_genome[seg]:
segments_on_target_genome[seg][file_name]=list()
segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
seg_count+=1
# look for the segment on either strand of the target genome
def search_seg_on_target_genome(segment):
inverted_segment=invert_seg(segment)
if segment in segments_on_target_genome:
#if inverted_segment in segments_on_target_genome:
# print(segment," found in both orientations")
return segment
elif inverted_segment in segments_on_target_genome:
#print("inverted seg found *****")
return inverted_segment
else:
return False
# look for a segment on a walk, in either orientations
def search_seg_on_walk(segment,walk): # for now just print the first found, look for several later...
inverted_segment=invert_seg(segment)
if segment in segments_on_target_genome:
if walk in segments_on_target_genome[segment]:
return segment
elif inverted_segment in segments_on_target_genome:
if walk in segments_on_target_genome[inverted_segment]:
return inverted_segment
else:
return False
# generates a dictionnary that associaces the segments to their sequence : 5->AGGCTAA
def get_segments_sequence(segments_file,segments_list):
with open(segments_file,'r') as file_segments:
seg_seq={}
if seg_id in segments_list:
seg_seq[seg_id]=line[2]
return seg_seq
# generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4]

nina.marthe_ird.fr
committed
def get_paths(walks_file,target_genome,haplotype):
with open(walks_file,'r') as file_walks:
paths={}

nina.marthe_ird.fr
committed
if haplotype:
seq_name=line[1]+"#"+line[2]+"#"+line[3]
else:
if target_genome in seq_name: # get the walk of the genome
path=line[6].split(',')[1:]
list_segments=[]
for segment in path:
list_segments.append(segment)

nina.marthe_ird.fr
committed
if seq_name in paths:
paths[seq_name].extend(list_segments)
else:
paths[seq_name]=list_segments
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
return paths
# get the first and last segment of the list that is in the target genome (possibly several pairs)
def get_first_last_seg(list_seg):
list_first_last_segs=[]
[first_seg_found,last_seg_found,walk_found]=['','','']
list_walks=get_walks_feature_cross(list_seg) # get all the walks where there is a segment of the feature
for walk in list_walks: # find the first and last seg for each walk
for segment in list_seg: # look for first_seg
seg_found=search_seg_on_walk(segment,walk)
if seg_found:
first_seg_found=seg_found
break
if first_seg_found!='': # look for last_seg
for segment in reversed(list_seg):
last_seg_found=search_seg_on_walk(segment,walk)
if last_seg_found:
walk_found=walk
break
list_first_last_segs.append([first_seg_found,last_seg_found,walk_found])
[first_seg_found,last_seg_found,walk_found]=['','','']
# return all the match
return list_first_last_segs
# functions to get the detail of the variations in the features
# find all the walks that contain a segment of the feature (list_seg is the walk of the feature on the source genome)
def get_walks_feature_cross(list_seg):
list_walks=list()
for segment in list_seg:
seg_found=search_seg_on_target_genome(segment)
if seg_found: # if the segment or the reverse complement is on the target genome
for walk in segments_on_target_genome[seg_found]:
if walk not in list_walks:
list_walks.append(walk)
return list_walks
# returns a list of feature's child, and their childs' childs, etc.
def get_child_list(feature_id):
feature=Features[feature_id]
list_childs=[]
for child in feature.childs:
list_childs.append(child) # add the child to the list
list_childs+=get_child_list(child) # add the child's childs to the list
return list_childs
# add the paths of the feature on the target genome in the object Feature
def add_target_genome_paths(feature_id,target_genome_paths):
feature=Features[feature_id]
list_seg=feature.segments_list_source
list_first_last_segs=get_first_last_seg(list_seg)
for match in list_first_last_segs: # for each walk that has the feature
[first_seg,last_seg,walk_name]=match
# get the first and last segments of all the copies on this walk
first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id)
copy_number=0
for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies
copy_number+=1
copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg
feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id)
feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
feature.segments_list_target.append(feature_path)

nina.marthe_ird.fr
committed
# add the feat_copy to all the segments
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
# get the pos start of the first and last segments (existing because done in find_gene_copies)
# then look for segs that are between.
walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id)[1]
walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id)[2]
feat_copy=(feature_id,copy_id)
for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
if walk_start <= segment[1] <= walk_stop:
segment.append(feat_copy) # annotate the segment : feature_id, copy_id
break
# add the paths of the gene's child features
add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths)
if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
feature.segments_list_target.append(['','',[]])
def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths):
childs_list=get_child_list(feature_id)
for child_id in childs_list:
child=Features[child_id]
# find child path in parent's path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source:
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # the child feature may be absent in this occurence of the parent
for seg in reversed(child.segments_list_source):
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
# get the path of the child feature
child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id)
feat_copy=(child_id,copy_id)
feature_path=[walk_name,copy_id]
feature_path.append(child_path)
child.segments_list_target.append(feature_path)
for segment_id in child_path: # annotate the segments with info about the occurence of the child feature
segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id)
segment.append(feat_copy)
# find all the copies of the segments from the source list in the target genome
def find_all_seg(list_seg_source,walk_name):
index=0
list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk]
list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>]
for seg in list_seg_source:
list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
seg_inverted=invert_seg(seg)
# look for all the segment copies in the target genome walk, in both orientations
if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
list_seg_target.append(seg_info)
if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
for copy in segments_on_target_genome[seg_inverted][walk_name]:
seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
list_seg_target.append(seg_info)
index+=1
list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position
return [list_seg_target,list_seg_source_unstranded]
def find_gene_copies(list_seg_source,walk_name,feature_id):
# find all copies of all segments from the gene in the target genome (in both orientations)
[list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name)
# find each copy of the gene in the ordered list of segments
[first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id)
# join the first and last seg lists
first_last_segs_list=[]
index=0
for first_seg in first_segs_list:
last_seg=last_segs_list[index]
pair=(first_seg,last_seg)
first_last_segs_list.append(pair)
index+=1
return first_last_segs_list # return a list of pairs (first_seg,last_seg)
# called by find_gene_copies
def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id):
# prepare the variables for the first iteration of the for loop
[old_id,old_strand,old_start,old_index]=list_seg_target[0]
[first_segs_list,last_segs_list]=[[],[]]
old_seg_id=old_strand+old_id
first_segs_list.append(old_seg_id)
copy_number=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
if segment[1]==old_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
# adjust old_index for the first iteration of the loop
first_inversion=(old_strand!=list_seg_source_unstranded[old_index][1])
if first_inversion:
old_index+=1
else:
old_index-=1
for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies
[new_id,new_strand,new_start,new_index]=seg
new_seg_id=new_strand+new_id
inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk
if inversion :
if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy)
last_segs_list.append(old_seg_id)
add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
copy_number+=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id) # new feature copy, change the info
first_segs_list.append(new_seg_id)
add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
else:
if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy)
last_segs_list.append(old_seg_id)
add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
copy_number+=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id) # new feature copy, change the info
first_segs_list.append(new_seg_id)
add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
[old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start]
# add the last seg info
last_segs_list.append(old_seg_id)
add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy) # add info for the last seg of the last copy
return [first_segs_list,last_segs_list]
def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy):
for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
if segment[1]==target_start:
segment.append(feat_copy) # add seg info
break
def sort_seg_info(seg_info):
return seg_info[2]
# find the feature's path in target genome walk
def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,feature_id):
# look for first_seg and last_seg that has the right copy_id for this feature
first_seg_index=get_seg_occ(first_seg,walk_name,feature_id,copy_id)[4]
last_seg_index=get_seg_occ(last_seg,walk_name,feature_id,copy_id)[4]
first_index=min(first_seg_index,last_seg_index)
last_index=max(first_seg_index,last_seg_index)
feature_path_target_genome=target_genome_path[first_index:last_index+1]
return feature_path_target_genome
# count the variations between two lists
def count_variations(feature_id,target_list):
feature=Features[feature_id]
if len(target_list)!=0:
source_list=feature.segments_list_source
inversion=detect_feature_inversion(source_list,target_list)
if inversion:
target_list=invert_segment_list(target_list)
target_dict=dict.fromkeys(target_list,"")
source_dict=dict.fromkeys(source_list,"") # convert list into dict to search segments in dict quicker.
var_count=0
for segment in source_dict:
if segment not in target_dict:
var_count+=1
for segment in target_dict:
if segment not in source_dict:
var_count+=1
# this counts the substitutions twice, as insertion+deletion.
return var_count
# get the coverage and sequence id of a feature
def get_id_cov(feature_id,seg_size,target_list): # seg_size has unoriented segments : s25
feature=Features[feature_id]
source_list=feature.segments_list_source
inversion=detect_feature_inversion(source_list,target_list)
if inversion:
target_list=invert_segment_list(target_list)
[match,subs,inser,delet]=[0,0,0,0]
[i,j]=[0,0]
first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
last=False # same for the last part of the feature # for id and ins.
while (i<len(source_list)) and (j<len(target_list)):
if i==len(source_list)-1:
last=True
if source_list[i] != target_list[j]: # if there is a difference between the two paths
if target_list[j] not in source_list: # if the segment in target genome is absent in source genome
if source_list[i] not in target_list: # if the segment in source genome is absent is target genome : substitution
add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
add=segment_id_cov("insertion",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
j+=1
elif source_list[i] not in target_list: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
add=segment_id_cov("deletion",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
else: # segment present in both, no variation.
add=segment_id_cov("identity",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
if i<=len(source_list)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
add=segment_id_cov("end_deletion",seg_size,source_list[i:],'',first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]

nina.marthe_ird.fr
committed
denom_cov=match+subs+delet
denom_id=denom_cov+inser
[cov,id]=[0,0]
if denom_cov>0:
cov=round((match+subs)/(match+subs+delet),3)
if denom_id>0:
id=round(match/(match+subs+inser+delet),3)
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
#var_count=count_variations(feature_id,target_list)
return [cov,id]
# computes the cov/id calculation for a segment pair
def segment_id_cov(type,seg_size,seg_a,seg_b,first,feature,last):
[match,subs,inser,delet]=[0,0,0,0]
match type:
case "identity":
if first:
match+=seg_size[seg_a[1:]]-feature.pos_start+1
elif last:
match+=feature.pos_stop
else:
match+=seg_size[seg_a[1:]]
case "substitution":
if seg_size[seg_b[1:]]!=seg_size[seg_a[1:]]: # substitution can be between segments of different size
if seg_size[seg_b[1:]]>seg_size[seg_a[1:]]:
subs+=seg_size[seg_a[1:]]
inser+=seg_size[seg_b[1:]]-seg_size[seg_a[1:]]
elif seg_size[seg_b[1:]]<seg_size[seg_a[1:]]:
subs+=seg_size[seg_b[1:]]
delet+=seg_size[seg_a[1:]]-seg_size[seg_b[1:]]
else:
subs+=seg_size[seg_a[1:]]
case "insertion":
inser+=seg_size[seg_b[1:]]
case "deletion":
if first:
delet+=seg_size[seg_a[1:]]-feature.pos_start+1
else:
delet+=seg_size[seg_a[1:]]
case "end_deletion":
for seg in seg_a[:-1]:
delet+=seg_size[seg[1:]]
delet+=feature.pos_stop
return [match,subs,inser,delet,False] # check the orientation of the segment later
# invert the given strand
def invert_strand(strand):
match strand:
case "+":
return "-"
case "-":
return "+"
case ">":
return "<"
case "<":
return ">"
case default:
return ""
# outputs the nucleotide sequence of a list of segments, corresponding to the end of a feature
def get_sequence_list_seg(list_seg,i,feature,seg_seq):
del_sequence=""
for k in range(i,len(list_seg)):
if k==len(list_seg)-1:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feature.pos_stop]
else:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])
return del_sequence
# outputs the sequence of an oriented segment
def get_segment_sequence(seg_seq,segment):
if segment[0]==">":
return seg_seq[segment[1:]]
else:
return reverse_complement(seg_seq[segment[1:]])
# outputs the reverse complement of a sequence
def reverse_complement(sequence):
sequence_rc=""
for char in sequence:
sequence_rc+=complement(char)
return sequence_rc[::-1]
# outputs the reverse complement of a nucleotide
def complement(nucl):
match nucl:
case "A":
return "T"
case "C":
return "G"
case "G":
return "C"
case "T":
return "A"
return nucl
# stores information about a feature and its current variation
class Variation:

nina.marthe_ird.fr
committed
def __init__(self,feature_id,feature_key,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
self.feature_id=feature_id # real id in the gff
self.feature_key=feature_key # key to access the feature info in the dict Features
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
self.feature_type=feature_type
self.chr=chr
self.start_new=start_new
self.stop_new=stop_new
self.inversion=inversion
self.size_diff=size_diff
self.size_new=size_new
self.type=''
self.last_seg_in_target=''
self.seg_ref=list()
self.seg_alt=list()
# initiate a Variation object with the information on the feature it is on
def create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path):
feature=Features[feature_id]
# get feature paths on the original genome and on the target genome
feature_path_target_genome=feature_target_path
feature_path_source_genome=feature.segments_list_source
inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)
if inversion:
feature_path_target_genome=invert_segment_list(feature_path_target_genome)
stop_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id)
start_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id)
else:
start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id)
stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id)
size_new_genome=stop_new_genome-start_new_genome+1
size_diff=str(size_new_genome-feature.size)
sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id)[0]

nina.marthe_ird.fr
committed
variation=Variation(feature.id,feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
return(variation,feature_path_source_genome,feature_path_target_genome)
# reset the informations of the variation, but keep the information about the feature
def reset_var(variation):
variation.type='' # make type enumerate
variation.size_var=0
variation.start_var=''
variation.start_var_index=0
variation.ref=''
variation.alt=''
# find the position of a substitution on the source and the target sequence
def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id):
seg_pos=search_segment(variation.start_var)

nina.marthe_ird.fr
committed
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
var_start_seg=variation.start_on_target
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
var_start_seg=invert_seg(var_start_seg)
end_var=get_seg_occ(var_start_seg,walk,feat,copy_id)[2]
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(var_start_seg,walk,feat,copy_id)[1]
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of an insertion on the source and the target sequence
def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id):
seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion

nina.marthe_ird.fr
committed
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
start_var_seg=variation.start_var
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
start_var_seg=invert_seg(start_var_seg)
end_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[2]+len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[1]-len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of a deletion on the source and the target sequence
def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id):
i=variation.start_var_index
seg_pos=search_segment(variation.start_var)
if i==0:

nina.marthe_ird.fr
committed
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)+Features[feat].pos_start-1

nina.marthe_ird.fr
committed
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
if pos_old<0:
pos_old=0
print("error with variation position",variation.inversion,"***")
if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new=0
else:
start_var_seg=variation.last_seg_in_target
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
start_var_seg=invert_seg(start_var_seg)
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[1]-1
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id)
pos_new=str(start_feat-start_var)
else:
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[2]+1
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# change the variation information, but keep the feature information (the variation is on the feature)
def init_new_var(variation,type,feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature):
variation.type=type
variation.start_var=feature_path_source_genome[i]
variation.start_var_index=i
if type=="substitution":
variation.start_on_target=feature_path_target_genome[j]
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="insertion":
variation.ref="-"
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="deletion":
if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment :
#use pos_start, position of the feature on its first segment
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
variation.seg_ref.append(feature_path_source_genome[i])
else: # else, the deletion will always start at the start of the first segment.
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
variation.alt="-"
# update the variation
def continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,genome_to_continue):
if variation.type=="substitution":
if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif genome_to_continue==1: # deletion
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
elif genome_to_continue==2: # insertion
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="insertion":
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="deletion":
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
# functions to detect inversions
# gives the list of segments from dict1 that are in dict2
def get_common_segments(dict1,dict2):
list_common=[]
for segment in dict1:
if segment in dict2:
list_common.append(segment)
return list_common
# check if common segments in the two dict have the same strand
def compare_strand(dict1,dict2): # dict1 and dict2 : [seg_id]->[seg_strand]
seg_common=get_common_segments(dict1,dict2)
# for each segment in common, check if the strand is the same
same_strand_count=0
for segment in seg_common:
if dict1[segment]==dict2[segment]:
same_strand_count+=1
return [len(seg_common),same_strand_count]
# check if the two dict have their segments in the inverted order
def detect_segment_order_inversion(dict1,dict2):
list_1_common=get_common_segments(dict1,dict2)
list_2_common=get_common_segments(dict2,dict1) # same segments, different orders
list_2_common_reversed=list(reversed(list_2_common))
[cpt,i]=[0,0]
while i<len(list_1_common):
if list_2_common_reversed[i]==list_1_common[i]:
cpt+=1
i+=1
return (cpt>len(list_1_common)*0.9) # if more than 90% of the segments are on the same position when the lists are reversed, there is an inversion.
# check if the two dict have the same segments but in different orientation
def detect_orient_inversion(dict1,dict2):
[seg_common_count,same_strand_count]=compare_strand(dict1,dict2)
if same_strand_count>=seg_common_count*0.9: # if more than 90% of segments shared have the same strand, no inversion
return [False,dict1,dict2]
else:
return [True,dict1,dict2]
# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if the segments in common are on the opposite strand)
def detect_feature_inversion(list_1,list_2):
# convert list into dict with unstranded segment id as key and strand as value
strand1=[seg[0] for seg in list_1]
id1=[seg[1:] for seg in list_1]
dict1 = {id1[i]: strand1[i] for i in range(len(strand1))}
strand2=[seg[0] for seg in list_2]
id2=[seg[1:] for seg in list_2]
dict2 = {id2[i]: strand2[i] for i in range(len(strand2))}
# check if we have an inversion of the orientation of the segments
[strand_inversion,dict1,dict2]=detect_orient_inversion(dict1,dict2)
# check if we have an inversion of the order of the segments
segment_order_inversion=detect_segment_order_inversion(dict1,dict2)
# if there we have both inversions, the gene is in an inverted region
if segment_order_inversion and strand_inversion:
return True
else :
return False
# invert all the segments in a list (change the orientation)
def invert_segment_list(seg_list):
list_inverted=list()
for seg in seg_list:
list_inverted.append(invert_seg(seg))
return list(reversed(list_inverted))