Newer
Older
from Graph_gff import Segments, Features, get_feature_start_on_segment, get_feature_stop_on_segment, write_line
### functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_feature_start_on_genome(start_seg,feat_id):
seg_start_pos=segments_on_target_genome[start_seg][1]
feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)
start=int(seg_start_pos)+int(feat_start_pos)-1
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_feature_stop_on_genome(stop_seg,feat_id):
seg_start_pos=segments_on_target_genome[stop_seg][1]
feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
stop=int(seg_start_pos)+int(feat_stop_pos)-1
return stop
# functions to get the detailed gff with all the fragments of the features
# get the position of a part of a feature on the complete feature (on the original genome)
def get_position_on_feature(start_seg,stop_seg,feature_start_segment,feature):
start_on_feature=get_segment_start_on_feature(feature_start_segment,start_seg,feature)
stop_on_feature=get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature)
position=";position="+str(start_on_feature)+"-"+str(stop_on_feature)
#position=";start_position_on_feature="+str(start_on_gene)+":stop_position_on_feature="+str(stop_on_gene)
return position
def get_segment_start_on_feature(feature_start_segment,start_seg,feature):
feature_start_pos=Segments[feature_start_segment].start+get_feature_start_on_segment(feature_start_segment,feature.id)-1
start_on_reference=Segments[start_seg].start+get_feature_start_on_segment(start_seg,feature.id)-1
start_on_feature=int(start_on_reference)-int(feature_start_pos)+1
return start_on_feature
def get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature):
start_on_new_genome=get_feature_start_on_genome(start_seg,feature.id)
stop_on_new_genome=get_feature_stop_on_genome(stop_seg,feature.id)
# length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature
stop_on_feature=start_on_feature+(stop_on_new_genome-start_on_new_genome) # stop position : start+length
return stop_on_feature
# get the proportion of a part of the feature on the total length
def get_proportion_of_feature_part(start_seg,stop_seg,feature):
start_on_new_genome=get_feature_start_on_genome(start_seg,feature.id)
stop_on_new_genome=get_feature_stop_on_genome(stop_seg,feature.id)
# length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature
proportion=";proportion="+str(stop_on_new_genome-start_on_new_genome+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_new_genome-start_new_genome+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file for the function gff_detail
def create_line_detail(last_seg,feature_id,start_seg,feat_start):
[stop_seg,feature,chr,strand]=[last_seg[1:],Features[feature_id],segments_on_target_genome[start_seg][0],last_seg[0:1]]
# start and stop position of the feature on the genome we transfer on
start_on_new_genome=get_feature_start_on_genome(start_seg,feature_id)
stop_on_new_genome=get_feature_stop_on_genome(stop_seg,feature_id)
proportion=get_proportion_of_feature_part(start_seg,stop_seg,feature)
position=get_position_on_feature(start_seg,stop_seg,feat_start,feature)
annotation=feature.annot+proportion+position
output_line=chr+"\tGrAnnot\t"+feature.type+"\t"+str(start_on_new_genome)+"\t"+str(stop_on_new_genome)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return output_line
# outputs each fragment of the feature in a gff
def gff_detail(list_seg,feature_id):
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature, etc.
# at the end of the loop, prints the last part of the fragment.
[feat_start,seg_start,last_seg]=["","",""] # first segment of the feature, first segment of the current part of the feature, last segment in the part of the feature, for loop below
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
if feat_start=="":
feat_start=segment[1:]
if seg_start=="": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=segment[1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="": # found a stop and we have a start. print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=create_line_detail(last_seg,feature_id,seg_start,feat_start)
write_line(out_line,output_detail_gff,False)
seg_start=""
#else: if the current segment is not in azucena but there is no start, keep looking for a start
last_seg=segment
if last_seg[1:] in segments_on_target_genome:
out_line=create_line_detail(list_seg[-1],feature_id,seg_start,feat_start)
write_line(out_line,output_detail_gff,False)
# functions to get the gff with one line per feature
def right_size(size,max_diff,feat):
if max_diff==0:
return True
return not ((size>Features[feat].size*max_diff) | (size<Features[feat].size/max_diff))
def create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg):
[chr,strand,feature]=[segments_on_target_genome[first_seg][0],list_seg[0][0:1],Features[feature_id]]
variant_count=0
for segment in list_seg:
if segment[1:] not in segments_on_target_genome: # if a segment is absent, it is either a deletion of a substitution. insertions are not considered here.
variant_count+=1
annotation=feature.annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(variant_count)
line=chr+" GrAnnot "+feature.type+" "+str(get_feature_start_on_genome(first_seg,feature_id))+" "+str(get_feature_stop_on_genome(last_seg,feature_id))+" . "+strand+" . "+annotation+"\n"
return line
# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):
if (first_seg!=''): # feature present on the target genome
size_on_new_genome=get_feature_stop_on_genome(last_seg,feature_id)-get_feature_start_on_genome(first_seg,feature_id)+1
size_diff=abs(size_on_new_genome-Features[feature_id].size)
if right_size(size_on_new_genome,max_diff,feature_id):
line=create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg)
write_line(line,output_gff_one,False)
return size_diff
else:
return "bad_size"
else :
return "absent"
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# functions to output the stats on the transfer
def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
feature_missing_segments[5].append(feature_id)
if (first_seg=='') : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_segments[3].append(feature_id)
elif first_seg != list_seg[0][1:]: # the first segment is missing
feature_missing_segments[0].append(feature_id)
elif last_seg!=list_seg[-1][1:]: # the last segment is missing
feature_missing_segments[2].append(feature_id)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) & (feature_id not in feature_missing_segments[3]): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
if segment[1:] not in segments_on_target_genome:
feature_missing_segments[1].append(feature_id)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if segment[1:] not in segments_on_target_genome:
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[4].append(feature_id)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK FOR INSERTIONS !!
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[6].append(feature_id)
def get_annot_features(list_features):
list_annot_features=[]
for feature in list_features:
list_annot_features.append(Features[feature].note)
return list_annot_features
def count_hypput_total(list_annot_first):
total=len(list_annot_first)
count_hypput=0
for annot in list_annot_first:
if ("hypothetical" in annot) | ("putative" in annot):
count_hypput+=1
return [count_hypput,total]
# print stats on the transfer : number of feature that have segments in different positions missing.
def stats_features(feature_missing_segments):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
list_annot_first=get_annot_features(feature_missing_segments[0])
[hyp_put,total]=count_hypput_total(list_annot_first)
print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_middle=get_annot_features(feature_missing_segments[1])
[hyp_put,total]=count_hypput_total(list_annot_middle)
print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_last=get_annot_features(feature_missing_segments[2])
[hyp_put,total]=count_hypput_total(list_annot_last)
print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_all=get_annot_features(feature_missing_segments[3])
[hyp_put,total]=count_hypput_total(list_annot_all)
print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_total=get_annot_features(feature_missing_segments[4])
[hyp_put,total]=count_hypput_total(list_annot_total)
print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_ok=get_annot_features(feature_missing_segments[6])
[hyp_put,total]=count_hypput_total(list_annot_ok)
print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_features=get_annot_features(feature_missing_segments[5])
[hyp_put,total]=count_hypput_total(list_annot_features)
print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def get_segments_positions_on_genome(pos_seg):
global segments_on_target_genome
segments_on_target_genome={}
bed=open(pos_seg,'r')
lines=bed.readlines() # read line by line ?
bed.close()
for line in lines:
line=line.split()
[seg,chrom,start,stop]=[line[3][1:],line[0],line[1],line[2]]
if line[3][0:1]=='>':
strand='+'
elif line[3][0:1]=='<':
strand='-'
else:
strand=''
segments_on_target_genome[seg]=[chrom,start,stop,strand]
def get_segments_sequence_and_paths(gfa):
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
seg_seq={}
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('-s'+segment[1:])
else:
list_path.append('s'+segment[1:])
paths[line[3]]=list_path
return [paths,seg_seq]
def get_first_seg(list_seg):
first_seg_found=''
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
first_seg_found=segment[1:]
break
return first_seg_found
def get_all_features_in_gff(gff):
gff=open(gff,'r')
lines=gff.readlines()
gff.close()
list_feature=[]
for line in lines:
feature_id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if feature_id not in list_feature:
list_feature.append(feature_id)
return list_feature
# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
print("generation of the genome's gff ")
# create variables and open files
[once,detail,var,stats]=[False,False,True,False]
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
if var:
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
file_out_var = open("variations.txt", 'w')
global output_variations
output_variations=[0,"",file_out_var]
if detail:
file_out_detail = open("azucena_chr10_detail.gff", 'w')
global output_detail_gff
output_detail_gff=[0,"",file_out_detail]
if once:
file_out=open(out,'w')
global output_gff_one
output_gff_one=[0,"",file_out]
bad_size_features=0
absent_features=0
diff_size_transfered_features=[0,0] # [count,sum], to get the average
get_segments_positions_on_genome(pos_seg)
list_feature_to_transfer=get_all_features_in_gff(gff) # get the list of all the features to transfer from the gff
# create objects for stats on how many segments are absent in azucena, their average length, etc
if stats==True:
feature_missing_segments=[[],[],[],[],[],[],[]] # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing - feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
for feat in list_feature_to_transfer:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
if stats==True:
stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat)
# outputs the feature once in a gff, from the first to the last segment present on the new genome
if once:
max_diff=2 # maximum difference size (n times bigger of smaller)
transfer_stat=gff_one(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
if transfer_stat=="bad_size":
elif transfer_stat=="absent":
absent_features+=1
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat
write_line("",output_gff_one,True)
# outputs each fragment of the feature in a gff
if detail:
gff_detail(list_seg,feat) # insertions !
write_line("",output_detail_gff,True)
# outputs the detail of variations of the feature :
if var:
print_variations_2(first_seg,last_seg,feat,paths,seg_seq)
# close output_files
if detail:
file_out_detail.close()
if once:
file_out.close()
if var:
file_out_var.close()
# print stats
from statistics import median, mean
if stats==True:
if once:
print(len(Features)-(bad_size_features+absent_features),"out of",len(Features),"features are transfered.")
print(bad_size_features,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
print("average length difference of the transfered genes : ",diff_size_transfered_features[1]/diff_size_transfered_features[0])
stats_features(feature_missing_segments)
# functions to get the detail of the variations in the features
def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
# get the list of segments in common
seg_common=[]
for segment in list_1_unstrand:
if segment in list_2_unstrand:
seg_common.append(segment)
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
# for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
same_strand_count=0
for segment in seg_common:
index_1=list_1_unstrand.index(segment)
index_2=list_2_unstrand.index(segment)
if list_1[index_1]==list_2[index_2]:
same_strand_count+=1
return [seg_common,same_strand_count]
def print_variations(first_seg,last_seg,feat,paths,seg_seq):
if (first_seg!=''): # if the feature is not completly absent # add the else, output absent features
var=0 # count variations, to see if there is any
feature=Features[feat]
feat_start=feature.start
# get the lengths of the feature, on the original genome and on the new one
start_new_genome=get_feature_start_on_genome(first_seg,feat)
stop_new_genome=get_feature_stop_on_genome(last_seg,feat)
size_new_genome=int(stop_new_genome)-int(start_new_genome)+1
size_diff=str(size_new_genome-feature.size)
# get feature paths on the original genome and on the target genome
list_segfeat_azu=get_feature_path(paths,first_seg,last_seg)
list_segfeat_nb=feature.segments_list
# loop to go through both paths
i=0
j=0
[last,last_taille,last_seq,last_start,last_in_azu]=['',0,'','',''] # rename last_taille
# check if there is an inversion and remove strands
[list_segfeat_nb,list_segfeat_azu,inversion]=detect_inversion(list_segfeat_nb,list_segfeat_azu)
# detect and print variations ignoring the strands
while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
# is both segments are absent in the other genome, its a substitution
last_in_azu=list_segfeat_azu[j]
# print if we had an insertion or deletion running
if last=='insertion': # last : type énuméré. # fct compare cas précedent à cas courant, si différent imprimer cas précédent.
[pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat)
# line : formated line f"{feat}\t"
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
elif last=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
last='';last_taille=0;last_seq='';last_start=''
# print the substitution # what if plusieurs substitutions à la suite ?
# substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
[pos_old,pos_new]=get_old_new_pos_substitution(feat_start,list_segfeat_nb,list_segfeat_azu,feat,i,j)
if len(seg_seq[list_segfeat_nb[i]]) == len(seg_seq[list_segfeat_azu[j]]): # if the substituion is between two segment of the same size, print it
size_subs=len(seg_seq[list_segfeat_nb[i]])
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tsubstitution\t"+seg_seq[list_segfeat_nb[i]]+"\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(size_subs)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
else :
# if the segments of the substitution have a different size, print deletion then insertion at the same position.
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+seg_seq[list_segfeat_nb[i]]+"\t-\t"+str(len(seg_seq[list_segfeat_nb[i]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
line+=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(len(seg_seq[list_segfeat_azu[j]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
var+=1
write_line(line,output_variations,False)
var+=1;i+=1;j+=1
else: # azu segment not in nb, but nb segment in azu : insertion
if last=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1;last='';last_taille=0;last_start='';last_seq=''
last_in_azu=list_segfeat_azu[j]
if last=='insertion':
last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
else:
last='insertion'
last_seq=seg_seq[list_segfeat_azu[j]]
last_start=list_segfeat_nb[i]
j+=1
elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
if last=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1;last='';last_start='';last_taille=0;last_seq=''
if last=='deletion':
last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
else:
last='deletion'
last_start=list_segfeat_nb[i]
if i==0: # if the deletion is at the start of the feature, the deletion doesnt start at the start at the first segment :
#use pos_start, position of the feature on its first segment
last_seq=seg_seq[list_segfeat_nb[i]][feature.pos_start-1:]
else: # else, the deletion will always start at the start of the first segment.
last_seq=seg_seq[list_segfeat_nb[i]]
i+=1
else : # idk yet. if both segments are present in the other genome but not at the same position. probably substitution then
line="weird order change\n"
write_line(line,output_variations,False)
var+=1;i+=1;j+=1
else: # segment present in both. print the running indel if there is one
if last=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
elif last=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
last_in_azu=list_segfeat_azu[j]
last='';last_taille=0;last_start='';last_seq='';i+=1;j+=1
# finish printing the current indel
if last=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(last_taille)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
elif last=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
# see if the end is missing for one of the two genomes
if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
del_sequence=get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq)
length=len(del_sequence)
pos_new=str(size_new_genome+1) # the deletion is at the end of the feature on the new genome
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+del_sequence+"\t-\t"+str(length)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
var+=1
if var==0:
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tno_var\t-\t-\t-\t-\t-\n"
write_line(line,output_variations,False)
def get_feature_path(paths,first_seg,last_seg):
# find the path in azucena.
first_strand=segments_on_target_genome[first_seg][3]
first_seg_stranded=first_strand+first_seg
last_strand=segments_on_target_genome[last_seg][3]
last_seg_stranded=last_strand+last_seg
id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
first_index=min(id_first_seg,id_last_seg)
last_index=max(id_last_seg,id_first_seg)
list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][first_index:last_index+1]
return list_segfeat_azu
def get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq):
del_sequence=""
for k in range(i,len(list_segfeat_nb)):
if k==len(list_segfeat_nb):
del_sequence+=seg_seq[list_segfeat_nb[k]][0:feature.pos_stop]
else:
del_sequence+=seg_seq[list_segfeat_nb[k]]
return del_sequence
def get_old_new_pos_substitution(feat_start,list_segfeat_nb,list_segfeat_azu,feat,i,j):
pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat)
start_var=int(segments_on_target_genome[list_segfeat_azu[j-1]][2])+1
pos_new=str(start_var-start_feat+1)
return [pos_old,pos_new]
def get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat):
pos_old=str(int(Segments[last_start].start)-int(feat_start)+1)
start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat)
start_var=int(segments_on_target_genome[last_start][1])-1
pos_new=str(start_var-start_feat+1)
return [pos_old,pos_new]
def get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i):
if i==0:
pos_old=int(Segments[last_start].start)-int(feat_start)+1+Features[feat].pos_start
else:
pos_old=int(Segments[last_start].start)-int(feat_start)+1
if pos_old<0:
pos_old=0
if last_in_azu=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new="1"
else:
start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat)
start_var=int(segments_on_target_genome[last_in_azu][2])+1
pos_new=str(start_var-start_feat+1)
return [pos_old,pos_new]
class Variation:
def __init__(self,feature_id,feature_type,chr,start_new,stop_new,inversion,size_diff):
self.feature_id=feature_id
self.feature_type=feature_type
self.chr=chr
self.start_new=start_new
self.stop_new=stop_new
self.inversion=str(inversion)
self.size_diff=str(size_diff)
self.size_new=str(self.stop_new-self.start_new+1)
self.type=''
self.last_seg_in_target=''
# ajouter fct pour écrire la ligne. return line. # line : formated line f"{feat}\t"
#def __str__(self):
# return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"
def create_var(feature_id,first_seg,last_seg,paths):
feature=Features[feature_id]
start_new_genome=get_feature_start_on_genome(first_seg,feature_id)
stop_new_genome=get_feature_stop_on_genome(last_seg,feature_id)
size_new_genome=int(stop_new_genome)-int(start_new_genome)+1
size_diff=str(size_new_genome-feature.size)
# get feature paths on the original genome and on the target genome
list_segfeat_azu=get_feature_path(paths,first_seg,last_seg)
list_segfeat_nb=feature.segments_list
[list_segfeat_nb,list_segfeat_azu,inversion]=detect_inversion(list_segfeat_nb,list_segfeat_azu)
variation=Variation(feature_id,feature.type,feature.chr,start_new_genome,stop_new_genome,inversion,size_diff)
return(variation,list_segfeat_nb,list_segfeat_azu)
def reset_var(variation):
variation.type='' # type énuméré.
variation.size_var=0
variation.start_var=''
variation.ref=''
variation.alt=''
def get_old_new_pos_substitution_2(feat_start,variation,list_segfeat_azu,feat,j):
#pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
pos_old=str(int(Segments[variation.start_var].start)-int(feat_start)+1)
start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat)
start_var=int(segments_on_target_genome[variation.start_on_target][1])
pos_new=str(start_var-start_feat+1)
return [pos_old,pos_new]
def get_old_new_pos_insertion_2(variation,feat_start,list_segfeat_azu,feat):
pos_old=str(int(Segments[variation.start_var].start)-int(feat_start)+1)
start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat)
start_var=int(segments_on_target_genome[variation.start_var][1])-1
pos_new=str(start_var-start_feat+1)
return [pos_old,pos_new]
def get_old_new_pos_deletion_2(variation,feat_start,list_segfeat_azu,feat,i):
if i==0:
pos_old=int(Segments[variation.start_var].start)-int(feat_start)+1+Features[feat].pos_start
else:
pos_old=int(Segments[variation.start_var].start)-int(feat_start)+1
if pos_old<0:
pos_old=0
if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new="1"
else:
start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat)
start_var=int(segments_on_target_genome[variation.last_seg_in_target][2])+1
pos_new=str(start_var-start_feat+1)
return [pos_old,pos_new]
def print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j):
if variation.type=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion_2(variation,feat_start,list_segfeat_azu,feat)
line=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tinsertion\t-\t"+variation.alt+"\t"+str(len(variation.alt))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
elif variation.type=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion_2(variation,feat_start,list_segfeat_azu,feat,i)
line=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tdeletion\t"+variation.ref+"\t-\t"+str(len(variation.ref))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
elif variation.type=='substitution':
[pos_old,pos_new]=get_old_new_pos_substitution_2(feat_start,variation,list_segfeat_azu,feat,j)
if len(variation.ref) == len(variation.alt): # if the substituion is between two segment of the same size, print it
size_subs=len(variation.ref)
line=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tsubstitution\t"+variation.ref+"\t"+variation.alt+"\t"+str(size_subs)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
else :
# if the segments of the substitution have a different size, print deletion then insertion at the same position.
line=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tdeletion\t"+variation.ref+"\t-\t"+str(len(variation.ref))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
line+=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tinsertion\t-\t"+variation.alt+"\t"+str(len(variation.alt))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
def print_last_deletion(variation,list_segfeat_nb,list_segfeat_azu,i,feat_start,feature,seg_seq):
pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
del_sequence=get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq)
length=len(del_sequence)
pos_new=str(int(variation.size_new)+1) # the deletion is at the end of the feature on the new genome
line=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tdeletion\t"+del_sequence+"\t-\t"+str(length)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
write_line(line,output_variations,False)
def print_novar(variation):
line=variation.feature_id+"\t"+variation.feature_type+"\t"+variation.chr+"\t"+str(variation.start_new)+"\t"+str(variation.stop_new)+"\t"+variation.size_new+"\t"+variation.inversion+"\t"+variation.size_diff+"\tno_var\t-\t-\t-\t-\t-\n"
write_line(line,output_variations,False)
def init_new_var(variation,type,list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature):
variation.type=type
variation.start_var=list_segfeat_nb[i]
if type=="substitution":
variation.start_on_target=list_segfeat_azu[j]
variation.ref=seg_seq[list_segfeat_nb[i]]
variation.alt=seg_seq[list_segfeat_azu[j]]
elif type=="insertion":
variation.ref="-"
variation.alt=seg_seq[list_segfeat_azu[j]]
elif type=="deletion":
if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment :
#use pos_start, position of the feature on its first segment
variation.ref=seg_seq[list_segfeat_nb[i]][feature.pos_start-1:]
else: # else, the deletion will always start at the start of the first segment.
variation.ref=seg_seq[list_segfeat_nb[i]]
variation.alt="-"
def continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j):
if variation.type=="substitution":
variation.ref+=seg_seq[list_segfeat_nb[i]]
variation.alt+=seg_seq[list_segfeat_azu[j]]
elif variation.type=="insertion":
variation.alt+=seg_seq[list_segfeat_azu[j]]
elif variation.type=="deletion":
variation.ref+=seg_seq[list_segfeat_nb[i]]
def get_common_segments(list1,list2):
list_output=[]
for elem in list1:
if elem in list2:
list_output.append(elem)
return list_output
def detect_segment_order_inversion(list_1,list_2):
cpt=0
i=0
list_1_common=get_common_segments(list_1,list_2)
list_2_common=get_common_segments(list_2,list_1)
list_2_common_reversed=list(reversed(list_2_common))
while i<len(list_1_common):
#if list_2_common[i]!=list_1_common[i]: pour l'option 1 qui semble fonctionner ??
# cpt+=1
#i+=1
if list_2_common_reversed[i]==list_1_common[i]: # pour l'option 3
cpt+=1
i+=1
#if cpt>len(list_1_common)/2: # if more than half of the common segments are not at the same position ?? semble fonctionner
#if list_1_common==reversed(list_2_common): fonctionne pas
if cpt>len(list_1_common)*0.9:# if more than 90% of the segments are on the same position when the lists are reversed, there is an inversion.
#print("ee")
return [True,list_1,list(reversed(list_2))]
else:
return [False,list_1,list_2]
# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if the segments in common are on the opposite strand)
def detect_inversion(list_1,list_2):
# removes strand for the lists of stranded segments
list_1_unstrand=[segment_stranded[1:] for segment_stranded in list_1]
list_2_unstrand=[segment_stranded[1:] for segment_stranded in list_2]
[seg_common,same_strand_count]=compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand)
# if there is less than 10% of strand difference among the common segments (ie more than 90% of same strand), return 0, no inversion
if len(seg_common)-same_strand_count<len(seg_common)/10:
inversion=0
else:
inversion=1
# check if we have an inversion for the order of the segmetns. double inversion = no inversion.
[segment_order_inversion,list_segfeat_nb,list_segfeat_azu]=detect_segment_order_inversion(list_1_unstrand,list_2_unstrand)
if segment_order_inversion:
inversion=abs(inversion-1)
return [list_segfeat_nb,list_segfeat_azu,inversion]
def print_variations_2(first_seg,last_seg,feat,paths,seg_seq):
if (first_seg!=''): # if the feature is not completly absent # add the else, output absent features
[variation,list_segfeat_nb,list_segfeat_azu]=create_var(feat,first_seg,last_seg,paths)
var=0 # count variations, to see if there is any
feature=Features[feat]
feat_start=feature.start
# loop to go through both paths
i=0
j=0
# detect and print variations ignoring the strands
while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
# if both segments are absent in the other genome, its a substitution
variation.last_seg_in_target=list_segfeat_azu[j]
if (variation.type=='insertion') | (variation.type=='deletion'):
# print the current variation
print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j)
reset_var(variation)
var+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j)
else:
# initiate substitution
init_new_var(variation,"substitution",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
i+=1;j+=1
else: # azu segment not in nb, but nb segment in azu : insertion
if (variation.type=='substitution') | (variation.type=='deletion'):
# print the current variation
print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j)
reset_var(variation)
var+=1
variation.last_seg_in_target=list_segfeat_azu[j]
if variation.type=='insertion':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j)
else:
# intiate insertion
init_new_var(variation,"insertion",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
j+=1
elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
if (variation.type=='substitution') | (variation.type=='insertion'):
# print the current variation
print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j)
reset_var(variation)
var+=1
if variation.type=='deletion':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j)
else:
# intiate deletion
init_new_var(variation,"deletion",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
i+=1
else : # idk yet. if both segments are present in the other genome but not at the same position. probably substitution then
line="weird order change\n"
write_line(line,output_variations,False)
var+=1;i+=1;j+=1
else: # segment present in both. print the running indel if there is one
# print the current variation
if variation.type!='':
print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j)
var+=1
reset_var(variation)
variation.last_seg_in_target=list_segfeat_azu[j]
i+=1;j+=1
# # finish printing the current indel
# print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j)
# if variation.type!='':
# var+=1
# reset_var(variation)
if (variation.type!='') & (variation.type!="deletion"):
print_current_var(variation,feat_start,list_segfeat_azu,feat,i,j)
var+=1
# see if the end is missing for the first genome, ie i didn'r reach the length of the segment list for the first genome
if i<len(list_segfeat_nb)-1:
print_last_deletion(variation,list_segfeat_nb,list_segfeat_azu,i,feat_start,feature,seg_seq)
var+=1
reset_var(variation)
if var==0:
if list_segfeat_nb != list_segfeat_azu:
print("??")
print(feat,list_segfeat_nb,list_segfeat_azu)
print_novar(variation)
# not used.
def get_list_segments_missing(list_seg,segments_on_target_genome):
segments_missing=[]
for segment in list_seg:
if segment[1:] not in segments_on_target_genome:
segments_missing.append(Segments[segment[1:]])
return segments_missing
# takes a feature and a feature type, returns a list of child features that have the wanted type. currently not used.
def get_child_list(feature,child_type):
if type=="":
return feature.childs
list_childs=[]
for child in feature.childs:
if Features[child].type==child_type:
list_childs.append(child)
return list_childs