Newer
Older
from Graph_gff import Segments, Features, get_start, get_stop
# functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_genome, feat_id):
s_start=seg_genome[seg_start][1]
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)-1
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_genome, feat_id):
s_start=seg_genome[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
stop=int(s_start)+int(f_stop)-1
return stop
# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
start_gene_start=Segments[feat_start[1:]].start+get_start(feat_start[1:],feature.id)-1
# start position of the part of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on the genome we transfer on, to get the length of the part of the feature
start_new_genome=get_start_2(seg_start,seg_genome,feature.id)
stop_new_genome=get_stop_2(seg_stop,seg_genome,feature.id)
# start position of the part of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_new_genome-start_new_genome)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
start_new_genome=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on the genome we transfer on
stop_new_genome=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on the genome we transfer on
proportion=";proportion="+str(stop_new_genome-start_new_genome+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_new_genome-start_new_genome+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file (gff detail)
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
feature=Features[feat]
chr=seg_genome[seg_start[1:]][0]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on the genome we transfer on
start_new_genome=get_start_2(seg_start[1:],seg_genome,feature.id)
stop_new_genome=get_stop_2(seg_stop[1:],seg_genome,feature.id)
proportion=get_proportion(seg_start[1:],seg_stop[1:],seg_genome,feature)
position=get_position(seg_start[1:],seg_stop[1:],feat_start,feature,seg_genome)
annotation=feature.annot+proportion+position
out_line=chr+"\tGrAnnot\t"+feature.type+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
# takes a list of stranded segments, returns a list of unstranded segments
def remove_strand(list_seg):
list_unstrand=list()
for seg_stranded in list_seg:
list_unstrand.append(seg_stranded[1:])
return list_unstrand
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if most of the segments in common are on the opposite strand)
def detect_inversion(list_1,list_2):
list_1_unstrand=remove_strand(list_1)
list_2_unstrand=remove_strand(list_2)
# get the list of segments in common
seg_common=list()
for seg in list_1_unstrand:
if seg in list_2_unstrand:
seg_common.append(seg)
# for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
same_strand_count=0
for seg in seg_common:
index_1=list_1_unstrand.index(seg)
index_2=list_2_unstrand.index(seg)
if list_1[index_1]==list_2[index_2]:
same_strand_count+=1
# if there is less than 10% of strand difference among the common segments (ie more than 90% of same strand), return False, no inversion
if len(seg_common)-same_strand_count<len(seg_common)/10:
return False
else:
return True
# takes a feature and a feature type, returns a list of child features that have the wanted type
def get_list_childs(feature,child_type):
list_childs=list()
for child in feature.childs:
if Features[child].type==child_type:
list_childs.append(child)
return list_childs
# print stats on the transfer : number of feature that have segments in different positions missing.
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok):
# creates files with the features by category (missing first, middle, end segment, etc)
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
if Features[first].type!='gene':
feat_miss_first.write(';')
feat_miss_first.write(Features[first].note)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
if Features[middle].type!='gene':
feat_miss_middle.write(';')
feat_miss_middle.write(Features[middle].note)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
if Features[last].type!='gene':
feat_miss_last.write(';')
feat_miss_last.write(Features[last].note)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
if Features[entier].type!='gene':
feat_miss.write(';')
feat_miss.write(Features[entier].note)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
if Features[total].type!='gene':
feat_miss_total.write(';')
feat_miss_total.write(Features[total].note)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
if Features[feat].type!='gene':
feat_all.write(';')
feat_all.write(Features[feat].note)
feat_all.write("\n")
feat_all.close()
feat_ok=open('features_ok.txt','w')
feat_ok.write("\nfeatures entirely present in azucena :\n")
for ok in feature_ok :
feat_ok.write(Features[ok].annot)
if Features[ok].type!='gene':
feat_ok.write(';')
feat_ok.write(Features[ok].note)
feat_ok.write("\n")
feat_ok.close()
# prints the stats for the features (hypothetical/putative rate, by category)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
total=len(feature_missing_first)
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
total=len(feature_missing_middle)
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
total=len(feature_missing_last)
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
total=len(feature_missing_all)
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
total=len(feature_missing_total)
print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
total=len(feature_ok)
print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
total=len(list_feature)
print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
# delete the files created for the stats
command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
subprocess.run(command,shell=True)
# outputs each fragment of th feature in a gff
def gff_detail(list_seg,seg_genome,feat,file_out_detail):
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
# etc.
# at the end of the loop, prints the last part of the fragment.
feat_start="empty" # stocks the first segment of the feature
seg_start="empty" # stocks the first segment of the current part of the feature
#seg_stop="empty" # stocks the last segment of the current part of the feature
for i in range(0,len(list_seg)):
if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
if feat_start=="empty":
feat_start=list_seg[i]
if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=list_seg[i]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
file_out_detail.write(out_line)
seg_start="empty"
#seg_stop="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last part of the feature in the end
if list_seg[i][1:] in seg_genome:
out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
file_out_detail.write(out_line)
# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n):
# count the number of features not transfered because they are too big or too small on the new genome
# count the number of features absent on the new genome
if (first_seg!='0') & (last_seg!='0'):
start2=get_start_2(first_seg,seg_genome,feat)
stop2=get_stop_2(last_seg,seg_genome,feat)
size_on_new_genome=int(stop2)-int(start2)+1
size_diff=abs(size_on_new_genome-Features[feat].size)
# if the gene on the new genome is not 2 times bigger or smaller than the original, write its line in the gff
if not ((size_on_new_genome>Features[feat].size*n) | (size_on_new_genome<Features[feat].size/n)) :
chr=seg_genome[first_seg][0]
strand=list_seg[0][0:1]
nb_variants=0
for segment in list_seg:
if segment[1:] not in seg_genome: # if a segment is absent, it is either a deletion of a substitution. insertions are not considered here.
nb_variants+=1
annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)
line=chr+" GrAnnot "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+" . "+strand+" . "+annotation+"\n"
file_out.write(line)
else:
bad_size+=1
else :
absent_features+=1
return list([bad_size,absent_features])
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
# outputs the detail of the variations on the new genome for the feature
def print_variations(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq,list_var,list_var_rate):
string_print=""
cpt_print=0
# list of size difference, to get the average size difference
diff_list=list()
list_var_temp=[0,0,0,0,0]
# create type_n the index of the list of the lists in list_var. for the stats of the number of variation by type of feature.
type=Features[feat].type
match type:
case "gene":
type_n=0
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
case "mRNA":
type_n=1
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
case "intron":
type_n=2
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
case "exon":
type_n=3
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
case "three_prime_UTR":
type_n=4
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
case "five_prime_UTR":
type_n=5
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
case "CDS":
type_n=6
list_var[3][type_n]+=1
list_var_temp[3]+=1
list_var[4][type_n]+=Features[feat].size
list_var_temp[4]+=Features[feat].size
# get the list of the segments in a feature that are missing in the genome
segments_missing=list()
for segment in list_seg:
if segment[1:] not in seg_genome:
segments_missing.append(Segments[segment[1:]])
# if there are missing segments, print them
if (first_seg!='0') & (last_seg!='0'):
var=0 # count variations, to see if there is any
feature=Features[feat]
feat_start=feature.start
# get the lengths of the feature, on the original genome and on the new one
start_new_genome=get_start_2(first_seg,seg_genome,feat)
stop_new_genome=get_stop_2(last_seg,seg_genome,feat)
size_new_genome=int(stop_new_genome)-int(start_new_genome)+1
size_diff=str(size_new_genome-feature.size)
diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
#line=feat+" : \nlength on the original genome = "+str(feature.size)+"\nlength difference (length in the new genome-length in the original genome) = "+size_diff+"\n"
#file_out2.write(line)
# find the path in azucena.
first_strand=seg_genome[first_seg][3]
first_seg_stranded=first_strand+first_seg
last_strand=seg_genome[last_seg][3]
last_seg_stranded=last_strand+last_seg
id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][id_first_seg:id_last_seg+1]
list_segfeat_nb=Features[feat].segments_list
# loop to go through both paths
i=0
j=0
last=''
last_taille=0
last_seq=''
last_start=''
#pos=''
last_in_azu=''
# check if there is an inversion
inversion=detect_inversion(list_segfeat_nb,list_segfeat_azu)
if inversion:
inverted="1"
else:
inverted="0"
#if detect_inversion(list_segfeat_nb,list_segfeat_azu):
# line="inversion detected\n"
# file_out2.write(line)
# remove the strands from the lists of segments :
list_segfeat_nb=remove_strand(list_segfeat_nb)
list_segfeat_azu=remove_strand(list_segfeat_azu)
# detect and print variations ignoring the strands
while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
# is both segments are absent in the other genome, its a substitution
last_in_azu=list_segfeat_azu[j]
# print if we had an insertion or deletion running
if last=='insertion':
pos_old=str(int(Segments[last_start].start)-int(feat_start)+1)
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_start][1])-1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[1][type_n]+=1
list_var_temp[1]+=1
var+=1
elif last=='deletion':
pos_old=int(Segments[last_start].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_in_azu][2])+1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[0][type_n]+=1
list_var_temp[0]+=1
var+=1
last='';last_taille=0;last_seq='';last_start=''
# print the substitution
# substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[list_segfeat_azu[j-1]][2])+1
pos_new=str(start_var-start_feat+1)
# start du fragment (var) - start du gène +1
# start de la var (donc fin du segment d'avant +1) = seg_genome[id_seg][2]+1
# -> trouver le segment d'avant -> j-1.
# start du gène sur azu -> get_start_2 premier_seg, seg_genome, feature_id
# feat_id = feat
# seg_genome = seg_gneome
# premier_seg sur azu -> list_segfeat_azu[0]
# seg_genome donne les positions sur azu
# seg_genome[seg]=list([ch,start,stop,strand])
if len(seg_seq[list_segfeat_nb[i]]) == len(seg_seq[list_segfeat_azu[j]]): # if the substituion is between two segment of the same size, print it
size_subs=len(seg_seq[list_segfeat_nb[i]])
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tsubstitution\t"+seg_seq[list_segfeat_nb[i]]+"\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(size_subs)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
else :
# if the segments of the substitution have a different size, print deletion then insertion at the same position.
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+seg_seq[list_segfeat_nb[i]]+"\t-\t"+str(len(seg_seq[list_segfeat_nb[i]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
line+=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(len(seg_seq[list_segfeat_azu[j]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
var+=1
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[2][type_n]+=1
list_var_temp[2]+=1
var+=1
# go to the next segments on the paths
i+=1;j+=1
else: # azu segment not in nb, but nb segment in azu : insertion
if last=='deletion':
if i==0:
pos_old=int(Segments[last_start].start)-int(feat_start)+1+feature.pos_start
else:
pos_old=int(Segments[last_start].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_in_azu][2])+1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[0][type_n]+=1
list_var_temp[0]+=1
var+=1;last='';last_taille=0;last_start='';last_seq=''
last_in_azu=list_segfeat_azu[j]
if last=='insertion':
#last_taille+=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
else:
last='insertion'
last_seq=seg_seq[list_segfeat_azu[j]]
#last_taille=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
last_start=list_segfeat_nb[i]
j+=1
elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
if last=='insertion':
pos_old=int(Segments[last_start].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_start][1])-1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[1][type_n]+=1
list_var_temp[1]+=1
var+=1;last='';last_start='';last_taille=0;last_seq=''
if last=='deletion':
#last_taille+=Segments[list_segfeat_nb[i]].size
last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
else:
last='deletion'
last_start=list_segfeat_nb[i]
#last_taille=Segments[list_segfeat_nb[i]].size
if i==0: # if the deletion is at the start of the feature, the deletion doesnt start at the start at the first segment :
#use pos_start, position of the feature on its first segment
last_seq=seg_seq[list_segfeat_nb[i]][feature.pos_start-1:]
else: # else, the deletion will always start at the start of the first segment.
last_seq=seg_seq[list_segfeat_nb[i]]
#if i==0:
# start=1
i+=1
else : # idk yet. if both segments are present in the other genome but not at the same position. probably substitution then
line="weird order change"
string_print+=line
cpt_print+=1
#file_out2.write(line)
var+=1;i+=1;j+=1
else: # segment present in both. print the running indel if there is one
if last=='insertion':
pos_old=int(Segments[last_start].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_start][1])-1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[1][type_n]+=1
list_var_temp[1]+=1
var+=1
elif last=='deletion':
if i==0:
pos_old=int(Segments[last_start].start)-int(feat_start)+1+feature.pos_start
else:
pos_old=int(Segments[last_start].start)-int(feat_start)+1
if last_in_azu=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new="1"
else :
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_in_azu][2])+1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[0][type_n]+=1
list_var_temp[0]+=1
var+=1
last_in_azu=list_segfeat_azu[j]
last='';last_taille=0;last_start='';last_seq='';i+=1;j+=1
# finish printing the current indel
if last=='insertion':
pos_old=int(Segments[last_start].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_start][1])-1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(last_taille)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[1][type_n]+=1
list_var_temp[1]+=1
var+=1
elif last=='deletion':
pos_old=int(Segments[last_start].start)-int(feat_start)+1
start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
start_var=int(seg_genome[last_in_azu][2])+1
pos_new=str(start_var-start_feat+1)
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[0][type_n]+=1
list_var_temp[0]+=1
var+=1
# see if the end is missing for one of the two genomes
if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
if (i<len(list_segfeat_nb)-1):
if not (j>=len(list_segfeat_azu)-1):
print('pb')
# length of the deletion = position of the end of the feature - position of the start of the deletion
#length=feature.stop-Segments[list_segfeat_nb[i]].start # pas bon, on prend le dernier segment en entier. len(del_sequence) plus sur.
pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
del_sequence=""
for k in range(i,len(list_segfeat_nb)):
if k==len(list_segfeat_nb):
del_sequence+=seg_seq[list_segfeat_nb[k]][0:feature.pos_stop]
else:
del_sequence+=seg_seq[list_segfeat_nb[k]]
length=len(del_sequence)
pos_new=str(length) # the deletion is at the end of the feature on the new genome
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+del_sequence+"\t-\t"+str(length)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
list_var[0][type_n]+=1
var+=1
elif (j<len(list_segfeat_azu)-1):
print("ins at the end ?")
# string_print+=line
# #file_out2.write("insertion at the end\n")
# list_var[1][type_n]+=1
# var+=1
if var==0:
line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(size_new_genome)+"\t"+size_diff+"\tno_var\t-\t-\t-\t-\t-\n"
string_print+=line
cpt_print+=1
#file_out2.write(line)
#else:
#line=feat+" : feature entirely absent.\n"
#file_out2.write(line)
for i in [0,1,2]:
if list_var_temp[4]!=0:
list_var_rate[i][type_n].append(list_var_temp[i]/list_var_temp[4])
# no need to return this list, it modifies the original list directly
if string_print!="":
file_out2.write(string_print)
return diff_list
# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
# create a dictionnary with the positions of the segments on the genome to transfer on
seg_genome={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
if line[3][0:1]=='>':
strand='+'
elif line[3][0:1]=='<':
strand='-'
else:
strand=''
seg=line[3][1:]
ch=line[0]
start=line[1]
stop=line[2]
seg_genome[seg]=list([ch,start,stop,strand])
bed.close()
# create a dictionnary with the sequences of the segments of the graph and a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=list()
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
list_path.append('-s'+segment[1:])
else:
list_path.append('s'+segment[1:])
paths[line[3]]=list_path
file_out_detail = open("azucena_chr10_detail.gff", 'w')
file_out=open(out,'w')
file_out2 = open("variations.txt", 'w')
bad_size=0
absent_features=0
# for stats by type of variation
del_gene=0;del_mrna=0;del_intron=0;del_exon=0;del_3utr=0;del_5utr=0;del_cds=0
deletions=[del_gene,del_mrna,del_intron,del_exon,del_3utr,del_5utr,del_cds]
ins_gene=0;ins_mrna=0;ins_intron=0;ins_exon=0;ins_3utr=0;ins_5utr=0;ins_cds=0
insertions=[ins_gene,ins_mrna,ins_intron,ins_exon,ins_3utr,ins_5utr,ins_cds]
sub_gene=0;sub_mrna=0;sub_intron=0;sub_exon=0;sub_3utr=0;sub_5utr=0;sub_cds=0
substitutions=[sub_gene,sub_mrna,sub_intron,sub_exon,sub_3utr,sub_5utr,sub_cds]
nb_type=[0,0,0,0,0,0,0]
length_type=[0,0,0,0,0,0,0]
list_var=[deletions,insertions,substitutions,nb_type,length_type]
deletions_rate=[list(),list(),list(),list(),list(),list(),list()]
insertions_rate=[list(),list(),list(),list(),list(),list(),list()]
substitutions_rate=[list(),list(),list(),list(),list(),list(),list()]
list_var_rate=[deletions_rate,insertions_rate,substitutions_rate]
stats=False
plot=False
# get the list of all the features to transfer from the gff
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# create objects for stats on how many segments are absent in azucena, their average length, etc
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
feature_total=list()
feature_ok=list()
for feat in list_feature:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list
first_seg="0"
for segment in list_seg:
break
last_seg="0"
for segment in reversed(list_seg):
# stats on how many segments are absent in azucena, their average length, etc
# for the segments
# segment missing that is at the start of a feature - seg_first
# segment missing that is at the end of a feature - seg_last
# segment missing that is in the middle of a feature - seg_middle
# segment missing that contains the entire feature - seg_entier
# total number of genes missing - seg_total_abs
# segments not missing - seg_ok
# segments missing of not - seg_total
if (stats==True):
for segment in list_seg: # counts the segments in each category
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_genome: # the segment is missing
seg_total.append(seg_len)
if segment==list_seg[0]: # the segment is the first of the feature
if segment==list_seg[-1]: # the segment is the last of the feature
seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
else:
seg_first.append(seg_len)
elif segment==list_seg[-1]: # the segment is the last of the feature
seg_last.append(seg_len)
else:
seg_middle.append(seg_len)
seg_ok.append(seg_len)
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing ->feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
if (stats==True):
feature_total.append(feat)
if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_all.append(feat)
elif first_seg != list_seg[0][1:]: # the first segment is missing
feature_missing_first.append(feat)
elif last_seg!=list_seg[-1][1:]: # the last segment is missing
feature_missing_last.append(feat)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
if segment[1:] not in seg_genome:
feature_missing_middle.append(feat)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if feat not in feature_missing_total:
feature_missing_total.append(feat)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK !!
if feat not in feature_missing_total:
feature_ok.append(feat)
# outputs each fragment of the feature in a gff
gff_detail(list_seg,seg_genome,feat,file_out_detail) # insertions !
# outputs the feature once in a gff, from the first to the last segment present on the new genome
n=2 # maximum difference size (n times bigger of smaller)
result=gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n)
bad_size+=result[0]
absent_features+=result[1]
# outputs the detail of variations for the feature :
variations_output=print_variations(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq,list_var,list_var_rate)
diff_list+=variations_output
file_out_detail.close()
file_out.close()
file_out2.close()
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
# changer la forme de l'output des variations
#print(list_var)
#print(list_var_rate)
#print(sum(list_var_rate[1][0])/len(list_var_rate[1][0]))
if plot==True:
import seaborn as sns
import matplotlib.pyplot as plt
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above)
sns.set(style="darkgrid")
import pandas as pd
df_deletions = pd.DataFrame(list_var_rate[0])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
df_deletions=df_deletions.T
df_deletions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']
sns.violinplot(data=df_deletions)
plt.ylim(-0.2, 0.2)
plt.show()
df_insertions = pd.DataFrame(list_var_rate[1])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
df_insertions=df_insertions.T
df_insertions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']
sns.violinplot(data=df_insertions)
plt.ylim(-0.2, 0.2)
plt.show()
df_substitutions = pd.DataFrame(list_var_rate[2])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
df_substitutions=df_substitutions.T
df_substitutions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']
sns.violinplot(data=df_substitutions)
plt.ylim(-0.2, 0.2)
plt.show()
# print stats
from statistics import median, mean
if stats==True:
print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
if len(diff_list)!=0:
print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))
stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok)
# prints the stats for the segments
#print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
#print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
#print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
#print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
#print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
#print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))