Newer
Older
import ClassSegFeat as Class
# functions to create the segments and the features
def init_seg(line,feature):
line=line.split()
seg_id=line[3][1:]
chr=line[0]
start=line[1]
stop=line[2]
# add the current feature to the list of features that are on the segment
feature_strand=line[10]
feature_stranded=feature_strand+feature
feat=list()
feat.append(feature_stranded)
Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
def init_feature(line):
line=line.split()
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
type=line[6]
annot=line[12]
chr=line[4]
start=line[7]
stop=line[8]
childs=list()
# add the current segment to the list of segments that have the feature
seg=line[3][1:]
strand=line[10]
seg_stranded=strand+seg
segments_list=list()
segments_list.append(seg_stranded)
# if the current feature has a parent, add the current feature in the childs list of its parent
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
def add_child(feat,new_child):
if feat in Features.keys(): # if the parent feature exists
if new_child not in Features[feat].childs:
Features[feat].childs.append(new_child)
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
# create a note for the child features that do not have annotation.
# the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
feat.note=feat.annot.split(';')[-1]
else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
# we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
annot_found=False
while annot_found==False:
if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
note=Features[curent].annot.split(';')[-1]
feat.note=note
annot_found=True
else: # if we didn't find the gene, we go back to the current feature's parent until we find it
curent=Features[Features[curent].parent].id
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):
global Features
global Segments
Segments={}
Features={}
# open the file with the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
for line in lines:
# get the ids for the dictionnaries' keys
feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line.split()[3][1:]
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line)
else: # if it exists, add the current segment to the list of segments that have the existing feature
strand=line.split()[10]
add_seg(feature_id,segment_id,strand)
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id)
else: # if it exists, add the current feature to the list of features on the existing segment
strand=line.split()[10]
add_feature(segment_id,feature_id,strand)
# for all the features, add the note (information on the function of the feature)
for feat_id in Features:
set_note(feat_id)
file.close()
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_start(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.start>f.start:
result=1
else:
result=f.start-s.start
return result
# get the feature's stop position on the segment
def get_stop(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.stop<f.stop:
result=s.size
else:
result=f.stop-s.start
return result
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
for segment in Segments:
# get the list of the features on the segment
features_seg=Segments[segment].features
# go through all these features, and print the gff line for each
for feature_stranded in features_seg:
strand=feature_stranded[0:1]
feature=feature_stranded[1:]
segment_stranded=strand+segment
type=Features[feature].type
stop=get_stop(segment,feature)
# get the rank and the total number of ocurrences for the feature
rank=str(Features[feature].segments_list.index(segment_stranded)+1)
total=str(len(Features[feature].segments_list))
# create the annotation with the rank information
annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
# write the gff line in the output file
line=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
file_out.write(line)
file_out.close()
# functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_genome, feat_id):
s_start=seg_genome[seg_start][1]
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_genome, feat_id):
s_start=seg_genome[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
stop=int(s_start)+int(f_stop)
return stop
# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
# start position of the entire feature on the reference
start_gene_start=Segments[feat_start].start+get_start(feat_start,feature.id)-1
# start position of the piece of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on azucena, to get the length of the piece of the feature
start_azu=get_start_2(seg_start,seg_genome,feature.id)
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
# start position of the piece of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_azu-start_azu)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
start_azu=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on azucena
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on azucena
proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
seg_stop=list_seg[i-1][1:]
feature=Features[feat]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on azucena
start_azu=get_start_2(seg_start,seg_genome,feature.id)
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
proportion=get_proportion(seg_start,seg_stop,seg_genome,feature)
position=get_position(seg_start,seg_stop,feat_start,feature,seg_genome)
annotation=feature.annot+proportion+position
out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
# create a dictionnary with the positions of the segments on the genome to transfer on
seg_genome={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
s_id=line[3][1:]
ch=line[0]
start=line[1]
stop=line[2]
bed.close()
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# create a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"):
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=list()
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1=='<']:
list_path.append('-s'+segment[1:])
else:
list_path.append('s'+segment[1:])
#couple=['s'+segment[1:],segment[0:1]]
##list_path.append(couple)
#list_path.append('s'+segment[1:])
paths[line[3]]=list_path
gff=open(gff,'r')
file_out_detail = open("azucena_chr10_detail.gff", 'w')
file_out=open(out,'w')
file_out2 = open("segments_manquants.txt", 'w')
lines=gff.readlines()
diff_list=list()
stats=False
bad_size=0
absent_features=0
# get the list of all the features to transfer from the gff
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# create objects for stats on how many segments are absent in azucena, their average length, etc
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
feature_total=list()
feature_ok=list()
#size_seg_missing=list()
# for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
for feat in list_feature:
list_seg=Features[feat].segments_list
size_list=len(list_seg)
first_seg="0"
for segment in list_seg:
first_seg=segment#[1:]
break
last_seg="0"
for segment in reversed(list_seg):
last_seg=segment#[1:]
# stats on how many segments are absent in azucena, their average length, etc
# for the segments
# segment missing that is at the start of a feature - seg_first
# segment missing that is at the end of a feature - seg_last
# segment missing that is in the middle of a feature - seg_middle
# segment missing that contains the entire feature - seg_entier
# total number of genes missing - seg_total_abs
# segments not missing - seg_ok
# segments missing of not - seg_total
if (stats==True):
for segment in list_seg: # counts the segments in each category
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_genome: # the segment is missing
seg_total.append(seg_len)
segments_missing.append(Segments[segment[1:]])
if segment==list_seg[0]: # the segment is the first of the feature
if segment==list_seg[-1]: # the segment is the last of the feature
seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
else:
seg_first.append(seg_len)
elif segment==list_seg[-1]: # the segment is the last of the feature
seg_last.append(seg_len)
else:
seg_middle.append(seg_len)
seg_ok.append(seg_len)
# for the features :
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing ->feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
if (stats==True):
feature_total.append(feat)
if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_all.append(feat)
elif first_seg[1:] != list_seg[0][1:]: # the first segment is missing
feature_missing_first.append(feat)
elif last_seg[1:]!=list_seg[-1][1:]: # the last segment is missing
feature_missing_last.append(feat)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
feature_missing_middle.append(feat)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if feat not in feature_missing_total:
feature_missing_total.append(feat)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK !!
if feat not in feature_missing_total:
feature_ok.append(feat)
# outputs each fragment of each feature, with its position on the new genome and its annotation :
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
# etc.
# at the end of the loop, prints the last part of the fragment.
feat_start="empty" # stocks the first segment of the feature
seg_start="empty" # stocks the first segment of the current part of the feature
#seg_stop="empty" # stocks the last segment of the current part of the feature
for i in range(0,size_list):
if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
if feat_start=="empty":
feat_start=list_seg[i][1:]
if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=list_seg[i][1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
file_out_detail.write(out_line)
seg_start="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last piece of the feature in the end
if list_seg[i][1:] in seg_genome:
out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
file_out_detail.write(out_line)
# outputs each feature once, from the first to the last segment present on the new genome and its annotation : ADD THE SIZE PARAMETER
if (first_seg!='0') & (last_seg!='0'):
start2=get_start_2(first_seg[1:],seg_genome,feat)
stop2=get_stop_2(last_seg[1:],seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=abs(size_on_genome-Features[feat].size)
if not ((size_on_genome>Features[feat].size*2) | (size_on_genome<Features[feat].size/2)) :
chr=seg_genome[first_seg[1:]][0]
strand=list_seg[i-1][0:1]
start2=get_start_2(first_seg[1:],seg_genome,feat)
stop2=get_stop_2(last_seg[1:],seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=abs(size_on_genome-Features[feat].size)
nb_variants=0
for segment in list_seg:
if segment[1:] not in seg_genome:
nb_variants+=1
annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)
line=chr+" graph_gff "+Features[feat].type+" "+str(get_start_2(first_seg[1:],seg_genome,feat))+" "+str(get_stop_2(last_seg[1:],seg_genome,feat))+" . "+strand+" . "+annotation+"\n"
file_out.write(line)
else:
bad_size+=1
else :
absent_features+=1
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
if (first_seg!='0') & (last_seg!='0'):
# find the path in azucena.
#print(list.index(nom)
id_first_seg=paths["CM020642.1_Azucena_chromosome10"].index(first_seg)
id_last_seg=paths["CM020642.1_Azucena_chromosome10"].index(last_seg)
list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][1546:1634]
list_segfeat_nb=Features[feat].segments_list
debase=False
if debase==True:
# debut
if list_segfeat_nb.index(first_seg)==0:
print("le debut c bon")
else:
print("manque",list_segfeat_nb.index(first_seg)+1,"bout au début")
# milieu
for i in range(1,len(list_segfeat_nb)):
if list_segfeat_nb[i] not in list_segfeat_azu:
print("manque un bout au milieu",list_segfeat_nb[i])
# fin
if list_segfeat_nb.index(last_seg)==(len(list_segfeat_nb)):
print("la fin c bon")
else:
print("manque",len(list_segfeat_nb)-list_segfeat_nb.index(last_seg),"bout à la fin")
# boucles parcourir les deux listes.
donei=False
donej=False
i=0
j=0
last=''
last_taille=0
while ((donei==False) & (donej==False)):
#for i in range(0,len(list_segfeat_nb)):
if list_segfeat_nb[i] != list_segfeat_azu[j]: # si on a une différence entre les deux listes : un est pas chez l'autre.
if list_segfeat_azu[j] not in list_segfeat_nb: # seg azu pas chez nb
if list_segfeat_nb[i] not in list_segfeat_azu: # seg nb pas dans azu
# si aucun des deux est dans l'autre, on a une substitution.
if last=='insertion':
print('insertion de taille',last_taille)
elif last=='deletion':
print('deletion de taille',last_taille)
last=''
last_taille=0
print('substitution de',list_segfeat_nb[i][1:],'de taille',Segments[list_segfeat_nb[i][1:]].size,"par",list_segfeat_azu[j][1:],'de taille',int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1)
if i<len(list_segfeat_nb)-1:
i+=1
else:
donei=True
#print('i1')
if j<len(list_segfeat_azu)-1:
j+=1
else:
donej=True
#print('j1')
else: # seg azu pas dans nb, mais seg nb dans azu. insertion
if last=='insertion':
last_taille+=int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1
elif last=='deletion':
print('deletion de taille',last_taille)
last=''
last_taille=0
else:
last='insertion'
last_taille=int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1
#print('insertion de',list_segfeat_azu[j][1:],'de taille',int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1)
if j<len(list_segfeat_azu)-1:
j+=1
else:
donej=True
#print('j2')
elif list_segfeat_nb[i] not in list_segfeat_azu: # seg nb pas chez azu, mais seg azu dans nb. deletion
if last=='deletion':
last_taille+=Segments[list_segfeat_nb[i][1:]].size
elif last=='insertion':
print('insertion de taille',last_taille)
last=''
last_taille=0
else:
last='deletion'
last_taille=Segments[list_segfeat_nb[i][1:]].size
#print('deletion de',list_segfeat_nb[i][1:],'de taille',Segments[list_segfeat_nb[i][1:]].size)
if i<len(list_segfeat_nb)-1:
i+=1
else:
donei=True
#print('i2')
else :
print("changement d'ordre bizarre")
if i<len(list_segfeat_nb)-1:
i+=1
else:
donei=True
#print('i3')
if j<len(list_segfeat_azu)-1:
j+=1
else:
donej=True
#print('j3')
else:
#print("segment",list_segfeat_nb[i][1:],'ok')
if last=='insertion':
print('insertion de taille',last_taille)
elif last=='deletion':
print('deletion de taille',last_taille)
last=''
last_taille=0
if i<len(list_segfeat_nb)-1:
i+=1
else:
donei=True
#print('i4')
if j<len(list_segfeat_azu)-1:
j+=1
else:
donej=True
#print('j4')
# voir s'il manque la fin pour un des deux
if last=='insertion':
print('insertion de taille',last_taille)
elif last=='deletion':
print('deletion de taille',last_taille)
if (i==len(list_segfeat_nb)-1) & (donei==True):
if (j!=len(list_segfeat_azu)-1) & (donej==False): # taille à revoir
print('insertion de',list_segfeat_azu[j+1:len(list_segfeat_azu)-1][1:],'à la fin, de taille',int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1,'(le segment, pas l\'insertion)')
else:
print('deletion de',list_segfeat_nb[i:],'à la fin, de taille',Segments[list_segfeat_nb[i][1:]].size,'(le segment, pas la deletion)') # taille à revoir
# au début et à la fin, si on a une délétion, on peut que donner la taille du segment, pas la taille de la délétion.
# j'ai aucune idée de comment régler ça simplement...
# dans le segment ou la feature, garder l'info de où sur le segment ça démarre.....
# outputs the detail of the missing fragments for each feature :
# get the list of the segments in a feature that are missing in the genome
for segment in list_seg:
segments_missing.append(Segments[segment[1:]])
if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'):
# get the lengths of the feature, on the original genome or on the new one
start2=get_start_2(first_seg[1:],seg_genome,feat)
stop2=get_stop_2(last_seg[1:],seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=size_on_genome-Features[feat].size
diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
line=feat+" : "+str(len(segments_missing))+" variations. length difference (length in the new genome-length in the original genome) : "+str(size_diff)+"\n"
# length of the original feature : "+str(Features[feat].size)+", length of the feature on this genome : "+str(size_on_genome)+"\n"
file_out2.write(line)
# for each segment missing, see where it is on the feature (start, middle, end)
for segment in segments_missing:
if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier
elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier
elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier
line="variation in the middle of the feature, of length "+str(segment.size)+"\n"
# check if its a substitution of an indel
# if its a substitution ou a deletion, print what it is replacing in the original feature
#size_seg_missing.append(segment.size)
file_out2.write("\n")
elif len(segments_missing)!=0:
line=feat+": feature entirely absent\n\n"
file_out2.write(line)
line=feat+": no variation.\n\n"
file_out2.write(line)
#import matplotlib.pyplot as plt
#plt.hist(diff_list, range=(-1000,1000))
#plt.show()
#print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing)))
file_out_detail.close()
file_out.close()
gff.close()
if stats==True:
print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))
# print stats
from statistics import median, mean
if stats==True:
# prints the stats for the segments
#print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
#print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
#print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
#print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
#print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
#print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
# creates files with the features by category (missing first, middle, end segment, etc)
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
if Features[first].type!='gene':
feat_miss_first.write(';')
feat_miss_first.write(Features[first].note)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
if Features[middle].type!='gene':
feat_miss_middle.write(';')
feat_miss_middle.write(Features[middle].note)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
if Features[last].type!='gene':
feat_miss_last.write(';')
feat_miss_last.write(Features[last].note)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
if Features[entier].type!='gene':
feat_miss.write(';')
feat_miss.write(Features[entier].note)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
if Features[total].type!='gene':
feat_miss_total.write(';')
feat_miss_total.write(Features[total].note)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
if Features[feat].type!='gene':
feat_all.write(';')
feat_all.write(Features[feat].note)
feat_all.write("\n")
feat_all.close()
feat_ok=open('features_ok.txt','w')
feat_ok.write("\nfeatures entirely present in azucena :\n")
for ok in feature_ok :
feat_ok.write(Features[ok].annot)
if Features[ok].type!='gene':
feat_ok.write(';')
feat_ok.write(Features[ok].note)
feat_ok.write("\n")
feat_ok.close()
# prints the stats for the features (hypothetical/putative rate, by category)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
total=len(feature_missing_first)
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
total=len(feature_missing_middle)
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
total=len(feature_missing_last)
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
total=len(feature_missing_all)
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
total=len(feature_missing_total)
print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
total=len(feature_ok)
print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
total=len(list_feature)
print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
subprocess.run(command,shell=True)