Newer
Older
import ClassSegFeat as Class
# functions to create the segments and the features
# create segment
def init_seg(line,feature):
line=line.split()
seg_id=line[3][1:]
chr=line[0]
start=line[1]
stop=line[2]
# add the current feature (stranded) to the list of features that are on the segment
feature_strand=line[10]
feature_stranded=feature_strand+feature
feat=list()
feat.append(feature_stranded)
Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
# create feature
def init_feature(line):
line=line.split()
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
type=line[6]
annot=line[12]
chr=line[4]
start=line[7]
stop=line[8]
childs=list()
# add the current segment (stranded) to the list of segments that have the feature
seg=line[3][1:]
strand=line[10]
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
seg_stranded=strand+seg
segments_list=list()
segments_list.append(seg_stranded)
# if the current feature as a parent, add the feature un the childs list of its parent
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
# add a feature (stranded) to an existing segment
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
# add a child to an existing feature
def add_child(feat,new_child):
if feat in Features.keys():
if new_child not in Features[feat].childs:
Features[feat].childs.append(new_child)
# add a segment (stranded) to an existing feature
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
def set_note(id):
feat=Features[id]
if feat.type=="gene":
feat.note=feat.annot.split(';')[-1]
else:
curent=feat.parent
ann=0
while ann==0:
if Features[curent].type=="gene":
fonction=Features[curent].annot.split(';')[-1]
feat.note=fonction
ann=1
else:
curent=Features[Features[curent].parent].id
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):
global Features
global Segments
Segments={}
Features={}
# open the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
for line in lines:
# get the ids for the dictionnaries' keys
feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line.split()[3][1:]
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line)
#set_function(feature_id)
else: # add the current segment to the list of segments that have the existing feature
strand=line.split()[10]
add_seg(feature_id,segment_id,strand)
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id)
else: # add the current feature to the existing segment
strand=line.split()[10]
add_feature(segment_id,feature_id,strand)
for feat_id in Features:
set_note(feat_id)
file.close()
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_start(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.start>f.start:
result=1
else:
result=f.start-s.start
return result
# get the feature's stop position on the segment
def get_stop(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.stop<f.stop:
result=s.size
else:
result=f.stop-s.start
return result
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
for segment in Segments:
# get the list of the features on the segment
features_seg=Segments[segment].features
# go through all these features, and print the gff line for each
for feature_stranded in features_seg:
strand=feature_stranded[0:1]
feature=feature_stranded[1:]
segment_stranded=strand+segment
type=Features[feature].type
#start=max(1,get_start(segment,feature))
start=get_start(segment,feature)
stop=get_stop(segment,feature)
# get the rank and the total number of ocurrences for the feature
rank=str(Features[feature].segments_list.index(segment_stranded)+1)
total=str(len(Features[feature].segments_list))
# create the annotation with the rank information
annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
# generate the gff line
ligne=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
#print(ligne)
file_out.write(ligne)
file_out.close()
# functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_ac, feat_id):
s_start=seg_ac[seg_start][1]
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_ac, feat_id):
s_start=seg_ac[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
stop=int(s_start)+int(f_stop)
return stop
# get the position of a piece of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,gene_start,feature,seg_ac):
# start position of the complete gene on the reference
start_gene_start=Segments[gene_start].start+get_start(gene_start,feature.id)-1
# start position of the piece of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on azucena, to get the length of the piece of the feature
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
# start position of the piece of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_azu-start_azu)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of the piece of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_ac,feature):
start_azu=get_start_2(seg_start,seg_ac,feature.id) # start position of the feature on azucena
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # stop position of the feature on azucena
proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,gene_start,seg_ac):
seg_stop=list_seg[i-1][1:]
feature=Features[feat]
chr=seg_ac[seg_start][0]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on azucena
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
proportion=get_proportion(seg_start,seg_stop,seg_ac,feature)
position=get_position(seg_start,seg_stop,gene_start,feature,seg_ac)
annotation=feature.annot+proportion+position
out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
# writes the gff of azucena using the gff of the graph
# create a dictionnary with the positions of the segments on azucena
seg_ac={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
s_id=line[3][1:]
ch=line[0]
start=line[1]
stop=line[2]
seg_ac[s_id]=list([ch,start,stop])
bed.close()
gff=open(gff,'r')
file_out = open(out, 'w')
file_out_alt=open("azucena_chr10_alt.gff",'w')
file_out2 = open("segments_manquants.txt", 'w')
lines=gff.readlines()
diff_list=list()
stats=True
# get the list of all the features to transfer
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# stats on how many segments are absent in azucena, their average length, etc
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
feature_total=list()
feature_ok=list()
#size_seg_missing=list()
# for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
for feat in list_feature:
list_seg=Features[feat].segments_list
size_list=len(list_seg)
segments_missing=list()
first_seg="0"
for segment in list_seg:
if segment[1:] in seg_ac:
first_seg=segment[1:]
break
last_seg="0"
for segment in reversed(list_seg):
if segment[1:] in seg_ac:
last_seg=segment[1:]
break
# stats on how many segments are absent in azucena, their average length, etc
# for the segments
# manque au début-seg_first
# manque à la fin-seg_last
# manque au milieu-seg_middle
# début milieu et fin, donc feature entier ?-seg_entier
# nb total de segments dans un gène et pas chez azucena-seg_total_abs
# présents chez azucena-seg_ok
# segments total dans une feature (présent ou pas dans azucena)-seg_total
if (stats==True):
for segment in list_seg:
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_ac:
seg_total.append(seg_len)
segments_missing.append(Segments[segment[1:]])
#if feat not in feature_missing_total:
# feature_missing_total.append(feat)
if segment==list_seg[0]:
if segment==list_seg[-1]:
seg_entier.append(seg_len) # contient les features en un seul segment, absent chez azu.
# if feat not in feature_missing_all:
# feature_missing_all.append(feat)
else:
seg_first.append(seg_len)
# if feat not in feature_missing_first:
# feature_missing_first.append(feat)
elif segment==list_seg[-1]:
seg_last.append(seg_len)
# if feat not in feature_missing_last:
# feature_missing_last.append(feat)
else:
seg_middle.append(seg_len)
#if feat not in feature_missing_middle:
# feature_missing_middle.append(feat)
else:
seg_ok.append(seg_len)
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
# for the features :
# manque le premier->feature_missing_first
# manque le dernier->feature_missing_last
# trou au milieu->feature_missing_middle
# absent totalement (0) ->feature_missing_all
# au moins un segment qui manque->feature_missing_total
# features completes chez azucena->feature_ok
# nb features total->feature_total
if (stats==True):
feature_total.append(feat)
# premier dernier ou aucun seg présent :
if (first_seg=='0') & (last_seg=='0') : # aucun des segments de la feature n'est dans azucena.
feature_missing_all.append(feat)
elif first_seg != list_seg[0][1:]: # le premier segment est manquant chez azucena
feature_missing_first.append(feat)
elif last_seg!=list_seg[-1][1:]: # le dernier segment est manquant chez azucena
feature_missing_last.append(feat)
# parcourir les segments, voir s'ils manquent au milieu
elif (len(list_seg)!=1) & (feat not in feature_missing_all): # pour pouvoir accéder à l'avant dernier élément
for segment in list_seg[1-(len(list_seg)-2)]:
if segment not in seg_ac:
feature_missing_middle.append(feat)
break
# parcourir les segments, voir s'il en manque un n'importe où
for segment in list_seg:
if segment[1:] not in seg_ac:
if feat not in feature_missing_total:
feature_missing_total.append(feat)
break
if feat not in feature_missing_total:
feature_ok.append(feat)
# outputs each fragment of each feature, with its position on the new genome and its annotation :
# loop that goes through all the segments that have the current feature
# keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present.
# at the end of the loop, prints the last piece of the fragment.
gene_start="empty"
seg_start="empty"
seg_stop="empty"
for i in range(0,size_list):
if list_seg[i][1:] in seg_ac:
if gene_start=="empty":
gene_start=list_seg[i][1:]
if seg_start=="empty":
seg_start=list_seg[i][1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,gene_start,seg_ac)
file_out.write(out_line)
seg_start="empty"
seg_stop="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last piece of the feature in the end
if list_seg[i][1:] in seg_ac:
out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac)
file_out.write(out_line)
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
# outputs each feature once, from the first to the last segment present on the new genome and its annotation :
if (first_seg!='0') & (last_seg!='0'):
chr=seg_ac[first_seg][0]
strand=list_seg[i-1][0:1]
ligne=chr+" graph_gff "+Features[feat].type+" "+str(get_start_2(first_seg,seg_ac,feat))+" "+str(get_stop_2(last_seg,seg_ac,feat))+" . "+strand+" . "+Features[feat].annot+"\n"
file_out_alt.write(ligne)
# outputs the detail of the missing fragments for each feature :
for segment in list_seg:
if segment[1:] not in seg_ac:
segments_missing.append(Segments[segment[1:]])
if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'):
#print(first_seg,last_seg)
start2=get_start_2(first_seg,seg_ac,feat)
stop2=get_stop_2(last_seg,seg_ac,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=size_on_genome-Features[feat].size
diff_list.append(int(size_diff))
ligne=feat+" : "+str(len(segments_missing))+" variations. différence de taille (taille dans le nouveau génome-taille dans nipponbare) : "+str(size_diff)+"\n"
#taille du gene à l'origine : "+str(Features[feat].size)+", taille du gene sur ce génome : "+str(size_on_genome)+"\n"
file_out2.write(ligne)
for segment in segments_missing:
if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier
ligne="premier segment du gène manquant.\n"
elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier
ligne="dernier segment du gène manquant.\n"
elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier
ligne="variation au milieu du gène, de taille "+str(segment.size)+"\n"
# voir si c'est substitution ou indel
# print ce qu'il y a en face dans nb
#size_seg_missing.append(segment.size)
file_out2.write(ligne)
file_out2.write("\n")
else:
ligne=feat+": aucune variation.\n\n"
file_out2.write(ligne)
#print("difference moyenne : "+str(sum(diff_list)/len(diff_list)))
#import matplotlib.pyplot as plt
#plt.hist(diff_list, range=(-1000,1000))
#plt.show()
#print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing)))
file_out.close()
gff.close()
file_out2.close()
file_out_alt.close()
# print stats
from statistics import median, mean
if stats==True:
# prints the stats for the segments
#print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
#print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
#print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
#print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
#print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
#print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
# creates files with the features by category
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
if Features[first].type!='gene':
feat_miss_first.write(';')
feat_miss_first.write(Features[first].note)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
if Features[middle].type!='gene':
feat_miss_middle.write(';')
feat_miss_middle.write(Features[middle].note)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
if Features[last].type!='gene':
feat_miss_last.write(';')
feat_miss_last.write(Features[last].note)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
if Features[entier].type!='gene':
feat_miss.write(';')
feat_miss.write(Features[entier].note)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
if Features[total].type!='gene':
feat_miss_total.write(';')
feat_miss_total.write(Features[total].note)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
if Features[feat].type!='gene':
feat_all.write(';')
feat_all.write(Features[feat].note)
feat_all.write("\n")
feat_all.close()
feat_ok=open('features_ok.txt','w')
feat_ok.write("\nfeatures entirely present in azucena :\n")
for ok in feature_ok :
feat_ok.write(Features[ok].annot)
if Features[ok].type!='gene':
feat_ok.write(';')
feat_ok.write(Features[ok].note)
feat_ok.write("\n")
feat_ok.close()
# prints the stats for the features (hypothetical/putative rate, by category)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < features_missing_first.txt"))
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/len(feature_missing_first),2),"% hypothetical or putative.")
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < features_missing_middle.txt"))
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/len(feature_missing_middle),2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < features_missing_last.txt"))
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/len(feature_missing_last),2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < features_missing.txt"))
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/len(feature_missing_all),2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < features_missing_total.txt"))
print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/len(feature_missing_total),2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < features_ok.txt"))
print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/len(feature_ok),2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
total=int(subprocess.getoutput("wc -l < all_features.txt"))
print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/len(list_feature),2),"% hypothetical or putative.")
command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
subprocess.run(command,shell=True)