Newer
Older
import ClassSegFeat as Class
# functions to create the segments and the features
# create segment
def init_seg(line,feature):
line=line.split()
seg_id=line[3][1:]
chr=line[0]
start=line[1]
stop=line[2]
# add the current feature (stranded) to the list of features that are on the segment
feature_strand=line[10]
feature_stranded=feature_strand+feature
feat=list()
feat.append(feature_stranded)
Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
# create feature
def init_feature(line):
line=line.split()
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
type=line[6]
annot=line[12]
chr=line[4]
start=line[7]
stop=line[8]
childs=list()
# add the current segment (stranded) to the list of segments that have the feature
seg=line[3][1:]
strand=line[10]
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
seg_stranded=strand+seg
segments_list=list()
segments_list.append(seg_stranded)
# if the current feature as a parent, add the feature un the childs list of its parent
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
# add a feature (stranded) to an existing segment
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
# add a child to an existing feature
def add_child(feat,new_child):
if feat in Features.keys():
if new_child not in Features[feat].childs:
Features[feat].childs.append(new_child)
# add a segment (stranded) to an existing feature
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):
global Features
global Segments
Segments={}
Features={}
# open the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
for line in lines:
# get the ids for the dictionnaries' keys
feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line.split()[3][1:]
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line)
else: # add the current segment to the list of segments that have the existing feature
strand=line.split()[10]
add_seg(feature_id,segment_id,strand)
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id)
else: # add the current feature to the existing segment
strand=line.split()[10]
add_feature(segment_id,feature_id,strand)
file.close()
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_start(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.start>f.start:
result=1
else:
result=f.start-s.start
return result
# get the feature's stop position on the segment
def get_stop(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.stop<f.stop:
result=s.size
else:
result=f.stop-s.start
return result
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
for segment in Segments:
# get the list of the features on the segment
features_seg=Segments[segment].features
# go through all these features, and print the gff line for each
for feature_stranded in features_seg:
strand=feature_stranded[0:1]
feature=feature_stranded[1:]
segment_stranded=strand+segment
type=Features[feature].type
#start=max(1,get_start(segment,feature))
start=get_start(segment,feature)
stop=get_stop(segment,feature)
# get the rank and the total number of ocurrences for the feature
rank=str(Features[feature].segments_list.index(segment_stranded)+1)
total=str(len(Features[feature].segments_list))
# create the annotation with the rank information
annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
# generate the gff line
ligne=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
#print(ligne)
file_out.write(ligne)
file_out.close()
# functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_ac, feat_id):
s_start=seg_ac[seg_start][1]
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_ac, feat_id):
s_start=seg_ac[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
stop=int(s_start)+int(f_stop)
return stop
# get the position of a piece of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,gene_start,feature,seg_ac):
# start position of the complete gene on the reference
start_gene_start=Segments[gene_start].start+get_start(gene_start,feature.id)-1
# start position of the piece of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on azucena, to get the length of the piece of the feature
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
# start position of the piece of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_azu-start_azu)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of the piece of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_ac,feature):
start_azu=get_start_2(seg_start,seg_ac,feature.id) # start position of the feature on azucena
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # stop position of the feature on azucena
proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,gene_start,seg_ac):
seg_stop=list_seg[i-1][1:]
feature=Features[feat]
chr=seg_ac[seg_start][0]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on azucena
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
proportion=get_proportion(seg_start,seg_stop,seg_ac,feature)
position=get_position(seg_start,seg_stop,gene_start,feature,seg_ac)
annotation=feature.annot+proportion+position
out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
# writes the gff of azucena using the gff of the graph
# create a dictionnary with the positions of the segments on azucena
seg_ac={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
s_id=line[3][1:]
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
ch=line[0]
start=line[1]
stop=line[2]
seg_ac[s_id]=list([ch,start,stop])
bed.close()
gff=open(gff,'r')
file_out = open(out, 'w')
lines=gff.readlines()
# get the list of all the features to transfer
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# stats on how many segments are absent in azucena, their average length, etc
stats=False
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_total=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
# for each feature, get list of the segments where it is
for feat in list_feature:
list_seg=Features[feat].segments_list
size_list=len(list_seg)
# stats on how many segments are absent in azucena, their average length, etc
if stats==True:
for segment in list_seg:
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_ac:
seg_total.append(seg_len)
if feat not in feature_missing_total:
feature_missing_total.append(feat)
if segment==list_seg[0]:
if segment==list_seg[size_list-1]:
seg_entier.append(seg_len)
if feat not in feature_missing_all:
feature_missing_all.append(feat)
else:
seg_first.append(seg_len)
if feat not in feature_missing_first:
feature_missing_first.append(feat)
elif segment==list_seg[size_list-1]:
seg_last.append(seg_len)
if feat not in feature_missing_last:
feature_missing_last.append(feat)
else:
seg_middle.append(seg_len)
if feat not in feature_missing_middle:
feature_missing_middle.append(feat)
else:
seg_ok.append(seg_len)
gene_start="empty"
seg_start="empty"
seg_stop="empty"
# loop that goes through all the segments that have the current feature
# keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present.
# at the end of the loop, prints the last piece of the fragment.
for i in range(0,size_list):
if list_seg[i][1:] in seg_ac:
if gene_start=="empty":
gene_start=list_seg[i][1:]
if seg_start=="empty":
seg_start=list_seg[i][1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,gene_start,seg_ac)
file_out.write(out_line)
seg_start="empty"
seg_stop="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last piece of the feature in the end
if list_seg[i][1:] in seg_ac:
out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac)
file_out.write(out_line)
file_out.close()
gff.close()
# print stats
from statistics import median, mean
if stats==True:
print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
multiple_3=0
for seg in seg_middle:
if seg%3==0:multiple_3+=1
print(multiple_3,"segments missing in the middle of a feature, of length=3k")
print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(117+5)/153,2),"% hypothetical or putative.")
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(180+3)/263,2),"% hypothetical or putative.")
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(115+7)/156,2),"% hypothetical or putative.")
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(314+24)/404,2),"% hypothetical or putative.")
print("there is at least one segment missing", len(feature_missing_total) ,"features, including",round(100*(606+36)/817,2),"% hypothetical or putative.")
print("there is", len(list_feature) ,"features in total, including",round(100*(2108+171)/3510,2),"% hypothetical or putative.")
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing a segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
feat_all.write("\n")
feat_all.close()
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
# writes the gff of azucena using the gff of the graph
def genome_gff_test(pos_seg, gff, out):
print("generation of the genome's gff ")
# create a dictionnary with the positions of the segments on azucena
seg_ac={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
s_id=line[3][1:]
ch=line[0]
start=line[1]
stop=line[2]
seg_ac[s_id]=list([ch,start,stop])
bed.close()
gff=open(gff,'r')
file_out = open(out, 'w')
lines=gff.readlines()
# get the list of all the features to transfer
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# for each feature, get list of the segments where it is
for feat in list_feature:
list_seg=Features[feat].segments_list
size_list=len(list_seg)
gene_start="empty"
seg_start="empty"
seg_stop="empty"
# prints all gene fragments.
for i in range(0,size_list):
if (list_seg[i][1:] in seg_ac) & (1==1):
feature=Features[feat]
seg_start=list_seg[i][1:]
seg_stop=list_seg[i][1:]
chr=seg_ac[seg_start][0]
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
out_line=chr+" "+str(start_azu)+" "+str(stop_azu)+" "+seg_start+" "+seg_stop+" "+feature.id+"\n"
file_out.write(out_line)
file_out.close()
gff.close()