Newer
Older
import ClassSegFeat as Class
# functions to create the segments and the features
def init_seg(line,feature):
line=line.split()
seg_id=line[3][1:]
chr=line[0]
start=line[1]
stop=line[2]
# add the current feature to the list of features that are on the segment
feature_strand=line[10]
feature_stranded=feature_strand+feature
feat=list()
feat.append(feature_stranded)
Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
def init_feature(line):
line=line.split()
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
type=line[6]
annot=line[12]
chr=line[4]
start=line[7]
stop=line[8]
childs=list()
# add the current segment to the list of segments that have the feature
seg=line[3][1:]
strand=line[10]
seg_stranded=strand+seg
segments_list=list()
segments_list.append(seg_stranded)
# if the current feature has a parent, add the current feature in the childs list of its parent
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
def add_child(feat,new_child):
if feat in Features.keys(): # if the parent feature exists
if new_child not in Features[feat].childs:
Features[feat].childs.append(new_child)
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
# create a note for the child features that do not have annotation.
# the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
feat.note=feat.annot.split(';')[-1]
else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
# we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
annot_found=False
while annot_found==False:
if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
note=Features[curent].annot.split(';')[-1]
feat.note=note
annot_found=True
else: # if we didn't find the gene, we go back to the current feature's parent until we find it
curent=Features[Features[curent].parent].id
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):
global Features
global Segments
Segments={}
Features={}
# open the file with the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
for line in lines:
# get the ids for the dictionnaries' keys
feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line.split()[3][1:]
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line)
else: # if it exists, add the current segment to the list of segments that have the existing feature
strand=line.split()[10]
add_seg(feature_id,segment_id,strand)
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id)
else: # if it exists, add the current feature to the list of features on the existing segment
strand=line.split()[10]
add_feature(segment_id,feature_id,strand)
# for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
for feat_id in Features:
feat=Features[feat_id]
set_note(feat.id)
feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)
file.close()
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_start(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.start>f.start:
result=1
else:
result=f.start-s.start
return result
# get the feature's stop position on the segment
def get_stop(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.stop<f.stop:
result=s.size
else:
result=f.stop-s.start
return result
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
for segment in Segments:
# get the list of the features on the segment
features_seg=Segments[segment].features
# go through all these features, and print the gff line for each
for feature_stranded in features_seg:
strand=feature_stranded[0:1]
feature=feature_stranded[1:]
segment_stranded=strand+segment
type=Features[feature].type
stop=get_stop(segment,feature)
# get the rank and the total number of ocurrences for the feature
rank=str(Features[feature].segments_list.index(segment_stranded)+1)
total=str(len(Features[feature].segments_list))
# create the annotation with the rank information
annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
# write the gff line in the output file
line=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
file_out.write(line)
file_out.close()
# functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_genome, feat_id):
s_start=seg_genome[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
stop=int(s_start)+int(f_stop)
return stop
# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
start_gene_start=Segments[feat_start[1:]].start+get_start(feat_start[1:],feature.id)-1
# start position of the piece of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on azucena, to get the length of the piece of the feature
start_azu=get_start_2(seg_start,seg_genome,feature.id)
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
# start position of the piece of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_azu-start_azu)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
start_azu=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on azucena
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on azucena
proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
feature=Features[feat]
chr=seg_genome[seg_start[1:]][0]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on azucena
start_azu=get_start_2(seg_start[1:],seg_genome,feature.id)
stop_azu=get_stop_2(seg_stop[1:],seg_genome,feature.id)
proportion=get_proportion(seg_start[1:],seg_stop[1:],seg_genome,feature)
position=get_position(seg_start[1:],seg_stop[1:],feat_start,feature,seg_genome)
annotation=feature.annot+proportion+position
out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
def stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok):
# creates files with the features by category (missing first, middle, end segment, etc)
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
if Features[first].type!='gene':
feat_miss_first.write(';')
feat_miss_first.write(Features[first].note)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
if Features[middle].type!='gene':
feat_miss_middle.write(';')
feat_miss_middle.write(Features[middle].note)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
if Features[last].type!='gene':
feat_miss_last.write(';')
feat_miss_last.write(Features[last].note)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
if Features[entier].type!='gene':
feat_miss.write(';')
feat_miss.write(Features[entier].note)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
if Features[total].type!='gene':
feat_miss_total.write(';')
feat_miss_total.write(Features[total].note)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
if Features[feat].type!='gene':
feat_all.write(';')
feat_all.write(Features[feat].note)
feat_all.write("\n")
feat_all.close()
feat_ok=open('features_ok.txt','w')
feat_ok.write("\nfeatures entirely present in azucena :\n")
for ok in feature_ok :
feat_ok.write(Features[ok].annot)
if Features[ok].type!='gene':
feat_ok.write(';')
feat_ok.write(Features[ok].note)
feat_ok.write("\n")
feat_ok.close()
# prints the stats for the features (hypothetical/putative rate, by category)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
total=len(feature_missing_first)
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
total=len(feature_missing_middle)
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
total=len(feature_missing_last)
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
total=len(feature_missing_all)
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
total=len(feature_missing_total)
print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
total=len(feature_ok)
print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
total=len(list_feature)
print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
subprocess.run(command,shell=True)
def gff_detail(list_seg,seg_genome,feat,file_out_detail):
# outputs each fragment of each feature, with its position on the new genome and its annotation :
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
# etc.
# at the end of the loop, prints the last part of the fragment.
size_list=len(list_seg)
feat_start="empty" # stocks the first segment of the feature
seg_start="empty" # stocks the first segment of the current part of the feature
#seg_stop="empty" # stocks the last segment of the current part of the feature
for i in range(0,size_list):
if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
if feat_start=="empty":
feat_start=list_seg[i]
if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=list_seg[i]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
file_out_detail.write(out_line)
seg_start="empty"
#seg_stop="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last piece of the feature in the end
if list_seg[i][1:] in seg_genome:
out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
file_out_detail.write(out_line)
def gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n):
# outputs each feature once, from the first to the last segment present on the new genome and its annotation (if the size is ok) :
bad_size=0
absent_features=0
if (first_seg!='0') & (last_seg!='0'):
start2=get_start_2(first_seg,seg_genome,feat)
stop2=get_stop_2(last_seg,seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=abs(size_on_genome-Features[feat].size)
if not ((size_on_genome>Features[feat].size*n) | (size_on_genome<Features[feat].size/n)) : # si le nouveau gene est pas 2x plus grand ou plus petit que l'original
chr=seg_genome[first_seg][0]
strand=list_seg[0][0:1]
start2=get_start_2(first_seg,seg_genome,feat)
stop2=get_stop_2(last_seg,seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=abs(size_on_genome-Features[feat].size)
nb_variants=0
for segment in list_seg:
if segment[1:] not in seg_genome:
nb_variants+=1
annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)
line=chr+" graph_gff "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+" . "+strand+" . "+annotation+"\n"
file_out.write(line)
else:
bad_size+=1
else :
absent_features+=1
return list([bad_size,absent_features])
# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
# create a dictionnary with the positions of the segments on the genome to transfer on
seg_genome={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
if line[3][0:1]=='>':
strand='+'
elif line[3][0:1]=='<':
strand='-'
else:
strand=''
seg=line[3][1:]
ch=line[0]
start=line[1]
stop=line[2]
seg_genome[seg]=list([ch,start,stop,strand])
bed.close()
# create a dictionnary with the sequences of the segments of the graph and a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"):
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"):
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=list()
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
list_path.append('-s'+segment[1:])
else:
list_path.append('s'+segment[1:])
#couple=['s'+segment[1:],segment[0:1]]
##list_path.append(couple)
#list_path.append('s'+segment[1:])
paths[line[3]]=list_path
file_out_detail = open("azucena_chr10_detail.gff", 'w')
file_out=open(out,'w')
file_out2 = open("segments_manquants.txt", 'w')
diff_list=list()
bad_size=0
absent_features=0
# get the list of all the features to transfer from the gff
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# create objects for stats on how many segments are absent in azucena, their average length, etc
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
feature_total=list()
feature_ok=list()
# for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
for feat in list_feature:
list_seg=Features[feat].segments_list
first_seg="0"
for segment in list_seg:
break
last_seg="0"
for segment in reversed(list_seg):
# stats on how many segments are absent in azucena, their average length, etc
# for the segments
# segment missing that is at the start of a feature - seg_first
# segment missing that is at the end of a feature - seg_last
# segment missing that is in the middle of a feature - seg_middle
# segment missing that contains the entire feature - seg_entier
# total number of genes missing - seg_total_abs
# segments not missing - seg_ok
# segments missing of not - seg_total
if (stats==True):
for segment in list_seg: # counts the segments in each category
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_genome: # the segment is missing
seg_total.append(seg_len)
if segment==list_seg[0]: # the segment is the first of the feature
if segment==list_seg[-1]: # the segment is the last of the feature
seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
else:
seg_first.append(seg_len)
elif segment==list_seg[-1]: # the segment is the last of the feature
seg_last.append(seg_len)
else:
seg_middle.append(seg_len)
seg_ok.append(seg_len)
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing ->feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
if (stats==True):
feature_total.append(feat)
if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_all.append(feat)
elif first_seg != list_seg[0][1:]: # the first segment is missing
feature_missing_first.append(feat)
elif last_seg!=list_seg[-1][1:]: # the last segment is missing
feature_missing_last.append(feat)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
if segment[1:] not in seg_genome:
feature_missing_middle.append(feat)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if feat not in feature_missing_total:
feature_missing_total.append(feat)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK !!
if feat not in feature_missing_total:
feature_ok.append(feat)
# outputs each fragment of each feature, with its position on the new genome and its annotation :
gff_detail(list_seg,seg_genome,feat,file_out_detail)
# outputs each feature once, from the first to the last segment present on the new genome and its annotation (if the size is ok) :
n=2
result=gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n)
bad_size+=result[0]
absent_features+=result[1]
# outputs the detail of the missing fragments for each feature :
dif_list=diff_list+gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq)
diff_list=dif_list
#print(feat,"ok")
#gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq)
file_out_detail.close()
file_out.close()
file_out2.close()
# print stats
from statistics import median, mean
if stats==True:
print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))
stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok)
# prints the stats for the segments
#print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
#print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
#print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
#print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
#print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
#print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
def gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq):
# outputs the detail of the missing fragments for each feature :
diff_list=list()
# get the list of the segments in a feature that are missing in the genome
segments_missing=list()
for segment in list_seg:
if segment[1:] not in seg_genome:
segments_missing.append(Segments[segment[1:]])
# if there are missing segments, print them
if (first_seg!='0') & (last_seg!='0'):
var=0 # count variations, to see if there is any
feature=Features[feat]
feat_start=feature.start
# get the lengths of the feature, on the original genome and on the new one
start2=get_start_2(first_seg,seg_genome,feat)
stop2=get_stop_2(last_seg,seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=size_on_genome-feature.size
diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
line=feat+" : \nlength on the original genome = "+str(feature.size)+"\nlength difference (length in the new genome-length in the original genome) = "+str(size_diff)+"\n"
file_out2.write(line)
# find the path in azucena.
first_strand=seg_genome[first_seg][3]
first_seg_stranded=first_strand+first_seg
last_strand=seg_genome[last_seg][3]
last_seg_stranded=last_strand+last_seg
id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][id_first_seg:id_last_seg+1]
list_segfeat_nb=Features[feat].segments_list
# get list segments present in azucena.
# for segment in list_seg:
# boucles parcourir les deux listes.
i=0
j=0
last=''
last_taille=0
last_seq=''
last_start=''
start=0
pos=''
# check if the gene is inverted :
# if all the segments in common (no strand) have the oposite strand, change the strand (or ignore it ?).
# the list from the paths are stranded.
# first get the list of the segments in common without the strand
# then compare the strands
# if all the strands are different, print 'there is an inversion'.
# else print nothing. then ignore the strand.
# check if there is an inversion
if detect_inversion(list_segfeat_nb,list_segfeat_azu):
line="inversion detected\n"
file_out2.write(line)
# remove the strands from the lists of segments :
list_segfeat_nb=remove_strand(list_segfeat_nb)
list_segfeat_azu=remove_strand(list_segfeat_azu)
# detect and print variations ignoring the strands
while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
#for i in range(0,len(list_segfeat_nb)):
if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
# is both segments are absent in the other genome, its a substitution
# print if we had an insertion or deletion running
if last=='insertion':
pos=int(Segments[last_start].start)-int(feat_start)+1
line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
elif last=='deletion':
if start==1:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
start=0
else:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
last=''
last_taille=0
last_seq=''
# print the substitution
# substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
if i==0:
pos=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
line='substitution at the start of the feature of '+list_segfeat_nb[i]+' '+seg_seq[list_segfeat_nb[i]]+" by "+list_segfeat_azu[j]+' '+seg_seq[list_segfeat_azu[j]]+" at the position "+str(pos)+'\n'
else:
pos=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
line='substitution of '+list_segfeat_nb[i]+' '+seg_seq[list_segfeat_nb[i]]+" by "+list_segfeat_azu[j]+' '+seg_seq[list_segfeat_azu[j]]+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
# go to the next segments on the paths
i+=1
j+=1
else: # azu segment not in nb, but nb segment in azu : insertion
if last=='deletion':
if start==1:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
start=0
else:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
last=''
last_taille=0
last_start=''
last_seq=''
if last=='insertion':
last_taille+=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
else:
last='insertion'
last_seq=seg_seq[list_segfeat_azu[j]]
last_taille=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
last_start=list_segfeat_nb[i]
j+=1
elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
if last=='insertion':
pos=int(Segments[last_start].start)-int(feat_start)+1
line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
last=''
last_start=''
last_taille=0
last_seq=''
if last=='deletion':
last_taille+=Segments[list_segfeat_nb[i]].size
last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
else:
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
last='deletion'
last_start=list_segfeat_nb[i]
last_taille=Segments[list_segfeat_nb[i]].size
last_seq=seg_seq[list_segfeat_nb[i]]
if i==0:
start=1
i+=1
else : # idk yet.
line="weird order change"
file_out2.write(line)
var+=1
i+=1
j+=1
else: # segment present in both. print the running indel if there is one
#print("segment",list_segfeat_nb[i][1:],'ok')
if last=='insertion':
pos=int(Segments[last_start].start)-int(feat_start)+1
line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
elif last=='deletion':
if start==1:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
start=0
else:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
file_out2.write(line)
var+=1
last=''
last_taille=0
last_start=''
last_seq=''
i+=1
j+=1
# finish printing the current indel
if last=='insertion':
pos=int(Segments[last_start].start)-int(feat_start)+1
line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
var+=1
elif last=='deletion':
if start==1:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
start=0
else:
pos=int(Segments[last_start].start)-int(feat_start)+1
line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
# see if the end is missing for one of the two genomes
# si i a la taille max, peut etre que j aussi est arrivé à la fin, si j+1 est trop grand, donc si j==len(list_segfeat_azu)-1
# si i et j taille max (ou i et j+1), c bon les deux sont arrivés à la fin (je crois).
# si i taille max et pas j, insertion. si j taille max et pas i, délétion.
if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
if (i<len(list_segfeat_nb)-1):
if not (j>=len(list_segfeat_azu)-1):
print('pb')
# length of the deletion = position of the end of the feature - position of the start of the deletion
length=feature.stop-Segments[list_segfeat_nb[i]].start
line="deletion at the end of length "+str(length)+"\n"
file_out2.write(line)
var+=1
elif (j<len(list_segfeat_azu)-1):
file_out2.write("insertion at the end\n")
var+=1
# au début et à la fin, si on a une délétion, on peut que donner la taille du segment, pas la taille de la délétion.
# dans le segment ou la feature, garder l'info de où commence le gène sur le segment.....
# taille de la délétion = (début segment fin + position de fin) - début last_seg
# ici last seg c list_nb[i], et segment fin c'est list_nb[-1]
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
if var==0:
line=feat+" : no variation\n"
file_out2.write(line)
else:
line=feat+" : feature entirely absent.\n"
file_out2.write(line)
file_out2.write('\n')
return diff_list
def remove_strand(list_seg):
list_unstrand=list()
for seg_stranded in list_seg:
list_unstrand.append(seg_stranded[1:])
return list_unstrand
def detect_inversion(list_1,list_2):
list_1_unstrand=remove_strand(list_1)
list_2_unstrand=remove_strand(list_2)
# get the list of segments in common
seg_common=list()
for seg in list_1_unstrand:
if seg in list_2_unstrand:
seg_common.append(seg)
# for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
same_strand_count=0
for seg in seg_common:
index_1=list_1_unstrand.index(seg)
index_2=list_2_unstrand.index(seg)
if list_1[index_1]==list_2[index_2]:
same_strand_count+=1
# if there is less than 10% of strand difference among the common segments, return False, no inversion
if len(seg_common)-same_strand_count<len(seg_common)/10:
return False
else:
return True