Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from Graph_gff import Segments, Features, write_line
from Functions import *
### functions to generate a genome's gff from the graph's gff
# functions to get the detailed gff with all the fragments of the features
# outputs each fragment of the feature in a gff
def gff_detail(list_seg,feature_id):
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature, etc.
# at the end of the loop, prints the last part of the fragment.
[feat_start,seg_start,last_seg]=["","",""] # first segment of the feature, first segment of the current part of the feature, last segment in the part of the feature, for loop below
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
if feat_start=="":
feat_start=segment[1:]
if seg_start=="": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=segment[1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="": # found a stop and we have a start. print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=create_line_detail(last_seg,feature_id,seg_start,feat_start)
write_line(out_line,output_detail_gff,False)
seg_start=""
#else: if the current segment is not in azucena but there is no start, keep looking for a start
last_seg=segment
if last_seg[1:] in segments_on_target_genome:
out_line=create_line_detail(list_seg[-1],feature_id,seg_start,feat_start)
write_line(out_line,output_detail_gff,False)
# functions to get the gff with one line per feature
# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):
if (first_seg!=''): # feature present on the target genome
size_on_new_genome=get_feature_stop_on_genome(last_seg,feature_id)-get_feature_start_on_genome(first_seg,feature_id)+1
size_diff=abs(size_on_new_genome-Features[feature_id].size)
if right_size(size_on_new_genome,max_diff,feature_id):
line=create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg)
write_line(line,output_gff_one,False)
return size_diff
else:
return "bad_size"
else :
return "absent"
# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
print("generation of the genome's gff ")
# create variables and open files
[once,detail,var,stats]=[True,True,True,False]
if var:
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
file_out_var = open("test_data/variations.txt", 'w')
global output_variations
output_variations=[0,"",file_out_var]
if detail:
file_out_detail = open("test_data/target_genome_detail.gff", 'w')
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
global output_detail_gff
output_detail_gff=[0,"",file_out_detail]
if once:
file_out=open(out,'w')
global output_gff_one
output_gff_one=[0,"",file_out]
bad_size_features=0
absent_features=0
diff_size_transfered_features=[0,0] # [count,sum], to get the average
if stats:
# create objects for stats on how many segments are absent in azucena, their average length, etc
feature_missing_segments=[[],[],[],[],[],[],[]] # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing - feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
get_segments_positions_on_genome(pos_seg)
list_feature_to_transfer=get_all_features_in_gff(gff) # get the list of all the features to transfer from the gff
for feat in list_feature_to_transfer:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
# outputs the feature once in a gff, from the first to the last segment present on the new genome
if once:
max_diff=2 # maximum difference size (n times bigger of smaller)
transfer_stat=gff_one(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
if transfer_stat=="bad_size":
bad_size_features+=1
elif transfer_stat=="absent":
absent_features+=1
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat
write_line("",output_gff_one,True)
# outputs each fragment of the feature in a gff
if detail:
gff_detail(list_seg,feat) # insertions !
write_line("",output_detail_gff,True)
# outputs the detail of variations of the feature :
if var:
print_variations(first_seg,last_seg,feat,paths,seg_seq)
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
write_line("",output_variations,True)
if stats==True:
stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat)
# close output_files
if detail:
file_out_detail.close()
if once:
file_out.close()
if var:
file_out_var.close()
# print stats
if stats==True:
if once:
print(len(Features)-(bad_size_features+absent_features),"out of",len(Features),"features are transfered.")
print(bad_size_features,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
print("average length difference of the transfered genes : ",diff_size_transfered_features[1]/diff_size_transfered_features[0])
stats_features(feature_missing_segments)
# functions to get the detail of the variations in the features
def print_variations(first_seg,last_seg,feat,paths,seg_seq):
if (first_seg!=''): # if the feature is not completly absent # add the else, output absent features
[variation,list_segfeat_nb,list_segfeat_azu]=create_var(feat,first_seg,last_seg,paths) # removes the strands in the segment lists
feature=Features[feat]
feat_start=feature.start
# loop to go through both paths with i and j
[i,j,var_count]=[0,0,0]
# detect and print variations ignoring the strands
while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
# if both segments are absent in the other genome, its a substitution
variation.last_seg_in_target=list_segfeat_azu[j][1:]
if (variation.type=='insertion') | (variation.type=='deletion'): # print the current variation before treating the substitution
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
reset_var(variation)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
init_new_var(variation,"substitution",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
i+=1;j+=1
else: # azu segment not in nb, but nb segment in azu : insertion or continue substitution
if variation.type=='deletion': # print the current variation before treating the insertion
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
reset_var(variation)
var_count+=1
variation.last_seg_in_target=list_segfeat_azu[j][1:]
if variation.type=='insertion':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
elif variation.type=="substitution":
while list_segfeat_azu[j]!=list_segfeat_nb[i]:
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,2)
j+=1
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
reset_var(variation)
var_count+=1
j-=1
init_new_var(variation,"insertion",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
j+=1
elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion or continue substitution
if variation.type=='insertion': # print the current variation before treating the deletion
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
reset_var(variation)
var_count+=1
if variation.type=='deletion':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
elif variation.type=="substitution":
while list_segfeat_azu[j]!=list_segfeat_nb[i]:
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,1)
i+=1
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
reset_var(variation)
var_count+=1
i-=1
init_new_var(variation,"deletion",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
# can be a substitution. check later if its not an inversion
variation.last_seg_in_target=list_segfeat_azu[j][1:]
if (variation.type=='insertion') | (variation.type=='deletion'): # print the current variation before treating the substitution
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
reset_var(variation)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
else: # initiate substitution
init_new_var(variation,"substitution",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
i+=1;j+=1
else: # segment present in both, no variation. print the running indel if there is one
if variation.type!='': # print the current variation if there is one
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
var_count+=1
reset_var(variation)
variation.last_seg_in_target=list_segfeat_azu[j][1:]
i+=1;j+=1
if (variation.type!=''): # if there was a current variation when we reached the end, print it
print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
var_count+=1
reset_var(variation)
if i<=len(list_segfeat_nb)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
print_last_deletion(variation,list_segfeat_nb,i,feat_start,feature,seg_seq)
var_count+=1
reset_var(variation)
if var_count==0: # if no variation was encountered
print_novar(variation)
def print_current_var(variation,feat_start,list_segfeat_azu,feat,i):
if variation.type=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(variation,feat_start,list_segfeat_azu,feat)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}\n'
write_line(line,output_variations,False)
elif variation.type=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(variation,feat_start,list_segfeat_azu,feat,i)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}\n'
write_line(line,output_variations,False)
elif variation.type=='substitution':
warning=detect_small_inversion(variation)
[pos_old,pos_new]=get_old_new_pos_substitution(feat_start,variation,list_segfeat_azu,feat)
if len(variation.ref) == len(variation.alt): # if the substituion is between two segment of the same size, print it
size_subs=len(variation.ref)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
else :
# if the segments of the substitution have a different size, print deletion then insertion at the same position.
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
line+=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
write_line(line,output_variations,False)
def detect_small_inversion(variation):
[list_ref_common,list_alt_common]=[list(),list()]
list_ref_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_ref]
list_alt_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_alt]
for seg in variation.seg_ref:
if seg[1:] in list_alt_unstrand:
list_ref_common.append(seg)
for seg in variation.seg_alt:
if seg[1:] in list_ref_unstrand:
list_alt_common.append(seg)
if (len(list_ref_common)>len(list_ref_unstrand)*0.5) & (len(list_alt_common)>len(list_alt_unstrand)*0.5):
return f'\t# Suspected inversion within this substitution.'
else:
return ''
def print_last_deletion(variation,list_segfeat_nb,i,feat_start,feature,seg_seq):
pos_old=int(Segments[list_segfeat_nb[i][1:]].start)-int(feat_start)+1
del_sequence=get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq)
length=len(del_sequence)
pos_new=str(int(variation.size_new)+1) # the deletion is at the end of the feature on the new genome
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tdeletion\t{del_sequence}\t-\t{length}\t{pos_old}\t{pos_new}\n'
write_line(line,output_variations,False)
def print_novar(variation):
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tno_var\t-\t-\t-\t-\t-\n'
write_line(line,output_variations,False)
# not used.
def get_list_segments_missing(list_seg,segments_on_target_genome):
segments_missing=[]
for segment in list_seg:
if segment[1:] not in segments_on_target_genome:
segments_missing.append(Segments[segment[1:]])
return segments_missing
# takes a feature and a feature type, returns a list of child features that have the wanted type. currently not used.
def get_child_list(feature,child_type):
if type=="":
return feature.childs
list_childs=[]
for child in feature.childs:
if Features[child].type==child_type:
list_childs.append(child)
return list_childs