Newer
Older

nina.marthe_ird.fr
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
from .usefull_little_functions import *
from .intersect import Features,Segments,search_segment,invert_seg
from .detect_inversion import detect_feature_inversion,invert_segment_list
# functions to get the detail of the variations in the features
# handle the variations analysis between the feature on the source genome and the feature on the target genome
def generate_variations_details(feature_id,seg_seq,match,file_out_var,segments_on_target_genome):
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
[first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match)
# do it for the gene, then for its childs.
if match[4]==True: # gene passed the filters
print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_target_path,file_out_var,segments_on_target_genome)
inversion=detect_feature_inversion(Features[feature_id].segments_list_source,feature_target_path)
# print child feature var
childs_list=Features[feature_id].get_child_list(Features)
for child_id in childs_list:
child=Features[child_id]
# look for the first and last seg of the child in its parent path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source: # get first seg in the parent path
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # check that the child feature is not absent in this occurence of the parent feature
for seg in reversed(child.segments_list_source): # get last seg in the parent path
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
if inversion:
temp=child_first_seg
child_first_seg=child_last_seg
child_last_seg=temp
for match in child.segments_list_target: # look for the right occurence of the child feature
if (match[0]==walk) and (match[1]==copy_id):
child_target_path=match[2]
break
# treat the child feature:
print_variations(child_id,child_first_seg,child_last_seg,copy_id,walk,seg_seq,child_target_path,file_out_var,segments_on_target_genome)
def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_seq,feature_target_path,file_out_var,segments_on_target_genome):
if first_seg!='': # if the feature is not completly absent # add the else, output absent features
[variation,feature_path_source_genome,feature_path_target_genome]=create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path,segments_on_target_genome) # removes the strands in the segment lists
feature=Features[feature_id]
feat_start=feature.start
# loop to go through both paths with i and j
[i,j,var_count]=[0,0,0]
# detect and print variations ignoring the strands
start_feat_seg_target=feature_path_target_genome[0]
while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome
# if both segments are absent in the other genome, its a substitution
variation.last_seg_in_target=feature_path_target_genome[j]
if (variation.type=='insertion') or (variation.type=='deletion'): # print the current variation before treating the substitution
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
else: # initiate substitution
init_new_var(variation,"substitution",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion or continue substitution
if variation.type=='deletion': # print the current variation before treating the insertion
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
variation.last_seg_in_target=feature_path_target_genome[j]
if variation.type=='insertion':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
elif variation.type=="substitution":
while feature_path_target_genome[j]!=feature_path_source_genome[i]:
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,2)
j+=1
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
j-=1
else: # intiate insertion
init_new_var(variation,"insertion",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
j+=1
elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion or continue substitution
if variation.type=='insertion': # print the current variation before treating the deletion
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
if variation.type=='deletion':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
elif variation.type=="substitution":
while feature_path_target_genome[j]!=feature_path_source_genome[i]:
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,1)
i+=1
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
i-=1
else: # intiate deletion
init_new_var(variation,"deletion",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
# can be a substitution. check later if its not an inversion
variation.last_seg_in_target=feature_path_target_genome[j]
if (variation.type=='insertion') or (variation.type=='deletion'): # print the current variation before treating the substitution
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
reset_var(variation)
var_count+=1
if variation.type=='substitution':
continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,0)
else: # initiate substitution
init_new_var(variation,"substitution",feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature)
i+=1;j+=1
else: # segment present in both, no variation. print the running indel if there is one
if variation.type!='': # print the current variation if there is one
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,segments_on_target_genome)
file_out_var.write(line)
var_count+=1
reset_var(variation)
variation.last_seg_in_target=feature_path_target_genome[j]
i+=1;j+=1
if (variation.type!=''): # if there was a current variation when we reached the end, print it
line=print_current_var(variation,feat_start,start_feat_seg_target,feature_id,walk,copy_id,file_out_var,segments_on_target_genome)
file_out_var.write(line)
var_count+=1
reset_var(variation)
if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
line=print_last_deletion(variation,feature_path_source_genome,i,feat_start,feature,seg_seq)
file_out_var.write(line)
var_count+=1
reset_var(variation)
if var_count==0: # if no variation was encountered
line=print_novar(variation)
file_out_var.write(line)
# print a variation
def print_current_var(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
warning=''
if variation.type=='insertion':
[pos_old,pos_new]=get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
return line
elif variation.type=='deletion':
[pos_old,pos_new]=get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
return line
elif variation.type=='substitution':
warning=detect_small_inversion(variation)
[pos_old,pos_new]=get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
size_subs=f'{len(variation.ref)}/{len(variation.alt)}'
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
# print the substitutions of different size as deletion+insertion.
#if len(variation.ref) == len(variation.alt): # if the substituion is between two segment of the same size, print it
# size_subs=len(variation.ref)
# line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
#else :
# # if the segments of the substitution have a different size, print deletion then insertion at the same position.
# line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
# line+=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
return line
# check if a substitution could be an inversion inside a feature
def detect_small_inversion(variation):
[list_ref_common,list_alt_common]=[list(),list()]
list_ref_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_ref]
list_alt_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_alt]
for seg in variation.seg_ref:
if seg[1:] in list_alt_unstrand:
list_ref_common.append(seg)
for seg in variation.seg_alt:
if seg[1:] in list_ref_unstrand:
list_alt_common.append(seg)
if (len(list_ref_common)>len(list_ref_unstrand)*0.5) and (len(list_alt_common)>len(list_alt_unstrand)*0.5):
return f'\t# Suspected inversion within this substitution.'
else:
return ''
# print a deletion at the end of a feature
def print_last_deletion(variation,feature_path_source_genome,i,feat_start,feature,seg_seq):
seg_del=search_segment(feature_path_source_genome[i])
pos_old=int(Segments[seg_del].get_start(feature.id))-int(feat_start)+1
del_sequence=get_sequence_list_seg(feature_path_source_genome,i,feature,seg_seq)
length=len(del_sequence)
pos_new=str(int(variation.size_new)+1) # the deletion is at the end of the feature on the new genome
if variation.inversion:
inversion='1'
else:
inversion='0'
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{inversion}\t{variation.size_diff}\tdeletion\t{del_sequence}\t-\t{length}\t{pos_old}\t{pos_new}\n'
return line
# print a feature with no variation
def print_novar(variation):
line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{print_inversion(variation.inversion)}\t{variation.size_diff}\tno_var\t-\t-\t-\t-\t-\n'
return line
# get a digit for the inversion (convert bool in int)
def print_inversion(bool):
if bool==True:
return '1'
else:
return '0'
# initiate a Variation object with the information on the feature it is on
def create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path,segments_on_target_genome):
feature=Features[feature_id]
# get feature paths on the original genome and on the target genome
feature_path_target_genome=feature_target_path
feature_path_source_genome=feature.segments_list_source
inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)
if inversion:
feature_path_target_genome=invert_segment_list(feature_path_target_genome)
stop_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id,segments_on_target_genome)
start_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id,segments_on_target_genome)
else:
start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id,segments_on_target_genome)
stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id,segments_on_target_genome)
size_new_genome=stop_new_genome-start_new_genome+1
size_diff=str(size_new_genome-feature.size)
sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id,segments_on_target_genome)[0]
variation=Variation(feature.id,feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
return(variation,feature_path_source_genome,feature_path_target_genome)
# reset the informations of the variation, but keep the information about the feature
def reset_var(variation):
variation.type='' # make type enumerate
variation.size_var=0
variation.start_var=''
variation.start_var_index=0
variation.ref=''
variation.alt=''
# find the position of a substitution on the source and the target sequence
def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
seg_pos=search_segment(variation.start_var)
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
var_start_seg=variation.start_on_target
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
var_start_seg=invert_seg(var_start_seg)
end_var=get_seg_occ(var_start_seg,walk,feat,copy_id,segments_on_target_genome)[2]
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(var_start_seg,walk,feat,copy_id,segments_on_target_genome)[1]
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of an insertion on the source and the target sequence
def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
start_var_seg=variation.start_var
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
start_var_seg=invert_seg(start_var_seg)
end_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[2]+len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_feat-end_var)
else:
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[1]-len(variation.alt) # start_var_seg is the segment AFTER the insertion
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# find the position of a deletion on the source and the target sequence
def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome):
i=variation.start_var_index
seg_pos=search_segment(variation.start_var)
if i==0:
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)+Features[feat].pos_start-1
else:
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)
if pos_old<0:
pos_old=0
print("error with variation position",variation.inversion,"***")
if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet.
pos_new=0
else:
start_var_seg=variation.last_seg_in_target
if variation.inversion:
start_feat_seg_target=invert_seg(start_feat_seg_target)
start_var_seg=invert_seg(start_var_seg)
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[1]-1
start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_feat-start_var)
else:
start_var=get_seg_occ(start_var_seg,walk,feat,copy_id,segments_on_target_genome)[2]+1
start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id,segments_on_target_genome)
pos_new=str(start_var-start_feat)
return [pos_old,pos_new] # pos_old and pos_new are the base before the change
# change the variation information, but keep the feature information (the variation is on the feature)
def init_new_var(variation,type,feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature):
variation.type=type
variation.start_var=feature_path_source_genome[i]
variation.start_var_index=i
if type=="substitution":
variation.start_on_target=feature_path_target_genome[j]
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="insertion":
variation.ref="-"
variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif type=="deletion":
if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment :
#use pos_start, position of the feature on its first segment
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
variation.seg_ref.append(feature_path_source_genome[i])
else: # else, the deletion will always start at the start of the first segment.
variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
variation.alt="-"
# update the variation
def continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,genome_to_continue):
if variation.type=="substitution":
if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_ref.append(feature_path_source_genome[i])
variation.seg_alt.append(feature_path_target_genome[j])
elif genome_to_continue==1: # deletion
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
elif genome_to_continue==2: # insertion
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="insertion":
variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
variation.seg_alt.append(feature_path_target_genome[j])
elif variation.type=="deletion":
variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
variation.seg_ref.append(feature_path_source_genome[i])
# stores information about a feature and its current variation
class Variation:
def __init__(self,feature_id,feature_key,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
self.feature_id=feature_id # real id in the gff
self.feature_key=feature_key # key to access the feature info in the dict Features
self.feature_type=feature_type
self.chr=chr
self.start_new=start_new
self.stop_new=stop_new
self.inversion=inversion
self.size_diff=size_diff
self.size_new=size_new
self.type=''
self.last_seg_in_target=''
self.seg_ref=list()
self.seg_alt=list()
# outputs the nucleotide sequence of a list of segments, corresponding to the end of a feature
def get_sequence_list_seg(list_seg,i,feature,seg_seq):
del_sequence=""
for k in range(i,len(list_seg)):
if k==len(list_seg)-1:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feature.pos_stop]
else:
del_sequence+=get_segment_sequence(seg_seq,list_seg[k])
return del_sequence