Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
from Graph_gff import Features,load_intersect
from Functions import get_segment_sequence
target_genome_name="genome2_chr10"
pos_seg=target_genome_name+".bed"
var_file=target_genome_name+"_variations.txt"
gfa="graph.gfa"
intersect_path='intersect.bed'
load_intersect(intersect_path)
def convert_strand(strand):
match strand:
case "+":
return ">"
case "-":
return "<"
case ">":
return "+"
case "<":
return "-"
case default:
return ""
# présent dans Fuctions.py mais relou à importer ?
def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
seg_seq={}
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") and (line[1]!="_MINIGRAPH_"): # get the walk of the genome
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('-s'+segment[1:])
paths[line[3]]=list_path
return [paths,seg_seq]
# présent dans Fuctions.py mais relou à importer ?
def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome
bed=open(pos_seg,'r')
lines=bed.readlines() # read line by line ?
bed.close()
segments_on_target_genome={}
for line in lines:
line=line.split()
[seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
segments_on_target_genome[seg]=[chrom,start,stop,strand]
return segments_on_target_genome
def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object
feature_sequence=""
for segment in feature.segments_list:
if (segment==feature.segments_list[0]) and (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature
feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1]
if segment==feature.segments_list[0]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:]
elif segment==feature.segments_list[-1]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop]
else:
feature_sequence+=get_segment_sequence(seg_seq,segment)
feature.sequence=feature_sequence
def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome
first_seg_found=''
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
first_seg_found=segment[1:]
break
return first_seg_found
def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome
first_strand=convert_strand(segments_on_target_genome[first_seg][3])
first_seg_stranded=first_strand+first_seg
last_strand=convert_strand(segments_on_target_genome[last_seg][3])
last_seg_stranded=last_strand+last_seg
index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
first_index=min(index_first_seg,index_last_seg)
last_index=max(index_last_seg,index_first_seg)
list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
return list_segfeat_azu_corrected
def get_rna(dna_sequence):
return dna_sequence.replace("T","U")
def get_aa(rna_codon): # gives the aa coded by the rna codon
match rna_codon[0:2]:
case "UU":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "Phe"
else:
return "Leu"
case "UC":
return "Ser"
case "UA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "Tyr"
else:
return "*"
case "UG":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "Cys"
elif rna_codon[2]=="A":
return "*"
else:
return "Trp"
case "CU":
return "Leu"
case "CC":
return "Pro"
case "CA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "His"
else:
return "Gln"
case "CG":
return "Arg"
case "AU":
if rna_codon[2]=="G":
return "Met"
else:
return "Ile"
case "AC":
return "Thr"
case "AA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "Asn"
else:
return "Lys"
case "AG":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "Ser"
else:
return "Arg"
case "GU":
return "Val"
case "GC":
return "Ala"
case "GA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):
return "Asp"
else:
return "Glu"
case "GG":
return "Gly"
def cut_codon(sequence):
from textwrap import wrap
return wrap(sequence,3)
def traduction(sequence_arn): # translate rna
list_codons=cut_codon(sequence_arn)
prot=list()
for codon in list_codons:
if len(codon)==3:
prot.append(get_aa(codon))
#else:
# print("attempt to get the amino acid for an incomplete codon")
return prot
def get_sequence_on_genome(feature_id,segments_on_target_genome): # returns the sequence of the feature on the target genome
feature=Features[feature_id]
list_seg=feature.segments_list
first_seg=get_first_seg(list_seg,segments_on_target_genome)
last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)
path_on_target=get_feature_path(paths,first_seg,last_seg)
pos_start=feature.pos_start
pos_stop=feature.pos_stop
new_sequence=""
for segment in path_on_target:
if segment==feature.segments_list[0]:
new_sequence+=get_segment_sequence(seg_seq,segment)[pos_start-1:]
elif segment==feature.segments_list[-1]:
new_sequence+=get_segment_sequence(seg_seq,segment)[0:pos_stop]
else:
new_sequence+=get_segment_sequence(seg_seq,segment)
return new_sequence
def get_mrna_var(var_file):
var=open(var_file,'r')
lines=var.readlines()
var.close()
mrna_var={}
for line in lines:
line=line.split()
feature_type=line[1]
if feature_type=="mRNA":
mrna_id=line[0].replace('.','_').replace(':','_')
if mrna_id not in mrna_var.keys():
mrna_var[mrna_id]={}
elif feature_type=="CDS":
cds_id=line[0].replace('.','_').replace(':','_')
parent_id=line[0][0:line[0].find("cds")-1]
if cds_id not in mrna_var[parent_id]:
mrna_var[parent_id][cds_id]=list()
mrna_var[parent_id][cds_id].append(line)
return mrna_var
import re
def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one
print("\nlooking for a new start codon:")
frame_shift=0
# chercher codon start en amont du cds, dans le mrna
seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
print("sequence before the CDS in the mRNA:",sequence_amont)
if "ATG" in sequence_amont:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
stop_pos_list=[m.start() for m in re.finditer('(?=TAGorTAAorTGA)', sequence_amont)]
print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
print("checking if there is a stop codon after the start codons in the same reading frame:")
for start_pos in start_pos_list:
start_pos_frame=start_pos%3
n=len(sequence_amont)-start_pos+1
if True not in ( (stop_pos%3==start_pos_frame) and (stop_pos>start_pos) for stop_pos in stop_pos_list) :
#print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
frame_shift=(frame_shift+n)%3 # vérifier le frameshift !!
print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame")
else:
print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame")
# chercher codon start en aval, dans le cds
if "ATG" in seq_cds:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier
print("\n")
return frame_shift
def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion
print("printvar",deleted_sequence,inserted_sequence)
stop=False
deleted_aa=traduction(get_rna(deleted_sequence))
inserted_aa=traduction(get_rna(inserted_sequence))
if (len(deleted_aa)!=0) and (len(inserted_aa)!=0):
if deleted_aa!=inserted_aa:
print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa))
else:
print("consequence : synonymous variation in",",".join(deleted_aa))
elif len(inserted_aa)!=0:
print("consequence : insertion of",",".join(inserted_aa))
else:
print("consequence : deletion of",",".join(deleted_aa))
if ("*" in inserted_aa):
print("stop codon apparition in the cds of the target genome")
stop=True
return stop
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
segments_on_target_genome=get_segments_positions_on_genome(pos_seg)
mrna_var=get_mrna_var(var_file)
for feature in Features.values(): # add the sequence of all features
add_feature_sequence(feature,seg_seq)
def get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,old_frameshift):
# get the sequence until the next variation in the next cds
# for this, go through all the cds left, and add the sequence until the first variation encountered
var_found=False
keys = list(cds_var.keys())
cds_rank = keys.index(cds_id)+1
nextCds=keys[cds_rank]
ref_sequence=""
alt_sequence=""
# look for the next variation, and get the sequence before.
while (not var_found) and (keys.index(nextCds)!=len(keys)): # go through all the cds until the last
if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence
ref_sequence+=Features[nextCds].sequence
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)
cds_rank+=1
nextCds=keys[cds_rank]
print("no var in current cds")
else: # variation found
nextVar=cds_var[nextCds][0]
ref_sequence+=Features[nextCds].sequence[0:int(nextVar[12])]
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(nextVar[13])]
var_found=True
return [ref_sequence,alt_sequence]
def get_variation_change_sequences(old_frameshift,frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id):
ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3)
ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3)
print("oldfs",old_frameshift)
if frameshift==0: # only local change
ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon
ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3
elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3]
ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3)
ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3)
else: # last variation, go until the end of the cds
ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source)
ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target)
ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref]
alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt]
if (frameshift!=0) and (index_var==len(cds_var[cds_id])-1) and (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds
[ref_add,alt_add]=get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,old_frameshift)
ref_sequence+=ref_add
alt_sequence+=alt_add
ref_sequence=ref_sequence[0:3*(len(ref_sequence)//3)]
alt_sequence=alt_sequence[0:3*(len(alt_sequence)//3)]
return print_variation_change(ref_sequence,alt_sequence)
analysis=True
if analysis==True:
# analysing the variations for all the mrna :
for mrna_id in mrna_var.keys():
print("analysis of the variations in the mRNA",mrna_id,":\n")
leftover_ref=""
leftover_target=""
frameshift=0 # frameshift is how many bases to the right you need to go (from the reference reading frame) to get the alternate reading frame
readingframe=0 # readingframe is how many bases to the right you need to go (on the cds sequence) to get the first base of a codon.
cds_var=mrna_var[mrna_id]
for index_cds,cds_id in enumerate(cds_var):
print("\nCDS",cds_id,":")
print("readingframe",readingframe,"frameshift",frameshift)
for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation
print("\nvariation",index_var+1, ":")
length_ref=len(var[9])
length_alt=len(var[10])
if type_var=="insertion":
length_ref=0
print("insertion of",var[10])
elif type_var=="deletion":
length_alt=0
print("deletion of",var[9])
else:
print("substitution of",var[9],"by",var[10])
posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3]
temp_rf=readingframe
if temp_rf==0:
temp_rf=3
sequenceCDS_target=leftover_target[temp_rf:]+get_sequence_on_genome(cds_id,segments_on_target_genome)
sequenceCDS_source=leftover_ref[temp_rf:]+Features[cds_id].sequence
print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14])
if (index_cds==0) and (posVar[0]<3):
print("variation of the start codon, mRNA most likely wont be translated")
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
break
old_frameshift=frameshift
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")
if frameshift==0:
print("recovery of the original reading frame")
if old_frameshift==0:
print("loss of the original reading frame")
stop=get_variation_change_sequences(old_frameshift,frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id)
if stop:
break
if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds
leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:]
leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:]
readingframe=(3-(len(sequenceCDS_source)-readingframe))%3