Newer
Older

nina.marthe_ird.fr
committed
from Graph_gff import Features,load_intersect
from Functions import get_segment_sequence,convert_strand

nina.marthe_ird.fr
committed
target_genome_name="genome2_chr10"
pos_seg=target_genome_name+".bed"
var_file=target_genome_name+"_variations.txt"
gfa="graph.gfa"

nina.marthe_ird.fr
committed
load_intersect(intersect_path)
# présent dans Fuctions.py mais relou à importer ?
def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths

nina.marthe_ird.fr
committed
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
seg_seq={}
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('-s'+segment[1:])
paths[line[3]]=list_path
return [paths,seg_seq]
# présent dans Fuctions.py mais relou à importer ?
def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome

nina.marthe_ird.fr
committed
bed=open(pos_seg,'r')
lines=bed.readlines() # read line by line ?
bed.close()
segments_on_target_genome={}

nina.marthe_ird.fr
committed
for line in lines:
line=line.split()
[seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
segments_on_target_genome[seg]=[chrom,start,stop,strand]
return segments_on_target_genome

nina.marthe_ird.fr
committed
def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object

nina.marthe_ird.fr
committed
feature_sequence=""
for segment in feature.segments_list:

nina.marthe_ird.fr
committed
if (segment==feature.segments_list[0]) & (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature
feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1]

nina.marthe_ird.fr
committed
if segment==feature.segments_list[0]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:]

nina.marthe_ird.fr
committed
elif segment==feature.segments_list[-1]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop]

nina.marthe_ird.fr
committed
else:
feature_sequence+=get_segment_sequence(seg_seq,segment)
feature.sequence=feature_sequence
def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome

nina.marthe_ird.fr
committed
first_seg_found=''
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
first_seg_found=segment[1:]
break
return first_seg_found
def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome

nina.marthe_ird.fr
committed
first_strand=convert_strand(segments_on_target_genome[first_seg][3])
first_seg_stranded=first_strand+first_seg
last_strand=convert_strand(segments_on_target_genome[last_seg][3])
last_seg_stranded=last_strand+last_seg
index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
first_index=min(index_first_seg,index_last_seg)
last_index=max(index_last_seg,index_first_seg)
list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
return list_segfeat_azu_corrected
def get_rna(dna_sequence):
return dna_sequence.replace("T","U")
def get_aa(rna_codon): # gives the aa coded by the rna codon
match rna_codon[0:2]:

nina.marthe_ird.fr
committed
case "UU":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Phe"
else:
return "Leu"
case "UC":
return "Ser"
case "UA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Tyr"
else:
return "*"
case "UG":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Cys"
elif rna_codon[2]=="A":

nina.marthe_ird.fr
committed
return "*"
else:
return "Trp"
case "CU":
return "Leu"
case "CC":
return "Pro"
case "CA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "His"
else:
return "Gln"
case "CG":
return "Arg"
case "AU":
if rna_codon[2]=="G":

nina.marthe_ird.fr
committed
return "Met"
else:
return "Ile"
case "AC":
return "Thr"
case "AA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Asn"
else:
return "Lys"
case "AG":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Ser"
else:
return "Arg"
case "GU":
return "Val"
case "GC":
return "Ala"
case "GA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Asp"
else:
return "Glu"
case "GG":
return "Gly"
from textwrap import wrap
def cut_codon(sequence):
return wrap(sequence,3)

nina.marthe_ird.fr
committed
def traduction(sequence_arn): # translate rna
list_codons=cut_codon(sequence_arn)

nina.marthe_ird.fr
committed
prot=list()
for codon in list_codons:
if len(codon)==3:
prot.append(get_aa(codon))
else:
print("attempt to get the amino acid for an incomplete codon")

nina.marthe_ird.fr
committed
return prot
def get_sequence_on_genome(feature,segments_on_target_genome): # returns the sequence of the feature on the target genome
list_seg=Features[feature].segments_list
first_seg=get_first_seg(list_seg,segments_on_target_genome)
last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)

nina.marthe_ird.fr
committed
path_on_target=get_feature_path(paths,first_seg,last_seg)
new_sequence=""
for segment in path_on_target:
if segment==cds.segments_list[0]:
new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-1:]

nina.marthe_ird.fr
committed
elif segment==cds.segments_list[-1]:
new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]

nina.marthe_ird.fr
committed
else:
new_sequence+=get_segment_sequence(seg_seq,segment)
return new_sequence

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def get_mrna_var(var_file):
var=open(var_file,'r')
lines=var.readlines()
var.close()

nina.marthe_ird.fr
committed
mrna_var={}
for line in lines:
line=line.split()

nina.marthe_ird.fr
committed
feature_type=line[1]
if feature_type=="mRNA":
mrna_id=line[0].replace('.','_').replace(':','_')
if mrna_id not in mrna_var.keys():
mrna_var[mrna_id]={}
elif feature_type=="CDS":
cds_id=line[0].replace('.','_').replace(':','_')

nina.marthe_ird.fr
committed
parent_id=line[0][0:line[0].find("cds")-1]
if cds_id not in mrna_var[parent_id]:
mrna_var[parent_id][cds_id]=list()
mrna_var[parent_id][cds_id].append(line)
return mrna_var
import re
def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one
print("\nlooking for a new start codon:")
frame_shift=0
# chercher codon start en amont du cds, dans le mrna
seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
print("sequence before the CDS in the mRNA:",sequence_amont)
if "ATG" in sequence_amont:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)]
print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
print("checking if there is a stop codon after the start codons in the same reading frame:")
for start_pos in start_pos_list:
start_pos_frame=start_pos%3
n=len(sequence_amont)-start_pos+1
if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) :
#print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !

nina.marthe_ird.fr
committed
frame_shift=(frame_shift+n)%3 # vérifier le frameshift !!
print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame")
print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame")
# chercher codon start en aval, dans le cds
if "ATG" in seq_cds:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier
def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion
print(deleted_sequence,inserted_sequence)

nina.marthe_ird.fr
committed
deleted_aa=traduction(get_rna(deleted_sequence))
inserted_aa=traduction(get_rna(inserted_sequence))

nina.marthe_ird.fr
committed
if (len(deleted_aa)!=0) & (len(inserted_aa)!=0):
if deleted_aa!=inserted_aa:
print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa))

nina.marthe_ird.fr
committed
else:
print("consequence : synonymous variation in",",".join(deleted_aa))

nina.marthe_ird.fr
committed
elif len(inserted_aa)!=0:
print("consequence : insertion of",",".join(inserted_aa))

nina.marthe_ird.fr
committed
else:
print("consequence : deletion of",",".join(deleted_aa))
print("stop codon apparition in the cds of the target genome")

nina.marthe_ird.fr
committed
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
segments_on_target_genome=get_segments_positions_on_genome(pos_seg)

nina.marthe_ird.fr
committed
mrna_var=get_mrna_var(var_file)
for feature in Features.values(): # add the sequence of all features

nina.marthe_ird.fr
committed
add_feature_sequence(feature,seg_seq)

nina.marthe_ird.fr
committed
if analysis==True:
# analysing the variations for all the mrna :
for mrna_id in mrna_var.keys():
print("analysis of the variations in the mRNA",mrna_id,":\n")
leftover_ref=""
leftover_target=""

nina.marthe_ird.fr
committed
frameshift=0

nina.marthe_ird.fr
committed
cds_var=mrna_var[mrna_id]
for index_cds,cds_id in enumerate(cds_var):
cds=Features[cds_id]
print("\nCDS",cds_id,":")
for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation
posVar=[int(var[12]),int(var[13])]
sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome)
if type_var=="insertion":
length_ref=0
else:
length_ref=len(var[9])
if type_var=="deletion":
length_alt=0
else:
length_alt=len(var[10])
print("\nvariation",index_var+1, ":")
if (index_cds==0) & (posVar[0]<3):
print("variation of the start codon, mRNA most likely wont be translated")
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
old_frameshift=frameshift
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")
# frameshift=0 -> reading frame recovered. may need to get a base before.
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if frameshift==0:
print("recovery of the original reading frame")
if old_frameshift==0:
print("loss of the original reading frame")
#else:
# print("variation not causing a frameshift")
if type_var=="insertion":
print("insertion of",var[10])
elif type_var=="deletion":
print("deletion of",var[9])
else:
print("substitution of",var[9],"by",var[10])
len_fragment_before_del=(posVar[0])%3
len_fragment_before_ins=(posVar[1])%3
if frameshift==0:
# print only the local change.
len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3
len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3
total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins]
total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del]
stop=print_variation_change(total_del,total_ins)

nina.marthe_ird.fr
committed
if stop:
break

nina.marthe_ird.fr
committed
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
else:
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print("frameshift of",frameshift,"base(s) to the right.")
if index_var==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds while having full codons
total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:]
# supprimer taille%3 des sequences ins et del.
# en fait non. il faut récupérer les séquences sur les seq des cds (sans l'intron dcp), prendre la taille sur ça et supprimer sur ça
del_leftover_len=len(total_total_del)%3
ins_leftover_len=len(total_total_ins)%3
if del_leftover_len!=0:
total_total_del=total_total_del[0:-del_leftover_len]
if ins_leftover_len!=0:
total_total_ins=total_total_ins[0:-ins_leftover_len]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
else: # not the last variation. translate until the next var.
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12]),int(nextVar[13])]
if nextVar[8]=="insertion":
length_ref_nextvar=0
else:
length_ref_nextvar:len(nextVar[9])
if nextVar[8]=="deletion":
length_alt_nextvar=0
else:
length_alt_nextvar=len(nextVar[10])
len_fragment_before_del_nextvar=(posNextVar[0])%3
len_fragment_before_ins_nextvar=(posNextVar[1])%3
total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds
leftover_ref=cds.sequence[len(cds.sequence)-3:]
leftover_target=sequence_target[len(sequence_target)-3:]