Newer
Older

nina.marthe_ird.fr
committed
from Graph_gff import Features,load_intersect
from Functions import get_segment_sequence,convert_strand

nina.marthe_ird.fr
committed
target_genome_name="genome2_chr10"
pos_seg=target_genome_name+".bed"
var_file=target_genome_name+"_variations.txt"
gfa="graph.gfa"

nina.marthe_ird.fr
committed
load_intersect(intersect_path)
# présent dans Fuctions.py mais relou à importer ?
def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths

nina.marthe_ird.fr
committed
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
seg_seq={}
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('-s'+segment[1:])
paths[line[3]]=list_path
return [paths,seg_seq]
# présent dans Fuctions.py mais relou à importer ?
def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome

nina.marthe_ird.fr
committed
bed=open(pos_seg,'r')
lines=bed.readlines() # read line by line ?
bed.close()
segments_on_target_genome={}

nina.marthe_ird.fr
committed
for line in lines:
line=line.split()
[seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
segments_on_target_genome[seg]=[chrom,start,stop,strand]
return segments_on_target_genome

nina.marthe_ird.fr
committed
def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object

nina.marthe_ird.fr
committed
feature_sequence=""
for segment in feature.segments_list:

nina.marthe_ird.fr
committed
if (segment==feature.segments_list[0]) & (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature
feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1]

nina.marthe_ird.fr
committed
if segment==feature.segments_list[0]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:]

nina.marthe_ird.fr
committed
elif segment==feature.segments_list[-1]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop]

nina.marthe_ird.fr
committed
else:
feature_sequence+=get_segment_sequence(seg_seq,segment)
feature.sequence=feature_sequence
def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome

nina.marthe_ird.fr
committed
first_seg_found=''
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
first_seg_found=segment[1:]
break
return first_seg_found
def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome

nina.marthe_ird.fr
committed
first_strand=convert_strand(segments_on_target_genome[first_seg][3])
first_seg_stranded=first_strand+first_seg
last_strand=convert_strand(segments_on_target_genome[last_seg][3])
last_seg_stranded=last_strand+last_seg
index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
first_index=min(index_first_seg,index_last_seg)
last_index=max(index_last_seg,index_first_seg)
list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
return list_segfeat_azu_corrected
def get_rna(dna_sequence):
return dna_sequence.replace("T","U")
def get_aa(rna_codon): # gives the aa coded by the rna codon
match rna_codon[0:2]:

nina.marthe_ird.fr
committed
case "UU":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Phe"
else:
return "Leu"
case "UC":
return "Ser"
case "UA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Tyr"
else:
return "*"
case "UG":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Cys"
elif rna_codon[2]=="A":

nina.marthe_ird.fr
committed
return "*"
else:
return "Trp"
case "CU":
return "Leu"
case "CC":
return "Pro"
case "CA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "His"
else:
return "Gln"
case "CG":
return "Arg"
case "AU":
if rna_codon[2]=="G":

nina.marthe_ird.fr
committed
return "Met"
else:
return "Ile"
case "AC":
return "Thr"
case "AA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Asn"
else:
return "Lys"
case "AG":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Ser"
else:
return "Arg"
case "GU":
return "Val"
case "GC":
return "Ala"
case "GA":
if (rna_codon[2]=="U") | (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Asp"
else:
return "Glu"
case "GG":
return "Gly"
def cut_codon(sequence):

nina.marthe_ird.fr
committed
from textwrap import wrap
return wrap(sequence,3)

nina.marthe_ird.fr
committed
def traduction(sequence_arn): # translate rna
list_codons=cut_codon(sequence_arn)

nina.marthe_ird.fr
committed
prot=list()
for codon in list_codons:
if len(codon)==3:
prot.append(get_aa(codon))

nina.marthe_ird.fr
committed
#else:
# print("attempt to get the amino acid for an incomplete codon")

nina.marthe_ird.fr
committed
return prot

nina.marthe_ird.fr
committed
def get_sequence_on_genome(feature_id,segments_on_target_genome): # returns the sequence of the feature on the target genome
feature=Features[feature_id]
list_seg=feature.segments_list
first_seg=get_first_seg(list_seg,segments_on_target_genome)
last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)

nina.marthe_ird.fr
committed
path_on_target=get_feature_path(paths,first_seg,last_seg)

nina.marthe_ird.fr
committed
pos_start=feature.pos_start
pos_stop=feature.pos_stop

nina.marthe_ird.fr
committed
new_sequence=""
for segment in path_on_target:

nina.marthe_ird.fr
committed
if segment==feature.segments_list[0]:
new_sequence+=get_segment_sequence(seg_seq,segment)[pos_start-1:]
elif segment==feature.segments_list[-1]:
new_sequence+=get_segment_sequence(seg_seq,segment)[0:pos_stop]

nina.marthe_ird.fr
committed
else:
new_sequence+=get_segment_sequence(seg_seq,segment)
return new_sequence

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def get_mrna_var(var_file):
var=open(var_file,'r')
lines=var.readlines()
var.close()

nina.marthe_ird.fr
committed
mrna_var={}
for line in lines:
line=line.split()

nina.marthe_ird.fr
committed
feature_type=line[1]
if feature_type=="mRNA":
mrna_id=line[0].replace('.','_').replace(':','_')
if mrna_id not in mrna_var.keys():
mrna_var[mrna_id]={}
elif feature_type=="CDS":
cds_id=line[0].replace('.','_').replace(':','_')

nina.marthe_ird.fr
committed
parent_id=line[0][0:line[0].find("cds")-1]
if cds_id not in mrna_var[parent_id]:
mrna_var[parent_id][cds_id]=list()
mrna_var[parent_id][cds_id].append(line)
return mrna_var
import re
def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one
print("\nlooking for a new start codon:")
frame_shift=0
# chercher codon start en amont du cds, dans le mrna
seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
print("sequence before the CDS in the mRNA:",sequence_amont)
if "ATG" in sequence_amont:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)]
print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
print("checking if there is a stop codon after the start codons in the same reading frame:")
for start_pos in start_pos_list:
start_pos_frame=start_pos%3
n=len(sequence_amont)-start_pos+1
if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) :
#print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !

nina.marthe_ird.fr
committed
frame_shift=(frame_shift+n)%3 # vérifier le frameshift !!
print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame")
print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame")
# chercher codon start en aval, dans le cds
if "ATG" in seq_cds:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier
def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion

nina.marthe_ird.fr
committed
print("printvar",deleted_sequence,inserted_sequence)

nina.marthe_ird.fr
committed
deleted_aa=traduction(get_rna(deleted_sequence))
inserted_aa=traduction(get_rna(inserted_sequence))

nina.marthe_ird.fr
committed
if (len(deleted_aa)!=0) & (len(inserted_aa)!=0):
if deleted_aa!=inserted_aa:
print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa))

nina.marthe_ird.fr
committed
else:
print("consequence : synonymous variation in",",".join(deleted_aa))

nina.marthe_ird.fr
committed
elif len(inserted_aa)!=0:
print("consequence : insertion of",",".join(inserted_aa))

nina.marthe_ird.fr
committed
else:
print("consequence : deletion of",",".join(deleted_aa))
print("stop codon apparition in the cds of the target genome")

nina.marthe_ird.fr
committed
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
segments_on_target_genome=get_segments_positions_on_genome(pos_seg)

nina.marthe_ird.fr
committed
mrna_var=get_mrna_var(var_file)
for feature in Features.values(): # add the sequence of all features

nina.marthe_ird.fr
committed
add_feature_sequence(feature,seg_seq)
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,sequenceCDS_source):
# get the sequence until the next variation in the next cds
# for this, go through all the cds left, and add the sequence until the first variation encountered
var_found=False
keys = list(cds_var.keys())
cds_rank = keys.index(cds_id)+1
nextCds=keys[cds_rank]
current_readingframe=readingframe
ref_sequence=""
alt_sequence=""
# look for the next variation, and get the sequence before.
while (not var_found) & (keys.index(nextCds)!=len(keys)): # go through all the cds until the last
if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence
ref_sequence+=Features[nextCds].sequence
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)
cds_rank+=1
nextCds=keys[cds_rank]
#current_readingframe=(3-(len(sequenceCDS_source)-current_readingframe))%3
print("no var in current cds")
else: # variation found
variation=cds_var[nextCds][0]
#posNextVar=[int(variation[12])+(3-current_readingframe)%3,int(variation[13])+(3-current_readingframe)%3]
# revoir le calcul des fragments avant la variation !
# cas où la var est sur la deuxième base du cds : alors il faut peut être laisser à cette var les bases avant du cds précédent ......
# et je crois qu'on a toujours pas réglé le cas où on a une sbustitution sur la base 1 et 3 d'un codon.
nextReadingframe=(3-(len(sequenceCDS_source)-readingframe))%3
len_codonFragment_beforeNextVar_ref=(int(variation[12])-nextReadingframe)%3
len_codonFragment_beforeNextVar_alt=(int(variation[13])+(3-frameshift)-nextReadingframe)%3
ref_sequence+=Features[nextCds].sequence[0:int(variation[12])-len_codonFragment_beforeNextVar_ref]
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(variation[13])-len_codonFragment_beforeNextVar_alt]
#print("pos_var",variation[12],variation[13])
#print("next readingframe, frameshift",nextReadingframe,frameshift)
#print("len frag before",len_codonFragment_beforeNextVar_ref,len_codonFragment_beforeNextVar_alt)
#print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome))
var_found=True
return [ref_sequence,alt_sequence]
def get_variation_change_sequences(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id):

nina.marthe_ird.fr
committed
ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3)
ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3)
if frameshift==0: # only local change
ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon
ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3
elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3]
ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3)
ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3)
else: # last variation, go until the end of the cds
ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source)
ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target)
ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref]
alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt]
if (frameshift!=0) & (index_var==len(cds_var[cds_id])-1) & (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds
[ref_add,alt_add]=get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,sequenceCDS_source)
ref_sequence+=ref_add
alt_sequence+=alt_add

nina.marthe_ird.fr
committed
return print_variation_change(ref_sequence,alt_sequence)

nina.marthe_ird.fr
committed
if analysis==True:
# analysing the variations for all the mrna :
for mrna_id in mrna_var.keys():
print("analysis of the variations in the mRNA",mrna_id,":\n")
leftover_ref=""
leftover_target=""

nina.marthe_ird.fr
committed
frameshift=0

nina.marthe_ird.fr
committed
readingframe=0

nina.marthe_ird.fr
committed
cds_var=mrna_var[mrna_id]
for index_cds,cds_id in enumerate(cds_var):
print("\nCDS",cds_id,":")

nina.marthe_ird.fr
committed
print("readingframe",readingframe,"frameshift",frameshift)

nina.marthe_ird.fr
committed
for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation

nina.marthe_ird.fr
committed
print("\nvariation",index_var+1, ":")

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
length_ref=len(var[9])
length_alt=len(var[10])

nina.marthe_ird.fr
committed
if type_var=="insertion":
length_ref=0

nina.marthe_ird.fr
committed
print("insertion of",var[10])
elif type_var=="deletion":

nina.marthe_ird.fr
committed
length_alt=0

nina.marthe_ird.fr
committed
print("deletion of",var[9])

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
print("substitution of",var[9],"by",var[10])
posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3]
sequenceCDS_target=leftover_target[readingframe:]+get_sequence_on_genome(cds_id,segments_on_target_genome)
sequenceCDS_source=leftover_ref[readingframe:]+Features[cds_id].sequence
print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14])

nina.marthe_ird.fr
committed
if (index_cds==0) & (posVar[0]<3):
print("variation of the start codon, mRNA most likely wont be translated")
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
old_frameshift=frameshift
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")

nina.marthe_ird.fr
committed
# frameshift=0 -> reading frame recovered

nina.marthe_ird.fr
committed
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if frameshift==0:

nina.marthe_ird.fr
committed
print("recovery of the original reading frame")

nina.marthe_ird.fr
committed
if old_frameshift==0:
print("loss of the original reading frame")
stop=get_variation_change_sequences(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
if stop:
break

nina.marthe_ird.fr
committed
if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds

nina.marthe_ird.fr
committed
leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:]
leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:]
readingframe=(3-(len(sequenceCDS_source)-readingframe))%3