Skip to content
Snippets Groups Projects
Commit e243b920 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

simplified the code for inference, made a function for it, attempted to handle multi exonic genes

parent 3b61d864
No related branches found
No related tags found
No related merge requests found
......@@ -236,6 +236,7 @@ def findOtherStart(cds,segments_on_target_genome): # look for another start codo
return frame_shift
def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion
print("printvar",deleted_sequence,inserted_sequence)
stop=False
deleted_aa=traduction(get_rna(deleted_sequence))
inserted_aa=traduction(get_rna(inserted_sequence))
......@@ -262,6 +263,66 @@ for feature in Features.values(): # add the sequence of all features
add_feature_sequence(feature,seg_seq)
def test(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id):
ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3)
ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3)
if frameshift==0: # only local change
ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon
ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3
elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3]
ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3)
ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3)
else: # last variation, go until the end of the cds
ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source)
ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target)
ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref]
alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt]
if (frameshift!=0) & (index_var==len(cds_var[cds_id])-1) & (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds
# get the sequence until the next variation in the next cds
# for this, go through all the cds left, and add the sequence until the first variation encountered
var_found=False
keys = list(cds_var.keys())
cds_rank = keys.index(cds_id)+1
nextCds=keys[cds_rank]
current_readingframe=readingframe
# # look for the next variation, and get the sequence before.
while (not var_found) & (keys.index(nextCds)!=len(keys)): # go through all the cds until the last
if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence
ref_sequence+=Features[nextCds].sequence
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)
cds_rank+=1
nextCds=keys[cds_rank]
#current_readingframe=(3-(len(sequenceCDS_source)-current_readingframe))%3
print("no var in current cds")
else: # variation found
variation=cds_var[nextCds][0]
#posNextVar=[int(variation[12])+(3-current_readingframe)%3,int(variation[13])+(3-current_readingframe)%3]
# revoir le calcul des fragments avant la variation !
# cas où la var est sur la deuxième base du cds : alors il faut peut être laisser à cette var les bases avant du cds précédent ......
# et je crois qu'on a toujours pas réglé le cas où on a une sbustitution sur la base 1 et 3 d'un codon.
nextReadingframe=(3-(len(sequenceCDS_source)-readingframe))%3
len_codonFragment_beforeNextVar_ref=(int(variation[12])-nextReadingframe)%3
len_codonFragment_beforeNextVar_alt=(int(variation[13])+(3-frameshift)-nextReadingframe)%3
ref_sequence+=Features[nextCds].sequence[0:int(variation[12])-len_codonFragment_beforeNextVar_ref]
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(variation[13])-len_codonFragment_beforeNextVar_alt]
#print("pos_var",variation[12],variation[13])
#print("next readingframe, frameshift",nextReadingframe,frameshift)
#print("len frag before",len_codonFragment_beforeNextVar_ref,len_codonFragment_beforeNextVar_alt)
#print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome))
var_found=True
return print_variation_change(ref_sequence,alt_sequence)
analysis=True
if analysis==True:
# analysing the variations for all the mrna :
......@@ -270,28 +331,31 @@ if analysis==True:
leftover_ref=""
leftover_target=""
frameshift=0
readingframe=3
readingframe=0
cds_var=mrna_var[mrna_id]
for index_cds,cds_id in enumerate(cds_var):
cds=Features[cds_id]
print("\nCDS",cds_id,":")
print("readingframe",readingframe,"frameshift",frameshift)
for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation
posVar=[int(var[12])+3-readingframe,int(var[13])+3-readingframe]
sequence_target=leftover_target[readingframe:]+get_sequence_on_genome(cds_id,segments_on_target_genome)
sequence_ref=leftover_ref[readingframe:]+cds.sequence
print("\nvariation",index_var+1, ":")
length_ref=len(var[9])
length_alt=len(var[10])
if type_var=="insertion":
length_ref=0
else:
length_ref=len(var[9])
if type_var=="deletion":
print("insertion of",var[10])
elif type_var=="deletion":
length_alt=0
print("deletion of",var[9])
else:
length_alt=len(var[10])
print("\nvariation",index_var+1, ":")
print("substitution of",var[9],"by",var[10])
posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3]
sequenceCDS_target=leftover_target[readingframe:]+get_sequence_on_genome(cds_id,segments_on_target_genome)
sequenceCDS_source=leftover_ref[readingframe:]+Features[cds_id].sequence
print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14])
if (index_cds==0) & (posVar[0]<3):
print("variation of the start codon, mRNA most likely wont be translated")
......@@ -302,107 +366,21 @@ if analysis==True:
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")
# frameshift=0 -> reading frame recovered. may need to get a base before.
# frameshift=0 -> reading frame recovered
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if frameshift==0:
print("recovery of the original reading frame")
print("recovery of the original reading frame")
if old_frameshift==0:
print("loss of the original reading frame")
#else:
# print("variation not causing a frameshift")
stop=test(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id)
if type_var=="insertion":
print("insertion of",var[10])
elif type_var=="deletion":
print("deletion of",var[9])
else:
print("substitution of",var[9],"by",var[10])
len_fragment_before_del=(posVar[0])%3
len_fragment_before_ins=(posVar[1])%3
if frameshift==0:
# print only the local change.
len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3
len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3
total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins]
total_del=sequence_ref[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del]
stop=print_variation_change(total_del,total_ins)
if stop:
break
if stop:
break
else:
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print("frameshift of",frameshift,"base(s) to the right.")
if index_var==len(cds_var[cds_id])-1: # it is the last variation of the cds. translate until the next var in the rest of the mrna
total_total_del=sequence_ref[posVar[0]-len_fragment_before_del:]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:]
del_leftover_len=len(total_total_del)%3
ins_leftover_len=len(total_total_ins)%3
# if it is not the last cds, add the modifications until the next variation.
# get the leftover from this cds, and check the next one for the rest.
# for this, go through all the cds left, and add the sequence until the first variation encountered.
if index_cds!=len(cds_var)-1: # if this is not the last cds
# add the sequence until the next variation
var_found=False
keys = list(cds_var.keys())
cds_rank = keys.index(cds_id)
cds_rank+=1
nextCds=keys[cds_rank]
while (not var_found) & (keys.index(nextCds)!=len(keys)): # vérifier qu'on dépasse pas
if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence
total_total_del+=Features[nextCds].sequence
total_total_ins+=get_sequence_on_genome(nextCds,segments_on_target_genome)
cds_rank+=1
nextCds=keys[cds_rank]
print("no var in current cds")
else: # variation found
variation=cds_var[nextCds][0]
nextReadingframe=(3-(len(sequence_ref)-readingframe))%3
len_fragment_before_del_nextvar=(int(variation[12])-nextReadingframe)%3
len_fragment_before_ins_nextvar=(int(variation[13])+(3-frameshift)-nextReadingframe)%3
total_total_del+=Features[nextCds].sequence[0:int(variation[12])-len_fragment_before_del_nextvar]
total_total_ins+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(variation[13])-len_fragment_before_ins_nextvar]
#print("pos_var",variation[12],variation[13])
#print("next readingframe, frameshift",nextReadingframe,frameshift)
#print("len frag before",len_fragment_before_del_nextvar,len_fragment_before_ins_nextvar)
#print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome))
var_found=True
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
else: # not the last variation. translate until the next var.
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12]),int(nextVar[13])]
if nextVar[8]=="insertion":
length_ref_nextvar=0
else:
length_ref_nextvar:len(nextVar[9])
if nextVar[8]=="deletion":
length_alt_nextvar=0
else:
length_alt_nextvar=len(nextVar[10])
len_fragment_before_del_nextvar=(posNextVar[0])%3
len_fragment_before_ins_nextvar=(posNextVar[1])%3
total_total_del=sequence_ref[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds
leftover_ref=sequence_ref[len(sequence_ref)-3:]
leftover_target=sequence_target[len(sequence_target)-3:]
readingframe=(3-(len(sequence_ref)-readingframe))%3
leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:]
leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:]
readingframe=(3-(len(sequenceCDS_source)-readingframe))%3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment