From e243b92023d7349c2ea62ccfa6dcd1ff1efad710 Mon Sep 17 00:00:00 2001 From: NMarthe <nina.marthe@ird.fr> Date: Thu, 18 Jan 2024 15:59:45 +0100 Subject: [PATCH] simplified the code for inference, made a function for it, attempted to handle multi exonic genes --- inference.py | 188 +++++++++++++++++++++++---------------------------- 1 file changed, 83 insertions(+), 105 deletions(-) diff --git a/inference.py b/inference.py index 97532da..b2f1c31 100644 --- a/inference.py +++ b/inference.py @@ -236,6 +236,7 @@ def findOtherStart(cds,segments_on_target_genome): # look for another start codo return frame_shift def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion + print("printvar",deleted_sequence,inserted_sequence) stop=False deleted_aa=traduction(get_rna(deleted_sequence)) inserted_aa=traduction(get_rna(inserted_sequence)) @@ -262,6 +263,66 @@ for feature in Features.values(): # add the sequence of all features add_feature_sequence(feature,seg_seq) +def test(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id): + ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3) + ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3) + + if frameshift==0: # only local change + ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon + ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3 + elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation + nextVar=cds_var[cds_id][index_var+1] + posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3] + ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3) + ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3) + else: # last variation, go until the end of the cds + ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source) + ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target) + + ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref] + alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt] + + if (frameshift!=0) & (index_var==len(cds_var[cds_id])-1) & (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds + # get the sequence until the next variation in the next cds + # for this, go through all the cds left, and add the sequence until the first variation encountered + var_found=False + keys = list(cds_var.keys()) + cds_rank = keys.index(cds_id)+1 + nextCds=keys[cds_rank] + current_readingframe=readingframe + + # # look for the next variation, and get the sequence before. + while (not var_found) & (keys.index(nextCds)!=len(keys)): # go through all the cds until the last + if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence + ref_sequence+=Features[nextCds].sequence + alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome) + cds_rank+=1 + nextCds=keys[cds_rank] + #current_readingframe=(3-(len(sequenceCDS_source)-current_readingframe))%3 + print("no var in current cds") + else: # variation found + variation=cds_var[nextCds][0] + + #posNextVar=[int(variation[12])+(3-current_readingframe)%3,int(variation[13])+(3-current_readingframe)%3] + + # revoir le calcul des fragments avant la variation ! + # cas où la var est sur la deuxième base du cds : alors il faut peut être laisser à cette var les bases avant du cds précédent ...... + # et je crois qu'on a toujours pas réglé le cas où on a une sbustitution sur la base 1 et 3 d'un codon. + + nextReadingframe=(3-(len(sequenceCDS_source)-readingframe))%3 + len_codonFragment_beforeNextVar_ref=(int(variation[12])-nextReadingframe)%3 + len_codonFragment_beforeNextVar_alt=(int(variation[13])+(3-frameshift)-nextReadingframe)%3 + ref_sequence+=Features[nextCds].sequence[0:int(variation[12])-len_codonFragment_beforeNextVar_ref] + alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(variation[13])-len_codonFragment_beforeNextVar_alt] + #print("pos_var",variation[12],variation[13]) + #print("next readingframe, frameshift",nextReadingframe,frameshift) + #print("len frag before",len_codonFragment_beforeNextVar_ref,len_codonFragment_beforeNextVar_alt) + #print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome)) + var_found=True + + return print_variation_change(ref_sequence,alt_sequence) + + analysis=True if analysis==True: # analysing the variations for all the mrna : @@ -270,28 +331,31 @@ if analysis==True: leftover_ref="" leftover_target="" frameshift=0 - readingframe=3 + readingframe=0 cds_var=mrna_var[mrna_id] for index_cds,cds_id in enumerate(cds_var): - cds=Features[cds_id] print("\nCDS",cds_id,":") + print("readingframe",readingframe,"frameshift",frameshift) for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : type_var=var[8] if type_var!="no_var": # if there is a variation - posVar=[int(var[12])+3-readingframe,int(var[13])+3-readingframe] - sequence_target=leftover_target[readingframe:]+get_sequence_on_genome(cds_id,segments_on_target_genome) - sequence_ref=leftover_ref[readingframe:]+cds.sequence + print("\nvariation",index_var+1, ":") + length_ref=len(var[9]) + length_alt=len(var[10]) if type_var=="insertion": length_ref=0 - else: - length_ref=len(var[9]) - if type_var=="deletion": + print("insertion of",var[10]) + elif type_var=="deletion": length_alt=0 + print("deletion of",var[9]) else: - length_alt=len(var[10]) - - print("\nvariation",index_var+1, ":") + print("substitution of",var[9],"by",var[10]) + + posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3] + sequenceCDS_target=leftover_target[readingframe:]+get_sequence_on_genome(cds_id,segments_on_target_genome) + sequenceCDS_source=leftover_ref[readingframe:]+Features[cds_id].sequence + print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14]) if (index_cds==0) & (posVar[0]<3): print("variation of the start codon, mRNA most likely wont be translated") @@ -302,107 +366,21 @@ if analysis==True: frameshift=(frameshift+length_ref-length_alt)%3 if old_frameshift!=frameshift: print("variation causing a frameshift") - # frameshift=0 -> reading frame recovered. may need to get a base before. + # frameshift=0 -> reading frame recovered # frameshift=1 -> frameshift of 1 base to the right # frameshift=2 -> frameshift of 2 bases to the right if frameshift==0: - print("recovery of the original reading frame") + print("recovery of the original reading frame") if old_frameshift==0: print("loss of the original reading frame") - #else: - # print("variation not causing a frameshift") + stop=test(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id) - if type_var=="insertion": - print("insertion of",var[10]) - elif type_var=="deletion": - print("deletion of",var[9]) - else: - print("substitution of",var[9],"by",var[10]) - - len_fragment_before_del=(posVar[0])%3 - len_fragment_before_ins=(posVar[1])%3 - - if frameshift==0: - # print only the local change. - len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 - len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 - total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] - total_del=sequence_ref[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] - stop=print_variation_change(total_del,total_ins) - if stop: - break + if stop: + break - else: - # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. - print("frameshift of",frameshift,"base(s) to the right.") - if index_var==len(cds_var[cds_id])-1: # it is the last variation of the cds. translate until the next var in the rest of the mrna - total_total_del=sequence_ref[posVar[0]-len_fragment_before_del:] - total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] - del_leftover_len=len(total_total_del)%3 - ins_leftover_len=len(total_total_ins)%3 - - # if it is not the last cds, add the modifications until the next variation. - # get the leftover from this cds, and check the next one for the rest. - - # for this, go through all the cds left, and add the sequence until the first variation encountered. - - if index_cds!=len(cds_var)-1: # if this is not the last cds - # add the sequence until the next variation - var_found=False - - keys = list(cds_var.keys()) - cds_rank = keys.index(cds_id) - cds_rank+=1 - nextCds=keys[cds_rank] - - while (not var_found) & (keys.index(nextCds)!=len(keys)): # vérifier qu'on dépasse pas - if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence - total_total_del+=Features[nextCds].sequence - total_total_ins+=get_sequence_on_genome(nextCds,segments_on_target_genome) - cds_rank+=1 - nextCds=keys[cds_rank] - print("no var in current cds") - else: # variation found - variation=cds_var[nextCds][0] - nextReadingframe=(3-(len(sequence_ref)-readingframe))%3 - len_fragment_before_del_nextvar=(int(variation[12])-nextReadingframe)%3 - len_fragment_before_ins_nextvar=(int(variation[13])+(3-frameshift)-nextReadingframe)%3 - total_total_del+=Features[nextCds].sequence[0:int(variation[12])-len_fragment_before_del_nextvar] - total_total_ins+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(variation[13])-len_fragment_before_ins_nextvar] - #print("pos_var",variation[12],variation[13]) - #print("next readingframe, frameshift",nextReadingframe,frameshift) - #print("len frag before",len_fragment_before_del_nextvar,len_fragment_before_ins_nextvar) - #print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome)) - var_found=True - - - stop=print_variation_change(total_total_del,total_total_ins) - if stop: - break - else: # not the last variation. translate until the next var. - nextVar=cds_var[cds_id][index_var+1] - posNextVar=[int(nextVar[12]),int(nextVar[13])] - - if nextVar[8]=="insertion": - length_ref_nextvar=0 - else: - length_ref_nextvar:len(nextVar[9]) - if nextVar[8]=="deletion": - length_alt_nextvar=0 - else: - length_alt_nextvar=len(nextVar[10]) - - len_fragment_before_del_nextvar=(posNextVar[0])%3 - len_fragment_before_ins_nextvar=(posNextVar[1])%3 - total_total_del=sequence_ref[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] - total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] - stop=print_variation_change(total_total_del,total_total_ins) - if stop: - break - if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds - leftover_ref=sequence_ref[len(sequence_ref)-3:] - leftover_target=sequence_target[len(sequence_target)-3:] - readingframe=(3-(len(sequence_ref)-readingframe))%3 + leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:] + leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:] + readingframe=(3-(len(sequenceCDS_source)-readingframe))%3 -- GitLab