From 2dff4cf57ec7142da28f6eb8db730d5431566236 Mon Sep 17 00:00:00 2001 From: NMarthe <nina.marthe@ird.fr> Date: Thu, 7 Dec 2023 13:35:33 +0100 Subject: [PATCH] corrected the inference when there is a frameshift. it simplfied the algorithm (less conditions) --- inference.py | 150 ++++++++++++++++++++------------------------------- 1 file changed, 58 insertions(+), 92 deletions(-) diff --git a/inference.py b/inference.py index 052175e..84d4d16 100644 --- a/inference.py +++ b/inference.py @@ -213,7 +213,7 @@ def findOtherStart(cds,segments_on_target_genome): # look for another start codo if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) : #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds") # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture ! - frame_shift=(frame_shift+n)%3 # vérifier le frame shift !! + frame_shift=(frame_shift+n)%3 # vérifier le frameshift !! print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame") else: print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame") @@ -256,7 +256,7 @@ for feature in Features.values(): # add the sequence of all features for cds_id in cds_var.keys(): cds=Features[cds_id] print("analysis of the variations in the CDS",cds_id,":\n") - frame_shift=0 + frameshift=0 for index, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : type_var=var[8] if type_var!="no_var": # if there is a variation @@ -279,106 +279,72 @@ for cds_id in cds_var.keys(): #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon break - if abs(length_alt-length_ref)%3 == 0: # size diff 3k -> no frame shift. - - if (posVar[0])%3==0: #size diff 3k, position 3k - print("variation between two codons not causing a frameshift") - if type_var=="insertion": - print("insertion of",var[10]) - elif type_var=="deletion": - print("deletion of",var[9]) - else: - print("substitution of",var[9],"by",var[10]) + old_frameshift=frameshift + frameshift=(frameshift+length_ref-length_alt)%3 + if old_frameshift!=frameshift: + print("variation causing a frameshift") + # frameshift=0 -> reading frame recovered. may need to get a base before. + # frameshift=1 -> frameshift of 1 base to the right + # frameshift=2 -> frameshift of 2 bases to the right + if frameshift==0: + print("recovery of the original reading frame") + if old_frameshift==0: + print("loss of the original reading frame") + #else: + # print("variation not causing a frameshift") - len_fragment_after=(3-length_ref)%3 - deleted_sequence=cds.sequence[posVar[0]:posVar[0]+length_ref+len_fragment_after] - inserted_sequence=sequence_target[posVar[1]:posVar[1]+length_alt+len_fragment_after] - stop=print_variation_change(deleted_sequence,inserted_sequence) - if stop: - break - else: # size diff 3k, position !=3k - print("variation in the middle of a codon not causing a frameshift") - if type_var=="insertion": - print("insertion of",var[10]) - elif type_var=="deletion": - print("deletion of",var[9]) - else: - print("substitution of",var[9],"by",var[10]) + if type_var=="insertion": + print("insertion of",var[10]) + elif type_var=="deletion": + print("deletion of",var[9]) + else: + print("substitution of",var[9],"by",var[10]) - len_fragment_before=(posVar[0])%3 - len_fragment_after=(3-(len_fragment_before+length_ref))%3 + len_fragment_before_del=(posVar[0])%3 + len_fragment_before_ins=(posVar[1])%3 - total_ins=sequence_target[posVar[1]-len_fragment_before:posVar[1]+length_alt+len_fragment_after] - total_del=cds.sequence[posVar[0]-len_fragment_before:posVar[0]+length_ref+len_fragment_after] + if frameshift==0: + # print only the local change. + len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 + len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 + total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] + total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] + stop=print_variation_change(total_del,total_ins) + if stop: + break - stop=print_variation_change(total_del,total_ins) + else: + # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. + print("frameshift of",frameshift,"base(s) to the right.") + if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds. + total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] + total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] + stop=print_variation_change(total_total_del,total_total_ins) if stop: break - # possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon, - # while printing the effect of the first snp we aso use the second one. + else: # not the last variation. translate until the next var. + nextVar=cds_var[cds_id][index+1] + posNextVar=[int(nextVar[12]),int(nextVar[13])] - else: # size diff !=3k - print("variation causing a frameshift") - old_frameshift=frame_shift - frame_shift=(frame_shift+length_ref-length_alt)%3 - # frameshift=0 -> reading frame recovered. may need to get a base before. - # frameshift=1 -> frame shift of 1 base to the right - # frameshift=2 -> frame shift of 2 bases to the right - - if type_var=="insertion": - print("insertion of",var[10]) - elif type_var=="deletion": - print("deletion of",var[9]) - else: - print("substitution of",var[9],"by",var[10]) - - len_fragment_before_del=(posVar[0])%3 - len_fragment_before_ins=(posVar[1])%3 - print(len_fragment_before_ins,len_fragment_before_del) - - if frame_shift==0: - # print only the local change. - len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 - len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 - total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] - total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] - print("recovery of the original reading frame") - stop=print_variation_change(total_del,total_ins) + if nextVar[8]=="insertion": + length_ref_nextvar=0 + else: + length_ref_nextvar:len(nextVar[9]) + if nextVar[8]=="deletion": + length_alt_nextvar=0 + else: + length_alt_nextvar=len(nextVar[10]) + + len_fragment_before_del_nextvar=(posNextVar[0])%3 + len_fragment_before_ins_nextvar=(posNextVar[1])%3 + total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] + total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] + stop=print_variation_change(total_total_del,total_total_ins) if stop: break - - else: - # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. - print("creating frame shift of",frame_shift,"base(s) to the right.") - if old_frameshift==0: - print("loss of the original reading frame") - if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds. - total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] - total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] - stop=print_variation_change(total_total_del,total_total_ins) - if stop: - break - else: - nextVar=cds_var[cds_id][index+1] - posNextVar=[int(nextVar[12]),int(nextVar[13])] - - if nextVar[8]=="insertion": - length_ref_nextvar=0 - else: - length_ref_nextvar:len(nextVar[9]) - if nextVar[8]=="deletion": - length_alt_nextvar=0 - else: - length_alt_nextvar=len(nextVar[10]) - - len_fragment_before_del_nextvar=(posNextVar[0])%3 - len_fragment_before_ins_nextvar=(posNextVar[1])%3 - total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] - total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] - stop=print_variation_change(total_total_del,total_total_ins) - if stop: - break + # possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon, + # while printing the effect of the first snp we aso use the second one. print("\n") -- GitLab