From a545e4baa834f972d1f8e1631b4c3d3c5216e228 Mon Sep 17 00:00:00 2001 From: NMarthe <nina.marthe@ird.fr> Date: Tue, 7 Nov 2023 16:13:30 +0100 Subject: [PATCH] =?UTF-8?q?ajout=C3=A9=20une=20fonction=20pour=20print=20l?= =?UTF-8?q?es=20changements=20d'aa.=20ajout=C3=A9=20le=20traitement=20des?= =?UTF-8?q?=20variations=20qui=20d=C3=A9calent=20le=20cadre=20de=20lecture?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- inference.py | 115 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 46 deletions(-) diff --git a/inference.py b/inference.py index 6e1a0a5..3c1cc00 100644 --- a/inference.py +++ b/inference.py @@ -198,22 +198,6 @@ def get_sequence_after(last_seg,seg_seq,n,paths,feat): current_index+=1 return sequence_after[len(sequence_after)-100:] - - - ''' - first_strand=convert_strand(segments_on_target_genome[first_seg][3]) - first_seg_stranded=first_strand+first_seg - last_strand=convert_strand(segments_on_target_genome[last_seg][3]) - last_seg_stranded=last_strand+last_seg - index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) - index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) - first_index=min(index_first_seg,index_last_seg) - last_index=max(index_last_seg,index_first_seg) - list_segfeat_azu=paths[target_genome_name][first_index:last_index+1] - list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] - ''' - - def get_sequence_on_genome(feature,segments_on_target_genome): list_seg=Features[feature].segments_list first_seg=get_first_seg(list_seg,segments_on_target_genome) @@ -279,6 +263,22 @@ def findOtherStart(cds,segments_on_target_genome): return frame_shift + +# ajouter la vérification qu'on introduit/delete pas un codon stop +def print_variation_change(deleted_sequence,inserted_sequence): + deleted_aa=traduction(get_rna(deleted_sequence)) + inserted_aa=traduction(get_rna(inserted_sequence)) + if (len(deleted_aa)!=0) & (len(inserted_aa)!=0): + if deleted_aa!=inserted_aa: + print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa)) + else: + print("conséquence : mutation synonyme dans",",".join(deleted_aa)) + elif len(inserted_aa)!=0: + print("conséquence : insertion de",",".join(inserted_aa)) + else: + print("conséquence : deletion de",",".join(deleted_aa)) + + version="new" import re for feature in Features.values(): @@ -288,7 +288,8 @@ for cds_id in cds_var.keys(): cds=Features[cds_id] print("analyse des variations dans le cds",cds_id) frame_shift=0 - for var in cds_var[cds_id]: + #for var in cds_var[cds_id]: + for index, var in enumerate(cds_var[cds_id]): type_var=var[8] if type_var!="no_var": # if there is a variation posVar=[int(var[12]),int(var[13])] @@ -303,7 +304,7 @@ for cds_id in cds_var.keys(): else: length_alt=len(var[10]) - print(var) + print("variation",index) if abs(length_alt-length_ref)%3 == 0: # taille diff 3k -> pas de frame shift. @@ -316,21 +317,10 @@ for cds_id in cds_var.keys(): else: print(type_var,"de",var[9],"par",var[10]) - len_fragment_after=(3-length_ref)%3 - deleted_aa=traduction(get_rna(cds.sequence[posVar[0]:posVar[0]+length_ref+len_fragment_after])) - inserted_aa=traduction(get_rna(sequence_target[posVar[1]:posVar[1]+length_alt+len_fragment_after])) - - if (length_ref!=0) & (length_alt!=0): - if deleted_aa!=inserted_aa: - print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa)) - else: - print("conséquence : mutation synonyme dans",",".join(deleted_aa)) - elif length_alt!=0: - print("conséquence : insertion de",",".join(inserted_aa)) - else: - print("conséquence : deletion de",",".join(deleted_aa)) - + deleted_sequence=cds.sequence[posVar[0]:posVar[0]+length_ref+len_fragment_after] + inserted_sequence=sequence_target[posVar[1]:posVar[1]+length_alt+len_fragment_after] + print_variation_change(deleted_sequence,inserted_sequence) else: # taille diff 3k, position !=3k print("variation au milieu d'un codon sans décalage du cadre de lecture") @@ -347,13 +337,7 @@ for cds_id in cds_var.keys(): total_ins=sequence_target[posVar[1]-len_fragment_before:posVar[1]+length_alt+len_fragment_after] total_del=cds.sequence[posVar[0]-len_fragment_before:posVar[0]+length_ref+len_fragment_after] - deleted_aa=traduction(get_rna(total_del)) - inserted_aa=traduction(get_rna(total_ins)) - if deleted_aa!=inserted_aa: - print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa)) - else: - print("conséquence : mutation synonyme dans",",".join(deleted_aa)) - + print_variation_change(total_del,total_ins) # possibilité que j'ai print en compte une variation de trop, si on a un snp sur la premiere et la derniere base d'un codon : # pour le traitement de la première j'ai également considéré la dernière ! @@ -365,11 +349,6 @@ for cds_id in cds_var.keys(): # frameshift=0 -> cadre de lecture rétabli. peut nécessiter d'aller chercher une base en amont. # frameshift=1 -> cadre de lecture décalé de 1 base vers la droite # frameshift=2 -> cadre de lecture décalé de 2 bases vers la droite - if old_frameshift==0: - print("perte du cadre de lecture originel") - elif frame_shift==0: - print("rétablissement du cadre de lecture originel") - if type_var=="insertion": print(type_var,"de",var[10]) @@ -378,13 +357,57 @@ for cds_id in cds_var.keys(): else: print(type_var,"de",var[9],"par",var[10]) - print(frame_shift) + len_fragment_before_del=(posVar[0])%3 + len_fragment_before_ins=(posVar[1])%3 + + if frame_shift==0: + # print only the local change. + len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 + len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 + total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] + total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] + print_variation_change(total_del,total_ins) + print("rétablissement du cadre de lecture originel") + + else: + # print changes from local to next var + print("cadre de lecture décalé de",frame_shift,"base(s) vers la droite.") + if old_frameshift==0: + print("perte du cadre de lecture originel") + if index==len(cds_var[cds_id])-1: # on est sur la dernière variation. traduire jusqu'à la fin du cds + total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] + total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] + print_variation_change(total_total_del,total_total_ins) + else: + nextVar=cds_var[cds_id][index+1] + posNextVar=[int(nextVar[12]),int(nextVar[13])] + + if nextVar[8]=="insertion": + length_ref_nextvar=0 + else: + length_ref_nextvar:len(nextVar[9]) + if nextVar[8]=="deletion": + length_alt_nextvar=0 + else: + length_alt_nextvar=len(nextVar[10]) + + len_fragment_before_del_nextvar=(posNextVar[0])%3 + len_fragment_before_ins_nextvar=(posNextVar[1])%3 + total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] + total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] + print_variation_change(total_total_del,total_total_ins) + + + + + + if posVar[0]<=3: # pour l'instant on cherche pas d'autre start. print("codon start touché donc gène non fonctionnel") #findOtherStart(cds,segments_on_target_genome) - #break + break -- GitLab