From 35e00a5251d2dfaef385f1b0f08b4a319e0cd568 Mon Sep 17 00:00:00 2001 From: NMarthe <nina.marthe@ird.fr> Date: Mon, 4 Dec 2023 10:41:47 +0100 Subject: [PATCH] added comments and translated the comments and the output --- inference.py | 250 ++++++++++++++++++++------------------------------- 1 file changed, 97 insertions(+), 153 deletions(-) diff --git a/inference.py b/inference.py index a6a9d0c..0458958 100644 --- a/inference.py +++ b/inference.py @@ -4,16 +4,15 @@ from Functions import get_segment_sequence,convert_strand target_genome_name="genome2_chr10" pos_seg=target_genome_name+".bed" var_file=target_genome_name+"_variations.txt" - - +gfa="graph.gfa" intersect_path='intersect.bed' load_intersect(intersect_path) -var=open(var_file,'r') -gfa="graph.gfa" -def get_segments_sequence_and_paths(gfa): + +# présent dans Fuctions.py mais relou à importer ? +def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths file_gfa=open(gfa,'r') lines_gfa=file_gfa.readlines() file_gfa.close() @@ -39,21 +38,19 @@ def get_segments_sequence_and_paths(gfa): paths[line[3]]=list_path return [paths,seg_seq] -[paths,seg_seq]=get_segments_sequence_and_paths(gfa) - -segments_on_target_genome={} - -def get_segments_positions_on_genome(pos_seg): +# présent dans Fuctions.py mais relou à importer ? +def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome bed=open(pos_seg,'r') lines=bed.readlines() # read line by line ? bed.close() + segments_on_target_genome={} for line in lines: line=line.split() [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]] segments_on_target_genome[seg]=[chrom,start,stop,strand] -get_segments_positions_on_genome(pos_seg) + return segments_on_target_genome -def add_feature_sequence(feature,seg_seq): +def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object feature_sequence="" for segment in feature.segments_list: if segment==feature.segments_list[0]: @@ -64,7 +61,7 @@ def add_feature_sequence(feature,seg_seq): feature_sequence+=get_segment_sequence(seg_seq,segment) feature.sequence=feature_sequence -def get_first_seg(list_seg,segments_on_target_genome): +def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome first_seg_found='' for segment in list_seg: if segment[1:] in segments_on_target_genome: @@ -72,8 +69,7 @@ def get_first_seg(list_seg,segments_on_target_genome): break return first_seg_found -def get_feature_path(paths,first_seg,last_seg): - # find the path in the target genome +def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome first_strand=convert_strand(segments_on_target_genome[first_seg][3]) first_seg_stranded=first_strand+first_seg last_strand=convert_strand(segments_on_target_genome[last_seg][3]) @@ -86,29 +82,27 @@ def get_feature_path(paths,first_seg,last_seg): list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] return list_segfeat_azu_corrected - def get_rna(dna_sequence): return dna_sequence.replace("T","U") -# penser à transcrire la séquence codante du gène !! -def get_aa(codon): - match codon[0:2]: +def get_aa(rna_codon): # gives the aa coded by the rna codon + match rna_codon[0:2]: case "UU": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Phe" else: return "Leu" case "UC": return "Ser" case "UA": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Tyr" else: return "*" case "UG": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Cys" - elif codon[2]=="A": + elif rna_codon[2]=="A": return "*" else: return "Trp" @@ -117,26 +111,26 @@ def get_aa(codon): case "CC": return "Pro" case "CA": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "His" else: return "Gln" case "CG": return "Arg" case "AU": - if codon[2]=="G": + if rna_codon[2]=="G": return "Met" else: return "Ile" case "AC": return "Thr" case "AA": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Asn" else: return "Lys" case "AG": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Ser" else: return "Arg" @@ -145,60 +139,25 @@ def get_aa(codon): case "GC": return "Ala" case "GA": - if (codon[2]=="U") | (codon[2]=="C"): + if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Asp" else: return "Glu" case "GG": return "Gly" +from textwrap import wrap +def cut_codon(sequence): + return wrap(sequence,3) -def traduction(sequence_arn): - list_codons=decoupe_codon(sequence_arn) +def traduction(sequence_arn): # translate rna + list_codons=cut_codon(sequence_arn) prot=list() for codon in list_codons: prot.append(get_aa(codon)) return prot - -from textwrap import wrap -def decoupe_codon(sequence): - return wrap(sequence,3) - - - - - - - - -def get_sequence_before(first_seg,seg_seq,n,paths,feat): - first_strand=convert_strand(first_seg[0]) - first_seg_stranded=first_strand+first_seg[1:] - index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) - - sequence_before=seg_seq[first_seg[1:]][0:feat.pos_start-1] # sequence left on the segment on which the cds start (can be empty) - current_index=index_first_seg-1 - while (len(sequence_before)<n) & (current_index>=0): - segment=paths[target_genome_name][current_index] - sequence_before=seg_seq[segment[1:]]+sequence_before - current_index-=1 - return sequence_before[0:99] - -def get_sequence_after(last_seg,seg_seq,n,paths,feat): - last_strand=convert_strand(last_seg[0]) - last_seg_stranded=last_strand+last_seg[1:] - index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) - - sequence_after=seg_seq[last_seg[1:]][feat.pos_stop:] # sequence left on the segment on which the cds ends (can be empty) - current_index=index_last_seg+1 - while (len(sequence_after)<n) & (current_index>len(paths[target_genome_name])): - segment=paths[target_genome_name][current_index] - sequence_after=sequence_after+seg_seq[segment[1:]] - current_index+=1 - return sequence_after[len(sequence_after)-100:] - -def get_sequence_on_genome(feature,segments_on_target_genome): +def get_sequence_on_genome(feature,segments_on_target_genome): # returns the sequence of the feature on the target genome list_seg=Features[feature].segments_list first_seg=get_first_seg(list_seg,segments_on_target_genome) last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome) @@ -214,36 +173,37 @@ def get_sequence_on_genome(feature,segments_on_target_genome): new_sequence+=get_segment_sequence(seg_seq,segment) return new_sequence +def get_cds_variations(var_file): # creates dict : cds_id -> list of variations from the var file + var=open(var_file,'r') + lines=var.readlines() + var.close() + # dict cds-var + cds_var={} + for line in lines: + line=line.split() + if line[1]=="CDS": + cds_id=line[0].replace('.','_').replace(':','_') + if cds_id not in cds_var.keys(): + cds_var[cds_id]=list() + cds_var[cds_id].append(line) + return cds_var -lines=var.readlines() -var.close() - -# dict cds-var -cds_var={} -for line in lines: - line=line.split() - if line[1]=="CDS": - cds_id=line[0].replace('.','_').replace(':','_') - if cds_id not in cds_var.keys(): - cds_var[cds_id]=list() - cds_var[cds_id].append(line) - - -def findOtherStart(cds,segments_on_target_genome): - print("\nrecherche de nouveau codon start:") +import re +def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one + print("\nlooking for a new start codon:") frame_shift=0 # chercher codon start en amont du cds, dans le mrna seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome) seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome) sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence - print("sequence en amont dans le mRNA:",sequence_amont) + print("sequence before the CDS in the mRNA:",sequence_amont) if "ATG" in sequence_amont: start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)] stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)] - print("position des starts:",start_pos_list,"position des stops:",stop_pos_list,"en amont du cds") # positions (overlapping) où on trouve un atg. + print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg. # vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !) - print("verification des stops après les starts:") + print("checking if there is a stop codon after the start codons in the same reading frame:") for start_pos in start_pos_list: start_pos_frame=start_pos%3 n=len(sequence_amont)-start_pos+1 @@ -251,51 +211,51 @@ def findOtherStart(cds,segments_on_target_genome): #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds") # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture ! frame_shift=(frame_shift+n)%3 # vérifier le frame shift !! - print("le start à la position",start_pos,",",n,"bases en amont du cds, n'a pas de stop en aval dans le même cadre de lecture") + print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame") else: - print("le start à la position",start_pos,",",n,"bases en amont du cds, a un stop en aval dans le même cadre de lecture") + print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame") # chercher codon start en aval, dans le cds if "ATG" in seq_cds: start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)] - print("codon start candidat trouvé plus loin dans le cds, à la base",start_pos_list[0]) # print seulement le premier + print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier print("\n") return frame_shift - - -# ajouter la vérification qu'on introduit/delete pas un codon stop -def print_variation_change(deleted_sequence,inserted_sequence): +def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion stop=False deleted_aa=traduction(get_rna(deleted_sequence)) inserted_aa=traduction(get_rna(inserted_sequence)) if (len(deleted_aa)!=0) & (len(inserted_aa)!=0): if deleted_aa!=inserted_aa: - print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa)) + print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa)) else: - print("conséquence : mutation synonyme dans",",".join(deleted_aa)) + print("consequence : synonymous variation in",",".join(deleted_aa)) elif len(inserted_aa)!=0: - print("conséquence : insertion de",",".join(inserted_aa)) + print("consequence : insertion of",",".join(inserted_aa)) else: - print("conséquence : deletion de",",".join(deleted_aa)) + print("consequence : deletion of",",".join(deleted_aa)) if ("*" in inserted_aa): - print("apparition d'un codon stop dans le cds du génome cible") + print("stop codon apparition in the cds of the target genome") stop=True return stop -version="new" -import re -for feature in Features.values(): + +[paths,seg_seq]=get_segments_sequence_and_paths(gfa) +segments_on_target_genome=get_segments_positions_on_genome(pos_seg) +cds_var=get_cds_variations(var_file) +for feature in Features.values(): # add the sequence of all features add_feature_sequence(feature,seg_seq) + +# analysing the variations for all the cds : for cds_id in cds_var.keys(): cds=Features[cds_id] - print("analyse des variations dans le cds",cds_id,"\n") + print("analysis of the variations in the CDS",cds_id,"\n") frame_shift=0 - #for var in cds_var[cds_id]: - for index, var in enumerate(cds_var[cds_id]): + for index, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : type_var=var[8] if type_var!="no_var": # if there is a variation posVar=[int(var[12]),int(var[13])] @@ -310,18 +270,18 @@ for cds_id in cds_var.keys(): else: length_alt=len(var[10]) - print("variation",index) + print("variation",index, ":") - if abs(length_alt-length_ref)%3 == 0: # taille diff 3k -> pas de frame shift. + if abs(length_alt-length_ref)%3 == 0: # size diff 3k -> no frame shift. - if (posVar[0])%3==0: # taille diff 3k, position 3k - print("variation entre deux codons sans décalage du cadre de lecture") + if (posVar[0])%3==0: #size diff 3k, position 3k + print("variation between two codons not causing a frameshift") if type_var=="insertion": - print(type_var,"de",var[10]) + print("insertion of",var[10]) elif type_var=="deletion": - print(type_var,"de",var[9]) + print("deletion of",var[9]) else: - print(type_var,"de",var[9],"par",var[10]) + print("substitution of",var[9],"by",var[10]) len_fragment_after=(3-length_ref)%3 deleted_sequence=cds.sequence[posVar[0]:posVar[0]+length_ref+len_fragment_after] @@ -330,14 +290,14 @@ for cds_id in cds_var.keys(): if stop: break - else: # taille diff 3k, position !=3k - print("variation au milieu d'un codon sans décalage du cadre de lecture") + else: # size diff 3k, position !=3k + print("variation in the middle of a codon not causing a frameshift") if type_var=="insertion": - print(type_var,"de",var[10]) + print("insertion of",var[10]) elif type_var=="deletion": - print(type_var,"de",var[9]) + print("deletion of",var[9]) else: - print(type_var,"de",var[9],"par",var[10]) + print("substitution of",var[9],"by",var[10]) len_fragment_before=(posVar[0])%3 len_fragment_after=(3-(len_fragment_before+length_ref))%3 @@ -348,24 +308,24 @@ for cds_id in cds_var.keys(): stop=print_variation_change(total_del,total_ins) if stop: break - # possibilité que j'ai print en compte une variation de trop, si on a un snp sur la premiere et la derniere base d'un codon : - # pour le traitement de la première j'ai également considéré la dernière ! + # possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon, + # while printing the effect of the first snp we aso use the second one. - else: # taille diff !=3k - print("changement du cadre de lecture") + else: # size diff !=3k + print("frameshift variation") old_frameshift=frame_shift frame_shift=(frame_shift+length_ref-length_alt)%3 - # frameshift=0 -> cadre de lecture rétabli. peut nécessiter d'aller chercher une base en amont. - # frameshift=1 -> cadre de lecture décalé de 1 base vers la droite - # frameshift=2 -> cadre de lecture décalé de 2 bases vers la droite + # frameshift=0 -> reading frame recovered. may need to get a base before. + # frameshift=1 -> frame shift of 1 base to the right + # frameshift=2 -> frame shift of 2 bases to the right if type_var=="insertion": - print(type_var,"de",var[10]) + print("insertion of",var[10]) elif type_var=="deletion": - print(type_var,"de",var[9]) + print("deletion of",var[9]) else: - print(type_var,"de",var[9],"par",var[10]) + print("substitution of",var[9],"by",var[10]) len_fragment_before_del=(posVar[0])%3 len_fragment_before_ins=(posVar[1])%3 @@ -377,16 +337,16 @@ for cds_id in cds_var.keys(): total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] stop=print_variation_change(total_del,total_ins) - print("rétablissement du cadre de lecture originel") + print("recovery of the original reading frame") if stop: break else: - # print changes from local to next var - print("cadre de lecture décalé de",frame_shift,"base(s) vers la droite.") + # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. + print("creating frame shift of",frame_shift,"base(s) to the right.") if old_frameshift==0: - print("perte du cadre de lecture originel") - if index==len(cds_var[cds_id])-1: # on est sur la dernière variation. traduire jusqu'à la fin du cds + print("loss of the original reading frame") + if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds. total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] stop=print_variation_change(total_total_del,total_total_ins) @@ -413,25 +373,9 @@ for cds_id in cds_var.keys(): if stop: break - - - - - - - - if posVar[0]<=3: # pour l'instant on cherche pas d'autre start. - print("codon start touché, gène non fonctionnel") - #findOtherStart(cds,segments_on_target_genome) + if posVar[0]<=3: + print("start codon affected, mRNA most likely wont be translated") + #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon break - - - - # si on a plusieurs start candidats il faut choisir celui qu'on prend, ils sont pas forcément tous sur le même cadre de lecture - # comment traiter la variation sur le start ? - # après avoir détecté que la var touche le start, traiter la var ? ou l'inverse - - - print("\n") - + print("\n") \ No newline at end of file -- GitLab