from Graph_gff import Features,load_intersect from Functions import get_segment_sequence,convert_strand target_genome_name="genome2_chr10" pos_seg=target_genome_name+".bed" var_file=target_genome_name+"_variations.txt" gfa="graph.gfa" intersect_path='intersect.bed' load_intersect(intersect_path) # présent dans Fuctions.py mais relou à importer ? def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths file_gfa=open(gfa,'r') lines_gfa=file_gfa.readlines() file_gfa.close() seg_seq={} paths={} for line in lines_gfa: line=line.split() if (line[0]=="S"): # get the sequence of the segment seg_id='s'+line[1] seg_seq[seg_id]=line[2] if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome path=line[6].replace(">",";>") path=path.replace("<",";<").split(';') list_path=[] for segment in path: if segment[0:1]=='>': list_path.append('+s'+segment[1:]) elif segment[0:1]=='<': list_path.append('-s'+segment[1:]) paths[line[3]]=list_path return [paths,seg_seq] # présent dans Fuctions.py mais relou à importer ? def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome bed=open(pos_seg,'r') lines=bed.readlines() # read line by line ? bed.close() segments_on_target_genome={} for line in lines: line=line.split() [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]] segments_on_target_genome[seg]=[chrom,start,stop,strand] return segments_on_target_genome def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object feature_sequence="" for segment in feature.segments_list: if segment==feature.segments_list[0]: feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:] # revérifier les +/- 1 pour la position, avec de vraies données elif segment==feature.segments_list[-1]: feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] # revérifier les +/- 1 pour la position, avec de vraies données else: feature_sequence+=get_segment_sequence(seg_seq,segment) feature.sequence=feature_sequence def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome first_seg_found='' for segment in list_seg: if segment[1:] in segments_on_target_genome: first_seg_found=segment[1:] break return first_seg_found def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome first_strand=convert_strand(segments_on_target_genome[first_seg][3]) first_seg_stranded=first_strand+first_seg last_strand=convert_strand(segments_on_target_genome[last_seg][3]) last_seg_stranded=last_strand+last_seg index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) first_index=min(index_first_seg,index_last_seg) last_index=max(index_last_seg,index_first_seg) list_segfeat_azu=paths[target_genome_name][first_index:last_index+1] list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] return list_segfeat_azu_corrected def get_rna(dna_sequence): return dna_sequence.replace("T","U") def get_aa(rna_codon): # gives the aa coded by the rna codon match rna_codon[0:2]: case "UU": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Phe" else: return "Leu" case "UC": return "Ser" case "UA": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Tyr" else: return "*" case "UG": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Cys" elif rna_codon[2]=="A": return "*" else: return "Trp" case "CU": return "Leu" case "CC": return "Pro" case "CA": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "His" else: return "Gln" case "CG": return "Arg" case "AU": if rna_codon[2]=="G": return "Met" else: return "Ile" case "AC": return "Thr" case "AA": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Asn" else: return "Lys" case "AG": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Ser" else: return "Arg" case "GU": return "Val" case "GC": return "Ala" case "GA": if (rna_codon[2]=="U") | (rna_codon[2]=="C"): return "Asp" else: return "Glu" case "GG": return "Gly" from textwrap import wrap def cut_codon(sequence): return wrap(sequence,3) def traduction(sequence_arn): # translate rna list_codons=cut_codon(sequence_arn) prot=list() for codon in list_codons: if len(codon)==3: prot.append(get_aa(codon)) else: print("attempt to get the amino acid for an incomplete codon") return prot def get_sequence_on_genome(feature,segments_on_target_genome): # returns the sequence of the feature on the target genome list_seg=Features[feature].segments_list first_seg=get_first_seg(list_seg,segments_on_target_genome) last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome) path_on_target=get_feature_path(paths,first_seg,last_seg) new_sequence="" for segment in path_on_target: if segment==cds.segments_list[0]: new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-2:] elif segment==cds.segments_list[-1]: new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop] else: new_sequence+=get_segment_sequence(seg_seq,segment) return new_sequence def get_cds_variations(var_file): # creates dict : cds_id -> list of variations from the var file var=open(var_file,'r') lines=var.readlines() var.close() # dict cds-var cds_var={} for line in lines: line=line.split() if line[1]=="CDS": cds_id=line[0].replace('.','_').replace(':','_') if cds_id not in cds_var.keys(): cds_var[cds_id]=list() cds_var[cds_id].append(line) return cds_var import re def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one print("\nlooking for a new start codon:") frame_shift=0 # chercher codon start en amont du cds, dans le mrna seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome) seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome) sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence print("sequence before the CDS in the mRNA:",sequence_amont) if "ATG" in sequence_amont: start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)] stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)] print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg. # vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !) print("checking if there is a stop codon after the start codons in the same reading frame:") for start_pos in start_pos_list: start_pos_frame=start_pos%3 n=len(sequence_amont)-start_pos+1 if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) : #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds") # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture ! frame_shift=(frame_shift+n)%3 # vérifier le frame shift !! print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame") else: print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame") # chercher codon start en aval, dans le cds if "ATG" in seq_cds: start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)] print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier print("\n") return frame_shift def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion stop=False deleted_aa=traduction(get_rna(deleted_sequence)) inserted_aa=traduction(get_rna(inserted_sequence)) if (len(deleted_aa)!=0) & (len(inserted_aa)!=0): if deleted_aa!=inserted_aa: print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa)) else: print("consequence : synonymous variation in",",".join(deleted_aa)) elif len(inserted_aa)!=0: print("consequence : insertion of",",".join(inserted_aa)) else: print("consequence : deletion of",",".join(deleted_aa)) if ("*" in inserted_aa): print("stop codon apparition in the cds of the target genome") stop=True return stop [paths,seg_seq]=get_segments_sequence_and_paths(gfa) segments_on_target_genome=get_segments_positions_on_genome(pos_seg) cds_var=get_cds_variations(var_file) for feature in Features.values(): # add the sequence of all features add_feature_sequence(feature,seg_seq) # analysing the variations for all the cds : for cds_id in cds_var.keys(): cds=Features[cds_id] print("analysis of the variations in the CDS",cds_id,":\n") frame_shift=0 for index, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : type_var=var[8] if type_var!="no_var": # if there is a variation posVar=[int(var[12]),int(var[13])] sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome) if type_var=="insertion": length_ref=0 else: length_ref=len(var[9]) if type_var=="deletion": length_alt=0 else: length_alt=len(var[10]) print("variation",index, ":") if posVar[0]<=3: print("variation of the start codon, mRNA most likely wont be translated") #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon break if abs(length_alt-length_ref)%3 == 0: # size diff 3k -> no frame shift. if (posVar[0])%3==0: #size diff 3k, position 3k print("variation between two codons not causing a frameshift") if type_var=="insertion": print("insertion of",var[10]) elif type_var=="deletion": print("deletion of",var[9]) else: print("substitution of",var[9],"by",var[10]) len_fragment_after=(3-length_ref)%3 deleted_sequence=cds.sequence[posVar[0]:posVar[0]+length_ref+len_fragment_after] inserted_sequence=sequence_target[posVar[1]:posVar[1]+length_alt+len_fragment_after] stop=print_variation_change(deleted_sequence,inserted_sequence) if stop: break else: # size diff 3k, position !=3k print("variation in the middle of a codon not causing a frameshift") if type_var=="insertion": print("insertion of",var[10]) elif type_var=="deletion": print("deletion of",var[9]) else: print("substitution of",var[9],"by",var[10]) len_fragment_before=(posVar[0])%3 len_fragment_after=(3-(len_fragment_before+length_ref))%3 total_ins=sequence_target[posVar[1]-len_fragment_before:posVar[1]+length_alt+len_fragment_after] total_del=cds.sequence[posVar[0]-len_fragment_before:posVar[0]+length_ref+len_fragment_after] stop=print_variation_change(total_del,total_ins) if stop: break # possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon, # while printing the effect of the first snp we aso use the second one. else: # size diff !=3k print("frameshift variation") old_frameshift=frame_shift frame_shift=(frame_shift+length_ref-length_alt)%3 # frameshift=0 -> reading frame recovered. may need to get a base before. # frameshift=1 -> frame shift of 1 base to the right # frameshift=2 -> frame shift of 2 bases to the right if type_var=="insertion": print("insertion of",var[10]) elif type_var=="deletion": print("deletion of",var[9]) else: print("substitution of",var[9],"by",var[10]) len_fragment_before_del=(posVar[0])%3 len_fragment_before_ins=(posVar[1])%3 if frame_shift==0: # print only the local change. len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] stop=print_variation_change(total_del,total_ins) print("recovery of the original reading frame") if stop: break else: # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. print("creating frame shift of",frame_shift,"base(s) to the right.") if old_frameshift==0: print("loss of the original reading frame") if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds. total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] stop=print_variation_change(total_total_del,total_total_ins) if stop: break else: nextVar=cds_var[cds_id][index+1] posNextVar=[int(nextVar[12]),int(nextVar[13])] if nextVar[8]=="insertion": length_ref_nextvar=0 else: length_ref_nextvar:len(nextVar[9]) if nextVar[8]=="deletion": length_alt_nextvar=0 else: length_alt_nextvar=len(nextVar[10]) len_fragment_before_del_nextvar=(posNextVar[0])%3 len_fragment_before_ins_nextvar=(posNextVar[1])%3 total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] stop=print_variation_change(total_total_del,total_total_ins) if stop: break print("\n")