from Graph_gff import Features,load_intersect from Functions import get_segment_sequence target_genome_name="genome2_chr10" pos_seg=target_genome_name+".bed" var_file=target_genome_name+"_variations.txt" gfa="graph.gfa" intersect_path='intersect.bed' load_intersect(intersect_path) def convert_strand(strand): match strand: case "+": return ">" case "-": return "<" case ">": return "+" case "<": return "-" case default: return "" # présent dans Fuctions.py mais relou à importer ? def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths file_gfa=open(gfa,'r') lines_gfa=file_gfa.readlines() file_gfa.close() seg_seq={} paths={} for line in lines_gfa: line=line.split() if (line[0]=="S"): # get the sequence of the segment seg_id='s'+line[1] seg_seq[seg_id]=line[2] if (line[0]=="W") and (line[1]!="_MINIGRAPH_"): # get the walk of the genome path=line[6].replace(">",";>") path=path.replace("<",";<").split(';') list_path=[] for segment in path: if segment[0:1]=='>': list_path.append('+s'+segment[1:]) elif segment[0:1]=='<': list_path.append('-s'+segment[1:]) paths[line[3]]=list_path return [paths,seg_seq] # présent dans Fuctions.py mais relou à importer ? def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome bed=open(pos_seg,'r') lines=bed.readlines() # read line by line ? bed.close() segments_on_target_genome={} for line in lines: line=line.split() [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]] segments_on_target_genome[seg]=[chrom,start,stop,strand] return segments_on_target_genome def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object feature_sequence="" for segment in feature.segments_list: if (segment==feature.segments_list[0]) and (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1] if segment==feature.segments_list[0]: feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:] elif segment==feature.segments_list[-1]: feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] else: feature_sequence+=get_segment_sequence(seg_seq,segment) feature.sequence=feature_sequence def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome first_seg_found='' for segment in list_seg: if segment[1:] in segments_on_target_genome: first_seg_found=segment[1:] break return first_seg_found def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome first_strand=convert_strand(segments_on_target_genome[first_seg][3]) first_seg_stranded=first_strand+first_seg last_strand=convert_strand(segments_on_target_genome[last_seg][3]) last_seg_stranded=last_strand+last_seg index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) first_index=min(index_first_seg,index_last_seg) last_index=max(index_last_seg,index_first_seg) list_segfeat_azu=paths[target_genome_name][first_index:last_index+1] list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] return list_segfeat_azu_corrected def get_rna(dna_sequence): return dna_sequence.replace("T","U") def get_aa(rna_codon): # gives the aa coded by the rna codon match rna_codon[0:2]: case "UU": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "Phe" else: return "Leu" case "UC": return "Ser" case "UA": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "Tyr" else: return "*" case "UG": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "Cys" elif rna_codon[2]=="A": return "*" else: return "Trp" case "CU": return "Leu" case "CC": return "Pro" case "CA": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "His" else: return "Gln" case "CG": return "Arg" case "AU": if rna_codon[2]=="G": return "Met" else: return "Ile" case "AC": return "Thr" case "AA": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "Asn" else: return "Lys" case "AG": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "Ser" else: return "Arg" case "GU": return "Val" case "GC": return "Ala" case "GA": if (rna_codon[2]=="U") or (rna_codon[2]=="C"): return "Asp" else: return "Glu" case "GG": return "Gly" def cut_codon(sequence): from textwrap import wrap return wrap(sequence,3) def traduction(sequence_arn): # translate rna list_codons=cut_codon(sequence_arn) prot=list() for codon in list_codons: if len(codon)==3: prot.append(get_aa(codon)) #else: # print("attempt to get the amino acid for an incomplete codon") return prot def get_sequence_on_genome(feature_id,segments_on_target_genome): # returns the sequence of the feature on the target genome feature=Features[feature_id] list_seg=feature.segments_list first_seg=get_first_seg(list_seg,segments_on_target_genome) last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome) path_on_target=get_feature_path(paths,first_seg,last_seg) pos_start=feature.pos_start pos_stop=feature.pos_stop new_sequence="" for segment in path_on_target: if segment==feature.segments_list[0]: new_sequence+=get_segment_sequence(seg_seq,segment)[pos_start-1:] elif segment==feature.segments_list[-1]: new_sequence+=get_segment_sequence(seg_seq,segment)[0:pos_stop] else: new_sequence+=get_segment_sequence(seg_seq,segment) return new_sequence def get_mrna_var(var_file): var=open(var_file,'r') lines=var.readlines() var.close() mrna_var={} for line in lines: line=line.split() feature_type=line[1] if feature_type=="mRNA": mrna_id=line[0].replace('.','_').replace(':','_') if mrna_id not in mrna_var.keys(): mrna_var[mrna_id]={} elif feature_type=="CDS": cds_id=line[0].replace('.','_').replace(':','_') parent_id=line[0][0:line[0].find("cds")-1] if cds_id not in mrna_var[parent_id]: mrna_var[parent_id][cds_id]=list() mrna_var[parent_id][cds_id].append(line) return mrna_var import re def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one print("\nlooking for a new start codon:") frame_shift=0 # chercher codon start en amont du cds, dans le mrna seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome) seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome) sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence print("sequence before the CDS in the mRNA:",sequence_amont) if "ATG" in sequence_amont: start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)] stop_pos_list=[m.start() for m in re.finditer('(?=TAGorTAAorTGA)', sequence_amont)] print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg. # vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !) print("checking if there is a stop codon after the start codons in the same reading frame:") for start_pos in start_pos_list: start_pos_frame=start_pos%3 n=len(sequence_amont)-start_pos+1 if True not in ( (stop_pos%3==start_pos_frame) and (stop_pos>start_pos) for stop_pos in stop_pos_list) : #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds") # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture ! frame_shift=(frame_shift+n)%3 # vérifier le frameshift !! print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame") else: print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame") # chercher codon start en aval, dans le cds if "ATG" in seq_cds: start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)] print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier print("\n") return frame_shift def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion print("printvar",deleted_sequence,inserted_sequence) stop=False deleted_aa=traduction(get_rna(deleted_sequence)) inserted_aa=traduction(get_rna(inserted_sequence)) if (len(deleted_aa)!=0) and (len(inserted_aa)!=0): if deleted_aa!=inserted_aa: print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa)) else: print("consequence : synonymous variation in",",".join(deleted_aa)) elif len(inserted_aa)!=0: print("consequence : insertion of",",".join(inserted_aa)) else: print("consequence : deletion of",",".join(deleted_aa)) if ("*" in inserted_aa): print("stop codon apparition in the cds of the target genome") stop=True return stop [paths,seg_seq]=get_segments_sequence_and_paths(gfa) segments_on_target_genome=get_segments_positions_on_genome(pos_seg) mrna_var=get_mrna_var(var_file) for feature in Features.values(): # add the sequence of all features add_feature_sequence(feature,seg_seq) def get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,old_frameshift): # get the sequence until the next variation in the next cds # for this, go through all the cds left, and add the sequence until the first variation encountered var_found=False keys = list(cds_var.keys()) cds_rank = keys.index(cds_id)+1 nextCds=keys[cds_rank] ref_sequence="" alt_sequence="" # look for the next variation, and get the sequence before. while (not var_found) and (keys.index(nextCds)!=len(keys)): # go through all the cds until the last if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence ref_sequence+=Features[nextCds].sequence alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome) cds_rank+=1 nextCds=keys[cds_rank] print("no var in current cds") else: # variation found nextVar=cds_var[nextCds][0] ref_sequence+=Features[nextCds].sequence[0:int(nextVar[12])] alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(nextVar[13])] var_found=True return [ref_sequence,alt_sequence] def get_variation_change_sequences(old_frameshift,frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id): ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3) ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3) print("oldfs",old_frameshift) if frameshift==0: # only local change ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3 elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation nextVar=cds_var[cds_id][index_var+1] posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3] ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3) ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3) else: # last variation, go until the end of the cds ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source) ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target) ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref] alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt] if (frameshift!=0) and (index_var==len(cds_var[cds_id])-1) and (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds [ref_add,alt_add]=get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,old_frameshift) ref_sequence+=ref_add alt_sequence+=alt_add ref_sequence=ref_sequence[0:3*(len(ref_sequence)//3)] alt_sequence=alt_sequence[0:3*(len(alt_sequence)//3)] return print_variation_change(ref_sequence,alt_sequence) analysis=True if analysis==True: # analysing the variations for all the mrna : for mrna_id in mrna_var.keys(): print("analysis of the variations in the mRNA",mrna_id,":\n") leftover_ref="" leftover_target="" frameshift=0 # frameshift is how many bases to the right you need to go (from the reference reading frame) to get the alternate reading frame readingframe=0 # readingframe is how many bases to the right you need to go (on the cds sequence) to get the first base of a codon. cds_var=mrna_var[mrna_id] for index_cds,cds_id in enumerate(cds_var): print("\nCDS",cds_id,":") print("readingframe",readingframe,"frameshift",frameshift) for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : type_var=var[8] if type_var!="no_var": # if there is a variation print("\nvariation",index_var+1, ":") length_ref=len(var[9]) length_alt=len(var[10]) if type_var=="insertion": length_ref=0 print("insertion of",var[10]) elif type_var=="deletion": length_alt=0 print("deletion of",var[9]) else: print("substitution of",var[9],"by",var[10]) posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3] temp_rf=readingframe if temp_rf==0: temp_rf=3 sequenceCDS_target=leftover_target[temp_rf:]+get_sequence_on_genome(cds_id,segments_on_target_genome) sequenceCDS_source=leftover_ref[temp_rf:]+Features[cds_id].sequence print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14]) if (index_cds==0) and (posVar[0]<3): print("variation of the start codon, mRNA most likely wont be translated") #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon break old_frameshift=frameshift frameshift=(frameshift+length_ref-length_alt)%3 if old_frameshift!=frameshift: print("variation causing a frameshift") if frameshift==0: print("recovery of the original reading frame") if old_frameshift==0: print("loss of the original reading frame") stop=get_variation_change_sequences(old_frameshift,frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id) if stop: break if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:] leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:] readingframe=(3-(len(sequenceCDS_source)-readingframe))%3