from Graph_gff import Features,load_intersect from Functions import get_segment_sequence,convert_strand target_genome_name="genome3_chr10" intersect_path='/home/nina/annotpangenome/test_data/input_data_inf/intersect.bed' load_intersect(intersect_path) gfa="test_data/input_data_inf/graph_test.gfa" def get_segments_sequence_and_paths(gfa): file_gfa=open(gfa,'r') lines_gfa=file_gfa.readlines() file_gfa.close() seg_seq={} paths={} for line in lines_gfa: line=line.split() if (line[0]=="S"): # get the sequence of the segment seg_id='s'+line[1] seg_seq[seg_id]=line[2] if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome path=line[6].replace(">",";>") path=path.replace("<",";<").split(';') list_path=[] for segment in path: if segment[0:1]=='>': list_path.append('+s'+segment[1:]) elif segment[0:1]=='<': list_path.append('-s'+segment[1:]) paths[line[3]]=list_path return [paths,seg_seq] [paths,seg_seq]=get_segments_sequence_and_paths(gfa) segments_on_target_genome={} pos_seg="test_data/input_data_inf/genome3_chr10.bed" def get_segments_positions_on_genome(pos_seg): bed=open(pos_seg,'r') lines=bed.readlines() # read line by line ? bed.close() for line in lines: line=line.split() [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]] segments_on_target_genome[seg]=[chrom,start,stop,strand] get_segments_positions_on_genome(pos_seg) def add_feature_sequence(feature,seg_seq): feature_sequence="" for segment in feature.segments_list: if segment==feature.segments_list[0]: feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:] # revérifier les +/- 1 pour la position, avec de vraies données elif segment==feature.segments_list[-1]: feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] # revérifier les +/- 1 pour la position, avec de vraies données else: feature_sequence+=get_segment_sequence(seg_seq,segment) feature.sequence=feature_sequence def get_first_seg(list_seg): first_seg_found='' for segment in list_seg: if segment[1:] in segments_on_target_genome: first_seg_found=segment[1:] break return first_seg_found def get_feature_path(paths,first_seg,last_seg): # find the path in azucena. first_strand=convert_strand(segments_on_target_genome[first_seg][3]) first_seg_stranded=first_strand+first_seg last_strand=convert_strand(segments_on_target_genome[last_seg][3]) last_seg_stranded=last_strand+last_seg index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) first_index=min(index_first_seg,index_last_seg) last_index=max(index_last_seg,index_first_seg) list_segfeat_azu=paths[target_genome_name][first_index:last_index+1] list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] return list_segfeat_azu_corrected def get_rna(dna_sequence): return dna_sequence.replace("T","U") # penser à transcrire la séquence codante du gène !! def get_aa(codon): match codon[0:2]: case "UU": if (codon[2]=="U") | (codon[2]=="C"): return "Phe" else: return "Leu" case "UC": return "Ser" case "UA": if (codon[2]=="U") | (codon[2]=="C"): return "Tyr" else: return "*" case "UG": if (codon[2]=="U") | (codon[2]=="C"): return "Cys" elif codon[2]=="A": return "*" else: return "Trp" case "CU": return "Leu" case "CC": return "Pro" case "CA": if (codon[2]=="U") | (codon[2]=="C"): return "His" else: return "Gln" case "CG": return "Arg" case "AU": if codon[2]=="G": return "Met" else: return "Ile" case "AC": return "Thr" case "AA": if (codon[2]=="U") | (codon[2]=="C"): return "Asn" else: return "Lys" case "AG": if (codon[2]=="U") | (codon[2]=="C"): return "Ser" else: return "Arg" case "GU": return "Val" case "GC": return "Ala" case "GA": if (codon[2]=="U") | (codon[2]=="C"): return "Asp" else: return "Glu" case "GG": return "Gly" def traduction(sequence_arn): list_codons=decoupe_codon(sequence_arn) prot=list() for codon in list_codons: prot.append(get_aa(codon)) return prot from textwrap import wrap def decoupe_codon(sequence): return wrap(sequence,3) var=open("test_data/variations.txt",'r') lines=var.readlines() var.close() # dict cds-var cds_var={} for line in lines: line=line.split() if line[1]=="CDS": cds_id=line[0].replace('.','_').replace(':','_') if cds_id not in cds_var.keys(): cds_var[cds_id]=list() cds_var[cds_id].append(line) def get_sequence_before(first_seg,seg_seq,n,paths,feat): first_strand=convert_strand(first_seg[0]) first_seg_stranded=first_strand+first_seg[1:] index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) sequence_before=seg_seq[first_seg[1:]][0:feat.pos_start-1] # sequence left on the segment on which the cds start (can be empty) current_index=index_first_seg-1 while (len(sequence_before)<n) & (current_index>=0): segment=paths[target_genome_name][current_index] sequence_before=seg_seq[segment[1:]]+sequence_before current_index-=1 return sequence_before[0:99] def get_sequence_after(last_seg,seg_seq,n,paths,feat): last_strand=convert_strand(last_seg[0]) last_seg_stranded=last_strand+last_seg[1:] index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) sequence_after=seg_seq[last_seg[1:]][feat.pos_stop:] # sequence left on the segment on which the cds ends (can be empty) current_index=index_last_seg+1 while (len(sequence_after)<n) & (current_index>len(paths[target_genome_name])): segment=paths[target_genome_name][current_index] sequence_after=sequence_after+seg_seq[segment[1:]] current_index+=1 return sequence_after[len(sequence_after)-100:] ''' first_strand=convert_strand(segments_on_target_genome[first_seg][3]) first_seg_stranded=first_strand+first_seg last_strand=convert_strand(segments_on_target_genome[last_seg][3]) last_seg_stranded=last_strand+last_seg index_first_seg=int(paths[target_genome_name].index(first_seg_stranded)) index_last_seg=int(paths[target_genome_name].index(last_seg_stranded)) first_index=min(index_first_seg,index_last_seg) last_index=max(index_last_seg,index_first_seg) list_segfeat_azu=paths[target_genome_name][first_index:last_index+1] list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] ''' for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to make a prot. then detail var by cds. cds=Features[cds_id] print("analysing variations in cds",cds_id) add_feature_sequence(cds,seg_seq) cds_prot=traduction(get_rna(cds.sequence)) list_seg=Features[cds_id].segments_list first_seg=get_first_seg(list_seg) last_seg=get_first_seg(reversed(list_seg)) path_on_target=get_feature_path(paths,first_seg,last_seg) new_sequence="" for segment in path_on_target: if segment==cds.segments_list[0]: new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-1:] elif segment==cds.segments_list[-1]: new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop] else: new_sequence+=get_segment_sequence(seg_seq,segment) new_prot=traduction(get_rna(new_sequence)) print("original prot = ", cds_prot) print("new prot = ", new_prot) # print new version with new start and stop codons of deleted. (before and after the gene sequence) if new_prot[0]!="Met": print("no Met at the start of the new version of the protein -> start codon loss") if "Met" in new_prot: print("Met found at position", (new_prot.index("Met")+1),"-> possible later start codon") # print cette version de la prot print("look for start codon before. if none, print 'no start codon, likely gene not active'") # récupérer n pb avant, les traduire, chercher la dernière Met (list[::-1].index("Met")), donner cette version de la prot sequence_before=get_sequence_before(path_on_target[0],seg_seq,100,paths,cds) print(sequence_before) first_stop_index=new_prot.index("*") if "*" in new_prot else "None" if "*" not in new_prot: print("no stop codon") # récupérer n pb après, les traduire, chercher le premier *, donner cette version de la prot. elif first_stop_index+1!=len(new_prot): print("early stop codon at position",(first_stop_index+1),"instead of",len(new_prot)) else: print("stop codon found at expected position") sequence_after=get_sequence_after(path_on_target[-1],seg_seq,100,paths,cds) print(sequence_after) for var in cds_var[cds_id]: #print("\n",var) size_var=int(var[11]) type_var=var[8] pos_var=int(var[12])-1 if type_var=="insertions": sequence_var=var[10] if size_var%3==0: print("pas de décalage du cadre de lecture") trad_seq_ins=traduction(get_rna(sequence_var)) if pos_var%3==0: print("insertion entre deux codons") if "*" in trad_seq_ins: print("apparition d'un codon stop") print(f'ancienne sequence : {", ".join(cds_prot)}') print(f'nouvelle sequence : {", ".join(cds_prot[0:(pos_var//3)-1])}, *') else: print(f'ancienne sequence : {", ".join(cds_prot)}') print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}') else: print("insertion au milieu d'un codon, changement de certains acides amines") elif type_var=="deletions": sequence_var=line[9] if size_var%3==0: print("pas de décalage du cadre de lecture") trad_seq_ins=traduction(get_rna(sequence_var)) if pos_var%3==0: print("deletion de codons entiers") if "*" in trad_seq_ins: print("disparition d'un codon stop") else: print(f'ancienne sequence : {", ".join(cds_prot)}') print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}') else: print("deletion au milieu d'un codon, changement de certains acides amines")