Skip to content
Snippets Groups Projects
Commit 05f58532 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

commencé à traité les variations sur le codon start

parent 5c80b92f
No related branches found
No related tags found
No related merge requests found
from Graph_gff import Features,load_intersect from Graph_gff import Features,load_intersect
from Functions import get_segment_sequence,convert_strand from Functions import get_segment_sequence,convert_strand
target_genome_name="genome3_chr10" target_genome_name="genome4_chr10"
intersect_path='/home/nina/annotpangenome/test_data/input_data_inf/intersect.bed' intersect_path='/home/nina/annotpangenome/test_data/input_data_inf/intersect.bed'
load_intersect(intersect_path) load_intersect(intersect_path)
...@@ -58,7 +58,7 @@ def add_feature_sequence(feature,seg_seq): ...@@ -58,7 +58,7 @@ def add_feature_sequence(feature,seg_seq):
feature_sequence+=get_segment_sequence(seg_seq,segment) feature_sequence+=get_segment_sequence(seg_seq,segment)
feature.sequence=feature_sequence feature.sequence=feature_sequence
def get_first_seg(list_seg): def get_first_seg(list_seg,segments_on_target_genome):
first_seg_found='' first_seg_found=''
for segment in list_seg: for segment in list_seg:
if segment[1:] in segments_on_target_genome: if segment[1:] in segments_on_target_genome:
...@@ -164,19 +164,7 @@ def decoupe_codon(sequence): ...@@ -164,19 +164,7 @@ def decoupe_codon(sequence):
var=open("test_data/variations.txt",'r')
lines=var.readlines()
var.close()
# dict cds-var
cds_var={}
for line in lines:
line=line.split()
if line[1]=="CDS":
cds_id=line[0].replace('.','_').replace(':','_')
if cds_id not in cds_var.keys():
cds_var[cds_id]=list()
cds_var[cds_id].append(line)
def get_sequence_before(first_seg,seg_seq,n,paths,feat): def get_sequence_before(first_seg,seg_seq,n,paths,feat):
first_strand=convert_strand(first_seg[0]) first_strand=convert_strand(first_seg[0])
...@@ -219,15 +207,11 @@ def get_sequence_after(last_seg,seg_seq,n,paths,feat): ...@@ -219,15 +207,11 @@ def get_sequence_after(last_seg,seg_seq,n,paths,feat):
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu] list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
''' '''
for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to make a prot. then detail var by cds.
cds=Features[cds_id]
print("analysing variations in cds",cds_id)
add_feature_sequence(cds,seg_seq)
cds_prot=traduction(get_rna(cds.sequence))
list_seg=Features[cds_id].segments_list def get_sequence_on_genome(feature,segments_on_target_genome):
first_seg=get_first_seg(list_seg) list_seg=Features[feature].segments_list
last_seg=get_first_seg(reversed(list_seg)) first_seg=get_first_seg(list_seg,segments_on_target_genome)
last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)
path_on_target=get_feature_path(paths,first_seg,last_seg) path_on_target=get_feature_path(paths,first_seg,last_seg)
new_sequence="" new_sequence=""
...@@ -238,76 +222,168 @@ for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to ...@@ -238,76 +222,168 @@ for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to
new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop] new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]
else: else:
new_sequence+=get_segment_sequence(seg_seq,segment) new_sequence+=get_segment_sequence(seg_seq,segment)
return new_sequence
new_prot=traduction(get_rna(new_sequence))
print("original prot = ", cds_prot)
print("new prot = ", new_prot) # print new version with new start and stop codons of deleted. (before and after the gene sequence)
var=open("test_data/variations.txt",'r')
lines=var.readlines()
var.close()
if new_prot[0]!="Met": # dict cds-var
print("no Met at the start of the new version of the protein -> start codon loss") cds_var={}
if "Met" in new_prot: for line in lines:
print("Met found at position", (new_prot.index("Met")+1),"-> possible later start codon") # print cette version de la prot line=line.split()
if line[1]=="CDS":
print("look for start codon before. if none, print 'no start codon, likely gene not active'") cds_id=line[0].replace('.','_').replace(':','_')
# récupérer n pb avant, les traduire, chercher la dernière Met (list[::-1].index("Met")), donner cette version de la prot if cds_id not in cds_var.keys():
cds_var[cds_id]=list()
sequence_before=get_sequence_before(path_on_target[0],seg_seq,100,paths,cds) cds_var[cds_id].append(line)
print(sequence_before)
first_stop_index=new_prot.index("*") if "*" in new_prot else "None" for feature in Features.values():
if "*" not in new_prot: add_feature_sequence(feature,seg_seq)
print("no stop codon")
# récupérer n pb après, les traduire, chercher le premier *, donner cette version de la prot.
elif first_stop_index+1!=len(new_prot):
print("early stop codon at position",(first_stop_index+1),"instead of",len(new_prot))
else:
print("stop codon found at expected position")
sequence_after=get_sequence_after(path_on_target[-1],seg_seq,100,paths,cds)
print(sequence_after)
version="new"
import re
for cds_id in cds_var.keys():
cds=Features[cds_id]
print("analysing variations in cds",cds_id)
for var in cds_var[cds_id]: for var in cds_var[cds_id]:
#print("\n",var)
size_var=int(var[11]) size_var=int(var[11])
type_var=var[8] type_var=var[8]
pos_var=int(var[12])-1 pos_var=int(var[12])-1
if pos_var<=3:
print("codon start touché")
#chercher start (dans le mrna) avant et apres
# chercher codon start en amont du cds, dans le mrna
seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
print(sequence_amont)
if "ATG" in sequence_amont:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)]
print(start_pos_list,stop_pos_list) # positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
for start_pos in start_pos_list:
print(start_pos)
start_pos_frame=start_pos%3
test_list=[39,46,47]
if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) :
n=len(sequence_amont)-start_pos
print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
frame_shift=n%3 # mais si on a plusieurs start candidats il faut choisir celui qu'on prend !??
# chercher codon start en aval, dans le cds
if "ATG" in seq_cds:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
for pos in start_pos_list:
print("codon start candidat trouvé plus loin dans le cds, à la base",pos)
if version=="old":
for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to make a prot. then detail var by cds.
cds=Features[cds_id]
print("analysing variations in cds",cds_id)
add_feature_sequence(cds,seg_seq)
cds_prot=traduction(get_rna(cds.sequence))
list_seg=Features[cds_id].segments_list
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
path_on_target=get_feature_path(paths,first_seg,last_seg)
new_sequence=""
for segment in path_on_target:
if segment==cds.segments_list[0]:
new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-1:]
elif segment==cds.segments_list[-1]:
new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]
else:
new_sequence+=get_segment_sequence(seg_seq,segment)
new_prot=traduction(get_rna(new_sequence))
print("original prot = ", cds_prot)
print("new prot = ", new_prot) # print new version with new start and stop codons of deleted. (before and after the gene sequence)
if new_prot[0]!="Met":
print("no Met at the start of the new version of the protein -> start codon loss")
if "Met" in new_prot:
print("Met found at position", (new_prot.index("Met")+1),"-> possible later start codon") # print cette version de la prot
print("look for start codon before. if none, print 'no start codon, likely gene not active'")
# récupérer n pb avant, les traduire, chercher la dernière Met (list[::-1].index("Met")), donner cette version de la prot
sequence_before=get_sequence_before(path_on_target[0],seg_seq,100,paths,cds)
print(sequence_before)
first_stop_index=new_prot.index("*") if "*" in new_prot else "None"
if "*" not in new_prot:
print("no stop codon")
# récupérer n pb après, les traduire, chercher le premier *, donner cette version de la prot.
elif first_stop_index+1!=len(new_prot):
print("early stop codon at position",(first_stop_index+1),"instead of",len(new_prot))
else:
print("stop codon found at expected position")
sequence_after=get_sequence_after(path_on_target[-1],seg_seq,100,paths,cds)
print(sequence_after)
for var in cds_var[cds_id]:
#print("\n",var)
size_var=int(var[11])
type_var=var[8]
pos_var=int(var[12])-1
if type_var=="insertions": if type_var=="insertions":
sequence_var=var[10] sequence_var=var[10]
if size_var%3==0: if size_var%3==0:
print("pas de décalage du cadre de lecture") print("pas de décalage du cadre de lecture")
trad_seq_ins=traduction(get_rna(sequence_var)) trad_seq_ins=traduction(get_rna(sequence_var))
if pos_var%3==0: if pos_var%3==0:
print("insertion entre deux codons") print("insertion entre deux codons")
if "*" in trad_seq_ins: if "*" in trad_seq_ins:
print("apparition d'un codon stop") print("apparition d'un codon stop")
print(f'ancienne sequence : {", ".join(cds_prot)}') print(f'ancienne sequence : {", ".join(cds_prot)}')
print(f'nouvelle sequence : {", ".join(cds_prot[0:(pos_var//3)-1])}, *') print(f'nouvelle sequence : {", ".join(cds_prot[0:(pos_var//3)-1])}, *')
else:
print(f'ancienne sequence : {", ".join(cds_prot)}')
print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}')
else: else:
print(f'ancienne sequence : {", ".join(cds_prot)}') print("insertion au milieu d'un codon, changement de certains acides amines")
print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}')
else: elif type_var=="deletions":
print("insertion au milieu d'un codon, changement de certains acides amines") sequence_var=line[9]
if size_var%3==0:
elif type_var=="deletions": print("pas de décalage du cadre de lecture")
sequence_var=line[9] trad_seq_ins=traduction(get_rna(sequence_var))
if size_var%3==0:
print("pas de décalage du cadre de lecture") if pos_var%3==0:
trad_seq_ins=traduction(get_rna(sequence_var)) print("deletion de codons entiers")
if "*" in trad_seq_ins:
if pos_var%3==0: print("disparition d'un codon stop")
print("deletion de codons entiers") else:
if "*" in trad_seq_ins: print(f'ancienne sequence : {", ".join(cds_prot)}')
print("disparition d'un codon stop") print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}')
else: else:
print(f'ancienne sequence : {", ".join(cds_prot)}') print("deletion au milieu d'un codon, changement de certains acides amines")
print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}')
else:
print("deletion au milieu d'un codon, changement de certains acides amines")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment