Skip to content
Snippets Groups Projects
Commit 2e5f4d81 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

Merge branch 'inferences'

parents 1b5825d6 1ad1a7d9
No related branches found
No related tags found
No related merge requests found
......@@ -84,7 +84,7 @@ for line in lines :
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]-1
seg_stop=position+segments_size[seg_name]
if path[i][0:1]=='>':
orientation='+'
......
......@@ -53,10 +53,12 @@ def get_segments_positions_on_genome(pos_seg): # creates a dict with the positio
def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object
feature_sequence=""
for segment in feature.segments_list:
if (segment==feature.segments_list[0]) & (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature
feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1]
if segment==feature.segments_list[0]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:] # revérifier les +/- 1 pour la position, avec de vraies données
elif segment==feature.segments_list[-1]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] # revérifier les +/- 1 pour la position, avec de vraies données
feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop-1] # revérifier les +/- 1 pour la position, avec de vraies données
else:
feature_sequence+=get_segment_sequence(seg_seq,segment)
feature.sequence=feature_sequence
......@@ -171,26 +173,31 @@ def get_sequence_on_genome(feature,segments_on_target_genome): # returns the seq
if segment==cds.segments_list[0]:
new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-2:]
elif segment==cds.segments_list[-1]:
new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]
new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop-1]
else:
new_sequence+=get_segment_sequence(seg_seq,segment)
return new_sequence
def get_cds_variations(var_file): # creates dict : cds_id -> list of variations from the var file
def get_mrna_var(var_file):
var=open(var_file,'r')
lines=var.readlines()
var.close()
# dict cds-var
cds_var={}
mrna_var={}
for line in lines:
line=line.split()
if line[1]=="CDS":
feature_type=line[1]
if feature_type=="mRNA":
mrna_id=line[0].replace('.','_').replace(':','_')
if mrna_id not in mrna_var.keys():
mrna_var[mrna_id]={}
elif feature_type=="CDS":
cds_id=line[0].replace('.','_').replace(':','_')
if cds_id not in cds_var.keys():
cds_var[cds_id]=list()
cds_var[cds_id].append(line)
return cds_var
parent_id=line[0][0:line[0].find("cds")-1]
if cds_id not in mrna_var[parent_id]:
mrna_var[parent_id][cds_id]=list()
mrna_var[parent_id][cds_id].append(line)
return mrna_var
import re
def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one
......@@ -247,104 +254,121 @@ def print_variation_change(deleted_sequence,inserted_sequence): # print the cons
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
segments_on_target_genome=get_segments_positions_on_genome(pos_seg)
cds_var=get_cds_variations(var_file)
mrna_var=get_mrna_var(var_file)
for feature in Features.values(): # add the sequence of all features
add_feature_sequence(feature,seg_seq)
# analysing the variations for all the cds :
for cds_id in cds_var.keys():
cds=Features[cds_id]
print("analysis of the variations in the CDS",cds_id,":\n")
frameshift=0
for index, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation
posVar=[int(var[12]),int(var[13])]
sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome)
if type_var=="insertion":
length_ref=0
else:
length_ref=len(var[9])
if type_var=="deletion":
length_alt=0
else:
length_alt=len(var[10])
print("variation",index+1, ":")
if posVar[0]<=3:
print("variation of the start codon, mRNA most likely wont be translated")
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
break
old_frameshift=frameshift
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")
# frameshift=0 -> reading frame recovered. may need to get a base before.
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if frameshift==0:
print("recovery of the original reading frame")
if old_frameshift==0:
print("loss of the original reading frame")
#else:
# print("variation not causing a frameshift")
if type_var=="insertion":
print("insertion of",var[10])
elif type_var=="deletion":
print("deletion of",var[9])
else:
print("substitution of",var[9],"by",var[10])
analysis=True
if analysis==True:
# analysing the variations for all the mrna :
for mrna_id in mrna_var.keys():
print("analysis of the variations in the mRNA",mrna_id,":\n")
leftover_ref=""
leftover_target=""
frameshift=0
cds_var=mrna_var[mrna_id]
for index_cds,cds_id in enumerate(cds_var):
cds=Features[cds_id]
print("\nCDS",cds_id,":")
for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation
posVar=[int(var[12]),int(var[13])]
sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome)
if type_var=="insertion":
length_ref=0
else:
length_ref=len(var[9])
if type_var=="deletion":
length_alt=0
else:
length_alt=len(var[10])
print("\nvariation",index_var+1, ":")
if (index_cds==0) & (posVar[0]<3):
print("variation of the start codon, mRNA most likely wont be translated")
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
break
len_fragment_before_del=(posVar[0])%3
len_fragment_before_ins=(posVar[1])%3
old_frameshift=frameshift
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")
# frameshift=0 -> reading frame recovered. may need to get a base before.
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if frameshift==0:
print("recovery of the original reading frame")
if old_frameshift==0:
print("loss of the original reading frame")
#else:
# print("variation not causing a frameshift")
if type_var=="insertion":
print("insertion of",var[10])
elif type_var=="deletion":
print("deletion of",var[9])
else:
print("substitution of",var[9],"by",var[10])
if frameshift==0:
# print only the local change.
len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3
len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3
total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins]
total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del]
stop=print_variation_change(total_del,total_ins)
if stop:
break
len_fragment_before_del=(posVar[0])%3
len_fragment_before_ins=(posVar[1])%3
else:
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print("frameshift of",frameshift,"base(s) to the right.")
if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds.
total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
else: # not the last variation. translate until the next var.
nextVar=cds_var[cds_id][index+1]
posNextVar=[int(nextVar[12]),int(nextVar[13])]
if frameshift==0:
# print only the local change.
len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3
len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3
total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins]
total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del]
stop=print_variation_change(total_del,total_ins)
if stop:
break
if nextVar[8]=="insertion":
length_ref_nextvar=0
else:
length_ref_nextvar:len(nextVar[9])
if nextVar[8]=="deletion":
length_alt_nextvar=0
else:
length_alt_nextvar=len(nextVar[10])
len_fragment_before_del_nextvar=(posNextVar[0])%3
len_fragment_before_ins_nextvar=(posNextVar[1])%3
total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
# possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon,
# while printing the effect of the first snp we aso use the second one.
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print("frameshift of",frameshift,"base(s) to the right.")
if index_var==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds while having full codons
total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:]
print(cds.sequence,sequence_target)
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:]
# supprimer taille%3 des sequences ins et del.
# en fait non. il faut récupérer les séquences sur les seq des cds (sans l'intron dcp), prendre la taille sur ça et supprimer sur ça
del_leftover_len=len(total_total_del)%3
ins_leftover_len=len(total_total_ins)%3
if del_leftover_len!=0:
total_total_del=total_total_del[0:-del_leftover_len]
if ins_leftover_len!=0:
total_total_ins=total_total_ins[0:-ins_leftover_len]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
else: # not the last variation. translate until the next var.
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12]),int(nextVar[13])]
if nextVar[8]=="insertion":
length_ref_nextvar=0
else:
length_ref_nextvar:len(nextVar[9])
if nextVar[8]=="deletion":
length_alt_nextvar=0
else:
length_alt_nextvar=len(nextVar[10])
len_fragment_before_del_nextvar=(posNextVar[0])%3
len_fragment_before_ins_nextvar=(posNextVar[1])%3
total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar]
total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar]
stop=print_variation_change(total_total_del,total_total_ins)
if stop:
break
if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds
leftover_ref=cds.sequence[len(cds.sequence)-3:]
leftover_target=sequence_target[len(sequence_target)-3:]
print("\n")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment