Newer
Older

nina.marthe_ird.fr
committed
from Graph_gff import Features,load_intersect
from Functions import get_segment_sequence

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
target_genome_name="genome2_chr10"
pos_seg=target_genome_name+".bed"
var_file=target_genome_name+"_variations.txt"
gfa="graph.gfa"

nina.marthe_ird.fr
committed
load_intersect(intersect_path)
def convert_strand(strand):
match strand:
case "+":
return ">"
case "-":
return "<"
case ">":
return "+"
case "<":
return "-"
case default:
return ""
# présent dans Fuctions.py mais relou à importer ?
def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths

nina.marthe_ird.fr
committed
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
seg_seq={}
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if (line[0]=="W") and (line[1]!="_MINIGRAPH_"): # get the walk of the genome

nina.marthe_ird.fr
committed
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
list_path=[]
for segment in path:
if segment[0:1]=='>':
list_path.append('+s'+segment[1:])
elif segment[0:1]=='<':
list_path.append('-s'+segment[1:])
paths[line[3]]=list_path
return [paths,seg_seq]
# présent dans Fuctions.py mais relou à importer ?
def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome

nina.marthe_ird.fr
committed
bed=open(pos_seg,'r')
lines=bed.readlines() # read line by line ?
bed.close()
segments_on_target_genome={}

nina.marthe_ird.fr
committed
for line in lines:
line=line.split()
[seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
segments_on_target_genome[seg]=[chrom,start,stop,strand]
return segments_on_target_genome

nina.marthe_ird.fr
committed
def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object

nina.marthe_ird.fr
committed
feature_sequence=""
for segment in feature.segments_list:
if (segment==feature.segments_list[0]) and (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature

nina.marthe_ird.fr
committed
feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1]

nina.marthe_ird.fr
committed
if segment==feature.segments_list[0]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:]

nina.marthe_ird.fr
committed
elif segment==feature.segments_list[-1]:
feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop]

nina.marthe_ird.fr
committed
else:
feature_sequence+=get_segment_sequence(seg_seq,segment)
feature.sequence=feature_sequence
def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome

nina.marthe_ird.fr
committed
first_seg_found=''
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
first_seg_found=segment[1:]
break
return first_seg_found
def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome

nina.marthe_ird.fr
committed
first_strand=convert_strand(segments_on_target_genome[first_seg][3])
first_seg_stranded=first_strand+first_seg
last_strand=convert_strand(segments_on_target_genome[last_seg][3])
last_seg_stranded=last_strand+last_seg
index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
first_index=min(index_first_seg,index_last_seg)
last_index=max(index_last_seg,index_first_seg)
list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
return list_segfeat_azu_corrected
def get_rna(dna_sequence):
return dna_sequence.replace("T","U")
def get_aa(rna_codon): # gives the aa coded by the rna codon
match rna_codon[0:2]:

nina.marthe_ird.fr
committed
case "UU":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Phe"
else:
return "Leu"
case "UC":
return "Ser"
case "UA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Tyr"
else:
return "*"
case "UG":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Cys"
elif rna_codon[2]=="A":

nina.marthe_ird.fr
committed
return "*"
else:
return "Trp"
case "CU":
return "Leu"
case "CC":
return "Pro"
case "CA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "His"
else:
return "Gln"
case "CG":
return "Arg"
case "AU":
if rna_codon[2]=="G":

nina.marthe_ird.fr
committed
return "Met"
else:
return "Ile"
case "AC":
return "Thr"
case "AA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Asn"
else:
return "Lys"
case "AG":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Ser"
else:
return "Arg"
case "GU":
return "Val"
case "GC":
return "Ala"
case "GA":
if (rna_codon[2]=="U") or (rna_codon[2]=="C"):

nina.marthe_ird.fr
committed
return "Asp"
else:
return "Glu"
case "GG":
return "Gly"
def cut_codon(sequence):

nina.marthe_ird.fr
committed
from textwrap import wrap
return wrap(sequence,3)

nina.marthe_ird.fr
committed
def traduction(sequence_arn): # translate rna
list_codons=cut_codon(sequence_arn)

nina.marthe_ird.fr
committed
prot=list()
for codon in list_codons:
if len(codon)==3:
prot.append(get_aa(codon))

nina.marthe_ird.fr
committed
#else:
# print("attempt to get the amino acid for an incomplete codon")

nina.marthe_ird.fr
committed
return prot

nina.marthe_ird.fr
committed
def get_sequence_on_genome(feature_id,segments_on_target_genome): # returns the sequence of the feature on the target genome
feature=Features[feature_id]
list_seg=feature.segments_list
first_seg=get_first_seg(list_seg,segments_on_target_genome)
last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)

nina.marthe_ird.fr
committed
path_on_target=get_feature_path(paths,first_seg,last_seg)

nina.marthe_ird.fr
committed
pos_start=feature.pos_start
pos_stop=feature.pos_stop

nina.marthe_ird.fr
committed
new_sequence=""
for segment in path_on_target:

nina.marthe_ird.fr
committed
if segment==feature.segments_list[0]:
new_sequence+=get_segment_sequence(seg_seq,segment)[pos_start-1:]
elif segment==feature.segments_list[-1]:
new_sequence+=get_segment_sequence(seg_seq,segment)[0:pos_stop]

nina.marthe_ird.fr
committed
else:
new_sequence+=get_segment_sequence(seg_seq,segment)
return new_sequence

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
def get_mrna_var(var_file):
var=open(var_file,'r')
lines=var.readlines()
var.close()

nina.marthe_ird.fr
committed
mrna_var={}
for line in lines:
line=line.split()

nina.marthe_ird.fr
committed
feature_type=line[1]
if feature_type=="mRNA":
mrna_id=line[0].replace('.','_').replace(':','_')
if mrna_id not in mrna_var.keys():
mrna_var[mrna_id]={}
elif feature_type=="CDS":
cds_id=line[0].replace('.','_').replace(':','_')

nina.marthe_ird.fr
committed
parent_id=line[0][0:line[0].find("cds")-1]
if cds_id not in mrna_var[parent_id]:
mrna_var[parent_id][cds_id]=list()
mrna_var[parent_id][cds_id].append(line)
return mrna_var
import re
def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one
print("\nlooking for a new start codon:")
frame_shift=0
# chercher codon start en amont du cds, dans le mrna
seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
print("sequence before the CDS in the mRNA:",sequence_amont)
if "ATG" in sequence_amont:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
stop_pos_list=[m.start() for m in re.finditer('(?=TAGorTAAorTGA)', sequence_amont)]
print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
print("checking if there is a stop codon after the start codons in the same reading frame:")
for start_pos in start_pos_list:
start_pos_frame=start_pos%3
n=len(sequence_amont)-start_pos+1
if True not in ( (stop_pos%3==start_pos_frame) and (stop_pos>start_pos) for stop_pos in stop_pos_list) :
#print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !

nina.marthe_ird.fr
committed
frame_shift=(frame_shift+n)%3 # vérifier le frameshift !!
print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame")
print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame")
# chercher codon start en aval, dans le cds
if "ATG" in seq_cds:
start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier
def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion

nina.marthe_ird.fr
committed
print("printvar",deleted_sequence,inserted_sequence)

nina.marthe_ird.fr
committed
deleted_aa=traduction(get_rna(deleted_sequence))
inserted_aa=traduction(get_rna(inserted_sequence))
if (len(deleted_aa)!=0) and (len(inserted_aa)!=0):

nina.marthe_ird.fr
committed
if deleted_aa!=inserted_aa:
print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa))

nina.marthe_ird.fr
committed
else:
print("consequence : synonymous variation in",",".join(deleted_aa))

nina.marthe_ird.fr
committed
elif len(inserted_aa)!=0:
print("consequence : insertion of",",".join(inserted_aa))

nina.marthe_ird.fr
committed
else:
print("consequence : deletion of",",".join(deleted_aa))
print("stop codon apparition in the cds of the target genome")

nina.marthe_ird.fr
committed
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
segments_on_target_genome=get_segments_positions_on_genome(pos_seg)

nina.marthe_ird.fr
committed
mrna_var=get_mrna_var(var_file)
for feature in Features.values(): # add the sequence of all features

nina.marthe_ird.fr
committed
add_feature_sequence(feature,seg_seq)
def get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,old_frameshift):
# get the sequence until the next variation in the next cds
# for this, go through all the cds left, and add the sequence until the first variation encountered
var_found=False
keys = list(cds_var.keys())
cds_rank = keys.index(cds_id)+1
nextCds=keys[cds_rank]
ref_sequence=""
alt_sequence=""
# look for the next variation, and get the sequence before.
while (not var_found) and (keys.index(nextCds)!=len(keys)): # go through all the cds until the last
if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence
ref_sequence+=Features[nextCds].sequence
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)
cds_rank+=1
nextCds=keys[cds_rank]
print("no var in current cds")
else: # variation found
nextVar=cds_var[nextCds][0]
ref_sequence+=Features[nextCds].sequence[0:int(nextVar[12])]
alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(nextVar[13])]

nina.marthe_ird.fr
committed
return [ref_sequence,alt_sequence]
def get_variation_change_sequences(old_frameshift,frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id):

nina.marthe_ird.fr
committed
ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3)
ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3)

nina.marthe_ird.fr
committed
if frameshift==0: # only local change
ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon
ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3
elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation
nextVar=cds_var[cds_id][index_var+1]
posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3]
ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3)
ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3)
else: # last variation, go until the end of the cds
ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source)
ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target)
ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref]
alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt]
if (frameshift!=0) and (index_var==len(cds_var[cds_id])-1) and (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds
[ref_add,alt_add]=get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,old_frameshift)
ref_sequence+=ref_add
alt_sequence+=alt_add

nina.marthe_ird.fr
committed
ref_sequence=ref_sequence[0:3*(len(ref_sequence)//3)]
alt_sequence=alt_sequence[0:3*(len(alt_sequence)//3)]

nina.marthe_ird.fr
committed
return print_variation_change(ref_sequence,alt_sequence)

nina.marthe_ird.fr
committed
if analysis==True:
# analysing the variations for all the mrna :
for mrna_id in mrna_var.keys():
print("analysis of the variations in the mRNA",mrna_id,":\n")
leftover_ref=""
leftover_target=""
frameshift=0 # frameshift is how many bases to the right you need to go (from the reference reading frame) to get the alternate reading frame
readingframe=0 # readingframe is how many bases to the right you need to go (on the cds sequence) to get the first base of a codon.

nina.marthe_ird.fr
committed
cds_var=mrna_var[mrna_id]
for index_cds,cds_id in enumerate(cds_var):
print("\nCDS",cds_id,":")

nina.marthe_ird.fr
committed
print("readingframe",readingframe,"frameshift",frameshift)

nina.marthe_ird.fr
committed
for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
type_var=var[8]
if type_var!="no_var": # if there is a variation

nina.marthe_ird.fr
committed
print("\nvariation",index_var+1, ":")

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
length_ref=len(var[9])
length_alt=len(var[10])

nina.marthe_ird.fr
committed
if type_var=="insertion":
length_ref=0

nina.marthe_ird.fr
committed
print("insertion of",var[10])
elif type_var=="deletion":

nina.marthe_ird.fr
committed
length_alt=0

nina.marthe_ird.fr
committed
print("deletion of",var[9])

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
print("substitution of",var[9],"by",var[10])
posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3]
temp_rf=readingframe
if temp_rf==0:
temp_rf=3
sequenceCDS_target=leftover_target[temp_rf:]+get_sequence_on_genome(cds_id,segments_on_target_genome)
sequenceCDS_source=leftover_ref[temp_rf:]+Features[cds_id].sequence

nina.marthe_ird.fr
committed
print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14])

nina.marthe_ird.fr
committed
if (index_cds==0) and (posVar[0]<3):

nina.marthe_ird.fr
committed
print("variation of the start codon, mRNA most likely wont be translated")
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
old_frameshift=frameshift
frameshift=(frameshift+length_ref-length_alt)%3
if old_frameshift!=frameshift:
print("variation causing a frameshift")
if frameshift==0:

nina.marthe_ird.fr
committed
print("recovery of the original reading frame")

nina.marthe_ird.fr
committed
if old_frameshift==0:
print("loss of the original reading frame")
stop=get_variation_change_sequences(old_frameshift,frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id)

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
if stop:
break

nina.marthe_ird.fr
committed
if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds

nina.marthe_ird.fr
committed
leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:]
leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:]
readingframe=(3-(len(sequenceCDS_source)-readingframe))%3