inference.py

from Graph_gff import Features,load_intersect
from Functions import get_segment_sequence,convert_strand

target_genome_name="genome2_chr10"
pos_seg=target_genome_name+".bed"
var_file=target_genome_name+"_variations.txt"
gfa="graph.gfa"
intersect_path='intersect.bed'
load_intersect(intersect_path)


# présent dans Fuctions.py mais relou à importer ?
def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths 
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    file_gfa.close()
    seg_seq={}
    paths={}
    for line in lines_gfa:
        line=line.split()
        if (line[0]=="S"): # get the sequence of the segment
            seg_id='s'+line[1]
            seg_seq[seg_id]=line[2]

        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
            
            path=line[6].replace(">",";>")
            path=path.replace("<",";<").split(';')
            list_path=[]
            for segment in path:
                if segment[0:1]=='>':
                    list_path.append('+s'+segment[1:])
                elif segment[0:1]=='<':
                    list_path.append('-s'+segment[1:])
 
            paths[line[3]]=list_path
    return [paths,seg_seq]

# présent dans Fuctions.py mais relou à importer ?
def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome
    bed=open(pos_seg,'r')
    lines=bed.readlines() # read line by line ?
    bed.close()
    segments_on_target_genome={}
    for line in lines:
        line=line.split()
        [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
        segments_on_target_genome[seg]=[chrom,start,stop,strand]
    return segments_on_target_genome

def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object
    feature_sequence=""
    for segment in feature.segments_list:
        if (segment==feature.segments_list[0]) & (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature
            feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1]
        if segment==feature.segments_list[0]:
            feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:]
        elif segment==feature.segments_list[-1]:
            feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop]
        else:
            feature_sequence+=get_segment_sequence(seg_seq,segment)
    feature.sequence=feature_sequence

def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome
    first_seg_found=''
    for segment in list_seg:
        if segment[1:] in segments_on_target_genome:
            first_seg_found=segment[1:]
            break
    return first_seg_found

def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome
    first_strand=convert_strand(segments_on_target_genome[first_seg][3])
    first_seg_stranded=first_strand+first_seg
    last_strand=convert_strand(segments_on_target_genome[last_seg][3])
    last_seg_stranded=last_strand+last_seg
    index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
    index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
    first_index=min(index_first_seg,index_last_seg)
    last_index=max(index_last_seg,index_first_seg)
    list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
    list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
    return list_segfeat_azu_corrected

def get_rna(dna_sequence):
    return dna_sequence.replace("T","U")

def get_aa(rna_codon): # gives the aa coded by the rna codon
    match rna_codon[0:2]:
        case "UU":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "Phe"
            else:
                return "Leu"
        case "UC":
            return "Ser"
        case "UA":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "Tyr"
            else:
                return "*"
        case "UG":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "Cys"
            elif rna_codon[2]=="A":
                return "*"
            else:
                return "Trp"
        case "CU":
            return "Leu"
        case "CC":
            return "Pro"
        case "CA":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "His"
            else:
                return "Gln"
        case "CG":
            return "Arg"
        case "AU":
            if rna_codon[2]=="G":
                return "Met"
            else:
                return "Ile"
        case "AC":
            return "Thr"
        case "AA":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "Asn"
            else:
                return "Lys"
        case "AG":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "Ser"
            else:
                return "Arg"
        case "GU":
            return "Val"
        case "GC":
            return "Ala"
        case "GA":
            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
                return "Asp"
            else:
                return "Glu"
        case "GG":
            return "Gly"

def cut_codon(sequence):
    from textwrap import wrap
    return wrap(sequence,3)

def traduction(sequence_arn): # translate rna
    list_codons=cut_codon(sequence_arn)
    prot=list()
    for codon in list_codons:
        if len(codon)==3:
            prot.append(get_aa(codon))
        #else:
        #    print("attempt to get the amino acid for an incomplete codon")
    return prot

def get_sequence_on_genome(feature_id,segments_on_target_genome): # returns the sequence of the feature on the target genome
    feature=Features[feature_id]
    list_seg=feature.segments_list
    first_seg=get_first_seg(list_seg,segments_on_target_genome)
    last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)
    path_on_target=get_feature_path(paths,first_seg,last_seg)
    pos_start=feature.pos_start
    pos_stop=feature.pos_stop

    new_sequence=""
    for segment in path_on_target:
        if segment==feature.segments_list[0]:
            new_sequence+=get_segment_sequence(seg_seq,segment)[pos_start-1:]
        elif segment==feature.segments_list[-1]:
            new_sequence+=get_segment_sequence(seg_seq,segment)[0:pos_stop]
        else:
            new_sequence+=get_segment_sequence(seg_seq,segment)
    return new_sequence

def get_mrna_var(var_file):
    var=open(var_file,'r')
    lines=var.readlines()
    var.close()
    
    mrna_var={}
    for line in lines:
        line=line.split()
        feature_type=line[1]
        if feature_type=="mRNA":
            mrna_id=line[0].replace('.','_').replace(':','_')
            if mrna_id not in mrna_var.keys():
                mrna_var[mrna_id]={}
        elif feature_type=="CDS":
            cds_id=line[0].replace('.','_').replace(':','_')
            parent_id=line[0][0:line[0].find("cds")-1]
            if cds_id not in mrna_var[parent_id]:
                mrna_var[parent_id][cds_id]=list()
            mrna_var[parent_id][cds_id].append(line)
    return mrna_var

import re
def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one 
    print("\nlooking for a new start codon:")
    frame_shift=0
    # chercher codon start en amont du cds, dans le mrna
    seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
    seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
    sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
    print("sequence before the CDS in the mRNA:",sequence_amont)
    if "ATG" in sequence_amont:
        start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
        stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)]
        print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg.
        # vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
        print("checking if there is a stop codon after the start codons in the same reading frame:")
        for start_pos in start_pos_list:
            start_pos_frame=start_pos%3
            n=len(sequence_amont)-start_pos+1
            if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) :
                #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
                # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
                frame_shift=(frame_shift+n)%3 # vérifier le frameshift !!
                print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame")
            else:
                print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame")

    # chercher codon start en aval, dans le cds
    if "ATG" in seq_cds:
        start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
        print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier
    print("\n")
    return frame_shift

def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion
    print("printvar",deleted_sequence,inserted_sequence)
    stop=False
    deleted_aa=traduction(get_rna(deleted_sequence))
    inserted_aa=traduction(get_rna(inserted_sequence))

    if (len(deleted_aa)!=0) & (len(inserted_aa)!=0):
        if deleted_aa!=inserted_aa:
            print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa))
        else:
            print("consequence : synonymous variation in",",".join(deleted_aa))
    elif len(inserted_aa)!=0:
        print("consequence : insertion of",",".join(inserted_aa))
    else:
        print("consequence : deletion of",",".join(deleted_aa))
    if ("*" in inserted_aa):
        print("stop codon apparition in the cds of the target genome")
        stop=True
    return stop


[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
segments_on_target_genome=get_segments_positions_on_genome(pos_seg)
mrna_var=get_mrna_var(var_file)
for feature in Features.values(): # add the sequence of all features
    add_feature_sequence(feature,seg_seq)


def get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,sequenceCDS_source):
    # get the sequence until the next variation in the next cds
    # for this, go through all the cds left, and add the sequence until the first variation encountered
    var_found=False
    keys = list(cds_var.keys())
    cds_rank = keys.index(cds_id)+1
    nextCds=keys[cds_rank]
    current_readingframe=readingframe
    ref_sequence=""
    alt_sequence=""

    # look for the next variation, and get the sequence before.
    while (not var_found) & (keys.index(nextCds)!=len(keys)): # go through all the cds until the last
        if len(cds_var[nextCds])==0: # if the current cds has no variation, simply add its sequence
            ref_sequence+=Features[nextCds].sequence
            alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)
            cds_rank+=1
            nextCds=keys[cds_rank]
            #current_readingframe=(3-(len(sequenceCDS_source)-current_readingframe))%3
            print("no var in current cds")
        else: # variation found
            variation=cds_var[nextCds][0]

            #posNextVar=[int(variation[12])+(3-current_readingframe)%3,int(variation[13])+(3-current_readingframe)%3]

            # revoir le calcul des fragments avant la variation !
            # cas où la var est sur la deuxième base du cds : alors il faut peut être laisser à cette var les bases avant du cds précédent ......
            # et je crois qu'on a toujours pas réglé le cas où on a une sbustitution sur la base 1 et 3 d'un codon.

            nextReadingframe=(3-(len(sequenceCDS_source)-readingframe))%3
            len_codonFragment_beforeNextVar_ref=(int(variation[12])-nextReadingframe)%3
            len_codonFragment_beforeNextVar_alt=(int(variation[13])+(3-frameshift)-nextReadingframe)%3
            ref_sequence+=Features[nextCds].sequence[0:int(variation[12])-len_codonFragment_beforeNextVar_ref]
            alt_sequence+=get_sequence_on_genome(nextCds,segments_on_target_genome)[0:int(variation[13])-len_codonFragment_beforeNextVar_alt]
            #print("pos_var",variation[12],variation[13])
            #print("next readingframe, frameshift",nextReadingframe,frameshift)
            #print("len frag before",len_codonFragment_beforeNextVar_ref,len_codonFragment_beforeNextVar_alt)
            #print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome))
            var_found=True

            return [ref_sequence,alt_sequence]

def get_variation_change_sequences(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id):
    ProtChangeStartPosOnCDS_ref=posVar[0]-(posVar[0]%3)
    ProtChangeStartPosOnCDS_alt=posVar[1]-(posVar[1]%3)

    if frameshift==0: # only local change
        ProtChangeStopPosOnCDS_ref=posVar[0]+length_ref+(3-(posVar[0]+length_ref))%3 # end of var + rest of the codon
        ProtChangeStopPosOnCDS_alt=posVar[1]+length_alt+(3-(posVar[1]+length_alt))%3
    elif index_var!=len(cds_var[cds_id])-1: # not the last variation, go until the last variation
        nextVar=cds_var[cds_id][index_var+1]
        posNextVar=[int(nextVar[12])+(3-readingframe)%3,int(nextVar[13])+(3-readingframe)%3]
        ProtChangeStopPosOnCDS_ref=posNextVar[0]-(posNextVar[0]%3)
        ProtChangeStopPosOnCDS_alt=posNextVar[1]-(posNextVar[1]%3)
    else: # last variation, go until the end of the cds
        ProtChangeStopPosOnCDS_ref=len(sequenceCDS_source)
        ProtChangeStopPosOnCDS_alt=len(sequenceCDS_target)

    ref_sequence=sequenceCDS_source[ProtChangeStartPosOnCDS_ref:ProtChangeStopPosOnCDS_ref]
    alt_sequence=sequenceCDS_target[ProtChangeStartPosOnCDS_alt:ProtChangeStopPosOnCDS_alt]

    if (frameshift!=0) & (index_var==len(cds_var[cds_id])-1) & (index_cds!=len(cds_var)-1): # if it is the last variation of the cds and it is not the last cds
        [ref_add,alt_add]=get_sequence_until_nextVar(cds_var,cds_id,segments_on_target_genome,readingframe,frameshift,sequenceCDS_source)
        ref_sequence+=ref_add
        alt_sequence+=alt_add
    
    return print_variation_change(ref_sequence,alt_sequence)


analysis=True
if analysis==True:
    # analysing the variations for all the mrna :
    for mrna_id in mrna_var.keys():
        print("analysis of the variations in the mRNA",mrna_id,":\n")
        leftover_ref=""
        leftover_target=""
        frameshift=0
        readingframe=0
        cds_var=mrna_var[mrna_id]
        for index_cds,cds_id in enumerate(cds_var):
            print("\nCDS",cds_id,":")
            print("readingframe",readingframe,"frameshift",frameshift)
            for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
                type_var=var[8]
                if type_var!="no_var": # if there is a variation
                    print("\nvariation",index_var+1, ":")

                    length_ref=len(var[9])
                    length_alt=len(var[10])
                    if type_var=="insertion":
                        length_ref=0
                        print("insertion of",var[10])
                    elif type_var=="deletion":
                        length_alt=0
                        print("deletion of",var[9])
                    else:
                        print("substitution of",var[9],"by",var[10])

                    posVar=[int(var[12])+(3-readingframe)%3,int(var[13])+(3-readingframe)%3]
                    sequenceCDS_target=leftover_target[readingframe:]+get_sequence_on_genome(cds_id,segments_on_target_genome)
                    sequenceCDS_source=leftover_ref[readingframe:]+Features[cds_id].sequence
                    print(sequenceCDS_source,sequenceCDS_target,posVar,var[12:14])
                    
                    if (index_cds==0) & (posVar[0]<3):
                        print("variation of the start codon, mRNA most likely wont be translated")
                        #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
                        break

                    old_frameshift=frameshift
                    frameshift=(frameshift+length_ref-length_alt)%3
                    if old_frameshift!=frameshift:
                        print("variation causing a frameshift")
                        # frameshift=0 -> reading frame recovered
                        # frameshift=1 -> frameshift of 1 base to the right
                        # frameshift=2 -> frameshift of 2 bases to the right
                        if frameshift==0:
                            print("recovery of the original reading frame")                            
                        if old_frameshift==0:
                            print("loss of the original reading frame")

                    stop=get_variation_change_sequences(frameshift,readingframe,posVar,length_ref,length_alt,sequenceCDS_source,sequenceCDS_target,index_var,cds_var,cds_id)

                    if stop:
                        break

                    if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds
                        leftover_ref=sequenceCDS_source[len(sequenceCDS_source)-3:]
                        leftover_target=sequenceCDS_target[len(sequenceCDS_target)-3:]
                        readingframe=(3-(len(sequenceCDS_source)-readingframe))%3