Skip to content
Snippets Groups Projects
inference.py 20.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • from Graph_gff import Features,load_intersect
    from Functions import get_segment_sequence,convert_strand
    
    
    intersect_path='intersect.bed'
    
    def get_segments_sequence_and_paths(gfa):
        file_gfa=open(gfa,'r')
        lines_gfa=file_gfa.readlines()
        file_gfa.close()
        seg_seq={}
        paths={}
        for line in lines_gfa:
            line=line.split()
            if (line[0]=="S"): # get the sequence of the segment
                seg_id='s'+line[1]
                seg_seq[seg_id]=line[2]
    
            if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
                
                path=line[6].replace(">",";>")
                path=path.replace("<",";<").split(';')
                list_path=[]
                for segment in path:
                    if segment[0:1]=='>':
                        list_path.append('+s'+segment[1:])
                    elif segment[0:1]=='<':
                        list_path.append('-s'+segment[1:])
     
                paths[line[3]]=list_path
        return [paths,seg_seq]
    
    [paths,seg_seq]=get_segments_sequence_and_paths(gfa)
    
    segments_on_target_genome={}
    
    def get_segments_positions_on_genome(pos_seg):
        bed=open(pos_seg,'r')
        lines=bed.readlines() # read line by line ?
        bed.close()
        for line in lines:
            line=line.split()
            [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
            segments_on_target_genome[seg]=[chrom,start,stop,strand]
    get_segments_positions_on_genome(pos_seg)
    
    def add_feature_sequence(feature,seg_seq):
        feature_sequence=""
        for segment in feature.segments_list:
            if segment==feature.segments_list[0]:
                feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-1:] # revérifier les +/- 1 pour la position, avec de vraies données
            elif segment==feature.segments_list[-1]:
                feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] # revérifier les +/- 1 pour la position, avec de vraies données
            else:
                feature_sequence+=get_segment_sequence(seg_seq,segment)
        feature.sequence=feature_sequence
    
    
    def get_first_seg(list_seg,segments_on_target_genome):
    
        first_seg_found=''
        for segment in list_seg:
            if segment[1:] in segments_on_target_genome:
                first_seg_found=segment[1:]
                break
        return first_seg_found
    
    def get_feature_path(paths,first_seg,last_seg):
    
        first_strand=convert_strand(segments_on_target_genome[first_seg][3])
        first_seg_stranded=first_strand+first_seg
        last_strand=convert_strand(segments_on_target_genome[last_seg][3])
        last_seg_stranded=last_strand+last_seg
        index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
        index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
        first_index=min(index_first_seg,index_last_seg)
        last_index=max(index_last_seg,index_first_seg)
        list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
        list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
        return list_segfeat_azu_corrected
    
    
    def get_rna(dna_sequence):
        return dna_sequence.replace("T","U")
    
    # penser à transcrire la séquence codante du gène !!
    def get_aa(codon):
        match codon[0:2]:
            case "UU":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "Phe"
                else:
                    return "Leu"
            case "UC":
                return "Ser"
            case "UA":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "Tyr"
                else:
                    return "*"
            case "UG":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "Cys"
                elif codon[2]=="A":
                    return "*"
                else:
                    return "Trp"
            case "CU":
                return "Leu"
            case "CC":
                return "Pro"
            case "CA":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "His"
                else:
                    return "Gln"
            case "CG":
                return "Arg"
            case "AU":
                if codon[2]=="G":
                    return "Met"
                else:
                    return "Ile"
            case "AC":
                return "Thr"
            case "AA":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "Asn"
                else:
                    return "Lys"
            case "AG":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "Ser"
                else:
                    return "Arg"
            case "GU":
                return "Val"
            case "GC":
                return "Ala"
            case "GA":
                if (codon[2]=="U") | (codon[2]=="C"):
                    return "Asp"
                else:
                    return "Glu"
            case "GG":
                return "Gly"
    
    
    def traduction(sequence_arn):
        list_codons=decoupe_codon(sequence_arn)
        prot=list()
        for codon in list_codons:
            prot.append(get_aa(codon))
        return prot
    
    
    from textwrap import wrap
    def decoupe_codon(sequence):
        return wrap(sequence,3)
    
    
    
    
    
    
    
    
    def get_sequence_before(first_seg,seg_seq,n,paths,feat):
        first_strand=convert_strand(first_seg[0])
        first_seg_stranded=first_strand+first_seg[1:]
        index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
    
        sequence_before=seg_seq[first_seg[1:]][0:feat.pos_start-1] # sequence left on the segment on which the cds start (can be empty)
        current_index=index_first_seg-1
        while (len(sequence_before)<n) & (current_index>=0):
            segment=paths[target_genome_name][current_index]
            sequence_before=seg_seq[segment[1:]]+sequence_before
            current_index-=1
        return sequence_before[0:99]
    
    def get_sequence_after(last_seg,seg_seq,n,paths,feat):
        last_strand=convert_strand(last_seg[0])
        last_seg_stranded=last_strand+last_seg[1:]
        index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
    
        sequence_after=seg_seq[last_seg[1:]][feat.pos_stop:] # sequence left on the segment on which the cds ends (can be empty)
        current_index=index_last_seg+1
        while (len(sequence_after)<n) & (current_index>len(paths[target_genome_name])):
            segment=paths[target_genome_name][current_index]
            sequence_after=sequence_after+seg_seq[segment[1:]]
            current_index+=1
        return sequence_after[len(sequence_after)-100:]
    
    
    
        '''
        first_strand=convert_strand(segments_on_target_genome[first_seg][3])
        first_seg_stranded=first_strand+first_seg
        last_strand=convert_strand(segments_on_target_genome[last_seg][3])
        last_seg_stranded=last_strand+last_seg
        index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
        index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
        first_index=min(index_first_seg,index_last_seg)
        last_index=max(index_last_seg,index_first_seg)
        list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
        list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
        '''
    
    
    
    def get_sequence_on_genome(feature,segments_on_target_genome):
        list_seg=Features[feature].segments_list
        first_seg=get_first_seg(list_seg,segments_on_target_genome)
        last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)
    
        path_on_target=get_feature_path(paths,first_seg,last_seg)
    
        new_sequence=""
        for segment in path_on_target:
            if segment==cds.segments_list[0]:
                new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-1:]
            elif segment==cds.segments_list[-1]:
                new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]
            else:
                new_sequence+=get_segment_sequence(seg_seq,segment)
    
    lines=var.readlines()
    var.close()
    
    # dict cds-var
    cds_var={}
    for line in lines:
        line=line.split()
        if line[1]=="CDS":
            cds_id=line[0].replace('.','_').replace(':','_')
            if cds_id not in cds_var.keys():
                cds_var[cds_id]=list()
            cds_var[cds_id].append(line)
    
    def findOtherStart(cds,segments_on_target_genome):
        print("\nrecherche de nouveau codon start:")
        frame_shift=0
        # chercher codon start en amont du cds, dans le mrna
        seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
        seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
        sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
        print("sequence en amont dans le mRNA:",sequence_amont)
        if "ATG" in sequence_amont:
            start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
            stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)]
            print("position des starts:",start_pos_list,"position des stops:",stop_pos_list,"en amont du cds") # positions (overlapping) où on trouve un atg.
            # vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
            print("verification des stops après les starts:")
            for start_pos in start_pos_list:
                start_pos_frame=start_pos%3
                n=len(sequence_amont)-start_pos+1
                if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) :
                    #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
                    # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
                    frame_shift=(frame_shift+n)%3 # vérifier le frame shift !!
                    print("le start à la position",start_pos,",",n,"bases en amont du cds, n'a pas de stop en aval dans le même cadre de lecture")
                else:
                    print("le start à la position",start_pos,",",n,"bases en amont du cds, a un stop en aval dans le même cadre de lecture")
    
        # chercher codon start en aval, dans le cds
        if "ATG" in seq_cds:
            start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
            print("codon start candidat trouvé plus loin dans le cds, à la base",start_pos_list[0]) # print seulement le premier
        print("\n")
    
    
    
    for feature in Features.values():
        add_feature_sequence(feature,seg_seq)
    
    
    for cds_id in cds_var.keys():
        cds=Features[cds_id]
    
        print("analyse des variations dans le cds",cds_id)
    
            posVar_on_ref=int(var[12])-1
            posVar_on_alt=int(var[13])-1
            sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome)
    
            # frame var ? pos_var-1 %3
            if type_var!="no_var": # if there is a variation
    
                if "/" in var[11]: # if the variation has a different ref and alt size
    
                    size_var=var[11].split("/")
                else:
                    size_var=var[11]
    
                # how to use size_var if it can be in two different formats (list or int) ?
    
    
                print(var)
                if type_var=="insertion":
                    length_ref=0
                else:
                    length_ref=len(var[9])
                if type_var=="deletion":
                    length_alt=0
                else:
                    length_alt=len(var[10])
    
                if abs(length_alt-length_ref)%3 == 0: # taille diff 3k -> pas de frame shift.
    
    
                        print("variation entre deux codons sans décalage du cadre de lecture")
    
                        if type_var=="insertion":
                            print(type_var,"de",var[10])
                        if type_var=="deletion":
                            print(type_var,"de",var[9])
                        else:
                            print(type_var,"de",var[9],"par",var[10])
    
    
                        if length_ref%3==0: # taille 3K -> pas d'impact sur l'aa en aval
                            deleted_aa=traduction(get_rna(var[9]))
                            inserted_aa=traduction(get_rna(var[10]))
    
                            if (length_ref!=0) & (length_alt!=0):
    
                                    print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa))
    
                                    print("conséquence : mutation synonyme dans",",".join(deleted_aa))
    
                                print("conséquence : insertion de",",".join(inserted_aa))
    
                                print("conséquence : deletion de",",".join(deleted_aa))
    
                            
                        else: # taille != 3k : impact sur l'aa d'après. 
                        ## on peut peut-être fusionner ces deux cas (if et else courants), mais il faudra modifier
                        ## le calcul des derniers aa ins et del parce que ça bug quand il n'y a pas d'ins ou de del
                            
                            taille_del_entiere=(length_ref//3)*3
                            taille_ins_entiere=(length_alt//3)*3
    
    
                            deleted_aa=traduction(get_rna(var[9][0:taille_del_entiere]))
    
                            inserted_aa=traduction(get_rna(var[10][0:taille_ins_entiere]))
    
    
                            len_fragment_after=3-(length_ref%3)
                            reste_cds=cds.sequence[posVar_on_ref+length_ref:posVar_on_ref+length_ref+len_fragment_after] # sequence sur le génome d'origine
                            # pour inserter_aa, il faudrait faire la meme chose mais avec la sequence target !
                            # get_sequence_on_genome(cds_id,segments_on_target_genome)
    
    
                            reste_ins=var[10][-(length_alt%3):]
                            dernier_aa_ins=reste_ins+reste_cds
                            
                            reste_del=var[9][-(length_ref%3):]
                            dernier_aa_del=reste_del+reste_cds
    
                            deleted_aa.append(traduction(get_rna(dernier_aa_del))[0])
                            inserted_aa.append(traduction(get_rna(dernier_aa_ins))[0])
                            
                            print(dernier_aa_del,"->",dernier_aa_ins) # à supprimer. seulement le dernier aa de la délétion et de l'insertion. 
    
                                print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa))
    
                                print("conséquence : mutation synonyme dans",",".join(deleted_aa))
    
                        print("variation au milieu d'un codon sans décalage du cadre de lecture")
                        if type_var=="insertion":
                            print(type_var,"de",var[10])
                        if type_var=="deletion":
                            print(type_var,"de",var[9])
                        else:
                            print(type_var,"de",var[9],"par",var[10])
    
                        len_fragment_before=(posVar_on_ref)%3
                        len_fragment_after=(3-(len_fragment_before+length_ref))%3
    
                        total_ins=sequence_target[posVar_on_alt-len_fragment_before:posVar_on_alt+length_alt+len_fragment_after]
                        total_del=cds.sequence[posVar_on_ref-len_fragment_before:posVar_on_ref+length_ref+len_fragment_after]
    
                        deleted_aa=traduction(get_rna(total_del))
                        inserted_aa=traduction(get_rna(total_ins))
                        if deleted_aa!=inserted_aa:
                            print("conséquence : changement de",",".join(deleted_aa),"en",",".join(inserted_aa))
                        else:
                            print("conséquence : mutation synonyme dans",",".join(deleted_aa))
    
                    # possibilité que j'ai pris en compte une variation de plus, si on a un snp sur la premiere et la derniere base d'un codon : 
                    # pour le traitement de la première j'ai également considéré la dernière ! 
    
                    frame_shift=(frame_shift+abs(length_alt-length_ref))%3
                    
    
                    print("codon start touché")
    
                    findOtherStart(cds,segments_on_target_genome)
    
                # si on a plusieurs start candidats il faut choisir celui qu'on prend, ils sont pas forcément tous sur le même cadre de lecture
                # comment traiter la variation sur le start ? 
                # après avoir détecté que la var touche le start, traiter la var ? ou l'inverse
    
    
                print("\n")
    
    
    
    if version=="old":
        for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to make a prot. then detail var by cds.
            cds=Features[cds_id]
            print("analysing variations in cds",cds_id)
            add_feature_sequence(cds,seg_seq)
            cds_prot=traduction(get_rna(cds.sequence))
    
            list_seg=Features[cds_id].segments_list
            first_seg=get_first_seg(list_seg)
            last_seg=get_first_seg(reversed(list_seg))
            path_on_target=get_feature_path(paths,first_seg,last_seg)
    
            new_sequence=""
            for segment in path_on_target:
                if segment==cds.segments_list[0]:
                    new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-1:]
                elif segment==cds.segments_list[-1]:
                    new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]
                else:
                    new_sequence+=get_segment_sequence(seg_seq,segment)
    
            new_prot=traduction(get_rna(new_sequence))
            print("original prot = ", cds_prot)
            print("new prot = ", new_prot) # print new version with new start and stop codons of deleted. (before and after the gene sequence)
    
    
            if new_prot[0]!="Met":
                print("no Met at the start of the new version of the protein -> start codon loss")
                if "Met" in new_prot:
                    print("Met found at position", (new_prot.index("Met")+1),"-> possible later start codon") # print cette version de la prot
    
                print("look for start codon before. if none, print 'no start codon, likely gene not active'")
                # récupérer n pb avant, les traduire, chercher la dernière Met (list[::-1].index("Met")), donner cette version de la prot
    
            sequence_before=get_sequence_before(path_on_target[0],seg_seq,100,paths,cds)
            print(sequence_before)
    
    
            first_stop_index=new_prot.index("*") if "*" in new_prot else "None"
            if "*" not in new_prot:
                print("no stop codon")
                # récupérer n pb après, les traduire, chercher le premier *, donner cette version de la prot.
            elif first_stop_index+1!=len(new_prot):
                print("early stop codon at position",(first_stop_index+1),"instead of",len(new_prot))
            else:
                print("stop codon found at expected position")
    
            sequence_after=get_sequence_after(path_on_target[-1],seg_seq,100,paths,cds)
            print(sequence_after)
    
    
    
            for var in cds_var[cds_id]:
                #print("\n",var)
                size_var=int(var[11])
                type_var=var[8]
                pos_var=int(var[12])-1
    
    
                if type_var=="insertions":
                    sequence_var=var[10]
                    if size_var%3==0:
                        print("pas de décalage du cadre de lecture")
                        trad_seq_ins=traduction(get_rna(sequence_var))
    
                        if pos_var%3==0:
                            print("insertion entre deux codons")
                            if "*" in trad_seq_ins:
                                print("apparition d'un codon stop")
                                print(f'ancienne sequence : {", ".join(cds_prot)}')
                                print(f'nouvelle sequence : {", ".join(cds_prot[0:(pos_var//3)-1])}, *')
                            else:
                                print(f'ancienne sequence : {", ".join(cds_prot)}')
                                print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}')
    
                            print("insertion au milieu d'un codon, changement de certains acides amines")
    
                elif type_var=="deletions":
                    sequence_var=line[9]
                    if size_var%3==0:
                        print("pas de décalage du cadre de lecture")
                        trad_seq_ins=traduction(get_rna(sequence_var))
    
                        if pos_var%3==0:
                            print("deletion de codons entiers")
                            if "*" in trad_seq_ins:
                                print("disparition d'un codon stop")
                            else:
                                print(f'ancienne sequence : {", ".join(cds_prot)}')
                                print(f'{type_var} de {size_var//3} acides amines {", ".join(trad_seq_ins)} à la position {pos_var//3}')
    
                            print("deletion au milieu d'un codon, changement de certains acides amines")