Skip to content
Snippets Groups Projects
alignment.py 7.01 KiB
Newer Older
from .intersect import Features
from .usefull_little_functions import get_segment_sequence
from .detect_inversion import detect_feature_inversion,invert_segment_list

# functions to get the alignment for the transfered genes

# get the alignment between the features on the source genome and the features on the target genome 
def print_alignment(feat,match,seg_seq,file_out_aln):
    if match[0]!='':
        feature_target_path=match[2]
        if len(feature_target_path)>0: # if the feature is not completely absent
            feature_path_source_genome=Features[feat].segments_list_source
            inversion=detect_feature_inversion(feature_path_source_genome,feature_target_path)
            if inversion:
                feature_target_path=invert_segment_list(feature_target_path)

            line_aln=create_line_aln(feature_path_source_genome,feature_target_path,seg_seq,feat)
            file_out_aln.write(line_aln)

# creates an alignment for two segments
def segment_aln(type,seg_seq,seg_a,seg_b,first,feature_id,last):

    match type:
        case "identity":
            if first:
                feature=Features[feature_id]
                seq_aln=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
            elif last:
                feature=Features[feature_id]
                seq_aln=get_segment_sequence(seg_seq,seg_a)[:feature.pos_stop]
            else:
                seq_aln=get_segment_sequence(seg_seq,seg_a)
            line_a=seq_aln
            line_b=seq_aln
            len_aln=len(seq_aln)
            line_c=len_aln*"*"
        case "substitution":
            seq_aln_a=get_segment_sequence(seg_seq,seg_a)
            seq_aln_b=get_segment_sequence(seg_seq,seg_b)
            len_a=len(seq_aln_a)
            len_b=len(seq_aln_b)
            if len_a>len_b:
                diff_len=len_a-len_b
                line_a=seq_aln_a
                line_b=seq_aln_b+diff_len*"-"
                line_c=len_a*" "
            else:
                diff_len=len_b-len_a
                line_a=seq_aln_a+diff_len*"-"
                line_b=seq_aln_b
                line_c=len_b*" "
        case "insertion":
            seq_aln_b=get_segment_sequence(seg_seq,seg_b)
            len_b=len(seq_aln_b)
            line_a=len_b*"-"
            line_b=seq_aln_b
            line_c=len_b*" "
        case "deletion":
            if first:
                feature=Features[feature_id]
                seq_aln_a=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
            else:
                seq_aln_a=get_segment_sequence(seg_seq,seg_a)
            len_a=len(seq_aln_a)
            line_a=seq_aln_a
            line_b=len_a*"-"
            line_c=len_a*" "
        case "end_deletion":
            seq_aln_a=""
            for segment in seg_a[:-1]:
                seq_aln_a+=get_segment_sequence(seg_seq,segment)
            feature=Features[feature_id]
            seq_aln_a+=get_segment_sequence(seg_seq,seg_a[-1])[0:feature.pos_stop] # for the last segment, only take the part that the feature is on
            len_a=len(seq_aln_a)
            line_a=seq_aln_a
            line_b=len_a*"-"
            line_c=len_a*" "

    return [line_a,line_b,line_c,False] # check the orientation of the segment later

# formats the alignment lines
def parse_aln_lines(line_a,line_b,line_c,feature_id):
    if (len(line_a)!=len(line_b)) or (len(line_b)!=len(line_c)):
        print("line lengths differ in alignment")
    len_to_parse=len(line_a)
    len_parsed=0
    aln_line=""
    nb_res_a=0
    nb_res_b=0

    while len_parsed<len_to_parse:
        len_header=len(feature_id)+11
        headers=[feature_id+"_source    ",feature_id+"_target    ",len_header*" "]

        add_a=line_a[len_parsed:len_parsed+60]
        add_b=line_b[len_parsed:len_parsed+60]
        add_c=line_c[len_parsed:len_parsed+60]
        nb_res_a+=len(add_a)-add_a.count("-")
        nb_res_b+=len(add_b)-add_b.count("-")
        aln_line+=f'{headers[0]}{add_a}    {nb_res_a}\n'
        aln_line+=f'{headers[1]}{add_b}    {nb_res_b}\n'
        aln_line+=f'{headers[2]}{add_c}\n\n'
        len_parsed+=60
    aln_line+="\n"
    
    return aln_line

# creates the alignment for a feature
def create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feature_id):
    line_a=""
    line_b=""
    line_c=""
    [i,j]=[0,0]
    first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
    last=False # same for the last part of the feature

    while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
        if i==len(feature_path_source_genome)-1:
            last=True
        if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
            if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
                if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome : substitution
                    [add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
                    line_a+=add_a;line_b+=add_b;line_c+=add_c
                    i+=1;j+=1
                else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
                    [add_a,add_b,add_c,first]=segment_aln("insertion",seg_seq,"",feature_path_target_genome[j],first,feature_id,last)
                    line_a+=add_a;line_b+=add_b;line_c+=add_c
                    j+=1
            elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
                [add_a,add_b,add_c,first]=segment_aln("deletion",seg_seq,feature_path_source_genome[i],"",first,feature_id,last)
                line_a+=add_a;line_b+=add_b;line_c+=add_c
                i+=1
            else : # if both segments are present in the other genome but not at the same position. weird case never found yet
                [add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
                line_a+=add_a;line_b+=add_b;line_c+=add_c
                i+=1;j+=1

        else: # segment present in both, no variation. 
            [add_a,add_b,add_c,first]=segment_aln("identity",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
            line_a+=add_a;line_b+=add_b;line_c+=add_c
            i+=1;j+=1

    if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
        [add_a,add_b,add_c,first]=segment_aln("end_deletion",seg_seq,feature_path_source_genome[i:],"",first,feature_id,last)
        line_a+=add_a;line_b+=add_b;line_c+=add_c

    return parse_aln_lines(line_a,line_b,line_c,feature_id)