Skip to content
Snippets Groups Projects
Functions.py 13.8 KiB
Newer Older
from .usefull_little_functions import get_seg_occ
from .intersect import invert_seg, Features
from .detect_inversion import detect_feature_inversion,invert_segment_list
from .detect_copies import find_gene_copies,get_first_last_seg


# functions to output the stats on the transfer

# stats about missing segments and feature type, not used, will change.
def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id,walk,segments_on_target_genome):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
    feature_missing_segments[5].append(feature_id)

    if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
        feature_missing_segments[3].append(feature_id)
    elif first_seg != list_seg[0]: # the first segment is missing 
        feature_missing_segments[0].append(feature_id)
    elif last_seg!=list_seg[-1]: # the last segment is missing
        feature_missing_segments[2].append(feature_id)

    # go through all the segments, check if some are missing in the middle of the feature
    elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element
        for segment in list_seg[1-(len(list_seg)-2)]:
            if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
                feature_missing_segments[1].append(feature_id)
                break

    # go through the segments, to see if one is missing anywhere on the feature
    for segment in list_seg:
        if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
            if feature_id not in feature_missing_segments[4]:
                feature_missing_segments[4].append(feature_id)
                break

    # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK FOR INSERTIONS !!
    if feature_id not in feature_missing_segments[4]:
        feature_missing_segments[6].append(feature_id)

def get_annot_features(list_features):
    list_annot_features=[]
    for feature in list_features:
        list_annot_features.append(Features[feature].note)
    return list_annot_features

def count_hypput_total(list_annot_first):
    total=len(list_annot_first)
    count_hypput=0
    for annot in list_annot_first:
        if ("hypothetical" in annot) or ("putative" in annot):
            count_hypput+=1
    return [count_hypput,total]

# print stats on the transfer : number of feature that have segments in different positions missing. 
def stats_features(feature_missing_segments):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
    list_annot_first=get_annot_features(feature_missing_segments[0])
    [hyp_put,total]=count_hypput_total(list_annot_first)
    print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_middle=get_annot_features(feature_missing_segments[1])
    [hyp_put,total]=count_hypput_total(list_annot_middle)
    print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_last=get_annot_features(feature_missing_segments[2])
    [hyp_put,total]=count_hypput_total(list_annot_last)
    print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_all=get_annot_features(feature_missing_segments[3])
    [hyp_put,total]=count_hypput_total(list_annot_all)
    print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_total=get_annot_features(feature_missing_segments[4])
    [hyp_put,total]=count_hypput_total(list_annot_total)
    print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_ok=get_annot_features(feature_missing_segments[6])
    [hyp_put,total]=count_hypput_total(list_annot_ok)
    print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_features=get_annot_features(feature_missing_segments[5])
    [hyp_put,total]=count_hypput_total(list_annot_features)
    print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")




# functions to generate the different gffs



# add the paths of the feature on the target genome in the object Feature
def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_genome):
    feature=Features[feature_id]
    list_seg=feature.segments_list_source
    list_first_last_segs=get_first_last_seg(list_seg,segments_on_target_genome)

    for match in list_first_last_segs: # for each walk that has the feature
        [first_seg,last_seg,walk_name]=match
        # get the first and last segments of all the copies on this walk
        first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id,segments_on_target_genome)
        copy_number=0
        for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies
            copy_number+=1
            copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg
            feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome)
            feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
            feature.segments_list_target.append(feature_path)


            # get the pos start of the first and last segments (existing because done in find_gene_copies)
            # then look for segs that are between.
            walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id,segments_on_target_genome)[1]
            walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id,segments_on_target_genome)[2]
            feat_copy=(feature_id,copy_id)
            for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
                for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
                    if walk_start <= segment[1] <= walk_stop: 
                        segment.append(feat_copy) # annotate the segment : feature_id, copy_id
                        break

            # add the paths of the gene's child features
            add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome)
     
    if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
        feature.segments_list_target.append(['','',[]])

def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome):
    childs_list=Features[feature_id].get_child_list(Features)
    for child_id in childs_list:
        child=Features[child_id]

        # find child path in parent's path
        [child_first_seg,child_last_seg]=['','']
        for seg in child.segments_list_source:
            if seg in feature_target_path: # parent path
                child_first_seg=seg
                break
            elif invert_seg(seg) in feature_target_path:
                child_first_seg=invert_seg(seg)
                break

        if child_first_seg!='': # the child feature may be absent in this occurence of the parent
            for seg in reversed(child.segments_list_source):
                if seg in feature_target_path: # parent path
                    child_last_seg=seg
                    break
                elif invert_seg(seg) in feature_target_path:
                    child_last_seg=invert_seg(seg)
                    break
            
            # get the path of the child feature
            child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id,segments_on_target_genome)
            feat_copy=(child_id,copy_id)

            feature_path=[walk_name,copy_id]
            feature_path.append(child_path)
            child.segments_list_target.append(feature_path)

            for segment_id in child_path: # annotate the segments with info about the occurence of the child feature
                segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id,segments_on_target_genome)
                segment.append(feat_copy)


# find the feature's path in target genome walk
def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome):
    # look for first_seg and last_seg that has the right copy_id for this feature
    first_seg_index=get_seg_occ(first_seg,walk_name,feature_id,copy_id,segments_on_target_genome)[4]
    last_seg_index=get_seg_occ(last_seg,walk_name,feature_id,copy_id,segments_on_target_genome)[4]
    first_index=min(first_seg_index,last_seg_index)
    last_index=max(first_seg_index,last_seg_index)
    feature_path_target_genome=target_genome_path[first_index:last_index+1]
    return feature_path_target_genome

# get the coverage and sequence id of a feature
def get_id_cov(feature_id,seg_size,target_list): # seg_size has unoriented segments : s25
    feature=Features[feature_id]
    source_list=feature.segments_list_source

    inversion=detect_feature_inversion(source_list,target_list)
    if inversion:
        target_list=invert_segment_list(target_list)

    [match,subs,inser,delet]=[0,0,0,0]

    [i,j]=[0,0]
    first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
    last=False # same for the last part of the feature # for id and ins.

    while (i<len(source_list)) and (j<len(target_list)):
        if i==len(source_list)-1:
            last=True
        if source_list[i] != target_list[j]: # if there is a difference between the two paths
            if target_list[j] not in source_list: # if the segment in target genome is absent in source genome
                if source_list[i] not in target_list: # if the segment in source genome is absent is target genome : substitution
                    add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
                    match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                    i+=1;j+=1
                else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
                    add=segment_id_cov("insertion",seg_size,source_list[i],target_list[j],first,feature,last)
                    match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                    j+=1
            elif source_list[i] not in target_list: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
                add=segment_id_cov("deletion",seg_size,source_list[i],target_list[j],first,feature,last)
                match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                i+=1
            else : # if both segments are present in the other genome but not at the same position. weird case never found yet
                add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
                match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                i+=1;j+=1
        else: # segment present in both, no variation. 
            add=segment_id_cov("identity",seg_size,source_list[i],target_list[j],first,feature,last)
            match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
            i+=1;j+=1

    if i<=len(source_list)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
        add=segment_id_cov("end_deletion",seg_size,source_list[i:],'',first,feature,last)
        match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]

    denom_cov=match+subs+delet
    denom_id=denom_cov+inser
    [cov,id]=[0,0]
    if denom_cov>0:
        cov=round((match+subs)/(match+subs+delet),3)
    if denom_id>0:
        id=round(match/(match+subs+inser+delet),3)
    return [cov,id]

# computes the cov/id calculation for a segment pair
def segment_id_cov(type,seg_size,seg_a,seg_b,first,feature,last):
    [match,subs,inser,delet]=[0,0,0,0]
    match type:
        case "identity":
            if first:
                match+=seg_size[seg_a[1:]]-feature.pos_start+1
            elif last:
                match+=feature.pos_stop
            else:
                match+=seg_size[seg_a[1:]]
        case "substitution":
            if seg_size[seg_b[1:]]!=seg_size[seg_a[1:]]: # substitution can be between segments of different size
                if seg_size[seg_b[1:]]>seg_size[seg_a[1:]]:
                    subs+=seg_size[seg_a[1:]]
                    inser+=seg_size[seg_b[1:]]-seg_size[seg_a[1:]]
                elif seg_size[seg_b[1:]]<seg_size[seg_a[1:]]:
                    subs+=seg_size[seg_b[1:]]
                    delet+=seg_size[seg_a[1:]]-seg_size[seg_b[1:]]
            else:
                subs+=seg_size[seg_a[1:]]

        case "insertion":
            inser+=seg_size[seg_b[1:]]
        case "deletion":
            if first:
                delet+=seg_size[seg_a[1:]]-feature.pos_start+1
            else:
                delet+=seg_size[seg_a[1:]]
        case "end_deletion":
            for seg in seg_a[:-1]:
                delet+=seg_size[seg[1:]]
            delet+=feature.pos_stop

    return [match,subs,inser,delet,False] # check the orientation of the segment later