detect_copies.py

from .intersect import invert_seg

# find all the copies of the segments from the source list in the target genome
def find_all_seg(list_seg_source,walk_name,segments_on_target_genome):
    index=0
    list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk]
    list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>]
    for seg in list_seg_source:
        list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
        seg_inverted=invert_seg(seg)
        # look for all the segment copies in the target genome walk, in both orientations
        if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
            for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
                seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
                list_seg_target.append(seg_info)
        if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
            for copy in segments_on_target_genome[seg_inverted][walk_name]:
                seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
                list_seg_target.append(seg_info)
        index+=1
    list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position
    return [list_seg_target,list_seg_source_unstranded]

def find_gene_copies(list_seg_source,walk_name,feature_id,segments_on_target_genome):
    # find all copies of all segments from the gene in the target genome (in both orientations)
    [list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name,segments_on_target_genome)
    # find each copy of the gene in the ordered list of segments
    [first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id,segments_on_target_genome)

    # join the first and last seg lists
    first_last_segs_list=[]
    index=0
    for first_seg in first_segs_list:
        last_seg=last_segs_list[index]
        pair=(first_seg,last_seg)
        first_last_segs_list.append(pair)
        index+=1

    return first_last_segs_list # return a list of pairs (first_seg,last_seg)

# called by find_gene_copies
def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id,segments_on_target_genome):
    # prepare the variables for the first iteration of the for loop
    [old_id,old_strand,old_start,old_index]=list_seg_target[0]
    [first_segs_list,last_segs_list]=[[],[]]
    old_seg_id=old_strand+old_id
    first_segs_list.append(old_seg_id)

    copy_number=1
    copy_id="copy_"+str(copy_number)
    feat_copy=(feature_id,copy_id)

    for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
        if segment[1]==old_start:
            copy_id="copy_"+str(copy_number)
            feat_copy=(feature_id,copy_id)
            segment.append(feat_copy)
            break
    # adjust old_index for the first iteration of the loop
    first_inversion=(old_strand!=list_seg_source_unstranded[old_index][1])
    if first_inversion:
        old_index+=1
    else:
        old_index-=1

    for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies
        [new_id,new_strand,new_start,new_index]=seg
        new_seg_id=new_strand+new_id
        inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk

        if inversion : 
            if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy)
                last_segs_list.append(old_seg_id)
                add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy,segments_on_target_genome) # add info for the last seg of the previous copy

                copy_number+=1
                copy_id="copy_"+str(copy_number)
                feat_copy=(feature_id,copy_id) # new feature copy, change the info

                first_segs_list.append(new_seg_id)
                add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the first seg of the new copy
        else: 
            if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy)
                last_segs_list.append(old_seg_id)
                add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy,segments_on_target_genome) # add info for the last seg of the previous copy

                copy_number+=1
                copy_id="copy_"+str(copy_number)
                feat_copy=(feature_id,copy_id) # new feature copy, change the info

                first_segs_list.append(new_seg_id)
                add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the first seg of the new copy

        [old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start]

    # add the last seg info
    last_segs_list.append(old_seg_id)
    add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the last seg of the last copy

    return [first_segs_list,last_segs_list]

def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy,segments_on_target_genome):
    for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
        if segment[1]==target_start:
            segment.append(feat_copy) # add seg info
            break

def sort_seg_info(seg_info):
    return seg_info[2]

# look for the segment on either strand of the target genome
def search_seg_on_target_genome(segment,segments_on_target_genome):
    inverted_segment=invert_seg(segment)
    if segment in segments_on_target_genome:
        #if inverted_segment in segments_on_target_genome:
        #    print(segment," found in both orientations")
        return segment
    elif inverted_segment in segments_on_target_genome:
        #print("inverted seg found *****")
        return inverted_segment
    else:
        return False

# look for a segment on a walk, in either orientations
def search_seg_on_walk(segment,walk,segments_on_target_genome): # for now just print the first found, look for several later...
    inverted_segment=invert_seg(segment)
    if segment in segments_on_target_genome:
        if walk in segments_on_target_genome[segment]:
            return segment
    elif inverted_segment in segments_on_target_genome:
        if walk in segments_on_target_genome[inverted_segment]:
            return inverted_segment
    else:
        return False


# get the first and last segment of the list that is in the target genome (possibly several pairs)
def get_first_last_seg(list_seg,segments_on_target_genome):
    list_first_last_segs=[]
    [first_seg_found,last_seg_found,walk_found]=['','','']

    list_walks=get_walks_feature_cross(list_seg,segments_on_target_genome) # get all the walks where there is a segment of the feature
    for walk in list_walks: # find the first and last seg for each walk
        for segment in list_seg: # look for first_seg
            seg_found=search_seg_on_walk(segment,walk,segments_on_target_genome)
            if seg_found:
                first_seg_found=seg_found
                break
        if first_seg_found!='': # look for last_seg
            for segment in reversed(list_seg):
                last_seg_found=search_seg_on_walk(segment,walk,segments_on_target_genome)
                if last_seg_found:
                    walk_found=walk
                    break
        list_first_last_segs.append([first_seg_found,last_seg_found,walk_found])
        [first_seg_found,last_seg_found,walk_found]=['','','']
    # return all the match
    return list_first_last_segs

# functions to get the detail of the variations in the features

# find all the walks that contain a segment of the feature (list_seg is the walk of the feature on the source genome)
def get_walks_feature_cross(list_seg,segments_on_target_genome):
    list_walks=list()
    for segment in list_seg:
        seg_found=search_seg_on_target_genome(segment,segments_on_target_genome)
        if seg_found: # if the segment or the reverse complement is on the target genome
            for walk in segments_on_target_genome[seg_found]:
                if walk not in list_walks:
                    list_walks.append(walk)
    return list_walks