from .intersect import invert_seg # find all the copies of the segments from the source list in the target genome def find_all_seg(list_seg_source,walk_name,segments_on_target_genome): index=0 list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk] list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>] for seg in list_seg_source: list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>] seg_inverted=invert_seg(seg) # look for all the segment copies in the target genome walk, in both orientations if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg) for copy in segments_on_target_genome[seg][walk_name]: # take all its copies seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4] list_seg_target.append(seg_info) if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted for copy in segments_on_target_genome[seg_inverted][walk_name]: seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index] list_seg_target.append(seg_info) index+=1 list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position return [list_seg_target,list_seg_source_unstranded] def find_gene_copies(list_seg_source,walk_name,feature_id,segments_on_target_genome): # find all copies of all segments from the gene in the target genome (in both orientations) [list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name,segments_on_target_genome) # find each copy of the gene in the ordered list of segments [first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id,segments_on_target_genome) # join the first and last seg lists first_last_segs_list=[] index=0 for first_seg in first_segs_list: last_seg=last_segs_list[index] pair=(first_seg,last_seg) first_last_segs_list.append(pair) index+=1 return first_last_segs_list # return a list of pairs (first_seg,last_seg) # called by find_gene_copies def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id,segments_on_target_genome): # prepare the variables for the first iteration of the for loop [old_id,old_strand,old_start,old_index]=list_seg_target[0] [first_segs_list,last_segs_list]=[[],[]] old_seg_id=old_strand+old_id first_segs_list.append(old_seg_id) copy_number=1 copy_id="copy_"+str(copy_number) feat_copy=(feature_id,copy_id) for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info if segment[1]==old_start: copy_id="copy_"+str(copy_number) feat_copy=(feature_id,copy_id) segment.append(feat_copy) break # adjust old_index for the first iteration of the loop first_inversion=(old_strand!=list_seg_source_unstranded[old_index][1]) if first_inversion: old_index+=1 else: old_index-=1 for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies [new_id,new_strand,new_start,new_index]=seg new_seg_id=new_strand+new_id inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk if inversion : if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy) last_segs_list.append(old_seg_id) add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy,segments_on_target_genome) # add info for the last seg of the previous copy copy_number+=1 copy_id="copy_"+str(copy_number) feat_copy=(feature_id,copy_id) # new feature copy, change the info first_segs_list.append(new_seg_id) add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the first seg of the new copy else: if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy) last_segs_list.append(old_seg_id) add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy,segments_on_target_genome) # add info for the last seg of the previous copy copy_number+=1 copy_id="copy_"+str(copy_number) feat_copy=(feature_id,copy_id) # new feature copy, change the info first_segs_list.append(new_seg_id) add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the first seg of the new copy [old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start] # add the last seg info last_segs_list.append(old_seg_id) add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the last seg of the last copy return [first_segs_list,last_segs_list] def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy,segments_on_target_genome): for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment if segment[1]==target_start: segment.append(feat_copy) # add seg info break def sort_seg_info(seg_info): return seg_info[2] # look for the segment on either strand of the target genome def search_seg_on_target_genome(segment,segments_on_target_genome): inverted_segment=invert_seg(segment) if segment in segments_on_target_genome: #if inverted_segment in segments_on_target_genome: # print(segment," found in both orientations") return segment elif inverted_segment in segments_on_target_genome: #print("inverted seg found *****") return inverted_segment else: return False # look for a segment on a walk, in either orientations def search_seg_on_walk(segment,walk,segments_on_target_genome): # for now just print the first found, look for several later... inverted_segment=invert_seg(segment) if segment in segments_on_target_genome: if walk in segments_on_target_genome[segment]: return segment elif inverted_segment in segments_on_target_genome: if walk in segments_on_target_genome[inverted_segment]: return inverted_segment else: return False # get the first and last segment of the list that is in the target genome (possibly several pairs) def get_first_last_seg(list_seg,segments_on_target_genome): list_first_last_segs=[] [first_seg_found,last_seg_found,walk_found]=['','',''] list_walks=get_walks_feature_cross(list_seg,segments_on_target_genome) # get all the walks where there is a segment of the feature for walk in list_walks: # find the first and last seg for each walk for segment in list_seg: # look for first_seg seg_found=search_seg_on_walk(segment,walk,segments_on_target_genome) if seg_found: first_seg_found=seg_found break if first_seg_found!='': # look for last_seg for segment in reversed(list_seg): last_seg_found=search_seg_on_walk(segment,walk,segments_on_target_genome) if last_seg_found: walk_found=walk break list_first_last_segs.append([first_seg_found,last_seg_found,walk_found]) [first_seg_found,last_seg_found,walk_found]=['','',''] # return all the match return list_first_last_segs # functions to get the detail of the variations in the features # find all the walks that contain a segment of the feature (list_seg is the walk of the feature on the source genome) def get_walks_feature_cross(list_seg,segments_on_target_genome): list_walks=list() for segment in list_seg: seg_found=search_seg_on_target_genome(segment,segments_on_target_genome) if seg_found: # if the segment or the reverse complement is on the target genome for walk in segments_on_target_genome[seg_found]: if walk not in list_walks: list_walks.append(walk) return list_walks