From 1f552e7b47563f921a001205e437685b242918f2 Mon Sep 17 00:00:00 2001 From: "nina.marthe_ird.fr" <nina.marthe@ird.fr> Date: Fri, 12 Apr 2024 11:44:37 +0200 Subject: [PATCH] added a function that finds all the copies of a gene in a given walk --- Functions.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/Functions.py b/Functions.py index 0e977ff..f8c504e 100644 --- a/Functions.py +++ b/Functions.py @@ -398,14 +398,110 @@ def add_target_genome_paths(feature_id,target_genome_paths): for match in list_first_last_segs: [first_seg,last_seg,walk_name]=match feature_path=[walk_name] + + # first check if several copies in the walk + # [first_segs_list,last_segs_list]=detect_gene_copies(list_seg,walk_name) + # copy_number=len(first_segs_list) + # # then get first_seg+last_seg pairs to create the paths + # if copy_number==1: + # feature_path.append(get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name)) + # feature.segments_list_target.append(feature_path) + # else: #Â several copies. + # # ADAPT TO ALL PAIRS OF SEGS feature_path.append(get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name)) feature.segments_list_target.append(feature_path) + if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. feature.segments_list_target.append(['',[]]) +def detect_gene_copies(list_seg_source,walk_name): + + # find all copies of all segments from the gene in the target genome (in both orientations) + index=0 + list_seg_target=[] # contains list of info for each seg [seg_id,seg_strand,start_pos,index_on_source_walk] + list_seg_source_unstranded=[] + for seg in list_seg_source: + list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>] + seg_inverted=invert_seg(seg) + #Â look for all the segment copies in the target genome walk, in both orientations + if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): + for copy in segments_on_target_genome[seg][walk_name]: + seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4] + list_seg_target.append(seg_info) + if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : + for copy in segments_on_target_genome[seg_inverted][walk_name]: + seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index] + list_seg_target.append(seg_info) + index+=1 + + list_seg_target.sort(key=sort_seg_info) #Â order the list of segments by start position + old_index=list_seg_target[0][3] + old_strand=list_seg_target[0][1] + copy_number=1 + first_segs_list=[] + last_segs_list=[] + old_seg_id=list_seg_target[0][1]+list_seg_target[0][0] + first_segs_list.append(old_seg_id) + #Â adjust old_index for the first iteration of the loop + first_inversion=(old_strand!=list_seg_source_unstranded[old_index][1]) + if first_inversion: + old_index+=1 + else: + old_index-=1 + + # find each copy of the gene in the ordered list of segments + for seg in list_seg_target: + new_seg_id=seg[1]+seg[0] + new_index=seg[3] #Â index in the list_source + new_strand=seg[1] + seg_start=seg[2] + inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk + + if inversion : + if (old_strand==new_strand) and (old_index>new_index): # if the index decreases and the strand stays the same, it is the same gene copy + for segment in segments_on_target_genome[new_seg_id][walk_name]: + if segment[1]==seg_start: + copy_id="copy_"+str(copy_number) + segment.append(copy_id) + seg.append(copy_id) + else: # end of the copy + copy_number+=1 + last_segs_list.append(old_seg_id) + first_segs_list.append(new_seg_id) + for segment in segments_on_target_genome[new_seg_id][walk_name]: + if segment[1]==seg_start: + copy_id="copy_"+str(copy_number) + segment.append(copy_id) + seg.append(copy_id) + else: + if (old_strand==new_strand) and (old_index<new_index): # if the index increases and the strand stays the same, it is the same gene copy + for segment in segments_on_target_genome[new_seg_id][walk_name]: + if segment[1]==seg_start: + copy_id="copy_"+str(copy_number) + segment.append(copy_id) + seg.append(copy_id) + else: # end of the copy + copy_number+=1 + last_segs_list.append(old_seg_id) + first_segs_list.append(new_seg_id) + for segment in segments_on_target_genome[new_seg_id][walk_name]: + if segment[1]==seg_start: + copy_id="copy_"+str(copy_number) + segment.append(copy_id) + seg.append(copy_id) + # if the strand changes, it is possible that it is an inversion inside the gene. treat this case later + old_strand=new_strand + old_index=new_index + old_seg_id=new_seg_id + last_segs_list.append(old_seg_id) + + return [first_segs_list,last_segs_list] # return two lists, first_segs and last_segs + +def sort_seg_info(seg_info): + return seg_info[2] + # find the feature's path in target genome walk def get_feature_path(target_genome_path,first_seg,last_seg,walk_name): - # if there is several occurences of first_seg and last_seg ??? first_seg_index=segments_on_target_genome[first_seg][walk_name][-1][4] last_seg_index=segments_on_target_genome[last_seg][walk_name][-1][4] first_index=min(first_seg_index,last_seg_index) -- GitLab