From 83f25031fe503f62a0b01504c1ca3757a52b9fa3 Mon Sep 17 00:00:00 2001 From: "nina.marthe_ird.fr" <nina.marthe@ird.fr> Date: Wed, 15 May 2024 11:32:14 +0200 Subject: [PATCH] reorganised some functions, made them smaller, added comments --- Functions.py | 223 ++++++++++++++++++++++---------------------- Functions_output.py | 80 ++++++++-------- 2 files changed, 152 insertions(+), 151 deletions(-) diff --git a/Functions.py b/Functions.py index f06537e..a085ef1 100644 --- a/Functions.py +++ b/Functions.py @@ -416,102 +416,124 @@ def add_target_genome_paths(feature_id,target_genome_paths): list_seg=feature.segments_list_source list_first_last_segs=get_first_last_seg(list_seg) - for match in list_first_last_segs: + for match in list_first_last_segs: #Â for each walk that has the feature [first_seg,last_seg,walk_name]=match - # get the first and last segments of all the copies - [first_last_segs_list]=detect_gene_copies(list_seg,walk_name,feature_id) + # get the first and last segments of all the copies on this walk + first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id) copy_number=0 for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies copy_number+=1 copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg - feature_path=[walk_name] - feature_path.append(copy_id) feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id) - feature_path.append(feature_target_path) + feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature feature.segments_list_target.append(feature_path) # add the feat_copy to all the segments !!! - #Â get the pos start of first and last, they are always present + #Â get the pos start of the first and last segments (existing because done in find_gene_copies) # then look for segs that are between. walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id)[1] walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id)[2] feat_copy=(feature_id,copy_id) - for seg in feature_target_path[1:-1]: # the first and last segs are already done in detect_gene_copies + for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies for segment in segments_on_target_genome[seg][walk_name]: #Â find the right occurence if walk_start <= segment[1] <= walk_stop: - segment.append(feat_copy) + segment.append(feat_copy) # annotate the segment : feature_id, copy_id break - # add paths of childs - childs_list=get_child_list(feature_id) - for child_id in childs_list: - child=Features[child_id] + # add the paths of the gene's child features + add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths) + + if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ? + feature.segments_list_target.append(['','',[]]) - # find child path in parent's path - [child_first_seg,child_last_seg]=['',''] - for seg in child.segments_list_source: #Â DOESNT TAKE INTO ACCOUNT THE INVERSIONS !!! - if seg in feature_target_path: #Â parent path - child_first_seg=seg - break - elif invert_seg(seg) in feature_target_path: - child_first_seg=invert_seg(seg) - break +def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths): + childs_list=get_child_list(feature_id) + for child_id in childs_list: + child=Features[child_id] + + # find child path in parent's path + [child_first_seg,child_last_seg]=['',''] + for seg in child.segments_list_source: + if seg in feature_target_path: #Â parent path + child_first_seg=seg + break + elif invert_seg(seg) in feature_target_path: + child_first_seg=invert_seg(seg) + break - if child_first_seg!='': # the child may be absent in this occurence ! - for seg in reversed(child.segments_list_source): - if seg in feature_target_path: #Â parent path - child_last_seg=seg - break - elif invert_seg(seg) in feature_target_path: - child_last_seg=invert_seg(seg) - break - - #Â get the index of these segs, to get the sous-liste - child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id) - feat_copy=(child_id,copy_id) - - feature_path=[walk_name,copy_id] - feature_path.append(child_path) - child.segments_list_target.append(feature_path) - - for segment_id in child_path: - segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id) - segment.append(feat_copy) + if child_first_seg!='': # the child feature may be absent in this occurence of the parent + for seg in reversed(child.segments_list_source): + if seg in feature_target_path: #Â parent path + child_last_seg=seg + break + elif invert_seg(seg) in feature_target_path: + child_last_seg=invert_seg(seg) + break - if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ? - feature.segments_list_target.append(['','',[]]) + #Â get the path of the child feature + child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id) + feat_copy=(child_id,copy_id) -def detect_gene_copies(list_seg_source,walk_name,feature_id): + feature_path=[walk_name,copy_id] + feature_path.append(child_path) + child.segments_list_target.append(feature_path) - # find all copies of all segments from the gene in the target genome (in both orientations) + for segment_id in child_path: # annotate the segments with info about the occurence of the child feature + segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id) + segment.append(feat_copy) + +# find all the copies of the segments from the source list in the target genome +def find_all_seg(list_seg_source,walk_name): index=0 - list_seg_target=[] # contains list of info for each seg [seg_id,seg_strand,start_pos,index_on_source_walk] - list_seg_source_unstranded=[] + list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk] + list_seg_source_unstranded=[] #Â contains the path in the source genome, but with the strand separated from the segment_id : [s24,>] for seg in list_seg_source: list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>] seg_inverted=invert_seg(seg) #Â look for all the segment copies in the target genome walk, in both orientations - if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): - for copy in segments_on_target_genome[seg][walk_name]: + if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg) + for copy in segments_on_target_genome[seg][walk_name]: # take all its copies seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4] list_seg_target.append(seg_info) - if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : + if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : #Â same but with the segment inverted for copy in segments_on_target_genome[seg_inverted][walk_name]: seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index] list_seg_target.append(seg_info) index+=1 - list_seg_target.sort(key=sort_seg_info) #Â order the list of segments by start position - old_index=list_seg_target[0][3] - old_strand=list_seg_target[0][1] - copy_number=1 - first_segs_list=[] - last_segs_list=[] - old_seg_id=list_seg_target[0][1]+list_seg_target[0][0] + return [list_seg_target,list_seg_source_unstranded] + +def find_gene_copies(list_seg_source,walk_name,feature_id): + # find all copies of all segments from the gene in the target genome (in both orientations) + [list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name) + # find each copy of the gene in the ordered list of segments + [first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id) + + # join the first and last seg lists + first_last_segs_list=[] + index=0 + for first_seg in first_segs_list: + last_seg=last_segs_list[index] + pair=(first_seg,last_seg) + first_last_segs_list.append(pair) + index+=1 + + return first_last_segs_list # return a list of pairs (first_seg,last_seg) + +#Â called by find_gene_copies +def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id): + # prepare the variables for the first iteration of the for loop + [old_id,old_strand,old_start,old_index]=list_seg_target[0] + [first_segs_list,last_segs_list]=[[],[]] + old_seg_id=old_strand+old_id first_segs_list.append(old_seg_id) - old_start=list_seg_target[0][2] - for segment in segments_on_target_genome[old_seg_id][walk_name]: #Â add first seg info + + copy_number=1 + copy_id="copy_"+str(copy_number) + feat_copy=(feature_id,copy_id) + + for segment in segments_on_target_genome[old_seg_id][walk_name]: #Â look for the first seg to add the occurence info if segment[1]==old_start: copy_id="copy_"+str(copy_number) feat_copy=(feature_id,copy_id) @@ -523,71 +545,48 @@ def detect_gene_copies(list_seg_source,walk_name,feature_id): old_index+=1 else: old_index-=1 - - # find each copy of the gene in the ordered list of segments - for seg in list_seg_target: - new_seg_id=seg[1]+seg[0] - new_index=seg[3] #Â index in the list_source - new_strand=seg[1] - seg_start=seg[2] + + for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies + [new_id,new_strand,new_start,new_index]=seg + new_seg_id=new_strand+new_id inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk if inversion : - if not((old_strand==new_strand) and (old_index>new_index)): # if the index decreases and the strand stays the same, it is the same gene copy + if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy) last_segs_list.append(old_seg_id) - for segment in segments_on_target_genome[old_seg_id][walk_name]: #Â add last seg info - if segment[1]==old_start: - copy_id="copy_"+str(copy_number) - feat_copy=(feature_id,copy_id) - segment.append(feat_copy) - break + add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) #Â add info for the last seg of the previous copy + copy_number+=1 + copy_id="copy_"+str(copy_number) + feat_copy=(feature_id,copy_id) # new feature copy, change the info + first_segs_list.append(new_seg_id) - for segment in segments_on_target_genome[new_seg_id][walk_name]: #Â add first seg info - if segment[1]==seg_start: - copy_id="copy_"+str(copy_number) - feat_copy=(feature_id,copy_id) - segment.append(feat_copy) - break + add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) #Â add info for the first seg of the new copy else: - if not((old_strand==new_strand) and (old_index<new_index)): # if the index increases and the strand stays the same, it is the same gene copy + if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy) last_segs_list.append(old_seg_id) - for segment in segments_on_target_genome[old_seg_id][walk_name]: #Â add last seg info - if segment[1]==old_start: - copy_id="copy_"+str(copy_number) - feat_copy=(feature_id,copy_id) - segment.append(feat_copy) - break + add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) #Â add info for the last seg of the previous copy + copy_number+=1 + copy_id="copy_"+str(copy_number) + feat_copy=(feature_id,copy_id) # new feature copy, change the info + first_segs_list.append(new_seg_id) - for segment in segments_on_target_genome[new_seg_id][walk_name]: #Â add first seg info - if segment[1]==seg_start: - copy_id="copy_"+str(copy_number) - feat_copy=(feature_id,copy_id) - segment.append(feat_copy) - break - - old_strand=new_strand - old_index=new_index - old_seg_id=new_seg_id - old_start=seg_start + add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) #Â add info for the first seg of the new copy + + [old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start] + + #Â add the last seg info last_segs_list.append(old_seg_id) - for segment in segments_on_target_genome[old_seg_id][walk_name]: #Â add last seg info - if segment[1]==seg_start: - copy_id="copy_"+str(copy_number) - feat_copy=(feature_id,copy_id) - segment.append(feat_copy) - break + add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy) #Â add info for the last seg of the last copy - first_last_segs_list=[] - index=0 - for first_seg in first_segs_list: - last_seg=last_segs_list[index] - pair=(first_seg,last_seg) - first_last_segs_list.append(pair) - index+=1 + return [first_segs_list,last_segs_list] - return [first_last_segs_list] # return a list of pairs (first_seg,last_seg) +def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy): + for segment in segments_on_target_genome[target_seg_id][walk_name]: #Â look for the right occurence of the segment + if segment[1]==target_start: + segment.append(feat_copy) #Â add seg info + break def sort_seg_info(seg_info): return seg_info[2] diff --git a/Functions_output.py b/Functions_output.py index 40782fe..ea7301f 100644 --- a/Functions_output.py +++ b/Functions_output.py @@ -25,48 +25,13 @@ def generate_target_gff(feature_id,seg_size,args,reason_features_not_transfered, match.append(False) # store the information that this gene copy didnt pass the filters return "no" - if right_size(size_on_new_genome,max_diff,feature_id):# or (feature.type!="gene"): + if right_size(size_on_new_genome,max_diff,feature_id): line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id) # transfer the gene write_line(line_gff,output_target_gff,False) # transfer all the gene's childs - # MAKE A FUNCTION FOR THAT - childs_list=get_child_list(feature_id) - for child_id in childs_list: - child=Features[child_id] - # look for the first and last seg of the child in its parent path - [child_first_seg,child_last_seg]=['',''] - for seg in child.segments_list_source: #Â get first seg in the parent path - if seg in feature_target_path: #Â parent path - child_first_seg=seg - break - elif invert_seg(seg) in feature_target_path: - child_first_seg=invert_seg(seg) - break - - if child_first_seg!='': # the child may be absent in this occurence ! - for seg in reversed(child.segments_list_source): #Â get last seg in the parent path - if seg in feature_target_path: #Â parent path - child_last_seg=seg - break - elif invert_seg(seg) in feature_target_path: - child_last_seg=invert_seg(seg) - break - if inversion: # done twice, make a function - child_size_on_new_genome=get_feature_start_on_target_genome_inv(child_last_seg,child_id,walk,copy_id)-get_feature_stop_on_target_genome_inv(child_first_seg,child_id,walk,copy_id)+1 - else: - child_size_on_new_genome=get_feature_stop_on_target_genome(child_last_seg,child_id,walk,copy_id)-get_feature_start_on_target_genome(child_first_seg,child_id,walk,copy_id)+1 - child_size_diff=abs(child_size_on_new_genome-child.size) - for match in child.segments_list_target: - if (match[0]==walk) and (match[1]==copy_id): - seg_list=match[2] - [cov,id]=get_id_cov(child_id,seg_size,seg_list) - - #Â transfer the child feature: - line_gff=create_line_target_gff(child_first_seg,child_last_seg,child_id,child_size_diff,inversion,walk,cov,id,copy_id) - write_line(line_gff,output_target_gff,False) - - + transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_size) + match.append(True) # store the information that this gene copy was transfered return size_diff @@ -78,7 +43,44 @@ def generate_target_gff(feature_id,seg_size,args,reason_features_not_transfered, reason_features_not_transfered[1]+=1 match.append(False) # store the information that this gene copy didnt pass the filters return "no" - + +def transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_size): + childs_list=get_child_list(feature_id) + for child_id in childs_list: + child=Features[child_id] + # look for the first and last seg of the child in its parent path + [child_first_seg,child_last_seg]=['',''] + for seg in child.segments_list_source: #Â get first seg in the parent path + if seg in feature_target_path: #Â parent path + child_first_seg=seg + break + elif invert_seg(seg) in feature_target_path: + child_first_seg=invert_seg(seg) + break + + if child_first_seg!='': # check that the child feature is not absent in this occurence of the parent feature + for seg in reversed(child.segments_list_source): #Â get last seg in the parent path + if seg in feature_target_path: #Â parent path + child_last_seg=seg + break + elif invert_seg(seg) in feature_target_path: + child_last_seg=invert_seg(seg) + break + + if inversion: #Â calculate the child feature size + child_size_on_new_genome=get_feature_start_on_target_genome_inv(child_last_seg,child_id,walk,copy_id)-get_feature_stop_on_target_genome_inv(child_first_seg,child_id,walk,copy_id)+1 + else: + child_size_on_new_genome=get_feature_stop_on_target_genome(child_last_seg,child_id,walk,copy_id)-get_feature_start_on_target_genome(child_first_seg,child_id,walk,copy_id)+1 + child_size_diff=abs(child_size_on_new_genome-child.size) + + for match in child.segments_list_target: # look for the right occurence of the child feature + if (match[0]==walk) and (match[1]==copy_id): + seg_list=match[2] + [cov,id]=get_id_cov(child_id,seg_size,seg_list) + + #Â transfer the child feature: + line_gff=create_line_target_gff(child_first_seg,child_last_seg,child_id,child_size_diff,inversion,walk,cov,id,copy_id) + write_line(line_gff,output_target_gff,False) # handles all the transfer, variation, alignment (stats?), on the target genome. def transfer_on_target(segments_file, out_target_gff, out_var,out_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args): -- GitLab