From e91317893db47bdbff4c46097e40423d4d7fcad7 Mon Sep 17 00:00:00 2001 From: "nina.marthe_ird.fr" <nina.marthe@ird.fr> Date: Wed, 12 Mar 2025 11:19:50 +0100 Subject: [PATCH] changed feature_on_target_genome occurence into a class FeaturePath --- GrAnnoT/alignment.py | 11 +++---- GrAnnoT/detect_copies.py | 2 +- GrAnnoT/find_feature_target_path.py | 27 +++++++++++++--- GrAnnoT/genome_transfer.py | 29 ++++++++--------- GrAnnoT/load_gfa.py | 21 +------------ GrAnnoT/setup_transfer.py | 48 ++++++++++++----------------- GrAnnoT/usefull_little_functions.py | 12 -------- GrAnnoT/variation_details.py | 13 ++++---- 8 files changed, 66 insertions(+), 97 deletions(-) diff --git a/GrAnnoT/alignment.py b/GrAnnoT/alignment.py index 5ba684b..8f92f78 100644 --- a/GrAnnoT/alignment.py +++ b/GrAnnoT/alignment.py @@ -6,15 +6,14 @@ from .detect_inversion import detect_feature_inversion # get the alignment between the features on the source genome and the features on the target genome def print_alignment(feat,match,seg_info,file_out_aln): - if match[0]!='': - feature_target_path=match[2] - if len(feature_target_path)>0: # if the feature is not completely absent + if match.walk_name!='': + if len(match.path)>0: # if the feature is not completely absent feature_path_source_genome=Features[feat].segments_list_source - inversion=detect_feature_inversion(feature_path_source_genome,feature_target_path) + inversion=detect_feature_inversion(feature_path_source_genome,match.path) if inversion: - feature_target_path=invert_segment_list(feature_target_path) + match.path=invert_segment_list(match.path) - line_aln=create_line_aln(feature_path_source_genome,feature_target_path,seg_info,feat) + line_aln=create_line_aln(feature_path_source_genome,match.path,seg_info,feat) file_out_aln.write(line_aln) # creates an alignment for two segments diff --git a/GrAnnoT/detect_copies.py b/GrAnnoT/detect_copies.py index abf1ea6..17395b6 100644 --- a/GrAnnoT/detect_copies.py +++ b/GrAnnoT/detect_copies.py @@ -136,7 +136,7 @@ def get_first_last_seg(list_seg,segments_on_target_genome): break list_first_last_segs.append([first_seg_found,last_seg_found,walk_found]) [first_seg_found,last_seg_found,walk_found]=['','',''] - # return all the match + # return all the matches return list_first_last_segs diff --git a/GrAnnoT/find_feature_target_path.py b/GrAnnoT/find_feature_target_path.py index a62b010..b2cf4f2 100644 --- a/GrAnnoT/find_feature_target_path.py +++ b/GrAnnoT/find_feature_target_path.py @@ -20,7 +20,7 @@ def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_ge copy_number+=1 copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome) - feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature + feature_path=FeaturePath(walk_name,copy_id,feature_target_path) # informations about the occurence of the feature feature.segments_list_target.append(feature_path) # add the feat_copy to all the segments @@ -40,7 +40,7 @@ def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_ge add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome) if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ? - feature.segments_list_target.append(['','',[]]) + feature.segments_list_target.append(FeaturePath('','',[])) def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome): childs_list=Features[feature_id].get_child_list(Features) @@ -73,9 +73,7 @@ def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_gen child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id,segments_on_target_genome) feat_copy=(child.key,copy_id) - feature_path=[walk_name,copy_id] - feature_path.append(child_path) - child.segments_list_target.append(feature_path) + child.segments_list_target.append(FeaturePath(walk_name,copy_id,child_path)) for segment_id in child_path: # annotate the segments with info about the occurence of the child feature segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id,segments_on_target_genome) @@ -91,3 +89,22 @@ def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,fea last_index=max(first_seg_index,last_seg_index) feature_path_target_genome=target_genome_path[first_index:last_index+1] return feature_path_target_genome + + +class FeaturePath: + def __init__(self,walk_name,copy_id,path): + self.walk_name=walk_name + self.copy_id=copy_id + self.path=path + self.first_seg='' + self.last_seg='' + if len(path)>0: + self.first_seg=path[0] + self.last_seg=path[-1] + self.pass_filter=None + self.cov=0 + self.id=0 + + def add_cov_id(self,cov_id): + self.cov=cov_id[0] + self.id=cov_id[1] \ No newline at end of file diff --git a/GrAnnoT/genome_transfer.py b/GrAnnoT/genome_transfer.py index 9943eb3..4ca4b09 100644 --- a/GrAnnoT/genome_transfer.py +++ b/GrAnnoT/genome_transfer.py @@ -7,35 +7,31 @@ from .compute_cov_id import get_id_cov def generate_target_gff(feature_id,seg_info,args,reason_features_not_transfered,match,file_out_gff,segments_on_target_genome): feature=Features[feature_id] - #Â get list of the segments where it is and the first and last segment of the feature on the new genome - [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match) - inversion=detect_feature_inversion(feature.segments_list_source,feature_target_path) - if first_seg!='': # feature present on the target genome + inversion=detect_feature_inversion(feature.segments_list_source,match.path) + if match.first_seg!='': # feature present on the target genome if inversion: - size_on_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id,segments_on_target_genome)-get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id,segments_on_target_genome)+1 + size_on_new_genome=get_feature_start_on_target_genome_inv(match.last_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)-get_feature_stop_on_target_genome_inv(match.first_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)+1 else: - size_on_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id,segments_on_target_genome)-get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id,segments_on_target_genome)+1 + size_on_new_genome=get_feature_stop_on_target_genome(match.last_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)-get_feature_start_on_target_genome(match.first_seg,feature_id,match.walk_name,match.copy_id,segments_on_target_genome)+1 size_diff=abs(size_on_new_genome-feature.size) - [cov,id]=match[3] - - if (cov*100<args.coverage) or (id*100<args.identity): + if (match.cov*100<args.coverage) or (match.id*100<args.identity): reason_features_not_transfered[1]+=1 - match.append(False) # store the information that this gene copy didnt pass the filters + match.pass_filter=False # store the information that this gene copy didnt pass the filters return "no" else: - line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id,segments_on_target_genome) # transfer the gene + line_gff=create_line_target_gff(match.first_seg,match.last_seg,feature_id,size_diff,inversion,match.walk_name,match.cov,match.id,match.copy_id,segments_on_target_genome) # transfer the gene file_out_gff.write(line_gff) # transfer all the gene's childs - transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_info,file_out_gff,segments_on_target_genome) + transfer_childs(feature_id,match.path,match.walk_name,match.copy_id,inversion,seg_info,file_out_gff,segments_on_target_genome) - match.append(True) # store the information that this gene copy was transfered + match.pass_filter=True # store the information that this gene copy was transfered return size_diff else : reason_features_not_transfered[0]+=1 - match.append(False) # store the information that this gene copy didnt pass the filters + match.pass_filter=False # store the information that this gene copy didnt pass the filters return "no" @@ -68,9 +64,8 @@ def transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_in child_size_diff=abs(child_size_on_new_genome-child.size) for match in child.segments_list_target: # look for the right occurence of the child feature - if (match[0]==walk) and (match[1]==copy_id): - seg_list=match[2] - [cov,id]=get_id_cov(child.key,seg_info,seg_list) + if (match.walk_name==walk) and (match.copy_id==copy_id): + [cov,id]=get_id_cov(child.key,seg_info,match.path) break if inversion: diff --git a/GrAnnoT/load_gfa.py b/GrAnnoT/load_gfa.py index 1b31a0f..5557848 100644 --- a/GrAnnoT/load_gfa.py +++ b/GrAnnoT/load_gfa.py @@ -124,24 +124,6 @@ class SegmentTarget: return list_seg - -# # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list) -# def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments. -# with open(pos_seg,'r') as bed: -# seg_count=0 -# file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename -# for line in bed: -# line=line.split() -# [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system -# # check if segment present twice on the same walk ??? -# #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name] -# if seg not in segments_on_target_genome: -# segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk -# if file_name not in segments_on_target_genome[seg]: -# segments_on_target_genome[seg][file_name]=list() -# segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index]) -# seg_count+=1 - # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list) def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments. with open(pos_seg,'r') as bed: @@ -151,9 +133,8 @@ def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add t line=line.split() [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system # check if segment present twice on the same walk ??? - #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name] if seg not in segments_on_target_genome: - segments_on_target_genome[seg]=SegmentTarget() # dict of walks->segment_info; you get the info about the segment for each walk + segments_on_target_genome[seg]=SegmentTarget() # list of walks -> segment_info; you get the info about the segment for each walk segments_on_target_genome[seg].add_occurence(file_name,[chrom,start,stop,strand,index]) seg_count+=1 \ No newline at end of file diff --git a/GrAnnoT/setup_transfer.py b/GrAnnoT/setup_transfer.py index 3f35a01..826b24d 100644 --- a/GrAnnoT/setup_transfer.py +++ b/GrAnnoT/setup_transfer.py @@ -2,7 +2,6 @@ from .intersect import Features from .compute_cov_id import get_id_cov from .find_feature_target_path import add_target_genome_paths from .compute_transfer_stat import stats_feature_missing_segment,stats_features -from .usefull_little_functions import get_featurePath_ends from .load_gfa import get_segments_sequence from .genome_transfer import generate_target_gff @@ -41,11 +40,11 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path feature=Features[feat] if feature.parent=="": # usually a gene for match in feature.segments_list_target: # compute cov and id for all matches. - if match[0]=='': + if match.walk_name=='': feature_target_path=[] else: - feature_target_path=match[2] - match.append(get_id_cov(feat,seg_info,feature_target_path)) + feature_target_path=match.path + match.add_cov_id(get_id_cov(feat,seg_info,feature_target_path)) for match in feature.segments_list_target: # if option no_dupl, only transfer the best match if args.no_duplication: @@ -53,15 +52,11 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path all_match=feature.segments_list_target best_match=all_match[0] for candidate_match in all_match: - candidate_cov=candidate_match[3][0] - candidate_id=candidate_match[3][1] - best_cov=best_match[3][0] - best_id=best_match[3][1] - if (candidate_cov+candidate_id)>(best_cov+best_id): + if (candidate_match.cov+candidate_match.id)>(best_match.cov+best_match.id): best_match=candidate_match if match!=best_match: # if current match is not best match, dont transfer - match.append(False) + match.pass_filter=False continue transfer_stat=generate_target_gff(feat,seg_info,args,reason_features_not_transfered,match,file_out_gff,segments_on_target_genome) # the childs are handled in this function @@ -76,8 +71,8 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path list_seg=Features[feat].segments_list_source if Features[feat].parent=="": feature_target_path=[] - for occurence in Features[feat].segments_list_target: # add all the occurences of the feature in the target genome - feature_target_path+=occurence[2] + for match in Features[feat].segments_list_target: # add all the occurences of the feature in the target genome + feature_target_path+=match.path for segment in list_seg: segments_list[segment[1:]]='' for segment in feature_target_path: @@ -90,19 +85,18 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path if feature.parent=="": # usually a gene for match in feature.segments_list_target: # compute cov and id for all matches. - if match[0]=='': + if match.walk_name=='': feature_target_path=[] else: - feature_target_path=match[2] - match.append(get_id_cov(feat,seg_info,feature_target_path)) + feature_target_path=match.path + match.add_cov_id(get_id_cov(feat,seg_info,feature_target_path)) for match in feature.segments_list_target: - [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match) # add the filter info in all the matches... - [cov,id]=get_id_cov(feature_id,seg_info,feature_target_path) + [cov,id]=get_id_cov(feature_id,seg_info,match.feature_target_path) - if (cov*100<args.coverage) or (id*100<args.identity) or (first_seg==''): # didnt put the "right_size" filter) - match.append(False) # store the information that this gene copy didnt pass the filters + if (cov*100<args.coverage) or (id*100<args.identity) or (match.first_seg==''): # didnt put the "right_size" filter) + match.pass_filter=False # store the information that this gene copy didnt pass the filters else: # if option no_dupl, only transfer the best match if args.no_duplication: @@ -110,17 +104,13 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path all_match=feature.segments_list_target best_match=all_match[0] for candidate_match in all_match: - candidate_cov=candidate_match[3][0] - candidate_id=candidate_match[3][1] - best_cov=best_match[3][0] - best_id=best_match[3][1] - if (candidate_cov+candidate_id)>(best_cov+best_id): + if (candidate_match.cov+candidate_match.id)>(best_match.cov+best_match.id): best_match=candidate_match if match!=best_match: # if current match is not best match, dont transfer - match.append(False) + match.pass_filter=False continue - match.append(True) + match.pass_filter=True if args.variation: get_segments_sequence(segments_file,segments_list,seg_info) @@ -144,7 +134,7 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path feature=Features[feat] if feature.parent=="": # usually a gene for match in feature.segments_list_target: # for all occurences of the gene - if match[4]==True: + if match.pass_filter==True: print_alignment(feat,match,seg_info,file_out_aln) @@ -161,10 +151,10 @@ def transfer_on_target(segments_file,genome_dir,target_genome,target_genome_path #Â for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome list_seg=Features[feat].segments_list_source - [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(Features[feat].segments_list_target[0]) + match=Features[feat].segments_list_target[0] for feat in list_feature_to_transfer: - stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat,walk,segments_on_target_genome) + stats_feature_missing_segment(feature_missing_segments,match.first_seg,match.last_seg,list_seg,feat,match.walk_name,segments_on_target_genome) if args.annotation: absent_features=reason_features_not_transfered[0];low_cov_id=reason_features_not_transfered[1] diff --git a/GrAnnoT/usefull_little_functions.py b/GrAnnoT/usefull_little_functions.py index 91c33dc..7f08dbf 100644 --- a/GrAnnoT/usefull_little_functions.py +++ b/GrAnnoT/usefull_little_functions.py @@ -38,18 +38,6 @@ def get_feature_stop_on_target_genome_inv(stop_seg,feat_id,walk,copy_id,segments return seg_end_pos-feat_stop_pos+1 -# gives the first and last segments of a walk (list) -def get_featurePath_ends(walk_info): # walk_info = [walk_name,copy_id,[list_seg]] - walk_name=walk_info[0] - if walk_name!='': - seg_list=walk_info[2] - copy_id=walk_info[1] - [first_seg,last_seg]=[seg_list[0],seg_list[-1]] - else: - [first_seg,last_seg,walk_name,copy_id,seg_list]=['','','','',[]] - return [first_seg,last_seg,walk_name,copy_id,seg_list] - - # invert the given strand def invert_strand(strand): match strand: diff --git a/GrAnnoT/variation_details.py b/GrAnnoT/variation_details.py index e470991..c87fe3c 100644 --- a/GrAnnoT/variation_details.py +++ b/GrAnnoT/variation_details.py @@ -7,13 +7,12 @@ from .detect_inversion import detect_feature_inversion # handle the variations analysis between the feature on the source genome and the feature on the target genome def generate_variations_details(feature_id,seg_info,match,file_out_var,segments_on_target_genome): #Â for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome - [first_seg,last_seg,walk,copy_id,feature_target_path]=get_featurePath_ends(match) # do it for the gene, then for its childs. - if match[4]==True: # gene passed the filters - inversion=detect_feature_inversion(Features[feature_id].segments_list_source,feature_target_path) - print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome) - child_variation_details(feature_id,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome) + if match.pass_filter==True: # gene passed the filters + inversion=detect_feature_inversion(Features[feature_id].segments_list_source,match.path) + print_variations(feature_id,match.first_seg,match.last_seg,match.copy_id,match.walk_name,seg_info,match.path,inversion,file_out_var,segments_on_target_genome) + child_variation_details(feature_id,match.copy_id,match.walk_name,seg_info,match.path,inversion,file_out_var,segments_on_target_genome) def child_variation_details(feature_id,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome): # print child feature var @@ -60,8 +59,8 @@ def get_child_last_seg(child_segments_list_source,feature_target_path): def find_child_target_path(child_segments_list_target,walk,copy_id): for match in child_segments_list_target: # look for the right occurence of the child feature - if (match[0]==walk) and (match[1]==copy_id): - return match[2] + if (match.walk_name==walk) and (match.copy_id==copy_id): + return match.path def print_variations(feature_id,first_seg,last_seg,copy_id,walk,seg_info,feature_target_path,inversion,file_out_var,segments_on_target_genome): -- GitLab