from .usefull_little_functions import get_seg_occ from .intersect import invert_seg, Features from .detect_inversion import detect_feature_inversion,invert_segment_list from .detect_copies import find_gene_copies,get_first_last_seg # functions to output the stats on the transfer # stats about missing segments and feature type, not used, will change. def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id,walk,segments_on_target_genome): # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok] feature_missing_segments[5].append(feature_id) if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely feature_missing_segments[3].append(feature_id) elif first_seg != list_seg[0]: # the first segment is missing feature_missing_segments[0].append(feature_id) elif last_seg!=list_seg[-1]: # the last segment is missing feature_missing_segments[2].append(feature_id) # go through all the segments, check if some are missing in the middle of the feature elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element for segment in list_seg[1-(len(list_seg)-2)]: if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]): feature_missing_segments[1].append(feature_id) break # go through the segments, to see if one is missing anywhere on the feature for segment in list_seg: if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]): if feature_id not in feature_missing_segments[4]: feature_missing_segments[4].append(feature_id) break # if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK FOR INSERTIONS !! if feature_id not in feature_missing_segments[4]: feature_missing_segments[6].append(feature_id) def get_annot_features(list_features): list_annot_features=[] for feature in list_features: list_annot_features.append(Features[feature].note) return list_annot_features def count_hypput_total(list_annot_first): total=len(list_annot_first) count_hypput=0 for annot in list_annot_first: if ("hypothetical" in annot) or ("putative" in annot): count_hypput+=1 return [count_hypput,total] # print stats on the transfer : number of feature that have segments in different positions missing. def stats_features(feature_missing_segments): # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok] list_annot_first=get_annot_features(feature_missing_segments[0]) [hyp_put,total]=count_hypput_total(list_annot_first) print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_middle=get_annot_features(feature_missing_segments[1]) [hyp_put,total]=count_hypput_total(list_annot_middle) print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_last=get_annot_features(feature_missing_segments[2]) [hyp_put,total]=count_hypput_total(list_annot_last) print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_all=get_annot_features(feature_missing_segments[3]) [hyp_put,total]=count_hypput_total(list_annot_all) print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_total=get_annot_features(feature_missing_segments[4]) [hyp_put,total]=count_hypput_total(list_annot_total) print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_ok=get_annot_features(feature_missing_segments[6]) [hyp_put,total]=count_hypput_total(list_annot_ok) print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_features=get_annot_features(feature_missing_segments[5]) [hyp_put,total]=count_hypput_total(list_annot_features) print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") # functions to generate the different gffs # add the paths of the feature on the target genome in the object Feature def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_genome): feature=Features[feature_id] list_seg=feature.segments_list_source list_first_last_segs=get_first_last_seg(list_seg,segments_on_target_genome) for match in list_first_last_segs: # for each walk that has the feature [first_seg,last_seg,walk_name]=match # get the first and last segments of all the copies on this walk first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id,segments_on_target_genome) copy_number=0 for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies copy_number+=1 copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome) feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature feature.segments_list_target.append(feature_path) # add the feat_copy to all the segments # get the pos start of the first and last segments (existing because done in find_gene_copies) # then look for segs that are between. walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id,segments_on_target_genome)[1] walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id,segments_on_target_genome)[2] feat_copy=(feature_id,copy_id) for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence if walk_start <= segment[1] <= walk_stop: segment.append(feat_copy) # annotate the segment : feature_id, copy_id break # add the paths of the gene's child features add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome) if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ? feature.segments_list_target.append(['','',[]]) def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome): childs_list=Features[feature_id].get_child_list(Features) for child_id in childs_list: child=Features[child_id] # find child path in parent's path [child_first_seg,child_last_seg]=['',''] for seg in child.segments_list_source: if seg in feature_target_path: # parent path child_first_seg=seg break elif invert_seg(seg) in feature_target_path: child_first_seg=invert_seg(seg) break if child_first_seg!='': # the child feature may be absent in this occurence of the parent for seg in reversed(child.segments_list_source): if seg in feature_target_path: # parent path child_last_seg=seg break elif invert_seg(seg) in feature_target_path: child_last_seg=invert_seg(seg) break # get the path of the child feature child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id,segments_on_target_genome) feat_copy=(child_id,copy_id) feature_path=[walk_name,copy_id] feature_path.append(child_path) child.segments_list_target.append(feature_path) for segment_id in child_path: # annotate the segments with info about the occurence of the child feature segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id,segments_on_target_genome) segment.append(feat_copy) # find the feature's path in target genome walk def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome): # look for first_seg and last_seg that has the right copy_id for this feature first_seg_index=get_seg_occ(first_seg,walk_name,feature_id,copy_id,segments_on_target_genome)[4] last_seg_index=get_seg_occ(last_seg,walk_name,feature_id,copy_id,segments_on_target_genome)[4] first_index=min(first_seg_index,last_seg_index) last_index=max(first_seg_index,last_seg_index) feature_path_target_genome=target_genome_path[first_index:last_index+1] return feature_path_target_genome # get the coverage and sequence id of a feature def get_id_cov(feature_id,seg_size,target_list): # seg_size has unoriented segments : s25 feature=Features[feature_id] source_list=feature.segments_list_source inversion=detect_feature_inversion(source_list,target_list) if inversion: target_list=invert_segment_list(target_list) [match,subs,inser,delet]=[0,0,0,0] [i,j]=[0,0] first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on last=False # same for the last part of the feature # for id and ins. while (i<len(source_list)) and (j<len(target_list)): if i==len(source_list)-1: last=True if source_list[i] != target_list[j]: # if there is a difference between the two paths if target_list[j] not in source_list: # if the segment in target genome is absent in source genome if source_list[i] not in target_list: # if the segment in source genome is absent is target genome : substitution add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last) match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4] i+=1;j+=1 else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion add=segment_id_cov("insertion",seg_size,source_list[i],target_list[j],first,feature,last) match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4] j+=1 elif source_list[i] not in target_list: # source_genome segment not in target genome, but target genome segment in source_genome : deletion add=segment_id_cov("deletion",seg_size,source_list[i],target_list[j],first,feature,last) match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4] i+=1 else : # if both segments are present in the other genome but not at the same position. weird case never found yet add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last) match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4] i+=1;j+=1 else: # segment present in both, no variation. add=segment_id_cov("identity",seg_size,source_list[i],target_list[j],first,feature,last) match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4] i+=1;j+=1 if i<=len(source_list)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome add=segment_id_cov("end_deletion",seg_size,source_list[i:],'',first,feature,last) match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4] denom_cov=match+subs+delet denom_id=denom_cov+inser [cov,id]=[0,0] if denom_cov>0: cov=round((match+subs)/(match+subs+delet),3) if denom_id>0: id=round(match/(match+subs+inser+delet),3) return [cov,id] # computes the cov/id calculation for a segment pair def segment_id_cov(type,seg_size,seg_a,seg_b,first,feature,last): [match,subs,inser,delet]=[0,0,0,0] match type: case "identity": if first: match+=seg_size[seg_a[1:]]-feature.pos_start+1 elif last: match+=feature.pos_stop else: match+=seg_size[seg_a[1:]] case "substitution": if seg_size[seg_b[1:]]!=seg_size[seg_a[1:]]: # substitution can be between segments of different size if seg_size[seg_b[1:]]>seg_size[seg_a[1:]]: subs+=seg_size[seg_a[1:]] inser+=seg_size[seg_b[1:]]-seg_size[seg_a[1:]] elif seg_size[seg_b[1:]]<seg_size[seg_a[1:]]: subs+=seg_size[seg_b[1:]] delet+=seg_size[seg_a[1:]]-seg_size[seg_b[1:]] else: subs+=seg_size[seg_a[1:]] case "insertion": inser+=seg_size[seg_b[1:]] case "deletion": if first: delet+=seg_size[seg_a[1:]]-feature.pos_start+1 else: delet+=seg_size[seg_a[1:]] case "end_deletion": for seg in seg_a[:-1]: delet+=seg_size[seg[1:]] delet+=feature.pos_stop return [match,subs,inser,delet,False] # check the orientation of the segment later