Skip to content
Snippets Groups Projects
Commit 8e84c51c authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

modified the condition for the transfer. now the filters coverage and id only...

modified the condition for the transfer. now the filters coverage and id only apply to genes. if the gene passes the filters, all its child features are transfered. if the gene doesnt pass the filters, none of its child features are transfered
parent bf488d6c
No related branches found
No related tags found
No related merge requests found
from Graph_gff import Segments, Features, write_line
from Graph_gff import Segments, Features, write_line, find_parent
from Functions import *
......@@ -7,26 +7,33 @@ from Functions import *
# functions to get the gff with one line per feature
# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def generate_target_gff(first_seg,last_seg,feature_id,max_diff,inversion,walk,seg_size,args):
def generate_target_gff(feature_id,max_diff,seg_size,args,reason_features_not_transfered):
feature=Features[feature_id]
# get list of the segments where it is and the first and last segment of the feature on the new genome
feature_target_path=feature.segments_list_target
[first_seg,last_seg,walk]=get_featurePath_ends(feature_target_path)
inversion=detect_feature_inversion(feature.segments_list_source,feature_target_path)
if first_seg!='': # feature present on the target genome
if inversion:
size_on_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk)-get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk)+1
else:
size_on_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk)-get_feature_start_on_target_genome(first_seg,feature_id,walk)+1
size_diff=abs(size_on_new_genome-Features[feature_id].size)
size_diff=abs(size_on_new_genome-feature.size)
[cov,id]=get_id_cov(feature_id,seg_size)
if (cov*100<args.coverage) or (id*100<args.identity):
return "cov_id"
if right_size(size_on_new_genome,max_diff,feature_id):
if ((cov*100<args.coverage) or (id*100<args.identity)) and (feature.type=="gene"):
reason_features_not_transfered[2]+=1
return "no"
if right_size(size_on_new_genome,max_diff,feature_id) or (feature.type!="gene"):
line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id)
write_line(line_gff,output_target_gff,False)
return size_diff
else:
return "bad_size"
reason_features_not_transfered[0]+=1
return "no"
else :
return "absent"
# add cov or id too low to reason to not transfer !!
reason_features_not_transfered[1]+=1
return "no"
......@@ -46,33 +53,42 @@ def transfer_on_target(segments_file, out_target_gff, out_var,out_aln,target_gen
file_out=open(out_target_gff,'w')
global output_target_gff
output_target_gff=[0,"",file_out]
bad_size_features=0
absent_features=0
cov_id_bad=0
reason_features_not_transfered=[0,0,0] # bad_size, absent_features, low_cov_id
diff_size_transfered_features=[0,0] # [count,sum], to get the average
genes_done={} # key=gene_id, value=true if done and passed, and value=false if done but not passed. if not passed, dont transfer child features
for feat in list_feature_to_transfer:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
feature_target_path=Features[feat].segments_list_target
[first_seg,last_seg,walk]=get_featurePath_ends(feature_target_path)
inversion=detect_feature_inversion(Features[feat].segments_list_source,feature_target_path)
transfer_stat=generate_target_gff(first_seg,last_seg,feat,args.max_difference,inversion,walk,seg_size,args) # insertions not considered !!!
if transfer_stat=="bad_size":
bad_size_features+=1
if Features[feat].type=='gene':
list_feat_absent.append(feat)
elif transfer_stat=="absent":
absent_features+=1
if Features[feat].type=='gene':
list_feat_absent.append(feat)
elif transfer_stat=="cov_id":
cov_id_bad+=1
if Features[feat].type=='gene':
list_feat_absent.append(feat)
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat
transfer=True
parent=find_parent(feat)
# do the parent gene then the child feature. dont do a child feature if the gene didnt pass.
if parent not in genes_done: # gene parent never seen, transfer it
if Features[feat].type!="gene":
transfer_stat=generate_target_gff(parent,args.max_difference,seg_size,args,reason_features_not_transfered)
if transfer_stat=="no":
genes_done[parent]=False
transfer=False
list_feat_absent.append(feat)
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat
genes_done[parent]=True
else: # gene parent already transfered
if not genes_done[parent]: # the gene parent didnt pass the filters
transfer=False
elif Features[feat].type=="gene": # the current feature is the gene and has alrady been transfered
transfer=False
if transfer==True:
transfer_stat=generate_target_gff(feat,args.max_difference,seg_size,args,reason_features_not_transfered) # insertions not considered !!!
if transfer_stat=="no":
if Features[feat].type=='gene':
genes_done[feat]=False
list_feat_absent.append(feat)
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat
genes_done[feat]=True
write_line("",output_target_gff,True)
file_out.close()
......@@ -144,10 +160,11 @@ def transfer_on_target(segments_file, out_target_gff, out_var,out_aln,target_gen
stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat,walk)
if args.annotation:
print(len(Features)-(bad_size_features+absent_features+cov_id_bad),"out of",len(Features),"features are transfered.")
bad_size_features=reason_features_not_transfered[0];absent_features=reason_features_not_transfered[1];low_cov_id=reason_features_not_transfered[2]
print(len(Features)-(bad_size_features+absent_features+low_cov_id),"out of",len(Features),"features are transfered.")
print(bad_size_features,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
print(cov_id_bad,"out of",len(Features),"features are not transfered because their coverage or sequence identity is below threshold.")
print(low_cov_id,"out of",len(Features),"features are not transfered because their coverage or sequence identity is below threshold.")
print("average length difference of the transfered genes : ",diff_size_transfered_features[1]/diff_size_transfered_features[0])
stats_features(feature_missing_segments)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment