Skip to content
Snippets Groups Projects
Commit da5b6f63 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

removed functions to get the 'detailed' gff for the target genome. simplified the main

parent 70b6f4be
No related branches found
No related tags found
No related merge requests found
......@@ -19,56 +19,13 @@ def get_feature_stop_on_target_genome(stop_seg,feat_id):
# functions to get the detailed gff with all the fragments of the features
# get the position of a part of a feature on the complete feature (on the original genome)
def get_position_on_feature(start_seg,stop_seg,feature_start_segment,feature):
start_on_feature=get_segment_start_on_feature(feature_start_segment,start_seg,feature)
stop_on_feature=get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature)
position=f';position={start_on_feature}-{stop_on_feature}'
return position
def get_segment_start_on_feature(feature_start_segment,start_seg,feature):
feature_start_pos=Segments[feature_start_segment].start+get_feature_start_on_segment(feature_start_segment,feature.id)-1
start_on_reference=Segments[start_seg].start+get_feature_start_on_segment(start_seg,feature.id)-1
return start_on_reference-feature_start_pos+1
def get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature):
start_on_target_genome=get_feature_start_on_target_genome(start_seg,feature.id)
stop_on_target_genome=get_feature_stop_on_target_genome(stop_seg,feature.id)
# length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature
return start_on_feature+(stop_on_target_genome-start_on_target_genome) # stop position : start+length-1. length=stop-start+1
# get the proportion of a part of the feature on the total length
def get_proportion_of_feature_part(start_seg,stop_seg,feature):
start_on_new_genome=get_feature_start_on_target_genome(start_seg,feature.id)
stop_on_new_genome=get_feature_stop_on_target_genome(stop_seg,feature.id)
# length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature
proportion=f';proportion={stop_on_new_genome-start_on_new_genome+1}/{feature.size}'
return proportion
# returns the gff line to write in the output file for the function gff_detail
def create_line_detail(last_seg,feature_id,start_seg,feat_start):
[stop_seg,feature,chr,strand]=[last_seg[1:],Features[feature_id],segments_on_target_genome[start_seg][0],Features[feature_id].strand]
# start and stop position of the feature on the genome we transfer on
start_on_new_genome=get_feature_start_on_target_genome(start_seg,feature_id)
stop_on_new_genome=get_feature_stop_on_target_genome(stop_seg,feature_id)
proportion=get_proportion_of_feature_part(start_seg,stop_seg,feature)
position=get_position_on_feature(start_seg,stop_seg,feat_start,feature)
annotation=feature.annot+proportion+position
output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start_on_new_genome}\t{stop_on_new_genome}\t.\t{strand}\t.\t{annotation}\n'
return output_line
# functions to get the gff with one line per feature
def right_size(size,max_diff,feat):
if max_diff==0:
return True
return not ((size>Features[feat].size*max_diff) | (size<Features[feat].size/max_diff))
def create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg):
def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,list_seg):
[chr,strand,feature]=[segments_on_target_genome[first_seg][0],Features[feature_id].strand,Features[feature_id]]
variant_count=0
......
......@@ -5,43 +5,16 @@ from Functions import *
### functions to generate a genome's gff from the graph's gff
# functions to get the detailed gff with all the fragments of the features
# outputs each fragment of the feature in a gff
def gff_detail(list_seg,feature_id):
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature, etc.
# at the end of the loop, prints the last part of the fragment.
[feat_start,seg_start,last_seg]=["","",""] # first segment of the feature, first segment of the current part of the feature, last segment in the part of the feature, for loop below
for segment in list_seg:
if segment[1:] in segments_on_target_genome:
if feat_start=="":
feat_start=segment[1:]
if seg_start=="": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=segment[1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in target genome)
else:
if seg_start!="": # found a stop and we have a start. print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=create_line_detail(last_seg,feature_id,seg_start,feat_start)
write_line(out_line,output_detail_gff,False)
seg_start=""
#else: if the current segment is not in target genome but there is no start, keep looking for a start
last_seg=segment
if last_seg[1:] in segments_on_target_genome:
out_line=create_line_detail(list_seg[-1],feature_id,seg_start,feat_start)
write_line(out_line,output_detail_gff,False)
# functions to get the gff with one line per feature
# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):
def target_gff(first_seg,last_seg,feature_id,list_seg,max_diff):
if (first_seg!=''): # feature present on the target genome
size_on_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id)-get_feature_start_on_target_genome(first_seg,feature_id)+1
size_diff=abs(size_on_new_genome-Features[feature_id].size)
if right_size(size_on_new_genome,max_diff,feature_id):
line=create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg)
write_line(line,output_gff_one,False)
line=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,list_seg)
write_line(line,output_target_gff,False)
return size_diff
else:
return "bad_size"
......@@ -51,24 +24,20 @@ def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):
# writes the gff of target genome using the gff of the graph
def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,max_diff):
def transfer_on_target(pos_seg, gfa, out_once, out_var,target_genome_name,max_diff):
print("generation of the genome's gff ")
# create variables and open files
[once,detail,var,stats]=[True,False,True,False]
[once,var,stats]=[True,True,False]
if var:
[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
file_out_var = open(out_var, 'w')
global output_variations
output_variations=[0,"",file_out_var]
if detail:
file_out_detail = open(out_detail, 'w')
global output_detail_gff
output_detail_gff=[0,"",file_out_detail]
if once:
file_out=open(out_once,'w')
global output_gff_one
output_gff_one=[0,"",file_out]
global output_target_gff
output_target_gff=[0,"",file_out]
bad_size_features=0
absent_features=0
diff_size_transfered_features=[0,0] # [count,sum], to get the average
......@@ -94,7 +63,7 @@ def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,ma
# outputs the feature once in a gff, from the first to the last segment present on the new genome
if once:
transfer_stat=gff_one(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
transfer_stat=target_gff(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
if transfer_stat=="bad_size":
bad_size_features+=1
elif transfer_stat=="absent":
......@@ -102,12 +71,7 @@ def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,ma
else:
diff_size_transfered_features[0]+=1
diff_size_transfered_features[1]+=transfer_stat
write_line("",output_gff_one,True)
# outputs each fragment of the feature in a gff
if detail:
gff_detail(list_seg,feat) # insertions !
write_line("",output_detail_gff,True)
write_line("",output_target_gff,True)
# outputs the detail of variations of the feature :
if var:
......@@ -118,8 +82,6 @@ def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,ma
stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat)
# close output_files
if detail:
file_out_detail.close()
if once:
file_out.close()
if var:
......
......@@ -7,101 +7,35 @@ from Graph_gff import *
from Functions_output import *
#from inference import *
run="command_line"
if run=="command_line":
import sys
if not(len(sys.argv)>=4) :# intersect, gfa, pos_seg (target_genome_name)
print("expected input : intersect, gfa file with walks, bed file with positions of the segments on the target genome")
#print("output : graph gff, graph gaf, target genome gff*2+variations")
sys.exit(1)
elif (sys.argv[1]=="-h") :
import sys
if not(len(sys.argv)>=4) :# intersect, gfa, pos_seg (target_genome_name)
print("expected input : intersect, gfa file with walks, bed file with positions of the segments on the target genome")
print("output : graph gff, graph gaf, target genome gff*2+variations")
sys.exit(1)
intersect=sys.argv[1]
gfa=sys.argv[2]
pos_seg=sys.argv[3]
out_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
out_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
out_once=pos_seg.split("/")[-1].split(".")[0:-1][0]+".gff"
out_detail=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_detail.gff"
out_var=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_variations.txt"
if len(sys.argv)==5:
target_genome_name=sys.argv[4]
else:
target_genome_name=pos_seg.split("/")[-1].split(".")[0:-1][0]
load_intersect(intersect)
# outputs the gff and gaf of the graph for chr10
graph_gff(out_gff)
graph_gaf(out_gaf,gfa)
# outputs the gff of a genome for the chr10
max_diff=2 # maximum size difference (n times bigger or smaller) between the gene on the source genome and the gene on the target genome for the gene to be transfered.
genome_gff(pos_seg,gfa,out_once,out_detail,out_var,target_genome_name,max_diff)
if run=="test":
intersect_path='intersect.bed'
load_intersect(intersect_path)
# outputs the gff of the graph for chr10
output_gff='graph.gff'
output_gaf='graph.gaf'
gfa="graph.gfa"
graph_gff(output_gff)
graph_gaf(output_gaf,gfa)
out_once="target_genome.gff"
out_var="variations.txt"
out_detail="target_genome_detail.gff"
pos_seg="genome2_chr10.bed"
gff=output_gff
target_genome_name="genome2_chr10"
# outputs the gff of a genome for the chr10
genome_gff(pos_seg,gff,gfa,out_once,out_detail,out_var,target_genome_name)
if run=="reel":
# creates segments and features for the intersect between the graph for chr10 and the gff of IRGSP
intersect_path='/home/nina/grannot/wd/align_genes/intersect_segments-genes_chr10.bed'
load_intersect(intersect_path)
# outputs the gff of the graph for chr10
output_gff='graph_chr10.gff'
output_gaf='graph_chr10.gaf'
gfa="data/graphs/RiceGraphChr10_cactus.gfa"
graph_gff(output_gff)
graph_gaf(output_gaf,gfa)
genome = 'ac'
if genome=='ac': # transfer from graph to azucena
pos_seg="seg_coord/AzucenaRS1_chromosome10.bed"
out_once="azucena_chr10.gff"
out_detail="azucena_detail_chr10.gff"
out_var="variations_chr10.gff"
if genome=='nb': # transfer from graph to nipponbare
pos_seg="seg_coord/IRGSP-1_0_Chr10.bed"
out_once="nb_chr10_all.gff"
out_detail="nb_chr10_all_detail.gff"
out_var="variations_chr10.gff"
gff=output_gff
target_genome_name="CM020642.1_Azucena_chromosome10"
# outputs the gff of a genome for the chr10
genome_gff(pos_seg,gff,gfa,out_once,out_detail,out_var,target_genome_name)
elif (sys.argv[1]=="-h") :
print("expected input : intersect, gfa file with walks, bed file with positions of the segments on the target genome")
print("output : graph gff, graph gaf, target genome gff+variations")
sys.exit(1)
intersect=sys.argv[1]
gfa=sys.argv[2]
pos_seg=sys.argv[3]
out_graph_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
out_graph_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
out_target_gff=pos_seg.split("/")[-1].split(".")[0:-1][0]+".gff"
out_target_var=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_variations.txt"
if len(sys.argv)==5:
target_genome_name=sys.argv[4]
else:
target_genome_name=pos_seg.split("/")[-1].split(".")[0:-1][0]
load_intersect(intersect)
# outputs the gff and gaf of the graph for chr10
graph_gff(out_graph_gff)
graph_gaf(out_graph_gaf,gfa)
# outputs the gff of a genome for the chr10
max_diff=2 # maximum size difference (n times bigger or smaller) between the gene on the source genome and the gene on the target genome for the gene to be transfered.
transfer_on_target(pos_seg,gfa,out_target_gff,out_target_var,target_genome_name,max_diff)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment