removed functions to get the 'detailed' gff for the target genome. simplified the main

da5b6f63 · nina.marthe_ird.fr · 70b6f4be · da5b6f63 · da5b6f63 · da5b6f63
Commit da5b6f63 authored 1 year ago by nina.marthe_ird.fr
--- a/Functions.py
+++ b/Functions.py
@@ -19,56 +19,13 @@ def get_feature_stop_on_target_genome(stop_seg,feat_id):



-# functions to get the detailed gff with all the fragments of the features
-
-# get the position of a part of a feature on the complete feature (on the original genome)
-def get_position_on_feature(start_seg,stop_seg,feature_start_segment,feature):
-    start_on_feature=get_segment_start_on_feature(feature_start_segment,start_seg,feature)
-    stop_on_feature=get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature)
-    position=f';position={start_on_feature}-{stop_on_feature}'
-    return position
-
-def get_segment_start_on_feature(feature_start_segment,start_seg,feature):
-    feature_start_pos=Segments[feature_start_segment].start+get_feature_start_on_segment(feature_start_segment,feature.id)-1
-    start_on_reference=Segments[start_seg].start+get_feature_start_on_segment(start_seg,feature.id)-1
-    return start_on_reference-feature_start_pos+1
-
-def get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature):
-    start_on_target_genome=get_feature_start_on_target_genome(start_seg,feature.id)
-    stop_on_target_genome=get_feature_stop_on_target_genome(stop_seg,feature.id)
-    # length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature
-    return start_on_feature+(stop_on_target_genome-start_on_target_genome) # stop position : start+length-1. length=stop-start+1
-
-# get the proportion of a part of the feature on the total length
-def get_proportion_of_feature_part(start_seg,stop_seg,feature):
-    start_on_new_genome=get_feature_start_on_target_genome(start_seg,feature.id)
-    stop_on_new_genome=get_feature_stop_on_target_genome(stop_seg,feature.id)
-    # length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature
-    proportion=f';proportion={stop_on_new_genome-start_on_new_genome+1}/{feature.size}'
-    return proportion
-
-# returns the gff line to write in the output file for the function gff_detail
-def create_line_detail(last_seg,feature_id,start_seg,feat_start):
-    [stop_seg,feature,chr,strand]=[last_seg[1:],Features[feature_id],segments_on_target_genome[start_seg][0],Features[feature_id].strand]
-    # start and stop position of the feature on the genome we transfer on
-    start_on_new_genome=get_feature_start_on_target_genome(start_seg,feature_id) 
-    stop_on_new_genome=get_feature_stop_on_target_genome(stop_seg,feature_id)
-    
-    proportion=get_proportion_of_feature_part(start_seg,stop_seg,feature)
-    position=get_position_on_feature(start_seg,stop_seg,feat_start,feature)
-    annotation=feature.annot+proportion+position
-    
-    output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start_on_new_genome}\t{stop_on_new_genome}\t.\t{strand}\t.\t{annotation}\n'
-    return output_line
-
-
 # functions to get the gff with one line per feature
 def right_size(size,max_diff,feat):
    if max_diff==0:
        return True
    return not ((size>Features[feat].size*max_diff) | (size<Features[feat].size/max_diff)) 

-def create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg):
+def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,list_seg):
    [chr,strand,feature]=[segments_on_target_genome[first_seg][0],Features[feature_id].strand,Features[feature_id]]

    variant_count=0

--- a/Functions_output.py
+++ b/Functions_output.py
@@ -5,43 +5,16 @@ from Functions import *
 ### functions to generate a genome's gff from the graph's gff


-# functions to get the detailed gff with all the fragments of the features
-# outputs each fragment of the feature in a gff
-def gff_detail(list_seg,feature_id):
-    # loop that goes through all the segments that have the current feature
-    # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
-    # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature, etc.
-    # at the end of the loop, prints the last part of the fragment.
-    [feat_start,seg_start,last_seg]=["","",""] # first segment of the feature, first segment of the current part of the feature, last segment in the part of the feature, for loop below
-    for segment in list_seg:
-        if segment[1:] in segments_on_target_genome: 
-            if feat_start=="":
-                feat_start=segment[1:]
-            if seg_start=="": # if we dont have a start, take the current segment for the start of the part of the feature
-                seg_start=segment[1:]
-            #else: if we already have a start, keep going though the segments until we find a stop (segment not in target genome)
-        else:
-            if seg_start!="": # found a stop and we have a start. print the line, reset seg_start, and keep going through the segments to find the next seg_start
-                out_line=create_line_detail(last_seg,feature_id,seg_start,feat_start)
-                write_line(out_line,output_detail_gff,False)
-                seg_start=""
-            #else: if the current segment is not in target genome but there is no start, keep looking for a start
-        last_seg=segment    
-    if last_seg[1:] in segments_on_target_genome:
-        out_line=create_line_detail(list_seg[-1],feature_id,seg_start,feat_start)
-        write_line(out_line,output_detail_gff,False)
-
-
 # functions to get the gff with one line per feature
 # outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
-def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):
+def target_gff(first_seg,last_seg,feature_id,list_seg,max_diff):

    if (first_seg!=''): # feature present on the target genome
        size_on_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id)-get_feature_start_on_target_genome(first_seg,feature_id)+1
        size_diff=abs(size_on_new_genome-Features[feature_id].size)
        if right_size(size_on_new_genome,max_diff,feature_id): 
-            line=create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg)
-            write_line(line,output_gff_one,False)
+            line=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,list_seg)
+            write_line(line,output_target_gff,False)
            return size_diff
        else:
            return "bad_size"
@@ -51,24 +24,20 @@ def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):


 # writes the gff of target genome using the gff of the graph
-def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,max_diff):
+def transfer_on_target(pos_seg, gfa, out_once, out_var,target_genome_name,max_diff):
    print("generation of the genome's gff ")

    # create variables and open files
-    [once,detail,var,stats]=[True,False,True,False]
+    [once,var,stats]=[True,True,False]
    if var:
        [paths,seg_seq]=get_segments_sequence_and_paths(gfa)
        file_out_var = open(out_var, 'w')
        global output_variations
        output_variations=[0,"",file_out_var]
-    if detail:
-        file_out_detail = open(out_detail, 'w')
-        global output_detail_gff
-        output_detail_gff=[0,"",file_out_detail]
    if once:
        file_out=open(out_once,'w')
-        global output_gff_one
-        output_gff_one=[0,"",file_out]
+        global output_target_gff
+        output_target_gff=[0,"",file_out]
        bad_size_features=0
        absent_features=0
        diff_size_transfered_features=[0,0] # [count,sum], to get the average
@@ -94,7 +63,7 @@ def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,ma

        # outputs the feature once in a gff, from the first to the last segment present on the new genome
        if once:
-            transfer_stat=gff_one(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
+            transfer_stat=target_gff(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
            if transfer_stat=="bad_size":
                bad_size_features+=1
            elif transfer_stat=="absent":
@@ -102,12 +71,7 @@ def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,ma
            else:
                diff_size_transfered_features[0]+=1
                diff_size_transfered_features[1]+=transfer_stat
-            write_line("",output_gff_one,True)
-
-        # outputs each fragment of the feature in a gff
-        if detail:
-            gff_detail(list_seg,feat) # insertions !
-            write_line("",output_detail_gff,True)
+            write_line("",output_target_gff,True)

        # outputs the detail of variations of the feature :
        if var:
@@ -118,8 +82,6 @@ def genome_gff(pos_seg, gfa, out_once, out_detail, out_var,target_genome_name,ma
            stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat)

    # close output_files
-    if detail:
-        file_out_detail.close()
    if once:
        file_out.close()
    if var:

--- a/main.py
+++ b/main.py
@@ -7,101 +7,35 @@ from Graph_gff import *
 from Functions_output import *
 #from inference import *

-run="command_line"

-if run=="command_line":
-
-    import sys
-    if not(len(sys.argv)>=4) :# intersect, gfa, pos_seg (target_genome_name)
-            print("expected input : intersect, gfa file  with walks, bed file with positions of the segments on the target genome")
-            #print("output : graph gff, graph gaf, target genome gff*2+variations")
-            sys.exit(1)
-    elif (sys.argv[1]=="-h") :
+import sys
+if not(len(sys.argv)>=4) :# intersect, gfa, pos_seg (target_genome_name)
        print("expected input : intersect, gfa file  with walks, bed file with positions of the segments on the target genome")
-        print("output : graph gff, graph gaf, target genome gff*2+variations")
        sys.exit(1)
-
-    intersect=sys.argv[1]
-    gfa=sys.argv[2]
-    pos_seg=sys.argv[3]
-
-    out_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
-    out_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
-    out_once=pos_seg.split("/")[-1].split(".")[0:-1][0]+".gff"
-    out_detail=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_detail.gff"
-    out_var=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_variations.txt"
-    if len(sys.argv)==5:
-        target_genome_name=sys.argv[4]
-    else:
-        target_genome_name=pos_seg.split("/")[-1].split(".")[0:-1][0]
-
-    load_intersect(intersect)
-
-    # outputs the gff and gaf of the graph for chr10
-    graph_gff(out_gff)
-    graph_gaf(out_gaf,gfa)
-
-    # outputs the gff of a genome for the chr10
-    max_diff=2 # maximum size difference (n times bigger or smaller) between the gene on the source genome and the gene on the target genome for the gene to be transfered.
-    genome_gff(pos_seg,gfa,out_once,out_detail,out_var,target_genome_name,max_diff)
-
-
-
-
-if run=="test":
-    intersect_path='intersect.bed'
-    load_intersect(intersect_path)
-
-    # outputs the gff of the graph for chr10
-    output_gff='graph.gff'
-    output_gaf='graph.gaf'
-    gfa="graph.gfa"
-    graph_gff(output_gff)
-    graph_gaf(output_gaf,gfa)
-
-    out_once="target_genome.gff"
-    out_var="variations.txt"
-    out_detail="target_genome_detail.gff"
-    pos_seg="genome2_chr10.bed"
-
-    gff=output_gff
-
-    target_genome_name="genome2_chr10"
-
-    # outputs the gff of a genome for the chr10
-    genome_gff(pos_seg,gff,gfa,out_once,out_detail,out_var,target_genome_name)
-
-if run=="reel":
-
-    # creates segments and features for the intersect between the graph for chr10 and the gff of IRGSP
-    intersect_path='/home/nina/grannot/wd/align_genes/intersect_segments-genes_chr10.bed'
-    load_intersect(intersect_path)
-
-    # outputs the gff of the graph for chr10
-    output_gff='graph_chr10.gff'
-    output_gaf='graph_chr10.gaf'
-    gfa="data/graphs/RiceGraphChr10_cactus.gfa"
-    graph_gff(output_gff)
-    graph_gaf(output_gaf,gfa)
-
-
-    genome = 'ac'
-    if genome=='ac': # transfer from graph to azucena
-        pos_seg="seg_coord/AzucenaRS1_chromosome10.bed"
-        out_once="azucena_chr10.gff"
-        out_detail="azucena_detail_chr10.gff"
-        out_var="variations_chr10.gff"
-
-    if genome=='nb': # transfer from graph to nipponbare
-        pos_seg="seg_coord/IRGSP-1_0_Chr10.bed"
-        out_once="nb_chr10_all.gff"
-        out_detail="nb_chr10_all_detail.gff"
-        out_var="variations_chr10.gff"
-
-    gff=output_gff
-    target_genome_name="CM020642.1_Azucena_chromosome10"
-
-    # outputs the gff of a genome for the chr10
-    genome_gff(pos_seg,gff,gfa,out_once,out_detail,out_var,target_genome_name)
-
-
+elif (sys.argv[1]=="-h") :
+    print("expected input : intersect, gfa file  with walks, bed file with positions of the segments on the target genome")
+    print("output : graph gff, graph gaf, target genome gff+variations")
+    sys.exit(1)
+
+intersect=sys.argv[1]
+gfa=sys.argv[2]
+pos_seg=sys.argv[3]
+
+out_graph_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
+out_graph_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
+out_target_gff=pos_seg.split("/")[-1].split(".")[0:-1][0]+".gff"
+out_target_var=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_variations.txt"
+if len(sys.argv)==5:
+    target_genome_name=sys.argv[4]
+else:
+    target_genome_name=pos_seg.split("/")[-1].split(".")[0:-1][0]
+
+load_intersect(intersect)
+
+# outputs the gff and gaf of the graph for chr10
+graph_gff(out_graph_gff)
+graph_gaf(out_graph_gaf,gfa)
+
+# outputs the gff of a genome for the chr10
+max_diff=2 # maximum size difference (n times bigger or smaller) between the gene on the source genome and the gene on the target genome for the gene to be transfered.
+transfer_on_target(pos_seg,gfa,out_target_gff,out_target_var,target_genome_name,max_diff)
\ No newline at end of file