reduced memory usage by only storing the sequences of the segments in the genes

8fd17172 · nina.marthe_ird.fr · 2ce45f90 · 8fd17172 · 8fd17172 · 8fd17172
Commit 8fd17172 authored 1 year ago by nina.marthe_ird.fr
--- a/ClassSegFeat.py
+++ b/ClassSegFeat.py
@@ -33,7 +33,8 @@ class Feature:
        self.annot=annot
        self.childs=childs # liste of child features (exons, cds, etc)
        self.parent=parent
-        self.segments_list=seg # list of oriented segments on which the feature is (>s1/<s1, depending on the path of the gene in the graph)
+        self.segments_list_source=seg # list of oriented segments on which the feature is (>s1/<s1, depending on the path of the gene in the graph)
+        self.segments_list_target=[]

        self.note=""


--- a/Functions.py
+++ b/Functions.py
@@ -247,21 +247,29 @@ def get_segments_positions_on_genome(pos_seg):
        segments_on_target_genome[seg]=[chrom,start,stop,strand,index]
        seg_count+=1

-def get_segments_sequence_and_paths(gfa,genomes_to_index):
+def get_segments_sequence(gfa,segments_list):
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    file_gfa.close()
    seg_seq={}
-    paths={}
    for line in lines_gfa:
        line=line.split()
-        if (line[0]=="S"): # get the sequence of the segment
+        if line[0]=="S": # get the sequence of the segment
            seg_id='s'+line[1]
-            seg_seq[seg_id]=line[2]
+            if seg_id in segments_list:
+                seg_seq[seg_id]=line[2]
+    return seg_seq

+def get_paths(gfa,target_genomes):
+    file_gfa=open(gfa,'r')
+    lines_gfa=file_gfa.readlines()
+    file_gfa.close()
+    paths={}
+    for line in lines_gfa:
+        line=line.split()
        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome

-            if line[3] in genomes_to_index:
+            if line[3] in target_genomes:
            
                path=line[6].replace(">",";>")
                path=path.replace("<",";<").split(';')
@@ -273,8 +281,7 @@ def get_segments_sequence_and_paths(gfa,genomes_to_index):
                        list_segments.append('<s'+segment[1:])

                paths[line[3]]=list_segments
-
-    return [paths,seg_seq]
+    return paths

 def get_first_seg(list_seg): # get the first segment of the list that is in the target genome
    first_seg_found=''
@@ -303,6 +310,16 @@ def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
            same_strand_count+=1
    return [seg_common,same_strand_count]

+def add_target_genome_path(feature_id,target_genome_path):
+    feature=Features[feature_id]
+    list_seg=feature.segments_list_source
+    first_seg=get_first_seg(list_seg)
+    last_seg=get_first_seg(reversed(list_seg))
+    feature_path=[]
+    if first_seg!='':
+        feature_path=get_feature_path(target_genome_path,first_seg,last_seg)
+    feature.segments_list_target=feature_path
+
 def get_feature_path(target_genome_path,first_seg,last_seg):
    # find the path in target genome
    first_seg_index=segments_on_target_genome[first_seg][4]
@@ -386,8 +403,8 @@ def create_var(feature_id,first_seg,last_seg,target_genome_path):
    size_diff=str(size_new_genome-feature.size)

    # get feature paths on the original genome and on the target genome
-    feature_path_target_genome=get_feature_path(target_genome_path,first_seg,last_seg)
-    feature_path_source_genome=feature.segments_list
+    feature_path_target_genome=feature.segments_list_target
+    feature_path_source_genome=feature.segments_list_source
    [feature_path_source_genome,feature_path_target_genome,inversion]=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)
    variation=Variation(feature_id,feature.type,feature.chr,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)


--- a/Functions_output.py
+++ b/Functions_output.py
@@ -30,8 +30,14 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
    get_segments_positions_on_genome(pos_seg)
    list_feature_to_transfer= Features.keys()

+    if var:
+        paths=get_paths(gfa,target_genomes)
+
    for genome in target_genomes:
        print(f'generation of {genome} output')
+        segments_list={}
+        if var:
+            target_genome_path=paths[genome]

        if once:
            file_out=open(out_once,'w')
@@ -43,11 +49,18 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome

            for feat in list_feature_to_transfer:
                # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
-                list_seg=Features[feat].segments_list
+                list_seg=Features[feat].segments_list_source
                first_seg=get_first_seg(list_seg)
                last_seg=get_first_seg(reversed(list_seg))

-                transfer_stat=target_gff(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
+                if var: # append dict of segments for which we may need the sequence
+                    add_target_genome_path(feat,target_genome_path)
+                    for segment in list_seg:
+                        segments_list[segment[1:]]=''
+                    for segment in Features[feat].segments_list_target:
+                        segments_list[segment[1:]]=''
+
+                transfer_stat=target_gff(first_seg,last_seg,feat,list_seg,max_diff) # insertions not considered !!!
                if transfer_stat=="bad_size":
                    bad_size_features+=1
                elif transfer_stat=="absent":
@@ -60,8 +73,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
            file_out.close()

        if var:
-            [paths,seg_seq]=get_segments_sequence_and_paths(gfa,target_genomes)
-            target_genome_path=paths[genome]
+            seg_seq=get_segments_sequence(gfa,segments_list)
            file_out_var = open(out_var, 'w')
            global output_variations
            output_variations=[0,"",file_out_var]
@@ -74,7 +86,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome

            for feat in list_feature_to_transfer:
                # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
-                list_seg=Features[feat].segments_list
+                list_seg=Features[feat].segments_list_source
                first_seg=get_first_seg(list_seg)
                last_seg=get_first_seg(reversed(list_seg))

@@ -97,7 +109,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
            # total number of features, with missing segments or not - feature_total

            # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
-            list_seg=Features[feat].segments_list
+            list_seg=Features[feat].segments_list_source
            first_seg=get_first_seg(list_seg)
            last_seg=get_first_seg(reversed(list_seg))

@@ -118,7 +130,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome

 def print_variations(first_seg,last_seg,feat,target_genome_path,seg_seq):

-    if (first_seg!=''): # if the feature is not completly absent        # add the else, output absent features
+    if first_seg!='': # if the feature is not completly absent        # add the else, output absent features
        [variation,feature_path_source_genome,feature_path_target_genome]=create_var(feat,first_seg,last_seg,target_genome_path) # removes the strands in the segment lists
        feature=Features[feat]
        feat_start=feature.start

--- a/Graph_gff.py
+++ b/Graph_gff.py
@@ -68,13 +68,13 @@ def add_child(feat,new_child):

 # add a segment to an existing feature
 def add_seg(feat_id,new_seg_oriented):
-    if new_seg_oriented not in Features[feat_id].segments_list:
-        Features[feat_id].segments_list.append(new_seg_oriented)
+    if new_seg_oriented not in Features[feat_id].segments_list_source:
+        Features[feat_id].segments_list_source.append(new_seg_oriented)

 # add the position of the feature on its first and last segment
 def add_pos(feature):
-    feature.pos_start=get_feature_start_on_segment(feature.segments_list[0][1:],feature.id)
-    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list[-1][1:],feature.id)
+    feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0][1:],feature.id)
+    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1][1:],feature.id)

 def load_feature_intersect(line,feature_id,seg_oriented):
    if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
@@ -167,15 +167,15 @@ def get_feature_stop_on_segment(seg_id,feat_id):
 # generates the annotation for the gff of the graph, with the rank of the segment in the feature
 def make_annot_graph(feature,segment_id):

-    if ">"+segment_id in feature.segments_list:
+    if ">"+segment_id in feature.segments_list_source:
        segment_oriented=">"+segment_id
    else:
        segment_oriented="<"+segment_id
        # what if the segment is present twice in the list, with the two orientation ??

    # get the rank and the total number of ocurrences for the feature
-    rank=str(feature.segments_list.index(segment_oriented)+1)
-    total=str(len(feature.segments_list))
+    rank=str(feature.segments_list_source.index(segment_oriented)+1)
+    total=str(len(feature.segments_list_source))
    # create the annotation with the rank information
    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
    return annotation
@@ -214,7 +214,7 @@ def graph_gaf(graph_gaf_file,gfa_file):
    seg_len=get_segments_length(gfa_file)
    for feature_id in Features:
        feature=Features[feature_id]
-        feature_segments=feature.segments_list
+        feature_segments=feature.segments_list_source
        feature_path=""
        path_length=0
        for segment in feature_segments: