Skip to content
Snippets Groups Projects
Commit 8fd17172 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

reduced memory usage by only storing the sequences of the segments in the genes

parent 2ce45f90
No related branches found
No related tags found
No related merge requests found
......@@ -33,7 +33,8 @@ class Feature:
self.annot=annot
self.childs=childs # liste of child features (exons, cds, etc)
self.parent=parent
self.segments_list=seg # list of oriented segments on which the feature is (>s1/<s1, depending on the path of the gene in the graph)
self.segments_list_source=seg # list of oriented segments on which the feature is (>s1/<s1, depending on the path of the gene in the graph)
self.segments_list_target=[]
self.note=""
......
......@@ -247,21 +247,29 @@ def get_segments_positions_on_genome(pos_seg):
segments_on_target_genome[seg]=[chrom,start,stop,strand,index]
seg_count+=1
def get_segments_sequence_and_paths(gfa,genomes_to_index):
def get_segments_sequence(gfa,segments_list):
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
seg_seq={}
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="S"): # get the sequence of the segment
if line[0]=="S": # get the sequence of the segment
seg_id='s'+line[1]
seg_seq[seg_id]=line[2]
if seg_id in segments_list:
seg_seq[seg_id]=line[2]
return seg_seq
def get_paths(gfa,target_genomes):
file_gfa=open(gfa,'r')
lines_gfa=file_gfa.readlines()
file_gfa.close()
paths={}
for line in lines_gfa:
line=line.split()
if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
if line[3] in genomes_to_index:
if line[3] in target_genomes:
path=line[6].replace(">",";>")
path=path.replace("<",";<").split(';')
......@@ -273,8 +281,7 @@ def get_segments_sequence_and_paths(gfa,genomes_to_index):
list_segments.append('<s'+segment[1:])
paths[line[3]]=list_segments
return [paths,seg_seq]
return paths
def get_first_seg(list_seg): # get the first segment of the list that is in the target genome
first_seg_found=''
......@@ -303,6 +310,16 @@ def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
same_strand_count+=1
return [seg_common,same_strand_count]
def add_target_genome_path(feature_id,target_genome_path):
feature=Features[feature_id]
list_seg=feature.segments_list_source
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
feature_path=[]
if first_seg!='':
feature_path=get_feature_path(target_genome_path,first_seg,last_seg)
feature.segments_list_target=feature_path
def get_feature_path(target_genome_path,first_seg,last_seg):
# find the path in target genome
first_seg_index=segments_on_target_genome[first_seg][4]
......@@ -386,8 +403,8 @@ def create_var(feature_id,first_seg,last_seg,target_genome_path):
size_diff=str(size_new_genome-feature.size)
# get feature paths on the original genome and on the target genome
feature_path_target_genome=get_feature_path(target_genome_path,first_seg,last_seg)
feature_path_source_genome=feature.segments_list
feature_path_target_genome=feature.segments_list_target
feature_path_source_genome=feature.segments_list_source
[feature_path_source_genome,feature_path_target_genome,inversion]=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)
variation=Variation(feature_id,feature.type,feature.chr,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
......
......@@ -30,8 +30,14 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
get_segments_positions_on_genome(pos_seg)
list_feature_to_transfer= Features.keys()
if var:
paths=get_paths(gfa,target_genomes)
for genome in target_genomes:
print(f'generation of {genome} output')
segments_list={}
if var:
target_genome_path=paths[genome]
if once:
file_out=open(out_once,'w')
......@@ -43,11 +49,18 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
for feat in list_feature_to_transfer:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list
list_seg=Features[feat].segments_list_source
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
transfer_stat=target_gff(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
if var: # append dict of segments for which we may need the sequence
add_target_genome_path(feat,target_genome_path)
for segment in list_seg:
segments_list[segment[1:]]=''
for segment in Features[feat].segments_list_target:
segments_list[segment[1:]]=''
transfer_stat=target_gff(first_seg,last_seg,feat,list_seg,max_diff) # insertions not considered !!!
if transfer_stat=="bad_size":
bad_size_features+=1
elif transfer_stat=="absent":
......@@ -60,8 +73,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
file_out.close()
if var:
[paths,seg_seq]=get_segments_sequence_and_paths(gfa,target_genomes)
target_genome_path=paths[genome]
seg_seq=get_segments_sequence(gfa,segments_list)
file_out_var = open(out_var, 'w')
global output_variations
output_variations=[0,"",file_out_var]
......@@ -74,7 +86,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
for feat in list_feature_to_transfer:
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list
list_seg=Features[feat].segments_list_source
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
......@@ -97,7 +109,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
# total number of features, with missing segments or not - feature_total
# for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
list_seg=Features[feat].segments_list
list_seg=Features[feat].segments_list_source
first_seg=get_first_seg(list_seg)
last_seg=get_first_seg(reversed(list_seg))
......@@ -118,7 +130,7 @@ def transfer_on_target(pos_seg, gfa, out_once, out_var,out_clustal,target_genome
def print_variations(first_seg,last_seg,feat,target_genome_path,seg_seq):
if (first_seg!=''): # if the feature is not completly absent # add the else, output absent features
if first_seg!='': # if the feature is not completly absent # add the else, output absent features
[variation,feature_path_source_genome,feature_path_target_genome]=create_var(feat,first_seg,last_seg,target_genome_path) # removes the strands in the segment lists
feature=Features[feat]
feat_start=feature.start
......
......@@ -68,13 +68,13 @@ def add_child(feat,new_child):
# add a segment to an existing feature
def add_seg(feat_id,new_seg_oriented):
if new_seg_oriented not in Features[feat_id].segments_list:
Features[feat_id].segments_list.append(new_seg_oriented)
if new_seg_oriented not in Features[feat_id].segments_list_source:
Features[feat_id].segments_list_source.append(new_seg_oriented)
# add the position of the feature on its first and last segment
def add_pos(feature):
feature.pos_start=get_feature_start_on_segment(feature.segments_list[0][1:],feature.id)
feature.pos_stop=get_feature_stop_on_segment(feature.segments_list[-1][1:],feature.id)
feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0][1:],feature.id)
feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1][1:],feature.id)
def load_feature_intersect(line,feature_id,seg_oriented):
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
......@@ -167,15 +167,15 @@ def get_feature_stop_on_segment(seg_id,feat_id):
# generates the annotation for the gff of the graph, with the rank of the segment in the feature
def make_annot_graph(feature,segment_id):
if ">"+segment_id in feature.segments_list:
if ">"+segment_id in feature.segments_list_source:
segment_oriented=">"+segment_id
else:
segment_oriented="<"+segment_id
# what if the segment is present twice in the list, with the two orientation ??
# get the rank and the total number of ocurrences for the feature
rank=str(feature.segments_list.index(segment_oriented)+1)
total=str(len(feature.segments_list))
rank=str(feature.segments_list_source.index(segment_oriented)+1)
total=str(len(feature.segments_list_source))
# create the annotation with the rank information
annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
return annotation
......@@ -214,7 +214,7 @@ def graph_gaf(graph_gaf_file,gfa_file):
seg_len=get_segments_length(gfa_file)
for feature_id in Features:
feature=Features[feature_id]
feature_segments=feature.segments_list
feature_segments=feature.segments_list_source
feature_path=""
path_length=0
for segment in feature_segments:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment