Skip to content
Snippets Groups Projects
Commit fd4eb1fb authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

converted segments_on_target_genome to use a new class SegmentTarget;...

converted segments_on_target_genome to use a new class SegmentTarget; clarifies how the dictionnary is used
parent fbb944d9
No related branches found
No related tags found
No related merge requests found
......@@ -17,13 +17,13 @@ def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,li
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
if (segment not in segments_on_target_genome) or (not segments_on_target_genome[segment].find_filename(walk)):
feature_missing_segments[1].append(feature_id)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
if (segment not in segments_on_target_genome) or (not segments_on_target_genome[segment].find_filename(walk)):
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[4].append(feature_id)
break
......
......@@ -27,12 +27,12 @@ def find_all_seg(list_seg_source,walk_name,segments_on_target_genome):
list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
seg_inverted=invert_seg(seg)
# look for all the segment copies in the target genome walk, in both orientations
if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
if (seg in segments_on_target_genome) and (segments_on_target_genome[seg].find_filename(walk_name)): # if the segment is in the target genome on the right walk (chr,ctg)
for copy in segments_on_target_genome[seg].get_all_seg(walk_name): # take all its copies
seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
list_seg_target.append(seg_info)
if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
for copy in segments_on_target_genome[seg_inverted][walk_name]:
if (seg_inverted in segments_on_target_genome) and (segments_on_target_genome[seg_inverted].find_filename(walk_name)) : # same but with the segment inverted
for copy in segments_on_target_genome[seg_inverted].get_all_seg(walk_name):
seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
list_seg_target.append(seg_info)
index+=1
......@@ -53,7 +53,7 @@ def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_i
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
for segment in segments_on_target_genome[old_seg_id].get_all_seg(walk_name): # look for the first seg to add the occurence info
if segment[1]==old_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
......@@ -104,7 +104,7 @@ def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_i
def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy,segments_on_target_genome):
for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
for segment in segments_on_target_genome[target_seg_id].get_all_seg(walk_name): # look for the right occurence of the segment
if segment[1]==target_start:
segment.append(feat_copy) # add seg info
break
......@@ -146,7 +146,7 @@ def get_walks_feature_cross(list_seg,segments_on_target_genome):
for segment in list_seg:
seg_found=search_seg_on_target_genome(segment,segments_on_target_genome)
if seg_found: # if the segment or the reverse complement is on the target genome
for walk in segments_on_target_genome[seg_found]:
for walk in segments_on_target_genome[seg_found].get_all_filenames():
if walk not in list_walks:
list_walks.append(walk)
return list_walks
......@@ -156,10 +156,10 @@ def get_walks_feature_cross(list_seg,segments_on_target_genome):
def search_seg_on_walk(segment,walk,segments_on_target_genome): # for now just print the first found, look for several later...
inverted_segment=invert_seg(segment)
if segment in segments_on_target_genome:
if walk in segments_on_target_genome[segment]:
if segments_on_target_genome[segment].find_filename(walk):
return segment
elif inverted_segment in segments_on_target_genome:
if walk in segments_on_target_genome[inverted_segment]:
if segments_on_target_genome[inverted_segment].find_filename(walk):
return inverted_segment
else:
return False
......
......@@ -31,7 +31,7 @@ def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_ge
walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id,segments_on_target_genome)[2]
feat_copy=(feature_id,copy_id)
for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
for segment in segments_on_target_genome[seg].get_all_seg(walk_name): # find the right occurence
if walk_start <= segment[1] <= walk_stop:
segment.append(feat_copy) # annotate the segment : feature_id, copy_id
break
......
......@@ -82,6 +82,66 @@ def get_paths(walks_file,target_genome,haplotype):
paths[seq_name]=list_segments
return paths
class SegmentTarget:
def __init__(self):
self.filename_list=[]
self.info_list=[]
def add_occurence(self,filename,info):
filename_info=self.find_filename_info(filename)
if filename_info is None: # if filename not yet encountered
pair=[filename,len(self.info_list)]
self.filename_list.append(pair)
self.info_list.append(info)
else:
filename_info.append(len(self.info_list))
self.info_list.append(info)
def get_all_filenames(self):
filenames=[]
for elem in self.filename_list:
filenames.append(elem[0])
return filenames
def find_filename_info(self,filename):
for elem in self.filename_list:
if elem[0]==filename:
return elem
def find_filename(self,filename):
for elem in self.filename_list:
if elem[0]==filename:
return True
return False
def get_all_seg(self,filename): # rename get_all_occ
filename_info=self.find_filename_info(filename)
list_seg=[]
if filename_info is not None: # if this segment is in the file
for index in filename_info[1:]:
list_seg.append(self.info_list[index])
return list_seg
# # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
# def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
# with open(pos_seg,'r') as bed:
# seg_count=0
# file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
# for line in bed:
# line=line.split()
# [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
# # check if segment present twice on the same walk ???
# #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
# if seg not in segments_on_target_genome:
# segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
# if file_name not in segments_on_target_genome[seg]:
# segments_on_target_genome[seg][file_name]=list()
# segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
# seg_count+=1
# appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
with open(pos_seg,'r') as bed:
......@@ -93,9 +153,7 @@ def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add t
# check if segment present twice on the same walk ???
#segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
if seg not in segments_on_target_genome:
segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
if file_name not in segments_on_target_genome[seg]:
segments_on_target_genome[seg][file_name]=list()
segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
seg_count+=1
segments_on_target_genome[seg]=SegmentTarget() # dict of walks->segment_info; you get the info about the segment for each walk
segments_on_target_genome[seg].add_occurence(file_name,[chrom,start,stop,strand,index])
seg_count+=1
\ No newline at end of file
......@@ -2,13 +2,13 @@ from .intersect import Features,get_feature_start_on_segment,get_feature_stop_on
# find the occurence of the segment that corresponds to the feature's copy
def get_seg_occ(seg,walk,feat,copy,segments_on_target_genome):
seg_in_walk=segments_on_target_genome[seg][walk]
seg_in_walk=segments_on_target_genome[seg].get_all_seg(walk)
for seg_occurence in seg_in_walk:
for feat_copy in seg_occurence[5:]:
if (feat_copy[0]==feat) & (feat_copy[1]==copy):
return seg_occurence # [chr,start,stop,strand,index,copies]
print("no occ found ???",seg,walk,feat,copy)
print(segments_on_target_genome[seg][walk])
print(segments_on_target_genome[seg].get_all_filenames())
print(Features[feat].segments_list_source)
print(Features[feat].segments_list_target)
exit()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment