converted segments_on_target_genome to use a new class SegmentTarget;...

converted segments_on_target_genome to use a new class SegmentTarget; clarifies how the dictionnary is used

converted segments_on_target_genome to use a new class SegmentTarget;...
converted segments_on_target_genome to use a new class SegmentTarget; clarifies how the dictionnary is used
fd4eb1fb · nina.marthe_ird.fr · fbb944d9 · fd4eb1fb · fd4eb1fb · fd4eb1fb
Commit fd4eb1fb authored 4 days ago by nina.marthe_ird.fr
--- a/GrAnnoT/compute_transfer_stat.py
+++ b/GrAnnoT/compute_transfer_stat.py
@@ -17,13 +17,13 @@ def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,li
    # go through all the segments, check if some are missing in the middle of the feature
    elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element
        for segment in list_seg[1-(len(list_seg)-2)]:
-            if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
+            if (segment not in segments_on_target_genome) or (not segments_on_target_genome[segment].find_filename(walk)):
                feature_missing_segments[1].append(feature_id)
                break

    # go through the segments, to see if one is missing anywhere on the feature
    for segment in list_seg:
-        if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
+        if (segment not in segments_on_target_genome) or (not segments_on_target_genome[segment].find_filename(walk)):
            if feature_id not in feature_missing_segments[4]:
                feature_missing_segments[4].append(feature_id)
                break

--- a/GrAnnoT/detect_copies.py
+++ b/GrAnnoT/detect_copies.py
@@ -27,12 +27,12 @@ def find_all_seg(list_seg_source,walk_name,segments_on_target_genome):
        list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
        seg_inverted=invert_seg(seg)
        # look for all the segment copies in the target genome walk, in both orientations
-        if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
-            for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
+        if (seg in segments_on_target_genome) and (segments_on_target_genome[seg].find_filename(walk_name)): # if the segment is in the target genome on the right walk (chr,ctg)
+            for copy in segments_on_target_genome[seg].get_all_seg(walk_name): # take all its copies
                seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
                list_seg_target.append(seg_info)
-        if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
-            for copy in segments_on_target_genome[seg_inverted][walk_name]:
+        if (seg_inverted in segments_on_target_genome) and (segments_on_target_genome[seg_inverted].find_filename(walk_name)) : # same but with the segment inverted
+            for copy in segments_on_target_genome[seg_inverted].get_all_seg(walk_name):
                seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
                list_seg_target.append(seg_info)
        index+=1
@@ -53,7 +53,7 @@ def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_i
    copy_id="copy_"+str(copy_number)
    feat_copy=(feature_id,copy_id)

-    for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
+    for segment in segments_on_target_genome[old_seg_id].get_all_seg(walk_name): # look for the first seg to add the occurence info
        if segment[1]==old_start:
            copy_id="copy_"+str(copy_number)
            feat_copy=(feature_id,copy_id)
@@ -104,7 +104,7 @@ def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_i


 def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy,segments_on_target_genome):
-    for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
+    for segment in segments_on_target_genome[target_seg_id].get_all_seg(walk_name): # look for the right occurence of the segment
        if segment[1]==target_start:
            segment.append(feat_copy) # add seg info
            break
@@ -146,7 +146,7 @@ def get_walks_feature_cross(list_seg,segments_on_target_genome):
    for segment in list_seg:
        seg_found=search_seg_on_target_genome(segment,segments_on_target_genome)
        if seg_found: # if the segment or the reverse complement is on the target genome
-            for walk in segments_on_target_genome[seg_found]:
+            for walk in segments_on_target_genome[seg_found].get_all_filenames():
                if walk not in list_walks:
                    list_walks.append(walk)
    return list_walks
@@ -156,10 +156,10 @@ def get_walks_feature_cross(list_seg,segments_on_target_genome):
 def search_seg_on_walk(segment,walk,segments_on_target_genome): # for now just print the first found, look for several later...
    inverted_segment=invert_seg(segment)
    if segment in segments_on_target_genome:
-        if walk in segments_on_target_genome[segment]:
+        if segments_on_target_genome[segment].find_filename(walk):
            return segment
    elif inverted_segment in segments_on_target_genome:
-        if walk in segments_on_target_genome[inverted_segment]:
+        if segments_on_target_genome[inverted_segment].find_filename(walk):
            return inverted_segment
    else:
        return False

--- a/GrAnnoT/find_feature_target_path.py
+++ b/GrAnnoT/find_feature_target_path.py
@@ -31,7 +31,7 @@ def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_ge
            walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id,segments_on_target_genome)[2]
            feat_copy=(feature_id,copy_id)
            for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
-                for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
+                for segment in segments_on_target_genome[seg].get_all_seg(walk_name): # find the right occurence
                    if walk_start <= segment[1] <= walk_stop: 
                        segment.append(feat_copy) # annotate the segment : feature_id, copy_id
                        break

--- a/GrAnnoT/load_gfa.py
+++ b/GrAnnoT/load_gfa.py
@@ -82,6 +82,66 @@ def get_paths(walks_file,target_genome,haplotype):
                    paths[seq_name]=list_segments
    return paths

+
+class SegmentTarget:
+    def __init__(self):
+        self.filename_list=[]
+        self.info_list=[]
+    
+    def add_occurence(self,filename,info):
+        filename_info=self.find_filename_info(filename)
+        if filename_info is None: # if filename not yet encountered
+            pair=[filename,len(self.info_list)]
+            self.filename_list.append(pair)
+            self.info_list.append(info)
+        else:
+            filename_info.append(len(self.info_list))
+            self.info_list.append(info)
+
+    def get_all_filenames(self):
+        filenames=[]
+        for elem in self.filename_list:
+            filenames.append(elem[0])
+        return filenames
+
+    def find_filename_info(self,filename):
+        for elem in self.filename_list:
+            if elem[0]==filename:
+                return elem
+            
+    def find_filename(self,filename):
+        for elem in self.filename_list:
+            if elem[0]==filename:
+                return True
+        return False
+
+    def get_all_seg(self,filename): # rename get_all_occ
+        filename_info=self.find_filename_info(filename)
+        list_seg=[]
+        if filename_info is not None: # if this segment is in the file
+            for index in filename_info[1:]:
+                list_seg.append(self.info_list[index])
+        return list_seg
+
+
+
+# # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
+# def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
+#     with open(pos_seg,'r') as bed:
+#         seg_count=0
+#         file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
+#         for line in bed:
+#             line=line.split()
+#             [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
+#             # check if segment present twice on the same walk ???
+#             #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
+#             if seg not in segments_on_target_genome:
+#                 segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
+#             if file_name not in segments_on_target_genome[seg]:
+#                 segments_on_target_genome[seg][file_name]=list()
+#             segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
+#             seg_count+=1
+
 # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
 def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
    with open(pos_seg,'r') as bed:
@@ -93,9 +153,7 @@ def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add t
            # check if segment present twice on the same walk ???
            #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
            if seg not in segments_on_target_genome:
-                segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
-            if file_name not in segments_on_target_genome[seg]:
-                segments_on_target_genome[seg][file_name]=list()
-            segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
-            seg_count+=1
+                segments_on_target_genome[seg]=SegmentTarget() # dict of walks->segment_info; you get the info about the segment for each walk

+            segments_on_target_genome[seg].add_occurence(file_name,[chrom,start,stop,strand,index])
+            seg_count+=1
\ No newline at end of file
--- a/GrAnnoT/usefull_little_functions.py
+++ b/GrAnnoT/usefull_little_functions.py
@@ -2,13 +2,13 @@ from .intersect import Features,get_feature_start_on_segment,get_feature_stop_on

 # find the occurence of the segment that corresponds to the feature's copy
 def get_seg_occ(seg,walk,feat,copy,segments_on_target_genome):
-    seg_in_walk=segments_on_target_genome[seg][walk]
+    seg_in_walk=segments_on_target_genome[seg].get_all_seg(walk)
    for seg_occurence in seg_in_walk:
        for feat_copy in seg_occurence[5:]:
            if (feat_copy[0]==feat) & (feat_copy[1]==copy):
                return seg_occurence # [chr,start,stop,strand,index,copies]
    print("no occ found ???",seg,walk,feat,copy)
-    print(segments_on_target_genome[seg][walk])
+    print(segments_on_target_genome[seg].get_all_filenames())
    print(Features[feat].segments_list_source)
    print(Features[feat].segments_list_target)
    exit()