modified how the feature ids are used to handle features fragmented on several lines in the gff

8d0291e1 · nina.marthe_ird.fr · 22c4a0b8 · 8d0291e1 · 8d0291e1 · 8d0291e1
Commit 8d0291e1 authored 6 months ago by nina.marthe_ird.fr
--- a/GrAnnoT/ClassSegFeat.py
+++ b/GrAnnoT/ClassSegFeat.py
@@ -56,7 +56,7 @@ class Segment:


 class Feature:
-    def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg_list,strand,complete):
+    def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg_list,strand,complete,discontinuous):
        self.id=id
        self.type=type
        
@@ -81,10 +81,15 @@ class Feature:

        self.complete=complete # boolean, is the feature initialised with all the info or not

+        self.discontinuous=discontinuous # boolean, is the feature discontinuous in the gff (split in several lines, like a fragmented cds for instance)
+        self.first='' # boolean, is it the first part of the feature encountered
+        self.first_part='' # if first=False, string, the id of the first part encountered
+        self.other_parts_list=[] # if first=True, list ot the id of the other parts encountered
+
        
    def __str__(self):
        if self.parent=="":
-            return f"id={self.id}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
+            return f"id={self.name}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
        else:
-            return f"id={self.id}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
+            return f"id={self.name}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
 
\ No newline at end of file
--- a/GrAnnoT/Functions.py
+++ b/GrAnnoT/Functions.py
@@ -63,6 +63,9 @@ def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,wal
        start=stop
        stop=temp

+    if strand=='':
+        strand='.'
+
    output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start}\t{stop}\t.\t{strand}\t.\t{annotation}\n'
    return output_line

@@ -756,8 +759,9 @@ def complement(nucl):

 # stores information about a feature and its current variation
 class Variation:
-    def __init__(self,feature_id,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
-        self.feature_id=feature_id
+    def __init__(self,feature_id,feature_key,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
+        self.feature_id=feature_id # real id in the gff
+        self.feature_key=feature_key # key to access the feature info in the dict Features
        self.feature_type=feature_type
        self.chr=chr
        self.start_new=start_new
@@ -789,7 +793,7 @@ def create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path):
    size_diff=str(size_new_genome-feature.size)
    sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id)[0]

-    variation=Variation(feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
+    variation=Variation(feature.id,feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
    return(variation,feature_path_source_genome,feature_path_target_genome)

 # reset the informations of the variation, but keep the information about the feature
@@ -804,7 +808,7 @@ def reset_var(variation):
 # find the position of a substitution on the source and the target sequence
 def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id):
    seg_pos=search_segment(variation.start_var)
-    pos_old=str(int(Segments[seg_pos].get_start(feat))-int(feat_start))
+    pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))

    var_start_seg=variation.start_on_target
    if variation.inversion:
@@ -822,7 +826,7 @@ def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat
 # find the position of an insertion on the source and the target sequence
 def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id):
    seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion
-    pos_old=str(int(Segments[seg_pos].get_start(feat))-int(feat_start))
+    pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))

    start_var_seg=variation.start_var
    if variation.inversion:
@@ -842,9 +846,9 @@ def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,wal
    i=variation.start_var_index
    seg_pos=search_segment(variation.start_var)
    if i==0:
-        pos_old=int(Segments[seg_pos].get_start(feat))-int(feat_start)+Features[feat].pos_start-1
+        pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)+Features[feat].pos_start-1
    else:
-        pos_old=int(Segments[seg_pos].get_start(feat))-int(feat_start)
+        pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)
        if pos_old<0:
            pos_old=0
            print("error with variation position",variation.inversion,"***")

--- a/GrAnnoT/Graph_gff.py
+++ b/GrAnnoT/Graph_gff.py
 from .ClassSegFeat import *
 from tqdm import tqdm
+import random
+import string

 # create global variables to manipulate them in the functions below and to pass them to Functions.py
 global Features
@@ -46,7 +48,7 @@ def init_feature(line,feature_id,segment_oriented):
    # add the current segment to the list of segments that have the feature
    segments_list=[segment_oriented]
    # create the feature, store it in the dict Features
-    Features[feature_id]=Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True)
+    Features[feature_id]=Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True,False)

 # if there is a parent, returns the id of the parent feature and add the child to the parent
 def get_parent(annot,feature_id): # check how the parent info is stored in the gff format in general (this is a particular case...) todo
@@ -73,7 +75,7 @@ def add_child(feat,new_child):
    if feat in Features: # if the feature exists, add new_child
        Features[feat].childs.append(new_child)
    else: # create the feature "empty", only with child
-        Features[feat]=Feature(feat,"","",0,0,"",[new_child],"",[],"",False)
+        Features[feat]=Feature(feat,"","",0,0,"",[new_child],"",[],"",False,False)

 # add a segment to an existing feature
 def add_seg(feat_id,new_seg_oriented):
@@ -81,16 +83,46 @@ def add_seg(feat_id,new_seg_oriented):
        Features[feat_id].segments_list_source.append(new_seg_oriented)

 # add the position of the feature on its first and last segment
-def add_pos(feature):
-    feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature.id)
-    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature.id)
+def add_pos(feature_id):
+    feature=Features[feature_id]
+    feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature_id)
+    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature_id)

 # handle the Feature object creation
 def load_feature_intersect(line,feature_id,seg_oriented):
    if (feature_id not in Features) or (not Features[feature_id].complete): # if the feature doesn't exist or is not complete, create it or complete it
        init_feature(line,feature_id,seg_oriented)
    else: # if it exists, add the current segment to the list of segments that have the existing feature
-        add_seg(feature_id,seg_oriented)
+
+        # if the feature existing starts and stops at the same positions as the current feature, add_seg
+        current_start=int(line[7])
+        current_stop=int(line[8])
+        feature=Features[feature_id]
+        if (current_start==feature.start) and (current_stop==feature.stop):
+            add_seg(feature_id,seg_oriented)
+        else: # same feature_id, but not same part of the feature -> need to store them in two separate objects
+            add_segment_to_discontinuous_feature(line,feature,seg_oriented,current_start,current_stop)
+
+def add_segment_to_discontinuous_feature(line,feature,seg_oriented,current_start,current_stop):
+    added=False
+    for other_part_id in feature.other_parts_list:
+        other_part=Features[other_part_id]
+        if (other_part.start==current_start) and (other_part.stop==current_stop): # look for the current part of the feature to add the segment
+            add_seg(other_part_id,seg_oriented)
+            added=True
+    if added==False : # if the right part of the feature was not found, create a new part for the feature
+        new_part_name=feature.id+''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(7))
+        feature.other_parts_list.append(new_part_name)
+        [chr,start,stop]=[line[4],int(line[7]),int(line[8])]
+        segments_list=[seg_oriented]
+        Features[new_part_name]=Feature(feature.id,feature.type,chr,start,stop,feature.annot,feature.childs,feature.parent,segments_list,feature.strand,True,True)
+        new_part_feature=Features[new_part_name]
+        new_part_feature.first=False
+        new_part_feature.first_part=feature.id
+        feature.other_parts_list.append(new_part_name)
+        feature.discontinuous=True
+        if feature.parent in Features:
+            add_child(feature.parent,new_part_name)

 # handle the Segment object creation
 def load_segment_intersect(line,segment_id,feat_id,strand):
@@ -137,7 +169,9 @@ def load_intersect(intersect_path,verbose):
    # for all the features, add the position of the feature on its first and last segment, and the note.
    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
    for feat_id in Features:
-        add_pos(Features[feat_id])
+        if Features[feat_id].complete==False:
+            print('***',feat_id,Features[feat_id])
+        add_pos(feat_id)
        set_note(feat_id)
    # order the dictionnary
    order_dict(Features)
@@ -156,7 +190,11 @@ def add_feature_dict(feature_id,old_dict_features,new_dict_features):
        # check that the parent is present
        if feature.parent!='': # recursively find the parent (most of the time a gene)
            add_feature_dict(feature.parent,old_dict_features,new_dict_features)
-        # once the parent is present, add the feature
+        # if the feature is discontinuous, check that the other parts are present :
+        if feature.discontinuous and feature.first:
+            for part_id in feature.other_parts_list:
+                add_feature_dict(feature.parent,old_dict_features,new_dict_features)
+        # once the parent is present and the other parts are added, add the feature
        new_dict_features[feature_id]=feature

 # find the gene parent of a feature
@@ -181,40 +219,49 @@ def find_parent(feature_id): # add a check if there is no parent !
 # get the feature's start position on the segment (among segments on the source genome)
 def get_feature_start_on_segment(seg_id,feat_id):
    s=Segments[search_segment(seg_id)]
-    f=Features[feat_id]
-    if s.get_start(feat_id)>=f.start:
+    f=Features[find_part_segment_source(s.id,feat_id)]
+    if s.get_start(f.id)>=f.start:
        result=1
    else:
-        result=f.start-s.get_start(feat_id)+1
+        result=f.start-s.get_start(f.id)+1
    return result

 # get the feature's stop position on the segment (among segments on the source genome)
 def get_feature_stop_on_segment(seg_id,feat_id):
    s=Segments[search_segment(seg_id)]
-    f=Features[feat_id]
-    if s.get_stop(feat_id)<=f.stop:
-        result=s.get_size(feat_id)
+    f=Features[find_part_segment_source(s.id,feat_id)]
+    if s.get_stop(f.id)<=f.stop:
+        result=s.get_size(f.id)
    else:
-        result=f.stop-s.get_start(feat_id)+1
+        result=f.stop-s.get_start(f.id)+1
    return result

 # generates the annotation for the gff of the graph, with the rank of the segment in the feature
-def make_annot_graph(feature,segment):
-
+def make_annot_graph(feature_id,segment):
    # get the rank and the total number of ocurrences for the feature
+    feature=Features[find_part_segment_source(segment,feature_id)]
    rank=str(feature.segments_list_source.index(segment)+1)
    total=str(len(feature.segments_list_source))
    # create the annotation with the rank information
    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
    return annotation

+def find_part_segment_source(segment,feature_id):
+    first_feature=Features[feature_id]
+    if segment in first_feature.segments_list_source:
+        return feature_id
+    else:
+        list_parts=first_feature.other_parts_list
+        for part in list_parts:
+            if segment in Features[part].segments_list_source:
+                return part
+
 # create the line for the gff of the graph
 def create_line_graph_gff(feature_stranded,segment):
    [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
-    feature=Features[feature_id]

-    annotation=make_annot_graph(feature,segment)
-    type=feature.type
+    annotation=make_annot_graph(feature_id,segment)
+    type=Features[feature_id].type
    start=get_feature_start_on_segment(segment,feature_id)
    stop=get_feature_stop_on_segment(segment,feature_id)
    line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"