Skip to content
Snippets Groups Projects
Commit 8d0291e1 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

modified how the feature ids are used to handle features fragmented on several lines in the gff

parent 22c4a0b8
No related branches found
No related tags found
No related merge requests found
......@@ -56,7 +56,7 @@ class Segment:
class Feature:
def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg_list,strand,complete):
def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg_list,strand,complete,discontinuous):
self.id=id
self.type=type
......@@ -81,10 +81,15 @@ class Feature:
self.complete=complete # boolean, is the feature initialised with all the info or not
self.discontinuous=discontinuous # boolean, is the feature discontinuous in the gff (split in several lines, like a fragmented cds for instance)
self.first='' # boolean, is it the first part of the feature encountered
self.first_part='' # if first=False, string, the id of the first part encountered
self.other_parts_list=[] # if first=True, list ot the id of the other parts encountered
def __str__(self):
if self.parent=="":
return f"id={self.id}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
return f"id={self.name}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
else:
return f"id={self.id}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
return f"id={self.name}, type={self.type}, segments={self.segments_list_source}, position on the original genome={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
\ No newline at end of file
......@@ -63,6 +63,9 @@ def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,wal
start=stop
stop=temp
if strand=='':
strand='.'
output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start}\t{stop}\t.\t{strand}\t.\t{annotation}\n'
return output_line
......@@ -756,8 +759,9 @@ def complement(nucl):
# stores information about a feature and its current variation
class Variation:
def __init__(self,feature_id,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
self.feature_id=feature_id
def __init__(self,feature_id,feature_key,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
self.feature_id=feature_id # real id in the gff
self.feature_key=feature_key # key to access the feature info in the dict Features
self.feature_type=feature_type
self.chr=chr
self.start_new=start_new
......@@ -789,7 +793,7 @@ def create_var(feature_id,first_seg,last_seg,walk,copy_id,feature_target_path):
size_diff=str(size_new_genome-feature.size)
sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id)[0]
variation=Variation(feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
variation=Variation(feature.id,feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
return(variation,feature_path_source_genome,feature_path_target_genome)
# reset the informations of the variation, but keep the information about the feature
......@@ -804,7 +808,7 @@ def reset_var(variation):
# find the position of a substitution on the source and the target sequence
def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id):
seg_pos=search_segment(variation.start_var)
pos_old=str(int(Segments[seg_pos].get_start(feat))-int(feat_start))
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
var_start_seg=variation.start_on_target
if variation.inversion:
......@@ -822,7 +826,7 @@ def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat
# find the position of an insertion on the source and the target sequence
def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id):
seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion
pos_old=str(int(Segments[seg_pos].get_start(feat))-int(feat_start))
pos_old=str(int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start))
start_var_seg=variation.start_var
if variation.inversion:
......@@ -842,9 +846,9 @@ def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,wal
i=variation.start_var_index
seg_pos=search_segment(variation.start_var)
if i==0:
pos_old=int(Segments[seg_pos].get_start(feat))-int(feat_start)+Features[feat].pos_start-1
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)+Features[feat].pos_start-1
else:
pos_old=int(Segments[seg_pos].get_start(feat))-int(feat_start)
pos_old=int(Segments[seg_pos].get_start(Features[feat].id))-int(feat_start)
if pos_old<0:
pos_old=0
print("error with variation position",variation.inversion,"***")
......
from .ClassSegFeat import *
from tqdm import tqdm
import random
import string
# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
......@@ -46,7 +48,7 @@ def init_feature(line,feature_id,segment_oriented):
# add the current segment to the list of segments that have the feature
segments_list=[segment_oriented]
# create the feature, store it in the dict Features
Features[feature_id]=Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True)
Features[feature_id]=Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True,False)
# if there is a parent, returns the id of the parent feature and add the child to the parent
def get_parent(annot,feature_id): # check how the parent info is stored in the gff format in general (this is a particular case...) todo
......@@ -73,7 +75,7 @@ def add_child(feat,new_child):
if feat in Features: # if the feature exists, add new_child
Features[feat].childs.append(new_child)
else: # create the feature "empty", only with child
Features[feat]=Feature(feat,"","",0,0,"",[new_child],"",[],"",False)
Features[feat]=Feature(feat,"","",0,0,"",[new_child],"",[],"",False,False)
# add a segment to an existing feature
def add_seg(feat_id,new_seg_oriented):
......@@ -81,16 +83,46 @@ def add_seg(feat_id,new_seg_oriented):
Features[feat_id].segments_list_source.append(new_seg_oriented)
# add the position of the feature on its first and last segment
def add_pos(feature):
feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature.id)
feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature.id)
def add_pos(feature_id):
feature=Features[feature_id]
feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature_id)
feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature_id)
# handle the Feature object creation
def load_feature_intersect(line,feature_id,seg_oriented):
if (feature_id not in Features) or (not Features[feature_id].complete): # if the feature doesn't exist or is not complete, create it or complete it
init_feature(line,feature_id,seg_oriented)
else: # if it exists, add the current segment to the list of segments that have the existing feature
add_seg(feature_id,seg_oriented)
# if the feature existing starts and stops at the same positions as the current feature, add_seg
current_start=int(line[7])
current_stop=int(line[8])
feature=Features[feature_id]
if (current_start==feature.start) and (current_stop==feature.stop):
add_seg(feature_id,seg_oriented)
else: # same feature_id, but not same part of the feature -> need to store them in two separate objects
add_segment_to_discontinuous_feature(line,feature,seg_oriented,current_start,current_stop)
def add_segment_to_discontinuous_feature(line,feature,seg_oriented,current_start,current_stop):
added=False
for other_part_id in feature.other_parts_list:
other_part=Features[other_part_id]
if (other_part.start==current_start) and (other_part.stop==current_stop): # look for the current part of the feature to add the segment
add_seg(other_part_id,seg_oriented)
added=True
if added==False : # if the right part of the feature was not found, create a new part for the feature
new_part_name=feature.id+''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(7))
feature.other_parts_list.append(new_part_name)
[chr,start,stop]=[line[4],int(line[7]),int(line[8])]
segments_list=[seg_oriented]
Features[new_part_name]=Feature(feature.id,feature.type,chr,start,stop,feature.annot,feature.childs,feature.parent,segments_list,feature.strand,True,True)
new_part_feature=Features[new_part_name]
new_part_feature.first=False
new_part_feature.first_part=feature.id
feature.other_parts_list.append(new_part_name)
feature.discontinuous=True
if feature.parent in Features:
add_child(feature.parent,new_part_name)
# handle the Segment object creation
def load_segment_intersect(line,segment_id,feat_id,strand):
......@@ -137,7 +169,9 @@ def load_intersect(intersect_path,verbose):
# for all the features, add the position of the feature on its first and last segment, and the note.
# cant do it before because for that i need to have all the segments in the list segments_list for each feature.
for feat_id in Features:
add_pos(Features[feat_id])
if Features[feat_id].complete==False:
print('***',feat_id,Features[feat_id])
add_pos(feat_id)
set_note(feat_id)
# order the dictionnary
order_dict(Features)
......@@ -156,7 +190,11 @@ def add_feature_dict(feature_id,old_dict_features,new_dict_features):
# check that the parent is present
if feature.parent!='': # recursively find the parent (most of the time a gene)
add_feature_dict(feature.parent,old_dict_features,new_dict_features)
# once the parent is present, add the feature
# if the feature is discontinuous, check that the other parts are present :
if feature.discontinuous and feature.first:
for part_id in feature.other_parts_list:
add_feature_dict(feature.parent,old_dict_features,new_dict_features)
# once the parent is present and the other parts are added, add the feature
new_dict_features[feature_id]=feature
# find the gene parent of a feature
......@@ -181,40 +219,49 @@ def find_parent(feature_id): # add a check if there is no parent !
# get the feature's start position on the segment (among segments on the source genome)
def get_feature_start_on_segment(seg_id,feat_id):
s=Segments[search_segment(seg_id)]
f=Features[feat_id]
if s.get_start(feat_id)>=f.start:
f=Features[find_part_segment_source(s.id,feat_id)]
if s.get_start(f.id)>=f.start:
result=1
else:
result=f.start-s.get_start(feat_id)+1
result=f.start-s.get_start(f.id)+1
return result
# get the feature's stop position on the segment (among segments on the source genome)
def get_feature_stop_on_segment(seg_id,feat_id):
s=Segments[search_segment(seg_id)]
f=Features[feat_id]
if s.get_stop(feat_id)<=f.stop:
result=s.get_size(feat_id)
f=Features[find_part_segment_source(s.id,feat_id)]
if s.get_stop(f.id)<=f.stop:
result=s.get_size(f.id)
else:
result=f.stop-s.get_start(feat_id)+1
result=f.stop-s.get_start(f.id)+1
return result
# generates the annotation for the gff of the graph, with the rank of the segment in the feature
def make_annot_graph(feature,segment):
def make_annot_graph(feature_id,segment):
# get the rank and the total number of ocurrences for the feature
feature=Features[find_part_segment_source(segment,feature_id)]
rank=str(feature.segments_list_source.index(segment)+1)
total=str(len(feature.segments_list_source))
# create the annotation with the rank information
annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
return annotation
def find_part_segment_source(segment,feature_id):
first_feature=Features[feature_id]
if segment in first_feature.segments_list_source:
return feature_id
else:
list_parts=first_feature.other_parts_list
for part in list_parts:
if segment in Features[part].segments_list_source:
return part
# create the line for the gff of the graph
def create_line_graph_gff(feature_stranded,segment):
[strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
feature=Features[feature_id]
annotation=make_annot_graph(feature,segment)
type=feature.type
annotation=make_annot_graph(feature_id,segment)
type=Features[feature_id].type
start=get_feature_start_on_segment(segment,feature_id)
stop=get_feature_stop_on_segment(segment,feature_id)
line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment