Newer
Older
import ClassSegFeat as Class
# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
global Segments
Segments={}
Features={}
global output_graph_gff
output_graph_gff=[0,"",""]
# write in output file
def write_line(line,output,force):
output[1]+=line
output[0]+=1
if (output[0]>999) | (force==True):
output[2].write(output[1])
output[0]=0
output[1]=""
# functions to create the segments and the features
# create a segment with its first feature
def init_seg(line,feature_id,segment_id,strand):
[chr,start,stop]=[line[0],line[1],line[2]]
# add the current feature to the list of features that are on the segment
feature_stranded=strand+feature_id
feature_list=[feature_stranded]
# create the segment, store it in the Segments dict
Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)
# create a feature with the first segment its on
def init_feature(line,feature_id,segment_id,strand):
[type,annot,chr,start,stop,childs]=[line[6],line[12],line[4],line[7],line[8],[]]
parent=get_parent(annot,feature_id)
# add the current segment to the list of segments that have the feature
segment_stranded=strand+segment_id
segments_list=[segment_stranded]
# create the feature, store it in the dict Features
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
set_note(feature_id)
# if there is a parent, returns the id of the parent feature and add the child to the parent
def get_parent(annot,feature_id):
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
return parent
# add a feature to an existing segment
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
# add a child feature to an existing feature
def add_child(feat,new_child):
if (feat in Features.keys()) & (new_child not in Features[feat].childs): # if the parent feature exists
Features[feat].childs.append(new_child)
# add a segment to an existing feature
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
# add the position of the feature on its first and last segment
def add_pos(feature):
feature.pos_start=get_feature_start_on_segment(feature.segments_list[0][1:],feature.id)
feature.pos_stop=get_feature_stop_on_segment(feature.segments_list[-1][1:],feature.id)
def load_feature_intersect(line,feature_id,segment_id,strand):
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line, feature_id, segment_id,strand)
else: # if it exists, add the current segment to the list of segments that have the existing feature
add_seg(feature_id,segment_id,strand)
def load_segment_intersect(line,feature_id,segment_id,strand):
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id,segment_id,strand)
else: # if it exists, add the current feature to the list of features on the existing segment
add_feature(segment_id,feature_id,strand)
# create a note for the child features that do not have annotation. not clean. fct for getting parent ?
def set_note(feature_id):
# the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
# in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent).
feature=Features[feature_id]
if feature.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
feature.note=feature.annot.split(';')[-1]
else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
# we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
# this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
curent=feature.parent
annot_found=False
while annot_found==False:
if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
note=Features[curent].annot.split(';')[-1]
feature.note=note
annot_found=True
else: # if we didn't find the gene, we go up to the current feature's parent until we find it
curent=Features[Features[curent].parent].id
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def load_intersect(intersect_path):
print("loading the intersect file")
# open the file with the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
file.close()
for line in lines: # for each line in the intersect file
line=line.split()
# get the ids for the dictionnaries' keys
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line[3][1:]
strand=line[10]
load_feature_intersect(line,feature_id,segment_id,strand)
load_segment_intersect(line,feature_id,segment_id,strand)
# for all the features, add the position of the feature on its first and last segment.
# cant do it before because for that i need to have all the segments in the list segments_list for each feature.
for feat_id in Features:
add_pos(Features[feat_id])
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_feature_start_on_segment(seg_id,feat_id):
s=Segments[seg_id]
f=Features[feat_id]
if s.start>=f.start:
result=1
else:
result=f.start-s.start+1
return result
# get the feature's stop position on the segment
def get_feature_stop_on_segment(seg_id,feat_id):
s=Segments[seg_id]
f=Features[feat_id]
if s.stop<=f.stop:
result=s.size
else:
result=f.stop-s.start+1
return result
# generates the annotation for the gff of the graph, with the rank of the segment in the feature
def make_annot_graph(feature,segment,strand):
segment_stranded=strand+segment
# get the rank and the total number of ocurrences for the feature
rank=str(feature.segments_list.index(segment_stranded)+1)
total=str(len(feature.segments_list))
# create the annotation with the rank information
annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
return annotation
def create_line_graph_gff(feature_stranded,segment):
[strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
feature=Features[feature_id]
annotation=make_annot_graph(feature,segment,strand)
type=feature.type
start=get_feature_start_on_segment(segment,feature_id)
stop=get_feature_stop_on_segment(segment,feature_id)
line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return line
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
output_graph_gff[2]=file_out
for segment in Segments:
features_seg=Segments[segment].features # get the list of the features on the segment
for feature_stranded in features_seg: # go through all the segment's features, and print the gff line for each
line = create_line_graph_gff(feature_stranded,segment)
write_line(line,output_graph_gff,False)
write_line("",output_graph_gff,True)