Newer
Older
import ClassSegFeat as Class
# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
global Segments
Segments={}
Features={}
global output_graph_gff
output_graph_gff=[0,"",""]
NMarthe
committed
global output_graph_gaf
output_graph_gaf=[0,"",""]
# write in output file
def write_line(line,output,force):
# output [count_line;string_lines;output_file]
output[1]+=line
output[0]+=1

nina.marthe_ird.fr
committed
if (output[0]>999) or (force==True):
output[2].write(output[1])
output[0]=0
output[1]=""

nina.marthe_ird.fr
committed
def invert_seg(seg):
if seg[0]==">":
inv_seg="<"+seg[1:]
elif seg[0]=="<":
inv_seg=">"+seg[1:]
else:
print(seg," not invertable")
return seg
return inv_seg
def search_segment(segment):
if segment not in Segments:
if invert_seg(segment) in Segments:
return invert_seg(segment)
return False
return segment
# functions to create the segments and the features
# create a segment with its first feature
def init_seg(line,segment_id,feature_stranded):

nina.marthe_ird.fr
committed
[chr,start,stop]=[line[0],int(line[1])+1,int(line[2])] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
# add the current feature to the list of features that are on the segment
feature_list=[feature_stranded]
# create the segment, store it in the Segments dict
Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)
# create a feature with the first segment its on
NMarthe
committed
def init_feature(line,feature_id,segment_oriented):

nina.marthe_ird.fr
committed
[type,annot,chr,start,stop,strand]=[line[6],line[12],line[4],int(line[7]),int(line[8]),line[10]]
if feature_id in Features: # if the feature has been created to store the child, get the child and rewrite it
childs=Features[feature_id].childs
else:
childs=[]
parent=get_parent(annot,feature_id)
# add the current segment to the list of segments that have the feature
NMarthe
committed
segments_list=[segment_oriented]
# create the feature, store it in the dict Features

nina.marthe_ird.fr
committed
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True)
# if there is a parent, returns the id of the parent feature and add the child to the parent
def get_parent(annot,feature_id):
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
return parent
# add a feature to an existing segment
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
# add a child feature to an existing feature
def add_child(feat,new_child):

nina.marthe_ird.fr
committed
if feat in Features: # if the feature exists, add new_child
Features[feat].childs.append(new_child)

nina.marthe_ird.fr
committed
else: # create the feature "empty", only with child
Features[feat]=Class.Feature(feat,"","",0,0,"",[new_child],"",[],"",False)
# add a segment to an existing feature
NMarthe
committed
def add_seg(feat_id,new_seg_oriented):

nina.marthe_ird.fr
committed
if new_seg_oriented not in Features[feat_id].segments_list_source:
Features[feat_id].segments_list_source.append(new_seg_oriented)
# add the position of the feature on its first and last segment
def add_pos(feature):

nina.marthe_ird.fr
committed
feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature.id)
feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature.id)
NMarthe
committed
def load_feature_intersect(line,feature_id,seg_oriented):

nina.marthe_ird.fr
committed
if (feature_id not in Features) or (not Features[feature_id].complete): # if the feature doesn't exist or is not complete, create it or complete it
init_feature(line,feature_id,seg_oriented)
else: # if it exists, add the current segment to the list of segments that have the existing feature
NMarthe
committed
add_seg(feature_id,seg_oriented)
def load_segment_intersect(line,segment_id,feature_stranded):
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line,segment_id,feature_stranded)
else: # if it exists, add the current feature to the list of features on the existing segment

nina.marthe_ird.fr
committed
# create a note for the child features that do not have annotation. not clean. fct for getting parent gene ?
def set_note(feature_id):
# the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
# in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent).
feature=Features[feature_id]
parent_feat=find_parent(feature_id)
note=Features[parent_feat].annot.split(';')[-1]
feature.note=note
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def load_intersect(intersect_path):
# open the file with the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
file.close()
for line in lines: # for each line in the intersect file
line=line.split()
# get the ids for the dictionnaries' keys
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")

nina.marthe_ird.fr
committed
segment_id=line[3]
strand=line[10]
NMarthe
committed
segment_oriented=line[3]
NMarthe
committed
load_feature_intersect(line,feature_id,segment_oriented)
load_segment_intersect(line,segment_id,feature_stranded)
# for all the features, add the position of the feature on its first and last segment.
# cant do it before because for that i need to have all the segments in the list segments_list for each feature.
for feat_id in Features:
add_pos(Features[feat_id])
set_note(feat_id)
def find_parent(feature_id):
feature=Features[feature_id]
if feature.type=="gene":
return feature.id
else:
parent_found=False
while parent_found==False:
if Features[current].type=="gene":
return current
else: # if we didn't find the gene, we go up to the current feature's parent until we find it
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_feature_start_on_segment(seg_id,feat_id):

nina.marthe_ird.fr
committed
s=Segments[search_segment(seg_id)]
f=Features[feat_id]
if s.start>=f.start:
result=1
else:
result=f.start-s.start+1
return result
# get the feature's stop position on the segment
def get_feature_stop_on_segment(seg_id,feat_id):

nina.marthe_ird.fr
committed
s=Segments[search_segment(seg_id)]
f=Features[feat_id]
if s.stop<=f.stop:
result=s.size
else:
result=f.stop-s.start+1
return result
# generates the annotation for the gff of the graph, with the rank of the segment in the feature

nina.marthe_ird.fr
committed
def make_annot_graph(feature,segment):
NMarthe
committed
# get the rank and the total number of ocurrences for the feature

nina.marthe_ird.fr
committed
rank=str(feature.segments_list_source.index(segment)+1)

nina.marthe_ird.fr
committed
total=str(len(feature.segments_list_source))
# create the annotation with the rank information
annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
return annotation

nina.marthe_ird.fr
committed
def create_line_graph_gff(feature_stranded,segment):
[strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
feature=Features[feature_id]

nina.marthe_ird.fr
committed
annotation=make_annot_graph(feature,segment)
type=feature.type

nina.marthe_ird.fr
committed
start=get_feature_start_on_segment(segment,feature_id)
stop=get_feature_stop_on_segment(segment,feature_id)
line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
print("generation of the graph's gff")
graph_gff_file = open(graph_gff_file, 'w')
output_graph_gff[2]=graph_gff_file

nina.marthe_ird.fr
committed
for segment_stranded in Segments:
features_on_seg=Segments[segment_stranded].features # get the list of the features on the segment
for feature_stranded in features_on_seg: # go through all the segment's features, and print the gff line for each

nina.marthe_ird.fr
committed
line = create_line_graph_gff(feature_stranded,segment_stranded)
write_line(line,output_graph_gff,False)
write_line("",output_graph_gff,True)
def graph_gaf(graph_gaf_file,segments_file):
NMarthe
committed
print("generation of the graph's gaf")
graph_gaf_file = open(graph_gaf_file, 'w')
output_graph_gaf[2]=graph_gaf_file
seg_len=get_segments_length(segments_file)
NMarthe
committed
for feature_id in Features:
feature=Features[feature_id]

nina.marthe_ird.fr
committed
feature_segments=feature.segments_list_source
NMarthe
committed
feature_path=""
path_length=0
for segment in feature_segments:
feature_path+=segment[0]+segment[2:]
path_length+=seg_len[segment[1:]]
strand=feature.strand
size=feature.size

nina.marthe_ird.fr
committed
start_on_path=feature.pos_start-1 # -1 to get the position 0-based bed-like
NMarthe
committed
stop_on_path=start_on_path+size
NMarthe
committed
line=f'{feature.id}\t{size}\t0\t{size}\t{strand}\t{feature_path}\t{path_length}\t{start_on_path}\t{stop_on_path}\t{size}\t{size}\t255\n'
write_line(line,output_graph_gaf,False)
write_line("",output_graph_gaf,True)
NMarthe
committed
graph_gaf_file.close()

nina.marthe_ird.fr
committed
def get_segments_length(segments):
file_segments=open(segments,'r')
lines_segments=file_segments.readlines()
file_segments.close()
NMarthe
committed
seg_len={}

nina.marthe_ird.fr
committed
for line in lines_segments:
NMarthe
committed
line=line.split()

nina.marthe_ird.fr
committed
seg_id='s'+line[1]
seg_len[seg_id]=len(line[2])
NMarthe
committed
return seg_len