Graph_gff.py

import ClassSegFeat as Class

# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
global Segments
Segments={}
Features={}

global output_graph_gff
output_graph_gff=[0,"",""]

# write in output file
def write_line(line,output,force):
    output[1]+=line
    output[0]+=1
    if (output[0]>999) | (force==True):
        output[2].write(output[1])
        output[0]=0
        output[1]=""


# functions to create the segments and the features

# create a segment with its first feature
def init_seg(line,feature_id,segment_id,strand):
    [chr,start,stop]=[line[0],line[1],line[2]]
    # add the current feature to the list of features that are on the segment
    feature_stranded=strand+feature_id
    feature_list=[feature_stranded]
    # create the segment, store it in the Segments dict
    Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)

# create a feature with the first segment its on
def init_feature(line,feature_id,segment_id,strand):
    [type,annot,chr,start,stop,childs]=[line[6],line[12],line[4],line[7],line[8],[]]
    parent=get_parent(annot,feature_id)
    # add the current segment to the list of segments that have the feature
    segment_stranded=strand+segment_id
    segments_list=[segment_stranded]
    # create the feature, store it in the dict Features
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
    set_note(feature_id)

# if there is a parent, returns the id of the parent feature and add the child to the parent
def get_parent(annot,feature_id):
    if annot.split(";")[1].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    elif annot.split(";")[2].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""
    return parent

# add a feature to an existing segment
def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)

# add a child feature to an existing feature
def add_child(feat,new_child):
    if (feat in Features.keys()) & (new_child not in Features[feat].childs): # if the parent feature exists
        Features[feat].childs.append(new_child)

# add a segment to an existing feature
def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)

# add the position of the feature on its first and last segment
def add_pos(feature):
    feature.pos_start=get_feature_start_on_segment(feature.segments_list[0][1:],feature.id)
    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list[-1][1:],feature.id)

def load_feature_intersect(line,feature_id,segment_id,strand):
    if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
        init_feature(line, feature_id, segment_id,strand)
    else: # if it exists, add the current segment to the list of segments that have the existing feature
        add_seg(feature_id,segment_id,strand)

def load_segment_intersect(line,feature_id,segment_id,strand):
    if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
        init_seg(line, feature_id,segment_id,strand)
    else: # if it exists, add the current feature to the list of features on the existing segment
        add_feature(segment_id,feature_id,strand)


# create a note for the child features that do not have annotation.     not clean. fct for getting parent ? 
def set_note(feature_id):
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
    feature=Features[feature_id]
    if feature.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
        feature.note=feature.annot.split(';')[-1]
    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
        # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
        curent=feature.parent
        annot_found=False
        while annot_found==False:
            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                note=Features[curent].annot.split(';')[-1]
                feature.note=note
                annot_found=True
            else: # if we didn't find the gene, we go up to the current feature's parent until we find it
                curent=Features[Features[curent].parent].id


# create all the Segment and Feature objects in the dictionnaries Segments and Features
def load_intersect(intersect_path):
    print("loading the intersect file")
    
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    file.close()
    
    for line in lines: # for each line in the intersect file
        line=line.split()
        # get the ids for the dictionnaries' keys
        feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        segment_id=line[3][1:]
        strand=line[10]
    
        load_feature_intersect(line,feature_id,segment_id,strand)
        load_segment_intersect(line,feature_id,segment_id,strand)

    # for all the features, add the position of the feature on its first and last segment.
    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
    for feat_id in Features:
        add_pos(Features[feat_id])


# functions to generate the graph's gff from the segments and features created with create_seg_feat

# get the feature's start position on the segment
def get_feature_start_on_segment(seg_id,feat_id):
    s=Segments[seg_id]
    f=Features[feat_id]
    if s.start>=f.start:
        result=1
    else:
        result=f.start-s.start+1
    return result

# get the feature's stop position on the segment
def get_feature_stop_on_segment(seg_id,feat_id):
    s=Segments[seg_id]
    f=Features[feat_id]
    if s.stop<=f.stop:
        result=s.size
    else:
        result=f.stop-s.start+1
    return result

# generates the annotation for the gff of the graph, with the rank of the segment in the feature
def make_annot_graph(feature,segment,strand):
    segment_stranded=strand+segment
    # get the rank and the total number of ocurrences for the feature
    rank=str(feature.segments_list.index(segment_stranded)+1)
    total=str(len(feature.segments_list))
    # create the annotation with the rank information
    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
    return annotation

def create_line_graph_gff(feature_stranded,segment):
    [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
    feature=Features[feature_id]

    annotation=make_annot_graph(feature,segment,strand)
    type=feature.type
    start=get_feature_start_on_segment(segment,feature_id)
    stop=get_feature_stop_on_segment(segment,feature_id)
    line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
    return line

# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')
    output_graph_gff[2]=file_out

    for segment in Segments:
        features_seg=Segments[segment].features # get the list of the features on the segment
        for feature_stranded in features_seg: # go through all the segment's features, and print the gff line for each
            line = create_line_graph_gff(feature_stranded,segment)
            write_line(line,output_graph_gff,False)
    write_line("",output_graph_gff,True)

    file_out.close()