Skip to content
Snippets Groups Projects
Graph_gff.py 12 KiB
Newer Older
import ClassSegFeat as Class

# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
global Segments
Segments={}
Features={}

global output_graph_gff
output_graph_gff=[0,"",""]
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# function to write in output file every 1000 lines
def write_line(line,output,force):
    # output [count_line;string_lines;output_file]
    output[1]+=line
    output[0]+=1
        output[2].write(output[1])
        output[0]=0
        output[1]=""
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# get the inverted segment
def invert_seg(seg):
    if seg[0]==">":
        inv_seg="<"+seg[1:]
    elif seg[0]=="<":
        inv_seg=">"+seg[1:]
    else:
        print(seg," not invertable")
        return seg
    return inv_seg

nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# look for a segment in the dict Segments, in one orientation or the other
def search_segment(segment):
    if segment not in Segments:
        if invert_seg(segment) in Segments:
            return invert_seg(segment)
        return False
    return segment


# functions to create the segments and the features

# create a segment with its first feature
def init_seg(line,segment_id,feature_stranded):
    [chr,start,stop]=[line[0],int(line[1])+1,int(line[2])] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
    # add the current feature to the list of features that are on the segment
    feature_list=[feature_stranded]
    # create the segment, store it in the Segments dict
    Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)

# create a feature with the first segment its on
def init_feature(line,feature_id,segment_oriented):
    [type,annot,chr,start,stop,strand]=[line[6],line[12],line[4],int(line[7]),int(line[8]),line[10]]
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
    if feature_id in Features: # if the feature has already been created (in order to store the child), get the child and rewrite it
        childs=Features[feature_id].childs
    else:
        childs=[]
    # add the current segment to the list of segments that have the feature
    # create the feature, store it in the dict Features
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True)
# if there is a parent, returns the id of the parent feature and add the child to the parent
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
def get_parent(annot,feature_id): # check how the parent info is stored in the gff format in general (this is a particular case...)
    if annot.split(";")[1].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    elif annot.split(";")[2].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""
    return parent

# add a feature to an existing segment
def add_feature(seg,new_feature_stranded):
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)

# add a child feature to an existing feature
def add_child(feat,new_child):
    if feat in Features: # if the feature exists, add new_child
        Features[feat].childs.append(new_child)
    else: # create the feature "empty", only with child
        Features[feat]=Class.Feature(feat,"","",0,0,"",[new_child],"",[],"",False)
    if new_seg_oriented not in Features[feat_id].segments_list_source:
        Features[feat_id].segments_list_source.append(new_seg_oriented)
# add the position of the feature on its first and last segment
def add_pos(feature):
    feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature.id)
    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature.id)
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# handle the Feature object creation
def load_feature_intersect(line,feature_id,seg_oriented):
    if (feature_id not in Features) or (not Features[feature_id].complete): # if the feature doesn't exist or is not complete, create it or complete it
        init_feature(line,feature_id,seg_oriented)
    else: # if it exists, add the current segment to the list of segments that have the existing feature
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# handle the Segment object creation
def load_segment_intersect(line,segment_id,feature_stranded):
    if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
        init_seg(line,segment_id,feature_stranded)
    else: # if it exists, add the current feature to the list of features on the existing segment
        add_feature(segment_id,feature_stranded)
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# create a note for the child features that do not have annotation.
def set_note(feature_id): # not universal, depends on the format of the input gff...
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
    feature=Features[feature_id]
    parent_feat=find_parent(feature_id)
    note=Features[parent_feat].annot.split(';')[-1]
    feature.note=note


# create all the Segment and Feature objects in the dictionnaries Segments and Features
def load_intersect(intersect_path):
    print("loading the intersect file")
    
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    file.close()
    
    for line in lines: # for each line in the intersect file
        # get the ids for the dictionnaries' keys
        feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        feature_stranded=strand+feature_id
        load_feature_intersect(line,feature_id,segment_oriented)
        load_segment_intersect(line,segment_id,feature_stranded)
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
    # for all the features, add the position of the feature on its first and last segment, and the note.
    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
    for feat_id in Features:
        add_pos(Features[feat_id])
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
    # order the dictionnary
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# orders the features so that each features is always preceded by its parent feature (gene -> mrna -> exons, cds, utr). useful for the transfer later.
def order_dict(features_dict):
    copy_Features=dict(features_dict) # stores the features in a copy
    features_dict.clear()# empty dict, to refill it with the features in the right orcer
    for feature_id in copy_Features:
        add_feature_dict(feature_id,copy_Features,features_dict)

nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# add a feature to the ordered dictionnary
def add_feature_dict(feature_id,old_dict_features,new_dict_features):
    if feature_id not in new_dict_features:
        feature=old_dict_features[feature_id]
        # check that the parent is present
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
        if feature.type!="gene": # recursively find the parent
            add_feature_dict(feature.parent,old_dict_features,new_dict_features)
        # once the parent is present, add the feature
        new_dict_features[feature_id]=feature

nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# find the gene parent of a feature
def find_parent(feature_id): # add a check if there is no parent !
    feature=Features[feature_id]
    if feature.type=="gene":
        return feature.id
    else:
        current=feature.parent
        parent_found=False
        while parent_found==False:
            if Features[current].type=="gene":
                return current
            else: # if we didn't find the gene, we go up to the current feature's parent until we find it
                current=Features[current].parent
# functions to generate the graph's gff from the segments and features created with create_seg_feat

nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# get the feature's start position on the segment (among segments on the source genome)
def get_feature_start_on_segment(seg_id,feat_id):
    f=Features[feat_id]
    if s.start>=f.start:
        result=1
    else:
        result=f.start-s.start+1
    return result

nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# get the feature's stop position on the segment (among segments on the source genome)
def get_feature_stop_on_segment(seg_id,feat_id):
    f=Features[feat_id]
    if s.stop<=f.stop:
        result=s.size
    else:
        result=f.stop-s.start+1
    return result

# generates the annotation for the gff of the graph, with the rank of the segment in the feature
    # get the rank and the total number of ocurrences for the feature
    rank=str(feature.segments_list_source.index(segment)+1)
    # create the annotation with the rank information
    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
    return annotation

nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# create the line for the gff of the graph
def create_line_graph_gff(feature_stranded,segment):
    [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
    feature=Features[feature_id]

    start=get_feature_start_on_segment(segment,feature_id)
    stop=get_feature_stop_on_segment(segment,feature_id)
    line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# go through all the segments in Segments and prints the gff of the graph, with one line per segment/feature intersection
def graph_gff(graph_gff_file):
    graph_gff_file = open(graph_gff_file, 'w')
    output_graph_gff[2]=graph_gff_file
    for segment_stranded in Segments:
        features_on_seg=Segments[segment_stranded].features # get the list of the features on the segment
        for feature_stranded in features_on_seg: # go through all the segment's features, and print the gff line for each
            line = create_line_graph_gff(feature_stranded,segment_stranded)
            write_line(line,output_graph_gff,False)
    write_line("",output_graph_gff,True)
    graph_gff_file.close()
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# go through all the features in Features and print the gaf of the graph, with one line per feature
def graph_gaf(graph_gaf_file,segments_file):
    print("generation of the graph's gaf")
    graph_gaf_file = open(graph_gaf_file, 'w')
    output_graph_gaf[2]=graph_gaf_file

    seg_len=get_segments_length(segments_file)
    for feature_id in Features:
        feature=Features[feature_id]
        feature_segments=feature.segments_list_source
        feature_path=""
        path_length=0
        for segment in feature_segments:
            feature_path+=segment[0]+segment[2:]
            path_length+=seg_len[segment[1:]]
        strand=feature.strand
        size=feature.size
        start_on_path=feature.pos_start-1 # -1 to get the position 0-based bed-like
        line=f'{feature.id}\t{size}\t0\t{size}\t{strand}\t{feature_path}\t{path_length}\t{start_on_path}\t{stop_on_path}\t{size}\t{size}\t255\n'
        write_line(line,output_graph_gaf,False)
    write_line("",output_graph_gaf,True)
nina.marthe_ird.fr's avatar
nina.marthe_ird.fr committed
# creates a dictionnary with the length of the segments : s5->8
def get_segments_length(segments):
    file_segments=open(segments,'r')
    lines_segments=file_segments.readlines()
    file_segments.close()