intersect.py

from .ClassSegFeat import Segment,Feature
from tqdm import tqdm
import random
import string
import subprocess
import sys

def run_intersect(args):
    print("Computing the intersection between the annotations and the graph segments")

    seg_coord_path=args.segment_coordinates_path

    if args.haplotype:
        if args.source_haplotype=='.':
            files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
            # select haplotype
            source_haplotypes=[]
            for file in files:
                file_haplotype=file.name.split("#")[1]
                source_haplotypes.append(file_haplotype)
            first_source_haplo=min(source_haplotypes)
            args.source_haplotype=first_source_haplo
            haplo=f"#{first_source_haplo}#"
            print('\033[35m'+f'Warning : No source haplotype was given with \'-sh\'. Haplotype {first_source_haplo} will be used.'+'\033[0m')
            files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome haplotype in seg_coord
            message=f'No bed files corresponding to the source genome haplotype {first_source_haplo} found in {seg_coord_path}'
        else:
            haplo = f"#{args.source_haplotype}#"
            files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome in seg_coord
            message=f'No bed files corresponding to the source genome haplotype found in {seg_coord_path}'
        if len(files)==0:
            sys.exit(message)
        if args.verbose:
            print(f'     Found {len(files)} paths corresponding to the source genome haplotype {args.source_haplotype}')
    else:
        files=list(seg_coord_path.glob(f"{args.source_genome}#*.bed")) # get all bed files from the source genome in seg_coord
        message='\33[31m'+f'Error : No bed files corresponding to the source genome found in {seg_coord_path}. Please check that the source genome name is right.'+'\033[0m'
        if len(files)==0:
            sys.exit(message)
        if args.verbose:
            print(f'     Found {len(files)} paths corresponding to the source genome')

    intersect_path=args.outdir.joinpath("intersect")
    command=f"echo -n '' > {intersect_path}" # empty the file "intersect"
    subprocess.run(command,shell=True,timeout=None)
    for file in files:
        if args.verbose:
            print(f'          Building the intersection for the path {file.stem}')
        command=f"bedtools intersect -nonamecheck -wo -a {file.as_posix()} -b {args.gff.as_posix()}>>{intersect_path}"
        subprocess.run(command,shell=True,timeout=None)
    intersect_lines =  int(subprocess.Popen(f"wc -l {intersect_path}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()[0])
    if intersect_lines==0:
        print('\33[31m'+"Error : No lines in the intersect. Please check that the sequence id in the GFF file (1st field) matches the sequence id in the GFA file (4th field of the W lines or 3rd part of the second field of the P lines)."+'\033[0m')
        exit()


# create global variables to manipulate them in the functions below and to pass them to Functions.py
global Features
global Segments
Segments={}
Features={}

# functions to create the segments and the features

# get the feature's start position on the segment (among segments on the source genome)
def get_feature_start_on_segment(seg_id,feat_id):
    s=Segments[search_segment(seg_id,feat_id)]
    f=Features[find_part_segment_source(s.id,feat_id)]
    if s.get_start(f.id)>=f.start:
        result=1
    else:
        result=f.start-s.get_start(f.id)+1
    return result

# get the feature's stop position on the segment (among segments on the source genome)
def get_feature_stop_on_segment(seg_id,feat_id):
    s=Segments[search_segment(seg_id,feat_id)]
    f=Features[find_part_segment_source(s.id,feat_id)]
    if s.get_stop(f.id)<=f.stop:
        result=s.get_size(f.id)
    else:
        result=f.stop-s.get_start(f.id)+1
    return result

# get the inverted segment
def invert_seg(seg):
    if seg[0]==">":
        inv_seg="<"+seg[1:]
    elif seg[0]=="<":
        inv_seg=">"+seg[1:]
    else:
        print(seg," not invertable")
        return seg
    return inv_seg

# look for a segment in the dict Segments, in one orientation or the other
def search_segment(segment,feat_id):
    list_seg_source=Features[feat_id].segments_list_source

    if segment in Segments:
        if segment in list_seg_source:
            return segment
        elif invert_seg(segment) in Segments:
            if invert_seg(segment) in list_seg_source:
                return invert_seg(segment)
    elif invert_seg(segment) in Segments:
        if invert_seg(segment) in list_seg_source:
            return invert_seg(segment)

# find on what part of the source feature the segment is; sometimes the features are fragmented.
def find_part_segment_source(segment,feature_id):
    first_feature=Features[feature_id]
    if segment in first_feature.segments_list_source:
        return feature_id
    else:
        list_parts=first_feature.other_parts_list
        for part in list_parts:
            if segment in Features[part].segments_list_source:
                return part

# create a segment with its first feature
def init_seg(line,segment_id,feat_id,strand):
    [chr,start,stop]=[line[0],int(line[1])+1,int(line[2])] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
    # create the segment, store it in the Segments dict
    Segments[segment_id]=Segment(segment_id,feat_id,chr,start,stop,strand)

# create a feature with the first segment its on
def init_feature(line,feature_id,segment_oriented):
    [type,annot,chr,start,stop,strand]=[line[6],line[12],line[4],int(line[7]),int(line[8]),line[10]]
    if feature_id in Features: # if the feature has already been created (in order to store the child), get the child and rewrite it
        childs=Features[feature_id].childs
    else:
        childs=[]
    parent=get_parent(annot,feature_id)
    # add the current segment to the list of segments that have the feature
    segments_list=[segment_oriented]
    # create the feature, store it in the dict Features
    Features[feature_id]=Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand,True,False)

# if there is a parent, returns the id of the parent feature and add the child to the parent
def get_parent(annot,feature_id): # check how the parent info is stored in the gff format in general (this is a particular case...) todo
    if (len(annot.split(";"))>1) and (annot.split(";")[1].split("=")[0]=="Parent"):
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    elif (len(annot.split(";"))>2) and (annot.split(";")[2].split("=")[0]=="Parent"):
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""
    return parent

# add a feature to an existing segment
def add_feature(line,seg,new_feat_id,new_strand):
    segment=Segments[seg]
    if not segment.find_feat(new_feat_id)[0]:
        [chr,start,stop]=[line[0],int(line[1])+1,int(line[2])] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
        segment.add_feature(new_feat_id,chr,start,stop,new_strand)

# add a child feature to an existing feature
def add_child(feat,new_child):
    if feat in Features: # if the feature exists, add new_child
        Features[feat].childs.append(new_child)
    else: # create the feature "empty", only with child
        Features[feat]=Feature(feat,"","",0,0,"",[new_child],"",[],"",False,False)

# add a segment to an existing feature
def add_seg(feat_id,new_seg_oriented):
    # if new_seg_oriented in Features[feat_id].segments_list_source:
    #     print("seg already in feat, duplicate segment ! feature:",feat_id,"segment:",new_seg_oriented)
    #     print("list for now :",Features[feat_id].segments_list_source)
    Features[feat_id].segments_list_source.append(new_seg_oriented)

# add the position of the feature on its first and last segment
def add_pos(feature_id):
    feature=Features[feature_id]
    feature.pos_start=get_feature_start_on_segment(feature.segments_list_source[0],feature_id)
    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list_source[-1],feature_id)

# handle the Feature object creation
def load_feature_intersect(line,feature_id,seg_oriented):
    if (feature_id not in Features) or (not Features[feature_id].complete): # if the feature doesn't exist or is not complete, create it or complete it
        init_feature(line,feature_id,seg_oriented)
    else: # if it exists, add the current segment to the list of segments that have the existing feature

        # if the feature existing starts and stops at the same positions as the current feature, add_seg
        current_start=int(line[7])
        current_stop=int(line[8])
        feature=Features[feature_id]
        if (current_start==feature.start) and (current_stop==feature.stop):
            add_seg(feature_id,seg_oriented)
        else: # same feature_id, but not same part of the feature -> need to store them in two separate objects
            add_segment_to_discontinuous_feature(line,feature,seg_oriented,current_start,current_stop)

def add_segment_to_discontinuous_feature(line,feature,seg_oriented,current_start,current_stop):
    added=False
    for other_part_id in feature.other_parts_list:
        other_part=Features[other_part_id]
        if (other_part.start==current_start) and (other_part.stop==current_stop): # look for the current part of the feature to add the segment
            add_seg(other_part_id,seg_oriented)
            added=True
            break
    if added==False : # if the right part of the feature was not found, create a new part for the feature
        new_part_name=feature.id+''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(7))
        [chr,start,stop]=[line[4],int(line[7]),int(line[8])]
        segments_list=[seg_oriented]
        Features[new_part_name]=Feature(feature.id,feature.type,chr,start,stop,feature.annot,feature.childs,feature.parent,segments_list,feature.strand,True,True)
        new_part_feature=Features[new_part_name]
        new_part_feature.first=False
        new_part_feature.first_part=feature.id
        feature.other_parts_list.append(new_part_name)
        feature.discontinuous=True
        if feature.parent in Features:
            add_child(feature.parent,new_part_name)

# handle the Segment object creation
def load_segment_intersect(line,segment_id,feat_id,strand):
    if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
        init_seg(line,segment_id,feat_id,strand)
    else: # if it exists, add the current feature to the list of features on the existing segment
        add_feature(line,segment_id,feat_id,strand)


# create a note for the child features that do not have annotation.
def set_note(feature_id): # not universal, depends on the format of the input gff...
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
    feature=Features[feature_id]
    note=""
    parent_feat=find_parent(feature_id)
    if parent_feat!='':
        annot=Features[parent_feat].annot.split(';')
        if (len(annot)>0) and (annot[-1].split('=')[0]=="Note"):
            note=annot[-1].split('=')[1]
    feature.note=note


# create all the Segment and Feature objects in the dictionnaries Segments and Features
def load_intersect(intersect_path,verbose):
    if not verbose:
        print("Loading the intersection information")
    # open the file with the intersect between the segments and the gff
    total_lines=0
    if verbose:
        total_lines = len(["" for line in open(intersect_path,"r")])
    with open(intersect_path,"r") as intersect_file:  
        for line in tqdm(intersect_file, desc="Loading the intersection information", total=total_lines, unit=" line",disable=not verbose):
            line=line.split()
            # get the ids for the dictionnaries' keys
            feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
            segment_id=line[3]
            strand=line[10]
            segment_oriented=line[3]
        
            load_feature_intersect(line,feature_id,segment_oriented)
            load_segment_intersect(line,segment_id,feature_id,strand)

    # for all the features, add the position of the feature on its first and last segment, and the note.
    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
    for feat_id in Features:
        if Features[feat_id].complete==False:
            print('***',feat_id,Features[feat_id])
        add_pos(feat_id)
        set_note(feat_id)
    # order the dictionnary
    order_dict(Features)

# orders the features so that each features is always preceded by its parent feature (gene -> mrna -> exons, cds, utr). useful for the transfer later.
def order_dict(features_dict):
    copy_Features=dict(features_dict) # stores the features in a copy
    features_dict.clear()# empty dict, to refill it with the features in the right orcer
    for feature_id in copy_Features:
        add_feature_dict(feature_id,copy_Features,features_dict)

# add a feature to the ordered dictionnary
def add_feature_dict(feature_id,old_dict_features,new_dict_features):
    if feature_id not in new_dict_features:
        feature=old_dict_features[feature_id]
        # check that the parent is present
        if feature.parent!='': # recursively find the parent (most of the time a gene)
            add_feature_dict(feature.parent,old_dict_features,new_dict_features)
        # if the feature is discontinuous, check that the other parts are present :
        if feature.discontinuous and feature.first:
            for part_id in feature.other_parts_list:
                add_feature_dict(feature.parent,old_dict_features,new_dict_features)
        # once the parent is present and the other parts are added, add the feature
        new_dict_features[feature_id]=feature

# find the gene parent of a feature
def find_parent(feature_id): # add a check if there is no parent !
    feature=Features[feature_id]
    if feature.parent=='': # no parent, usually a gene
        return feature.id
    else:
        current=feature.parent
        parent_found=False
        while parent_found==False:
            if Features[current].parent=='': # current doesnt have a parent
                return current
            else: # if we didn't find the parent, we go up to the current feature's parent until we find it
                current=Features[current].parent