main.py

# created by Nina Marthe 2023 - nina.marthe@ird.fr
# licensed under MIT

import os
import subprocess
from .Graph_gff import *
from .Functions_output import *
from .argparser import *
#from inference import *
from pathlib import Path


def main():

    args=arg()
    read_args(args)

    # intersect is in the output directory
    intersect=Path(args.outdir.joinpath("intersect")).resolve()
    gfa=args.graph
    load_intersect(intersect.as_posix(),args.verbose)

    segments=args.segment_coordinates_path.joinpath("segments.txt")

    # outputs the gff and gaf of the graph
    if args.graph_gff or args.graph_gaf:
        print("\n")
    if args.graph_gff:
        out_graph_gff=args.outdir.joinpath(gfa.stem).as_posix()+".gff" # what if there is several suffixes and the last one isn't '.gfa' ?
        graph_gff(out_graph_gff,args.verbose)
    if args.graph_gaf:
        seg_size=get_segments_length(segments,False)
        out_graph_gaf=args.outdir.joinpath(gfa.stem).as_posix()+".gaf"
        graph_gaf(out_graph_gaf,seg_size,args.verbose)


    # if a transfer on a target genome is asked # what about pav ??
    if args.annotation or args.variation or args.alignment:
        if args.verbose:
            print('\n')

        # get the target genomes if there is none specified
        if len(args.target)==0:
            walk_path=args.segment_coordinates_path.joinpath("walks.txt")
            total_lines = len(["" for line in open(walk_path,"r")])
            with open(walk_path,'r') as walks:
                for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
                    genome_name=line.split()[1]
                    if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
                        args.target.append(genome_name)
                if args.verbose:
                    print(f"     Genomes found : {args.target}")

        # get haplotypes for each target genome
        if args.haplotype:
            target_list=[]
            for genome in args.target:
                bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
                for haplotype in bed_files:
                    haplotype_split=haplotype.name.split("#")
                    target_list.append(haplotype_split[0]+"#"+haplotype_split[1])
            args.target=target_list

        # if a pav matrix is asked, create dictionnary to store the features, and for each feature the value for all the target genomes.
        pav_dict={}
        if args.pav_matrix:
            pav_line=len(args.target)*[1]
            pav_dict["gene_id"]=list(args.target)
            for feat in Features.keys():
                if Features[feat].type=='gene':
                    pav_dict[feat]=pav_line.copy()

        # build a dictionnary with the segment sizes to compute the coverage and id
        if not args.graph_gaf:
            seg_size=get_segments_length(segments,args.verbose)

        genome_index=0
        for target_genome in args.target:
            print(f'\n{target_genome} transfer :')
            # create directory to store output files
            genome_dir=Path(args.outdir.joinpath(target_genome)).resolve()
            genome_dir.mkdir(exist_ok=True)

            # get list of files in seg_coord
            segment_coord_files=list(args.segment_coordinates_path.glob(f"*{target_genome}*.bed"))

            # create dictionnaries with paths and segments positions.
            print(f'     Loading the walks for the genome {target_genome}')
            walks_path=args.segment_coordinates_path.joinpath("walks.txt").as_posix()
            target_genome_paths=get_paths(walks_path,target_genome,args.haplotype)

            if not args.verbose:
                print("     Loading the segments coordinates")
            for file in segment_coord_files:
                genome_name=get_genome_name(args.target,file.name)
                if genome_name==target_genome :
                    if args.verbose:
                        print(f'     Loading the segments coordinates for the path {file.stem}')
                    get_segments_positions_on_genome(file.as_posix())

            # create output files names
            out_target_gff=genome_dir.joinpath(f'{target_genome}.gff')
            out_target_var=genome_dir.joinpath(f'{target_genome}_var.txt')
            out_target_aln=genome_dir.joinpath(f'{target_genome}_aln.txt')

            list_feat_absent=[]
            # do the annotation transfer (or var/aln)
            transfer_on_target(segments,out_target_gff,out_target_var,out_target_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args)

            # if pav matrix is asked, add the information of this transfer on the matrix
            if args.pav_matrix:
                for feat in list_feat_absent:
                    pav_dict[feat][genome_index]=0
                genome_index+=1

        # print the pav matrix
        if args.pav_matrix:
            print('\nGeneration of the presence-absence matrix for the transfered genes')
            pav_output=''
            for line in pav_dict:
                pav_output+=line
                for field in pav_dict[line]:
                    pav_output+="\t"+str(field)
                pav_output+="\n"

            out_pav=args.outdir.joinpath("PAV_matrix.txt")
            with open(out_pav,'w') as file_out_pav:
                file_out_pav.write(pav_output)