main.py

# created by Nina Marthe 2023 - nina.marthe@ird.fr
# licensed under MIT

import os
import subprocess
from Graph_gff import *
from Functions_output import *
from argparser import *
#from inference import *


args=arg()
read_args(args)

# intersect is in the current directory
intersect="intersect"
gfa=args.graph.name
load_intersect(intersect)

segments=args.segment_coordinates_path+"/segments.txt"

# outputs the gff and gaf of the graph
if args.graph_gff:
    out_graph_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
    graph_gff(out_graph_gff)
if args.graph_gaf:
    out_graph_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
    graph_gaf(out_graph_gaf,segments)

# get list of files in seg_coord
segment_coord_files=os.listdir(args.segment_coordinates_path)

# if a transfer on a target genome is asked
if args.annotation or args.variation or args.alignment:

    # get the target genomes if there is none specified
    if len(args.target)==0:
        walks=open(args.segment_coordinates_path+"/walks.txt",'r')
        walk_lines=walks.readlines()
        for line in walk_lines:
            genome_name=line.split()[1]
            if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
                args.target.append(genome_name)

    pav_dict={}
    # if a pav matrix is asked, create dictionnary to store the features, and for each feature the value for all the target genomes.
    if args.pav_matrix:
        pav_line=len(args.target)*[1]
        pav_dict["gene_id"]=list(args.target)
        for feat in Features.keys():
            if Features[feat].type=='gene':
                pav_dict[feat]=pav_line.copy()

    seg_size={}
    # if the annotation transfer on target genome is asked, build a dictionnary with the segment sizes
    if args.annotation:
        segments_file = open(segments,'r')
        lines=segments_file.readlines()
        segments_file.close()

        for line in lines:
            line=line.split()
            seg_id='s'+line[1]
            segment_size=len(line[2])
            seg_size[seg_id]=segment_size

    genome_index=0
    for target_genome in args.target:
        print(f'\n{target_genome} transfer :')
        # create directory to store output files
        command="mkdir "+target_genome
        subprocess.run(command,shell=True,timeout=None)

        # create dictionnaries with paths and segments positions.
        for file in segment_coord_files:
            genome_name=get_genome_name(args.target,file)
            if genome_name==target_genome :
                print(f'    loading the information from the file {file}')
                file_path=args.segment_coordinates_path+file
                get_segments_positions_on_genome(file_path)
        print(f'    loading the walks for the genome {target_genome}')
        walks_path=args.segment_coordinates_path+"/walks.txt"
        target_genome_paths=get_paths(walks_path,target_genome)

        # create output files names
        out_target_gff=target_genome+"/"+target_genome+".gff"
        out_target_var=target_genome+"/"+target_genome+"_var.txt"
        out_aln=target_genome+"/"+target_genome+"_aln.txt"

        list_feat_absent=[]
        # do the annotation transfer (or var/aln)
        transfer_on_target(segments,out_target_gff,out_target_var,out_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args)

        # if pav matrix is asked, add the information of this transfer on the matrix
        if args.pav_matrix:
            for feat in list_feat_absent:
                pav_dict[feat][genome_index]=0
            genome_index+=1

    # print the pav matrix
    if args.pav_matrix:
        print('\ngeneration of the presence-absence matrix for the transfered genes')
        pav_output=''
        for line in pav_dict:
            pav_output+=line
            for field in pav_dict[line]:
                pav_output+="\t"+str(field)
            pav_output+="\n"

        out_pav="PAV_matrix.txt"
        file_out_pav = open(out_pav, 'w')
        file_out_pav.write(pav_output)