setup_transfer.py

from tqdm import tqdm
import subprocess
import sys

# get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
def get_genome_name(target_genomes,file_name):
    for genome in target_genomes:
        if genome in file_name:
            return genome
    return ""

# create a dictionnary with the length of the segments : s5->8
def get_segments_length(segments,verbose):
    seg_len={}
    total_lines=0
    if verbose:
        total_lines=len(["" for line in open(segments,"r")])
    with open(segments,'r') as segments_file:
        for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose):
            line=line.split()
            seg_id=line[1]
            seg_len[seg_id]=len(line[2])
        return seg_len

def get_target_genomes(args):
    # get the target genomes if there is none specified
    if len(args.target)==0:
        walk_path=args.segment_coordinates_path.joinpath("walks.txt")
        total_lines = len(["" for line in open(walk_path,"r")])
        with open(walk_path,'r') as walks:
            for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
                genome_name=line.split()[1]
                if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
                    args.target.append(genome_name)
            if args.verbose:
                print(f"     Genomes found : {args.target}")

    # get haplotypes for each target genome
    if args.haplotype:
        target_set=set()
        for genome in args.target:
            bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
            for haplotype in bed_files:
                haplotype_split=haplotype.name.split("#")
                target_set.add(haplotype_split[0]+"#"+haplotype_split[1])
        args.target=list(target_set)

def run_intersect(args):
    print("Computing the intersection between the annotations and the graph segments")

    seg_coord_path=args.segment_coordinates_path

    if args.haplotype:
        if args.source_haplotype=='.':
            files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
            # select haplotype
            source_haplotypes=[]
            for file in files:
                file_haplotype=file.name.split("#")[1]
                source_haplotypes.append(file_haplotype)
            first_source_haplo=min(source_haplotypes)
            args.source_haplotype=first_source_haplo
            haplo=f"#{first_source_haplo}#"
            print('\033[35m'+f'Warning : No source haplotype was given with \'-sh\'. Haplotype {first_source_haplo} will be used.'+'\033[0m')
            files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome haplotype in seg_coord
            message=f'No bed files corresponding to the source genome haplotype {first_source_haplo} found in {seg_coord_path}'
        else:
            haplo = f"#{args.source_haplotype}#"
            files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome in seg_coord
            message=f'No bed files corresponding to the source genome haplotype found in {seg_coord_path}'
        if len(files)==0:
            sys.exit(message)
        if args.verbose:
            print(f'     Found {len(files)} paths corresponding to the source genome haplotype {args.source_haplotype}')
    else:
        files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
        message='\33[31m'+f'Error : No bed files corresponding to the source genome found in {seg_coord_path}. Please check that the source genome name is right.'+'\033[0m'
        if len(files)==0:
            sys.exit(message)
        if args.verbose:
            print(f'     Found {len(files)} paths corresponding to the source genome')

    intersect_path=args.outdir.joinpath("intersect")
    command=f"echo -n '' > {intersect_path}" # empty the file "intersect"
    subprocess.run(command,shell=True,timeout=None)
    for file in files:
        if args.verbose:
            print(f'          Building the intersection for the path {file.stem}')
        command=f"bedtools intersect -nonamecheck -wo -a {file.as_posix()} -b {args.gff.as_posix()}>>{intersect_path}"
        subprocess.run(command,shell=True,timeout=None)
    intersect_lines =  int(subprocess.Popen(f"wc -l {intersect_path}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()[0])
    if intersect_lines==0:
        print('\33[31m'+"Error : No lines in the intersect. Please check that the sequence id in the GFF file (1st field) matches the sequence id in the GFA file (4th field of the W lines or 3rd part of the second field of the P lines)."+'\033[0m')
        exit()