import subprocess import sys from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm from pathlib import Path # creates a dictionnary with the length of the segments def get_seg_len(gfa,seg_coord_path,verbose): # get the lines that start with "S" if verbose: print(" Extracting the S-lines from the GFA") command=f"grep ^S {gfa} > {seg_coord_path.joinpath('segments.txt')}" subprocess.run(command,shell=True,timeout=None) # build a dictionnary with the segment sizes segments_size={} seg_path=seg_coord_path.joinpath("segments.txt") total_lines=0 if verbose: total_lines = len(["" for line in open(seg_path,"r")]) with open(seg_path,'r') as seg_file: for line in tqdm(seg_file, desc=" Computing the lengths of the segments",total=total_lines,unit=" segment",disable=not verbose): line=line.split() seg_id=line[1] seg_size=len(line[2]) segments_size[seg_id]=seg_size return segments_size # check that a walk in "walk_names" has "name" in it (check that "name" is a valid walk from the list) def check_walk_name(walk_names,name): name_found=False for walk in walk_names: if walk in name: name_found=True break return name_found def get_walks_paths(gfa,seg_coord_path,verbose): if verbose: print(" Extracting the W-lines or P-lines from the GFA") # get the lines that start with "W" command=f"grep ^W {gfa} | sed 's/>/,>/g' | sed 's/</,</g' > {seg_coord_path.joinpath('walks.txt')}" subprocess.run(command,shell=True,timeout=None) w_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('walks.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode() command=f"grep ^P {gfa} > {seg_coord_path.joinpath('paths.txt')}" subprocess.run(command,shell=True,timeout=None) p_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('paths.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode() if (p_lines[0]=='0') and (w_lines[0]=='0'): command=f"rm {seg_coord_path.joinpath('walks.txt')} && rm {seg_coord_path.joinpath('paths.txt')}" subprocess.run(command,shell=True,timeout=None) sys.exit("No walks or paths detected in the GFA file") elif p_lines[0]!='0': path_path=seg_coord_path.joinpath("paths.txt") total_lines=0 if verbose: total_lines = len(["" for line in open(path_path,"r")]) with open(path_path,"r") as path_file, open(seg_coord_path.joinpath("walks.txt"),'a') as walk_file: for line in tqdm(path_file, desc=" Converting paths in walks", total=total_lines, unit=" path", disable=not verbose): path_line=line.split() name=path_line[1].split("#") # assembly#hapl#chr [asm,hap,seq]=name # seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful. if ',' in path_line[2]: path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ; else: path=path_line[2].split(';') # error message if no , or ; but if there is only one segment there is no separator...... # expect no overlap ! line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t' for seg in path: if seg[-1]=="-": strand="<" else: strand=">" seg_stranded=strand+seg[:-1] line+=',' line+=seg_stranded line+="\n" walk_file.write(line) command=f"rm {seg_coord_path.joinpath('paths.txt')}" subprocess.run(command,shell=True,timeout=None) walk_path=seg_coord_path.joinpath("walks.txt") return walk_path # outputs the coordinates of the segments on the walks in bed files def seg_coord(gfa,walk_names,seg_coord_path,verbose,transfer,haplotype): segments_size=get_seg_len(gfa,seg_coord_path,verbose) # get the walks or paths walk_path=get_walks_paths(gfa,seg_coord_path,verbose) file_names=list() total_lines = len(["" for line in open(walk_path,"r")]) # on these lines, get the name of the genome to name the output bed file with open(walk_path,"r") as walk_file: for line in tqdm(walk_file, desc=" Computing the segments coordinates", total=total_lines, unit=" line", disable=not verbose): generate_bed(line,walk_names,segments_size,file_names,seg_coord_path,transfer,haplotype) # MAX_THREAD=8 # argparse # with open(walk_path,'r') as walk_file, ThreadPoolExecutor(max_workers=MAX_THREAD) as executor: # futures = [executor.submit(generate_bed,line,walk_names,segments_size,file_names) for line in walk_file] # with tqdm(total=len(futures), desc="Building files", unit="genome", disable=False) as progress_bar: # for future in futures: # future.result() # progress_bar.update(1) # problem with the opening of file in line 96 (executor.submit etc). todo : find another way to paralelize (on several files for instance) def generate_bed(line,walk_names,segments_size,file_names,seg_coord_path,transfer,haplotype): line=line.split() if haplotype: name=line[1]+"#"+line[2]+"#"+line[3] else: name=line[1]+'#'+line[3] if (check_walk_name(walk_names,name)) or ((len(walk_names)==1) and ("MINIGRAPH" not in name) and transfer): # len=1 if there is only the source genome. dont parse all the walks if no transfer (only graph annotation) path_start=int(line[4]) seq_name=line[3] file_name=seg_coord_path.joinpath(name).as_posix()+'.bed' # if we are writing in the file for the first time, overwrite it. else, append it # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file. if file_name not in file_names: file_names.append(file_name) open_mode="w" else : open_mode="a" path=line[6].split(',') position=path_start with open(file_name,open_mode) as output_bed: for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file seg_start=position seg_name=path[i] seg_stop=position+segments_size[seg_name[1:]] out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+seg_name+'\n' output_bed.write(out_line) position=seg_stop