Newer
Older

nina.marthe_ird.fr
committed
import sys
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from pathlib import Path
# creates a dictionnary with the length of the segments
def get_seg_len(gfa,seg_coord_path,verbose):
if verbose:
print(" Extracting the S-lines from the GFA")
command=f"grep ^S {gfa} > {seg_coord_path.joinpath('segments.txt')}"
subprocess.run(command,shell=True,timeout=None)
# build a dictionnary with the segment sizes
segments_size={}
seg_path=seg_coord_path.joinpath("segments.txt")
total_lines=0
if verbose:
total_lines = len(["" for line in open(seg_path,"r")])
with open(seg_path,'r') as seg_file:
for line in tqdm(seg_file, desc=" Computing the lengths of the segments",total=total_lines,unit=" segment",disable=not verbose):
seg_size=len(line[2])
segments_size[seg_id]=seg_size
return segments_size
# check that a walk in "walk_names" has "name" in it (check that "name" is a valid walk from the list)
def check_walk_name(walk_names,name):
name_found=False
for walk in walk_names:
if walk in name:
name_found=True
break
return name_found
def get_walks_paths(gfa,seg_coord_path,verbose):
if verbose:
print(" Extracting the W-lines or P-lines from the GFA")

nina.marthe_ird.fr
committed
# get the lines that start with "W"
command=f"grep ^W {gfa} | sed 's/>/,>/g' | sed 's/</,</g' > {seg_coord_path.joinpath('walks.txt')}"

nina.marthe_ird.fr
committed
subprocess.run(command,shell=True,timeout=None)
w_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('walks.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
command=f"grep ^P {gfa} > {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
p_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('paths.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if (p_lines[0]=='0') and (w_lines[0]=='0'):
command=f"rm {seg_coord_path.joinpath('walks.txt')} && rm {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
sys.exit("No walks or paths detected in the GFA file")
elif p_lines[0]!='0':
path_path=seg_coord_path.joinpath("paths.txt")
total_lines=0
if verbose:
total_lines = len(["" for line in open(path_path,"r")])
with open(path_path,"r") as path_file, open(seg_coord_path.joinpath("walks.txt"),'a') as walk_file:
for line in tqdm(path_file, desc=" Converting paths in walks", total=total_lines, unit=" path", disable=not verbose):
path_line=line.split()
name=path_line[1].split("#") # assembly#hapl#chr
[asm,hap,seq]=name
# seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
if ',' in path_line[2]:
path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
else:
path=path_line[2].split(';')
# error message if no , or ; but if there is only one segment there is no separator......
# expect no overlap !
line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
for seg in path:
if seg[-1]=="-":
strand="<"

nina.marthe_ird.fr
committed
else:

nina.marthe_ird.fr
committed
strand=">"
seg_stranded=strand+seg[:-1]
line+=','
line+=seg_stranded
line+="\n"
walk_file.write(line)
command=f"rm {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)

nina.marthe_ird.fr
committed
walk_path=seg_coord_path.joinpath("walks.txt")

nina.marthe_ird.fr
committed
return walk_path
# outputs the coordinates of the segments on the walks in bed files

nina.marthe_ird.fr
committed
def seg_coord(gfa,walk_names,seg_coord_path,verbose,transfer,haplotype):
segments_size=get_seg_len(gfa,seg_coord_path,verbose)

nina.marthe_ird.fr
committed
# get the walks or paths
walk_path=get_walks_paths(gfa,seg_coord_path,verbose)
total_lines = len(["" for line in open(walk_path,"r")])
# on these lines, get the name of the genome to name the output bed file
with open(walk_path,"r") as walk_file:
for line in tqdm(walk_file, desc=" Computing the segments coordinates", total=total_lines, unit=" line", disable=not verbose):

nina.marthe_ird.fr
committed
generate_bed(line,walk_names,segments_size,file_names,seg_coord_path,transfer,haplotype)
# MAX_THREAD=8 # argparse
# with open(walk_path,'r') as walk_file, ThreadPoolExecutor(max_workers=MAX_THREAD) as executor:
# futures = [executor.submit(generate_bed,line,walk_names,segments_size,file_names) for line in walk_file]
# with tqdm(total=len(futures), desc="Building files", unit="genome", disable=False) as progress_bar:
# for future in futures:
# future.result()
# progress_bar.update(1)
# problem with the opening of file in line 96 (executor.submit etc). todo : find another way to paralelize (on several files for instance)

nina.marthe_ird.fr
committed
def generate_bed(line,walk_names,segments_size,file_names,seg_coord_path,transfer,haplotype):
line=line.split()

nina.marthe_ird.fr
committed
if haplotype:
name=line[1]+"#"+line[2]+"#"+line[3]
else:

nina.marthe_ird.fr
committed
name=line[1]+'#'+line[3]

nina.marthe_ird.fr
committed
if (check_walk_name(walk_names,name)) or ((len(walk_names)==1) and ("MINIGRAPH" not in name) and transfer): # len=1 if there is only the source genome. dont parse all the walks if no transfer (only graph annotation)
path_start=int(line[4])
seq_name=line[3]
file_name=seg_coord_path.joinpath(name).as_posix()+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
open_mode="w"
else :
open_mode="a"
path=line[6].split(',')
position=path_start
with open(file_name,open_mode) as output_bed:
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
seg_start=position
seg_name=path[i]
seg_stop=position+segments_size[seg_name[1:]]
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+seg_name+'\n'
output_bed.write(out_line)
position=seg_stop