Newer
Older

nina.marthe_ird.fr
committed
import sys
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
# creates a dictionnary with the length of the segments
def get_seg_len(gfa):
print(" Computing the length of the segments")
# get the lines that start with "S"
command="grep ^S "+gfa+" > seg_coord/segments.txt"
subprocess.run(command,shell=True,timeout=None)
# build a dictionnary with the segment sizes
segments_size={}
with open('seg_coord/segments.txt','r') as seg_file:

nina.marthe_ird.fr
committed
for line in seg_file:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
return segments_size
# check that a walk in "walk_names" has "name" in it (check that "name" is a valid walk from the list)
def check_walk_name(walk_names,name):
name_found=False
for walk in walk_names:
if walk in name:
name_found=True
break
return name_found

nina.marthe_ird.fr
committed
def get_walks_paths(gfa):
print(" Parsing the walks or paths")

nina.marthe_ird.fr
committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
w_lines = subprocess.Popen("wc -l seg_coord/walks.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if w_lines[0]=='0':
command="grep ^P "+gfa+" > seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
p_lines = subprocess.Popen("wc -l seg_coord/paths.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if p_lines[0]=='0':
command="rm seg_coord/walks.txt && rm seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
sys.exit("No walks or paths detected in the GFA file")
else:
with open("seg_coord/paths.txt","r") as path_file, open("seg_coord/walks.txt",'w') as walk_file:
for path_line in path_file:
path_line=path_line.split()
name=path_line[1].split("#") # assembly#hapl#chr # todo
[asm,hap,seq]=name
# seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
if ',' in path_line[2]:
path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
else:
path=path_line[2].split(';')
# error message if no , or ; but if there is only one segment there is no separator......
# expect no overlap !
line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
for seg in path:
if seg[-1]=="-":
strand="<"
else:
strand=">"
seg_stranded=strand+seg[:-1]
line+=','
line+=seg_stranded
line+="\n"
walk_file.write(line)
command="rm seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
walk_path="seg_coord/walks.txt"
return walk_path
# outputs the coordinates of the segments on the walks in bed files
def seg_coord(gfa,walk_names):
# create directory to store output files
#command="mkdir seg_coord"
#subprocess.run(command,shell=True)

nina.marthe_ird.fr
committed
# get the walks or paths
walk_path=get_walks_paths(gfa)
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
total_lines = len(["" for line in open(walk_path,"r")])
# on these lines, get the name of the genome to name the output bed file
with open(walk_path,"r") as walk_file:
#for line in walk_file:
for line in tqdm(walk_file, desc=" Reading walks from the GFA file", total=total_lines, unit="ligne", disable=False):
generate_bed(line,walk_names,segments_size,file_names)
# MAX_THREAD=8 # argparse
# with open(walk_path,'r') as walk_file, ThreadPoolExecutor(max_workers=MAX_THREAD) as executor:
# futures = [executor.submit(generate_bed,line,walk_names,segments_size,file_names) for line in walk_file]
# with tqdm(total=len(futures), desc="Building files", unit="genome", disable=False) as progress_bar:
# for future in futures:
# future.result()
# progress_bar.update(1)
# problem with the opening of file in line 96 (executor.submit etc). todo : find another way to paralelize (on several files for instance)
def generate_bed(line,walk_names,segments_size,file_names):
line=line.split()
name=line[1]+"_"+line[3]
if (check_walk_name(walk_names,name)) or ((len(walk_names)==1) and ("MINIGRAPH" not in name)): # len=1 if there is only the source genome.
path_start=int(line[4])
seq_name=line[3]
file_name="seg_coord/"+name+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
open_mode="w"
else :
open_mode="a"
path=line[6].split(',')
position=path_start
with open(file_name,open_mode) as output_bed:
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
output_bed.write(out_line)
position=seg_stop