Newer
Older

nina.marthe_ird.fr
committed
import sys
# creates a dictionnary with the length of the segments
def get_seg_len(gfa):
# get the lines that start with "S"
command="grep ^S "+gfa+" > seg_coord/segments.txt"
subprocess.run(command,shell=True,timeout=None)
# build a dictionnary with the segment sizes
segments_size={}
with open('seg_coord/segments.txt','r') as seg_file:

nina.marthe_ird.fr
committed
for line in seg_file:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
return segments_size
# check that a walk in "walk_names" has "name" in it (check that "name" is a valid walk from the list)
def check_walk_name(walk_names,name):
name_found=False
for walk in walk_names:
if walk in name:
name_found=True
break
return name_found

nina.marthe_ird.fr
committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_walks_paths(gfa):
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
w_lines = subprocess.Popen("wc -l seg_coord/walks.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if w_lines[0]=='0':
command="grep ^P "+gfa+" > seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
p_lines = subprocess.Popen("wc -l seg_coord/paths.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if p_lines[0]=='0':
command="rm seg_coord/walks.txt && rm seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
sys.exit("No walks or paths detected in the GFA file")
else:
with open("seg_coord/paths.txt","r") as path_file, open("seg_coord/walks.txt",'w') as walk_file:
for path_line in path_file:
path_line=path_line.split()
name=path_line[1].split("#") # assembly#hapl#chr # todo
[asm,hap,seq]=name
# seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
if ',' in path_line[2]:
path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
else:
path=path_line[2].split(';')
# error message if no , or ; but if there is only one segment there is no separator......
# expect no overlap !
line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
for seg in path:
if seg[-1]=="-":
strand="<"
else:
strand=">"
seg_stranded=strand+seg[:-1]
line+=','
line+=seg_stranded
line+="\n"
walk_file.write(line)
command="rm seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
walk_path="seg_coord/walks.txt"
return walk_path
# outputs the coordinates of the segments on the walks in bed files
def seg_coord(gfa,walk_names):
# create directory to store output files
command="mkdir seg_coord"
subprocess.run(command,shell=True)
segments_size=get_seg_len(gfa)

nina.marthe_ird.fr
committed
# get the walks or paths
walk_path=get_walks_paths(gfa)
# on these lines, get the name of the genome to name the output bed file
file_names=list()

nina.marthe_ird.fr
committed
with open(walk_path,'r') as walk_file:
for line in walk_file:
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
line=line.split()
name=line[1]+"_"+line[3]
if (check_walk_name(walk_names,name)) or ((len(walk_names)==1) and ("MINIGRAPH" not in name)): # len=1 if there is only the source genome.
path_start=int(line[4])
seq_name=line[3]
file_name="seg_coord/"+name+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
open_mode="w"
else :
open_mode="a"
path=line[6].split(',')
position=path_start
with open(file_name,open_mode) as output_bed:
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
output_bed.write(out_line)

nina.marthe_ird.fr
committed
position=seg_stop