Skip to content
Snippets Groups Projects
Commit 728439c2 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

added the conversion of path lines in walk format to be read by grannot. the...

added the conversion of path lines in walk format to be read by grannot. the path name must be #assembly_name#haplotype#sequence_name, for instance IRGSP#1#Chr1
parent 4e16261a
No related branches found
No related tags found
No related merge requests found
import subprocess
import sys
# creates a dictionnary with the length of the segments
def get_seg_len(gfa):
......@@ -9,13 +10,11 @@ def get_seg_len(gfa):
# build a dictionnary with the segment sizes
segments_size={}
with open('seg_coord/segments.txt','r') as seg_file:
line=seg_file.readline()
while line:
for line in seg_file:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
line=seg_file.readline()
return segments_size
# check that a walk in "walk_names" has "name" in it (check that "name" is a valid walk from the list)
......@@ -27,6 +26,51 @@ def check_walk_name(walk_names,name):
break
return name_found
def get_walks_paths(gfa):
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
w_lines = subprocess.Popen("wc -l seg_coord/walks.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if w_lines[0]=='0':
command="grep ^P "+gfa+" > seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
p_lines = subprocess.Popen("wc -l seg_coord/paths.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if p_lines[0]=='0':
command="rm seg_coord/walks.txt && rm seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
sys.exit("No walks or paths detected in the GFA file")
else:
with open("seg_coord/paths.txt","r") as path_file, open("seg_coord/walks.txt",'w') as walk_file:
for path_line in path_file:
path_line=path_line.split()
name=path_line[1].split("#") # assembly#hapl#chr # todo
[asm,hap,seq]=name
# seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
if ',' in path_line[2]:
path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
else:
path=path_line[2].split(';')
# error message if no , or ; but if there is only one segment there is no separator......
# expect no overlap !
line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
for seg in path:
if seg[-1]=="-":
strand="<"
else:
strand=">"
seg_stranded=strand+seg[:-1]
line+=','
line+=seg_stranded
line+="\n"
walk_file.write(line)
command="rm seg_coord/paths.txt"
subprocess.run(command,shell=True,timeout=None)
walk_path="seg_coord/walks.txt"
return walk_path
# outputs the coordinates of the segments on the walks in bed files
def seg_coord(gfa,walk_names):
......@@ -36,15 +80,14 @@ def seg_coord(gfa,walk_names):
segments_size=get_seg_len(gfa)
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
# get the walks or paths
walk_path=get_walks_paths(gfa)
# on these lines, get the name of the genome to name the output bed file
file_names=list()
with open('seg_coord/walks.txt','r') as walk_file:
line=walk_file.readline()
while line:
with open(walk_path,'r') as walk_file:
for line in walk_file:
line=line.split()
name=line[1]+"_"+line[3]
......@@ -75,5 +118,4 @@ def seg_coord(gfa,walk_names):
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
output_bed.write(out_line)
position=seg_stop
line=walk_file.readline()
\ No newline at end of file
position=seg_stop
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment