Skip to content
Snippets Groups Projects
Commit dd08d372 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

improved how paths and walks are detected and how bed files are attributed to...

improved how paths and walks are detected and how bed files are attributed to genomes; fixes bug when a genome name was included in another genome name
parent e822f3ab
No related branches found
No related tags found
No related merge requests found
......@@ -43,45 +43,45 @@ def get_walks_paths(gfa,seg_coord_path,verbose):
command=f"grep ^W {gfa} | sed 's/>/,>/g' | sed 's/</,</g' > {seg_coord_path.joinpath('walks.txt')}"
subprocess.run(command,shell=True,timeout=None)
w_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('walks.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if w_lines[0]=='0':
command=f"grep ^P {gfa} > {seg_coord_path.joinpath('paths.txt')}"
command=f"grep ^P {gfa} > {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
p_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('paths.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if (p_lines[0]=='0') and (w_lines[0]=='0'):
command=f"rm {seg_coord_path.joinpath('walks.txt')} && rm {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
p_lines = subprocess.Popen(f"wc -l {seg_coord_path.joinpath('paths.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
if p_lines[0]=='0':
command=f"rm {seg_coord_path.joinpath('walks.txt')} && rm {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
sys.exit("No walks or paths detected in the GFA file")
else:
path_path=seg_coord_path.joinpath("paths.txt")
total_lines=0
if verbose:
total_lines = len(["" for line in open(path_path,"r")])
with open(path_path,"r") as path_file, open(seg_coord_path.joinpath("walks.txt"),'w') as walk_file:
for line in tqdm(path_file, desc=" Converting paths in walks", total=total_lines, unit=" path", disable=not verbose):
path_line=line.split()
name=path_line[1].split("#") # assembly#hapl#chr # todo
[asm,hap,seq]=name
# seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
if ',' in path_line[2]:
path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
sys.exit("No walks or paths detected in the GFA file")
elif p_lines[0]!='0':
path_path=seg_coord_path.joinpath("paths.txt")
total_lines=0
if verbose:
total_lines = len(["" for line in open(path_path,"r")])
with open(path_path,"r") as path_file, open(seg_coord_path.joinpath("walks.txt"),'a') as walk_file:
for line in tqdm(path_file, desc=" Converting paths in walks", total=total_lines, unit=" path", disable=not verbose):
path_line=line.split()
name=path_line[1].split("#") # assembly#hapl#chr
[asm,hap,seq]=name
# seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
if ',' in path_line[2]:
path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
else:
path=path_line[2].split(';')
# error message if no , or ; but if there is only one segment there is no separator......
# expect no overlap !
line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
for seg in path:
if seg[-1]=="-":
strand="<"
else:
path=path_line[2].split(';')
# error message if no , or ; but if there is only one segment there is no separator......
# expect no overlap !
line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
for seg in path:
if seg[-1]=="-":
strand="<"
else:
strand=">"
seg_stranded=strand+seg[:-1]
line+=','
line+=seg_stranded
line+="\n"
walk_file.write(line)
command=f"rm {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
strand=">"
seg_stranded=strand+seg[:-1]
line+=','
line+=seg_stranded
line+="\n"
walk_file.write(line)
command=f"rm {seg_coord_path.joinpath('paths.txt')}"
subprocess.run(command,shell=True,timeout=None)
walk_path=seg_coord_path.joinpath("walks.txt")
return walk_path
......@@ -116,7 +116,7 @@ def generate_bed(line,walk_names,segments_size,file_names,seg_coord_path,transfe
if haplotype:
name=line[1]+"#"+line[2]+"#"+line[3]
else:
name=line[1]+'_'+line[3]
name=line[1]+'#'+line[3]
if (check_walk_name(walk_names,name)) or ((len(walk_names)==1) and ("MINIGRAPH" not in name) and transfer): # len=1 if there is only the source genome. dont parse all the walks if no transfer (only graph annotation)
path_start=int(line[4])
......
......@@ -33,7 +33,7 @@ def run_intersect(args):
if args.verbose:
print(f' Found {len(files)} paths corresponding to the source genome haplotype {args.source_haplotype}')
else:
files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
files=list(seg_coord_path.glob(f"{args.source_genome}#*.bed")) # get all bed files from the source genome in seg_coord
message='\33[31m'+f'Error : No bed files corresponding to the source genome found in {seg_coord_path}. Please check that the source genome name is right.'+'\033[0m'
if len(files)==0:
sys.exit(message)
......
......@@ -2,8 +2,9 @@ from tqdm import tqdm
# get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
def get_genome_name(target_genomes,file_name):
genome_name=file_name.split("#")[0]
for genome in target_genomes:
if genome in file_name:
if genome == genome_name:
return genome
return ""
......@@ -63,8 +64,8 @@ def get_paths(walks_file,target_genome,haplotype):
if haplotype:
seq_name=line[1]+"#"+line[2]+"#"+line[3]
else:
seq_name=line[1]+'_'+line[3]
if target_genome in seq_name: # get the walk of the genome
seq_name=line[1]+'#'+line[3]
if target_genome == line[1]: # get the walk of the genome
path=line[6].split(',')[1:]
list_segments=[]
for segment in path:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment