Skip to content
Snippets Groups Projects
Commit 65ae93b3 authored by christine.tranchant_ird.fr's avatar christine.tranchant_ird.fr
Browse files

Integrating module load and singu

parent 267daf0a
No related branches found
No related tags found
1 merge request!1Draft: Resolve "Vecscreen"
......@@ -9,12 +9,15 @@ rule vecscreen:
univec_file = f"{frangipane_obj.path_univec}"
output:
fasta_file = f"{frangipane_obj.path_vecscreen}/{{sample_name}}-{{ref_name}}-vecscreen.fa"
singularity:
tools_config['SINGULARITY']['TOOLS']
envmodules:
tools_config["ENVMODULE"]["NCBI-TOOLS"]
log:
error = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.e",
output = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.o"
params:
length = frangipane_obj.get_config_value('CONTIGS','minLength'),
module = tools_config["ENVMODULE"]["NCBI-TOOLS"]
message:
f"""--------------------
-Running {{rule}}
......@@ -26,80 +29,5 @@ rule vecscreen:
| - Filtering on sequences longer than : {{params.length}}
--------------------
"""
run:
from Bio import SeqIO
# Load NCBI-TOOLS module
cmd_load=f"module load {params.module}"
nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
with open(os.path.join(output.fasta_file), "w") as o:
for seq in SeqIO.parse(input.fasta_file, "fasta"):
# Vecscreen only works with fasta file, not sequence
tmp_file = output.fasta_file + ".tmp"
with open(tmp_file, "w") as tmp:
tmp.write('>' + str(seq.description) + '\n')
tmp.write(gs.format_60(str(seq.seq)) + '\n')
cmd = f'module load {params.module} && vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
try:
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"########## ERROR running Vecscreen command: {e}")
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if 'No hits found' in process.stdout:
o.write('>' + str(seq.description) + '\n')
o.write(gs.format_60(str(seq.seq)) + '\n')
nb_seq += 1
nb_seq_no_hits += 1
else:
# print(process.stdout)
# print("Hits : >" + str(seq.description))
nb_seq_suspicious += 1
bounds = []
for line in process.stdout.split("\n"):
if len(line) > 0:
if line[0].isnumeric():
for value in line.split('\t'):
bounds.append(int(value))
min_seq = 0
max_seq = len(seq)
min_bounds = min(bounds)
max_bounds = max(bounds)
if min_bounds == min_seq + 1:
min_seq = max_bounds
new_length = max_seq - max_bounds
if new_length >= params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
elif max_bounds == max_seq:
max_seq = min_bounds
new_length = max_seq
if new_length >= params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
else:
print("Warning (seq : " + str(
seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
cmd = f'rm {tmp_file}'
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
# text = f"""
# ### VecScreen
# <hr>
# * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
# * SUSPICIOUS : {nb_seq_suspicious}
# * KEPT : {nb_seq}
# """
\ No newline at end of file
shell:
f"{frangipane_obj.snakemake_scripts}/vecscreen.py"
\ No newline at end of file
from Bio import SeqIO
nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
with open(os.path.join(output.fasta_file), "w") as o:
for seq in SeqIO.parse(snakemake.input.fasta_file, "fasta"):
# Vecscreen only works with fasta file, not sequence
tmp_file = snakemake.output.fasta_file + ".tmp"
with open(tmp_file, "w") as tmp:
tmp.write('>' + str(seq.description) + '\n')
tmp.write(gs.format_60(str(seq.seq)) + '\n')
print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
try:
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"########## ERROR running Vecscreen command: {e}")
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if 'No hits found' in process.stdout:
o.write('>' + str(seq.description) + '\n')
o.write(gs.format_60(str(seq.seq)) + '\n')
nb_seq += 1
nb_seq_no_hits += 1
else:
# print(process.stdout)
# print("Hits : >" + str(seq.description))
nb_seq_suspicious += 1
bounds = []
for line in process.stdout.split("\n"):
if len(line) > 0:
if line[0].isnumeric():
for value in line.split('\t'):
bounds.append(int(value))
min_seq = 0
max_seq = len(seq)
min_bounds = min(bounds)
max_bounds = max(bounds)
if min_bounds == min_seq + 1:
min_seq = max_bounds
new_length = max_seq - max_bounds
if new_length >= snakemake.params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
elif max_bounds == max_seq:
max_seq = min_bounds
new_length = max_seq
if new_length >= snakemake.params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
else:
print("Warning (seq : " + str(
seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
cmd = f'rm {tmp_file}'
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
# text = f"""
# ### VecScreen
# <hr>
# * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
# * SUSPICIOUS : {nb_seq_suspicious}
# * KEPT : {nb_seq}
# """
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment