Skip to content
Snippets Groups Projects
Commit ddb6d7da authored by christine.tranchant_ird.fr's avatar christine.tranchant_ird.fr
Browse files

patch vecscreen

parent 61a668fd
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@
SINGULARITY:
TOOLS : 'INSTALL_PATH/containers/frangiPANe.sif'
VECS : 'INSTALL_PATH/containers/ncbi-tools++-25.2.0.sif'
# Is and exemple of tools path
ENVMODULE:
......
......@@ -14,7 +14,7 @@ rule vecscreen:
output = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.o"
params:
length = frangipane_obj.get_config_value('CONTIGS','minLength'),
module = tools_config["ENVMODULE"]["NCBI-TOOLS"]
singu = tools_config['SINGULARITY']['VECS']
message:
f"""--------------------
-Running {{rule}}
......@@ -26,81 +26,9 @@ rule vecscreen:
| - Filtering on sequences longer than : {{params.length}}
--------------------
"""
run:
from Bio import SeqIO
# Load NCBI-TOOLS module
#cmd_load=f"module load {params.module}"
nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
with open(os.path.join(output.fasta_file), "w") as o:
for seq in SeqIO.parse(input.fasta_file, "fasta"):
# Vecscreen only works with fasta file, not sequence
tmp_file = output.fasta_file + ".tmp"
with open(tmp_file, "w") as tmp:
tmp.write('>' + str(seq.description) + '\n')
tmp.write(gs.format_60(str(seq.seq)) + '\n')
cmd = f'vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
#cmd = f'module load {params.module} && vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
#print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
try:
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"########## ERROR running Vecscreen command: {e}")
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if 'No hits found' in process.stdout:
o.write('>' + str(seq.description) + '\n')
o.write(gs.format_60(str(seq.seq)) + '\n')
nb_seq += 1
nb_seq_no_hits += 1
else:
# print(process.stdout)
# print("Hits : >" + str(seq.description))
nb_seq_suspicious += 1
bounds = []
for line in process.stdout.split("\n"):
if len(line) > 0:
if line[0].isnumeric():
for value in line.split('\t'):
bounds.append(int(value))
min_seq = 0
max_seq = len(seq)
min_bounds = min(bounds)
max_bounds = max(bounds)
if min_bounds == min_seq + 1:
min_seq = max_bounds
new_length = max_seq - max_bounds
if new_length >= params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
elif max_bounds == max_seq:
max_seq = min_bounds
new_length = max_seq
if new_length >= params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
else:
print("Warning (seq : " + str(
seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
cmd = f'rm {tmp_file}'
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
# text = f"""
# ### VecScreen
# <hr>
# * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
# * SUSPICIOUS : {nb_seq_suspicious}
# * KEPT : {nb_seq}
# """
\ No newline at end of file
#singularity:
# tools_config['SINGULARITY']['VECS']
shell:
"""
python3 {frangipane_obj.snakemake_scripts}/vecscreen_parser.py {input.fasta_file} {input.univec_file} {output.fasta_file} {params.length} {params.singu}
"""
#!/usr/bin/env python
import os
import sys
from Bio import SeqIO
fasta_input = sys.argv[1]
univec_file = sys.argv[2]
output_file = sys.argv[3]
min_length = int(sys.argv[4])
singu=sys.argv[5]
nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
with open(output_file, "w") as o:
for seq in SeqIO.parse(fasta_input, "fasta"):
tmp_file = output_file + ".tmp"
with open(tmp_file, "w") as tmp:
tmp.write('>' + str(seq.description) + '\n')
tmp.write(str(seq.seq) + '\n')
cmd = f"singularity exec {singu} vecscreen -db {univec_file} -query {tmp_file} -outfmt 1 -text_output"
process = os.popen(cmd).read()
if 'No hits found' in process:
o.write('>' + str(seq.description) + '\n')
o.write(str(seq.seq) + '\n')
nb_seq_no_hits += 1
else:
nb_seq_suspicious += 1
# Gestion des contaminations partiellement traitée
os.remove(tmp_file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment