From ddb6d7dae1fd6b90a21a2d52b7d1e192c3d25215 Mon Sep 17 00:00:00 2001 From: "christine.tranchant_ird.fr" <christine.tranchant@ird.fr> Date: Tue, 17 Dec 2024 14:25:27 +0100 Subject: [PATCH] patch vecscreen --- frangiPANe/install_files/tools_path.yaml | 1 + frangiPANe/snakefiles/vecscreen.smk | 86 ++----------------- .../snakemake_scripts/vecscreen_parser.py | 32 +++++++ 3 files changed, 40 insertions(+), 79 deletions(-) create mode 100755 frangiPANe/snakemake_scripts/vecscreen_parser.py diff --git a/frangiPANe/install_files/tools_path.yaml b/frangiPANe/install_files/tools_path.yaml index 31beb68..1dffc67 100644 --- a/frangiPANe/install_files/tools_path.yaml +++ b/frangiPANe/install_files/tools_path.yaml @@ -5,6 +5,7 @@ SINGULARITY: TOOLS : 'INSTALL_PATH/containers/frangiPANe.sif' + VECS : 'INSTALL_PATH/containers/ncbi-tools++-25.2.0.sif' # Is and exemple of tools path ENVMODULE: diff --git a/frangiPANe/snakefiles/vecscreen.smk b/frangiPANe/snakefiles/vecscreen.smk index 0f9dd9e..c0a6783 100644 --- a/frangiPANe/snakefiles/vecscreen.smk +++ b/frangiPANe/snakefiles/vecscreen.smk @@ -14,7 +14,7 @@ rule vecscreen: output = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.o" params: length = frangipane_obj.get_config_value('CONTIGS','minLength'), - module = tools_config["ENVMODULE"]["NCBI-TOOLS"] + singu = tools_config['SINGULARITY']['VECS'] message: f"""-------------------- -Running {{rule}} @@ -26,81 +26,9 @@ rule vecscreen: | - Filtering on sequences longer than : {{params.length}} -------------------- """ - run: - from Bio import SeqIO - - # Load NCBI-TOOLS module - #cmd_load=f"module load {params.module}" - - nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0 - - with open(os.path.join(output.fasta_file), "w") as o: - for seq in SeqIO.parse(input.fasta_file, "fasta"): - # Vecscreen only works with fasta file, not sequence - tmp_file = output.fasta_file + ".tmp" - - with open(tmp_file, "w") as tmp: - tmp.write('>' + str(seq.description) + '\n') - tmp.write(gs.format_60(str(seq.seq)) + '\n') - - cmd = f'vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output' - #cmd = f'module load {params.module} && vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output' - #print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}") - - try: - subprocess.run(cmd, shell=True, check=True) - except subprocess.CalledProcessError as e: - print(f"########## ERROR running Vecscreen command: {e}") - - process = subprocess.run(cmd, shell=True, capture_output=True, text=True) - - if 'No hits found' in process.stdout: - o.write('>' + str(seq.description) + '\n') - o.write(gs.format_60(str(seq.seq)) + '\n') - nb_seq += 1 - nb_seq_no_hits += 1 - else: - # print(process.stdout) - # print("Hits : >" + str(seq.description)) - nb_seq_suspicious += 1 - bounds = [] - for line in process.stdout.split("\n"): - if len(line) > 0: - if line[0].isnumeric(): - for value in line.split('\t'): - bounds.append(int(value)) - min_seq = 0 - max_seq = len(seq) - min_bounds = min(bounds) - max_bounds = max(bounds) - if min_bounds == min_seq + 1: - min_seq = max_bounds - new_length = max_seq - max_bounds - if new_length >= params.length: - header_fields = str(seq.description).split() - o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join( - header_fields[2:]) + '\n') - o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n') - nb_seq += 1 - elif max_bounds == max_seq: - max_seq = min_bounds - new_length = max_seq - if new_length >= params.length: - header_fields = str(seq.description).split() - o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join( - header_fields[2:]) + '\n') - o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n') - nb_seq += 1 - else: - print("Warning (seq : " + str( - seq.description) + ") : the detected contamination appears to be in the middle of the contig !") - cmd = f'rm {tmp_file}' - process = subprocess.run(cmd, shell=True, capture_output=True, text=True) - -# text = f""" -# ### VecScreen -# <hr> -# * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value) -# * SUSPICIOUS : {nb_seq_suspicious} -# * KEPT : {nb_seq} -# """ \ No newline at end of file + #singularity: + # tools_config['SINGULARITY']['VECS'] + shell: + """ + python3 {frangipane_obj.snakemake_scripts}/vecscreen_parser.py {input.fasta_file} {input.univec_file} {output.fasta_file} {params.length} {params.singu} + """ diff --git a/frangiPANe/snakemake_scripts/vecscreen_parser.py b/frangiPANe/snakemake_scripts/vecscreen_parser.py new file mode 100755 index 0000000..2d22802 --- /dev/null +++ b/frangiPANe/snakemake_scripts/vecscreen_parser.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import os +import sys +from Bio import SeqIO + +fasta_input = sys.argv[1] +univec_file = sys.argv[2] +output_file = sys.argv[3] +min_length = int(sys.argv[4]) +singu=sys.argv[5] + +nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0 + +with open(output_file, "w") as o: + for seq in SeqIO.parse(fasta_input, "fasta"): + tmp_file = output_file + ".tmp" + with open(tmp_file, "w") as tmp: + tmp.write('>' + str(seq.description) + '\n') + tmp.write(str(seq.seq) + '\n') + + cmd = f"singularity exec {singu} vecscreen -db {univec_file} -query {tmp_file} -outfmt 1 -text_output" + process = os.popen(cmd).read() + + if 'No hits found' in process: + o.write('>' + str(seq.description) + '\n') + o.write(str(seq.seq) + '\n') + nb_seq_no_hits += 1 + else: + nb_seq_suspicious += 1 + # Gestion des contaminations partiellement traitée + os.remove(tmp_file) + -- GitLab