From ddb6d7dae1fd6b90a21a2d52b7d1e192c3d25215 Mon Sep 17 00:00:00 2001
From: "christine.tranchant_ird.fr" <christine.tranchant@ird.fr>
Date: Tue, 17 Dec 2024 14:25:27 +0100
Subject: [PATCH] patch vecscreen

---
 frangiPANe/install_files/tools_path.yaml      |  1 +
 frangiPANe/snakefiles/vecscreen.smk           | 86 ++-----------------
 .../snakemake_scripts/vecscreen_parser.py     | 32 +++++++
 3 files changed, 40 insertions(+), 79 deletions(-)
 create mode 100755 frangiPANe/snakemake_scripts/vecscreen_parser.py

diff --git a/frangiPANe/install_files/tools_path.yaml b/frangiPANe/install_files/tools_path.yaml
index 31beb68..1dffc67 100644
--- a/frangiPANe/install_files/tools_path.yaml
+++ b/frangiPANe/install_files/tools_path.yaml
@@ -5,6 +5,7 @@
 
 SINGULARITY:
     TOOLS : 'INSTALL_PATH/containers/frangiPANe.sif'
+    VECS : 'INSTALL_PATH/containers/ncbi-tools++-25.2.0.sif'
 
 # Is and exemple of tools path
 ENVMODULE:
diff --git a/frangiPANe/snakefiles/vecscreen.smk b/frangiPANe/snakefiles/vecscreen.smk
index 0f9dd9e..c0a6783 100644
--- a/frangiPANe/snakefiles/vecscreen.smk
+++ b/frangiPANe/snakefiles/vecscreen.smk
@@ -14,7 +14,7 @@ rule vecscreen:
         output = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.o"
     params:
         length = frangipane_obj.get_config_value('CONTIGS','minLength'),
-        module =  tools_config["ENVMODULE"]["NCBI-TOOLS"]
+        singu =  tools_config['SINGULARITY']['VECS']
     message:
         f"""--------------------
 -Running {{rule}}
@@ -26,81 +26,9 @@ rule vecscreen:
 |   - Filtering on sequences longer than  : {{params.length}}
 --------------------
 """
-    run:
-        from Bio import SeqIO
-
-        # Load NCBI-TOOLS module
-        #cmd_load=f"module load {params.module}"
-
-        nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
-
-        with open(os.path.join(output.fasta_file), "w") as o:
-            for seq in SeqIO.parse(input.fasta_file, "fasta"):
-                # Vecscreen only works with fasta file, not sequence
-                tmp_file = output.fasta_file + ".tmp"
-
-                with open(tmp_file, "w") as tmp:
-                    tmp.write('>' + str(seq.description) + '\n')
-                    tmp.write(gs.format_60(str(seq.seq)) + '\n')
-
-                cmd = f'vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
-                #cmd = f'module load {params.module} && vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
-                #print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
-
-                try:
-                    subprocess.run(cmd, shell=True, check=True)
-                except subprocess.CalledProcessError as e:
-                    print(f"########## ERROR running Vecscreen command: {e}")
-
-                process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-
-                if 'No hits found' in process.stdout:
-                    o.write('>' + str(seq.description) + '\n')
-                    o.write(gs.format_60(str(seq.seq)) + '\n')
-                    nb_seq += 1
-                    nb_seq_no_hits += 1
-                else:
-                    # print(process.stdout)
-                    # print("Hits : >" + str(seq.description))
-                    nb_seq_suspicious += 1
-                    bounds = []
-                    for line in process.stdout.split("\n"):
-                        if len(line) > 0:
-                            if line[0].isnumeric():
-                                for value in line.split('\t'):
-                                    bounds.append(int(value))
-                    min_seq = 0
-                    max_seq = len(seq)
-                    min_bounds = min(bounds)
-                    max_bounds = max(bounds)
-                    if min_bounds == min_seq + 1:
-                        min_seq = max_bounds
-                        new_length = max_seq - max_bounds
-                        if new_length >= params.length:
-                            header_fields = str(seq.description).split()
-                            o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
-                                header_fields[2:]) + '\n')
-                            o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
-                            nb_seq += 1
-                    elif max_bounds == max_seq:
-                        max_seq = min_bounds
-                        new_length = max_seq
-                        if new_length >= params.length:
-                            header_fields = str(seq.description).split()
-                            o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
-                                header_fields[2:]) + '\n')
-                            o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
-                            nb_seq += 1
-                    else:
-                        print("Warning (seq : " + str(
-                            seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
-                cmd = f'rm {tmp_file}'
-                process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-
-#   text = f"""
-#   ### VecScreen
-#   <hr>
-#   * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
-#   * SUSPICIOUS : {nb_seq_suspicious}
-#   * KEPT : {nb_seq}
-#   """
\ No newline at end of file
+    #singularity:
+    #    tools_config['SINGULARITY']['VECS']
+    shell:
+        """
+         python3 {frangipane_obj.snakemake_scripts}/vecscreen_parser.py {input.fasta_file} {input.univec_file} {output.fasta_file} {params.length} {params.singu}
+        """
diff --git a/frangiPANe/snakemake_scripts/vecscreen_parser.py b/frangiPANe/snakemake_scripts/vecscreen_parser.py
new file mode 100755
index 0000000..2d22802
--- /dev/null
+++ b/frangiPANe/snakemake_scripts/vecscreen_parser.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+import os
+import sys
+from Bio import SeqIO
+
+fasta_input = sys.argv[1]
+univec_file = sys.argv[2]
+output_file = sys.argv[3]
+min_length = int(sys.argv[4])
+singu=sys.argv[5]
+
+nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
+
+with open(output_file, "w") as o:
+    for seq in SeqIO.parse(fasta_input, "fasta"):
+        tmp_file = output_file + ".tmp"
+        with open(tmp_file, "w") as tmp:
+            tmp.write('>' + str(seq.description) + '\n')
+            tmp.write(str(seq.seq) + '\n')
+
+        cmd = f"singularity exec {singu} vecscreen -db {univec_file} -query {tmp_file} -outfmt 1 -text_output"
+        process = os.popen(cmd).read()
+
+        if 'No hits found' in process:
+            o.write('>' + str(seq.description) + '\n')
+            o.write(str(seq.seq) + '\n')
+            nb_seq_no_hits += 1
+        else:
+            nb_seq_suspicious += 1
+            # Gestion des contaminations partiellement traitée
+        os.remove(tmp_file)
+
-- 
GitLab