Integrating module load and singu

65ae93b3 · christine.tranchant_ird.fr · 267daf0a · 65ae93b3 · 65ae93b3
Commit 65ae93b3 authored 1 year ago by christine.tranchant_ird.fr
--- a/frangiPANe/snakefiles/vecscreen.smk
+++ b/frangiPANe/snakefiles/vecscreen.smk
@@ -9,12 +9,15 @@ rule vecscreen:
        univec_file = f"{frangipane_obj.path_univec}"
    output:
        fasta_file = f"{frangipane_obj.path_vecscreen}/{{sample_name}}-{{ref_name}}-vecscreen.fa"
+    singularity:
+        tools_config['SINGULARITY']['TOOLS']
+    envmodules:
+        tools_config["ENVMODULE"]["NCBI-TOOLS"]
    log:
        error =  f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.e",
        output = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.o"
    params:
        length = frangipane_obj.get_config_value('CONTIGS','minLength'),
-        module =  tools_config["ENVMODULE"]["NCBI-TOOLS"]
    message:
        f"""--------------------
 -Running {{rule}}
@@ -26,80 +29,5 @@ rule vecscreen:
 |   - Filtering on sequences longer than  : {{params.length}}
 --------------------
 """
-    run:
-        from Bio import SeqIO
-
-        # Load NCBI-TOOLS module
-        cmd_load=f"module load {params.module}"
-
-        nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
-
-        with open(os.path.join(output.fasta_file), "w") as o:
-            for seq in SeqIO.parse(input.fasta_file, "fasta"):
-                # Vecscreen only works with fasta file, not sequence
-                tmp_file = output.fasta_file + ".tmp"
-
-                with open(tmp_file, "w") as tmp:
-                    tmp.write('>' + str(seq.description) + '\n')
-                    tmp.write(gs.format_60(str(seq.seq)) + '\n')
-
-                cmd = f'module load {params.module} && vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
-                print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
-
-                try:
-                    subprocess.run(cmd, shell=True, check=True)
-                except subprocess.CalledProcessError as e:
-                    print(f"########## ERROR running Vecscreen command: {e}")
-
-                process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-
-                if 'No hits found' in process.stdout:
-                    o.write('>' + str(seq.description) + '\n')
-                    o.write(gs.format_60(str(seq.seq)) + '\n')
-                    nb_seq += 1
-                    nb_seq_no_hits += 1
-                else:
-                    # print(process.stdout)
-                    # print("Hits : >" + str(seq.description))
-                    nb_seq_suspicious += 1
-                    bounds = []
-                    for line in process.stdout.split("\n"):
-                        if len(line) > 0:
-                            if line[0].isnumeric():
-                                for value in line.split('\t'):
-                                    bounds.append(int(value))
-                    min_seq = 0
-                    max_seq = len(seq)
-                    min_bounds = min(bounds)
-                    max_bounds = max(bounds)
-                    if min_bounds == min_seq + 1:
-                        min_seq = max_bounds
-                        new_length = max_seq - max_bounds
-                        if new_length >= params.length:
-                            header_fields = str(seq.description).split()
-                            o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
-                                header_fields[2:]) + '\n')
-                            o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
-                            nb_seq += 1
-                    elif max_bounds == max_seq:
-                        max_seq = min_bounds
-                        new_length = max_seq
-                        if new_length >= params.length:
-                            header_fields = str(seq.description).split()
-                            o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
-                                header_fields[2:]) + '\n')
-                            o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
-                            nb_seq += 1
-                    else:
-                        print("Warning (seq : " + str(
-                            seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
-                cmd = f'rm {tmp_file}'
-                process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-
-#   text = f"""
-#   ### VecScreen
-#   <hr>
-#   * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
-#   * SUSPICIOUS : {nb_seq_suspicious}
-#   * KEPT : {nb_seq}
-#   """
\ No newline at end of file
+    shell:
+        f"{frangipane_obj.snakemake_scripts}/vecscreen.py"
\ No newline at end of file
--- a/frangiPANe/snakemake_scripts/vecscreen.py
+++ b/frangiPANe/snakemake_scripts/vecscreen.py
+from Bio import SeqIO
+
+nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
+
+with open(os.path.join(output.fasta_file), "w") as o:
+
+        for seq in SeqIO.parse(snakemake.input.fasta_file, "fasta"):
+            # Vecscreen only works with fasta file, not sequence
+            tmp_file = snakemake.output.fasta_file + ".tmp"
+
+            with open(tmp_file, "w") as tmp:
+                tmp.write('>' + str(seq.description) + '\n')
+                tmp.write(gs.format_60(str(seq.seq)) + '\n')
+
+            print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
+
+            try:
+                subprocess.run(cmd, shell=True, check=True)
+            except subprocess.CalledProcessError as e:
+                print(f"########## ERROR running Vecscreen command: {e}")
+
+            process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+
+            if 'No hits found' in process.stdout:
+                o.write('>' + str(seq.description) + '\n')
+                o.write(gs.format_60(str(seq.seq)) + '\n')
+                nb_seq += 1
+                nb_seq_no_hits += 1
+            else:
+                # print(process.stdout)
+                # print("Hits : >" + str(seq.description))
+                nb_seq_suspicious += 1
+                bounds = []
+                for line in process.stdout.split("\n"):
+                    if len(line) > 0:
+                        if line[0].isnumeric():
+                            for value in line.split('\t'):
+                                bounds.append(int(value))
+                min_seq = 0
+                max_seq = len(seq)
+                min_bounds = min(bounds)
+                max_bounds = max(bounds)
+                if min_bounds == min_seq + 1:
+                    min_seq = max_bounds
+                    new_length = max_seq - max_bounds
+                    if new_length >= snakemake.params.length:
+                        header_fields = str(seq.description).split()
+                        o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
+                            header_fields[2:]) + '\n')
+                        o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
+                        nb_seq += 1
+                elif max_bounds == max_seq:
+                    max_seq = min_bounds
+                    new_length = max_seq
+                    if new_length >= snakemake.params.length:
+                        header_fields = str(seq.description).split()
+                        o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
+                            header_fields[2:]) + '\n')
+                        o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
+                        nb_seq += 1
+                else:
+                    print("Warning (seq : " + str(
+                        seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
+            cmd = f'rm {tmp_file}'
+            process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+
+#   text = f"""
+#   ### VecScreen
+#   <hr>
+#   * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
+#   * SUSPICIOUS : {nb_seq_suspicious}
+#   * KEPT : {nb_seq}
+#   """
\ No newline at end of file