From a13f7f5df6b3ac131a9c3d6ce8e65d79f74d0694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20FERNANDEZ=20NU=C3=91EZ?= <nicolas.fernandez@ird.fr> Date: Mon, 13 Jan 2025 17:42:48 +0100 Subject: [PATCH] Feat: return to the modular rules.smk --- .gitlab-ci.yml => .gitlab-ci.yaml | 0 .snakemake-workflow-catalog.yml | 4 + {resources/data_test => .test}/.gitkeep | 0 ...-Seq-Lib-on-MiSeq_250000-reads_R1.fastq.gz | Bin ...-Seq-Lib-on-MiSeq_250000-reads_R2.fastq.gz | Bin CHANGELOG.md | 0 Run_GeVarLi.sh | 4 +- config/README.md | 0 {configuration => config}/config.yaml | 34 +- {configuration => config}/fastq-screen.conf | 0 .../multiqc/default.yaml | 0 workflow/Snakefile | 1354 +---------------- .../{rules/gevarli.smk => Snakefile_SAVE} | 107 +- workflow/report/workflow.rst | 0 workflow/rules/consensus_calling.smk | 33 + workflow/rules/coverage_stats.smk | 221 +++ workflow/rules/indexing_genomes.smk | 161 +- workflow/rules/lineages_calling.smk | 74 + workflow/rules/mask_lowcov.smk | 86 ++ workflow/rules/primer_cleaping.smk | 45 + ...ality_control.smk => quality_controls.smk} | 69 +- workflow/rules/reads_cleaning.smk | 91 ++ workflow/rules/reads_mapping.smk | 120 ++ workflow/rules/remove_duplicates.smk | 160 ++ workflow/rules/variant_calling.smk | 142 ++ workflow/schemas/config.schema.yaml | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 154 bytes .../__pycache__/functions.cpython-312.pyc | Bin 0 -> 4247 bytes workflow/scripts/functions.py | 127 ++ 29 files changed, 1275 insertions(+), 1557 deletions(-) rename .gitlab-ci.yml => .gitlab-ci.yaml (100%) create mode 100644 .snakemake-workflow-catalog.yml rename {resources/data_test => .test}/.gitkeep (100%) rename {resources/data_test => .test}/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R1.fastq.gz (100%) rename {resources/data_test => .test}/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R2.fastq.gz (100%) create mode 100644 CHANGELOG.md create mode 100644 config/README.md rename {configuration => config}/config.yaml (90%) rename {configuration => config}/fastq-screen.conf (100%) rename {configuration => config}/multiqc/default.yaml (100%) rename workflow/{rules/gevarli.smk => Snakefile_SAVE} (95%) mode change 100755 => 100644 create mode 100644 workflow/report/workflow.rst create mode 100644 workflow/rules/consensus_calling.smk create mode 100644 workflow/rules/coverage_stats.smk mode change 100755 => 100644 workflow/rules/indexing_genomes.smk create mode 100644 workflow/rules/lineages_calling.smk create mode 100644 workflow/rules/mask_lowcov.smk create mode 100644 workflow/rules/primer_cleaping.smk rename workflow/rules/{quality_control.smk => quality_controls.smk} (63%) mode change 100755 => 100644 create mode 100644 workflow/rules/reads_cleaning.smk create mode 100644 workflow/rules/reads_mapping.smk create mode 100644 workflow/rules/remove_duplicates.smk create mode 100644 workflow/rules/variant_calling.smk create mode 100644 workflow/schemas/config.schema.yaml create mode 100644 workflow/scripts/__pycache__/__init__.cpython-312.pyc create mode 100644 workflow/scripts/__pycache__/functions.cpython-312.pyc create mode 100644 workflow/scripts/functions.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yaml similarity index 100% rename from .gitlab-ci.yml rename to .gitlab-ci.yaml diff --git a/.snakemake-workflow-catalog.yml b/.snakemake-workflow-catalog.yml new file mode 100644 index 0000000..fc26f44 --- /dev/null +++ b/.snakemake-workflow-catalog.yml @@ -0,0 +1,4 @@ +usage: + software-stack-deployment: + conda: true + report: true diff --git a/resources/data_test/.gitkeep b/.test/.gitkeep similarity index 100% rename from resources/data_test/.gitkeep rename to .test/.gitkeep diff --git a/resources/data_test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R1.fastq.gz b/.test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R1.fastq.gz similarity index 100% rename from resources/data_test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R1.fastq.gz rename to .test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R1.fastq.gz diff --git a/resources/data_test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R2.fastq.gz b/.test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R2.fastq.gz similarity index 100% rename from resources/data_test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R2.fastq.gz rename to .test/SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads_R2.fastq.gz diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/Run_GeVarLi.sh b/Run_GeVarLi.sh index 26189c0..62d83eb 100755 --- a/Run_GeVarLi.sh +++ b/Run_GeVarLi.sh @@ -216,8 +216,8 @@ fastq=$(expr $(ls -l ${workdir}/resources/reads/*.fastq.gz 2> /dev/null | wc -l) if [[ "${fastq}" == "0" ]] # If no sample, then # start GeVarLi with at least 1 sample echo -e "${red}¡${nc} No FASTQ files detected in ${ylo}resources/reads/${nc} ${red}!${nc} -${red}${sample_test}${nc} in ${ylo}resources/data_test/${nc} FASTQ files were be used as sample example" - cp ${workdir}/resources/data_test/${sample_test}_R*.fastq.gz ${workdir}/resources/reads/ # use data_test fastq +${red}${sample_test}${nc} FASTQ files were be used as sample test" + cp ${workdir}/.test/${sample_test}_R*.fastq.gz ${workdir}/resources/reads/ # use .test/fastq files fastq="2" fi samples=$(expr ${fastq} \/ 2) # {fastq.gz count} / 2 = samples count (paired-end) diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..e69de29 diff --git a/configuration/config.yaml b/config/config.yaml similarity index 90% rename from configuration/config.yaml rename to config/config.yaml index 1f00aaf..82f8d10 100755 --- a/configuration/config.yaml +++ b/config/config.yaml @@ -249,20 +249,20 @@ resources: # -'' # Custom (set your own) envs: # Conda environement yaml files: - bamclipper: 'envs/bamclipper_v.1.0.0.yaml' # Bamclipper ver. 1.0.0 - bcftools: 'envs/bcftools_v.1.20.yaml' # Bcftools ver. 1.20 - bedtools: 'envs/bedtools_v.2.31.1.yaml' # Bedtools ver. 2.31.1 - bowtie2: 'envs/bowtie2_v.2.5.4.yaml' # Bowtie2 ver. 2.5.4 - bwa: 'envs/bwa_v.0.7.18.yaml' # BWA ver. 0.7.18 - cutadapt: 'envs/cutadapt_v.4.9.yaml' # Cutadapt ver. 4.9 - fastq_screen: 'envs/fastq-screen_v.0.15.3.yaml' # Fastq-Screen ver. 0.15.3 (with BWA ver. 0.7.18) - fastqc: 'envs/fastqc_v.0.12.1.yaml' # FastQC ver. 0.12.1 - gawk: 'envs/gawk_v.5.3.0.yaml' # Awk (GNU) ver. 5.3.0 - ivar: 'envs/ivar_v.1.4.3.yaml' # iVar ver. 1.4.3 (with Samtools ver. 1.20) - minimap2: 'envs/minimap2_v.2.28.yaml' # Minimap2 ver. 2.28 - multiqc: 'envs/multiqc_v.1.23.yaml' # MultiQC ver. 1.23 (with Pandoc ver. 3.3) - nextclade: 'envs/nextclade_v.3.9.1.yaml' # Nextclade ver. 3.9.1 - pangolin: 'envs/pangolin_v.4.3.yaml' # Pangolin ver. 4.3 - samtools: 'envs/samtools_v.1.20.yaml' # Samtools ver. 1.20 - sickle_trim: 'envs/sickle-trim_v.1.33.yaml' # Sickle-trim ver. 1.33 - tsv2vcf: 'envs/tsv2vcf_v.1.1.yaml' # tsv2vcf ver. 1.1 (bcftools biopython numpy) + bamclipper: '../envs/bamclipper_v.1.0.0.yaml' # Bamclipper ver. 1.0.0 + bcftools: '../envs/bcftools_v.1.20.yaml' # Bcftools ver. 1.20 + bedtools: '../envs/bedtools_v.2.31.1.yaml' # Bedtools ver. 2.31.1 + bowtie2: '../envs/bowtie2_v.2.5.4.yaml' # Bowtie2 ver. 2.5.4 + bwa: '../envs/bwa_v.0.7.18.yaml' # BWA ver. 0.7.18 + cutadapt: '../envs/cutadapt_v.4.9.yaml' # Cutadapt ver. 4.9 + fastq_screen: '../envs/fastq-screen_v.0.15.3.yaml' # Fastq-Screen ver. 0.15.3 (with BWA ver. 0.7.18) + fastqc: '../envs/fastqc_v.0.12.1.yaml' # FastQC ver. 0.12.1 + gawk: '../envs/gawk_v.5.3.0.yaml' # Awk (GNU) ver. 5.3.0 + ivar: '../envs/ivar_v.1.4.3.yaml' # iVar ver. 1.4.3 (with Samtools ver. 1.20) + minimap2: '../envs/minimap2_v.2.28.yaml' # Minimap2 ver. 2.28 + multiqc: '../envs/multiqc_v.1.23.yaml' # MultiQC ver. 1.23 (with Pandoc ver. 3.3) + nextclade: '../envs/nextclade_v.3.9.1.yaml' # Nextclade ver. 3.9.1 + pangolin: '../envs/pangolin_v.4.3.yaml' # Pangolin ver. 4.3 + samtools: '../envs/samtools_v.1.20.yaml' # Samtools ver. 1.20 + sickle_trim: '../envs/sickle-trim_v.1.33.yaml' # Sickle-trim ver. 1.33 + tsv2vcf: '../envs/tsv2vcf_v.1.1.yaml' # tsv2vcf ver. 1.1 (bcftools biopython numpy) diff --git a/configuration/fastq-screen.conf b/config/fastq-screen.conf similarity index 100% rename from configuration/fastq-screen.conf rename to config/fastq-screen.conf diff --git a/configuration/multiqc/default.yaml b/config/multiqc/default.yaml similarity index 100% rename from configuration/multiqc/default.yaml rename to config/multiqc/default.yaml diff --git a/workflow/Snakefile b/workflow/Snakefile index 4eff7a1..1fa20a2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -15,6 +15,7 @@ # Date ___________________ 2021.10.12 # Latest modifications ___ 2025.01.08 (Prepare for Snakedeploy) # Use ____________________ snakemake -s Snakemake --use-conda -j +############################################################################### ############################################################################### ### CONFIGURATION ### @@ -26,132 +27,37 @@ configfile: "configuration/config.yaml" ### FUNCTIONS ### ################# -def get_memory_per_thread(wildcards): - memory_per_thread = int(RAM) // int(CPUS) - if memory_per_thread < 1: - memory_per_thread = 1 - return memory_per_thread - -def get_quality_input(wildcards): - quality_list = [] - if "quality" in MODULES: - quality_list = "results/00_Quality_Control/multiqc/" - return quality_list - -def get_trimming_input(wildcards): - trimming_list = [] - if "trimming" in MODULES: - trimming_list = expand("results/01_Trimming/sickle/{sample}_sickle-trimmed_SE.fastq.gz", - sample = SAMPLE) - return trimming_list - -def stash_or_trash(path): - if "keeptrim" in MODULES: - return path - else: - return temp(path) - -def get_cleapping_input(wildcards): - cleapping_list = [] - if "cleapping" in MODULES: - cleapping_list = "" - return cleapping_list +# Imports +import sys +import glob +import pandas as pandas +import scripts.functions as functions +from scripts.functions import stash_or_trash +from scripts.functions import get_bam_input +from scripts.functions import get_bai_input +import snakemake.utils as snakutils -def get_bam_input(wildcards): - markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" - if "cleapping" in MODULES: - markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam" - return markdup +# Exports +sys.path.append("/Users/2021nf001/git/GeVarLi") -def get_bai_input(wildcards): - index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" - if "cleapping" in MODULES: - index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam.bai" - return index +functions.CPUS = config["resources"]["cpus"] # Threads (maximum) +functions.RAM = config["resources"]["ram"] # Memory (RAM) in Gb (maximum) -def get_flagstat_input(wildcards): - flagstat_list = [] - if "covstats" in MODULES: - flagstat_list = expand( - "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}", - reference = REFERENCE, - sample = SAMPLE, - aligner = ALIGNER, - ext = ["txt", "tsv", "json"] - ) - return flagstat_list +functions.MODULES = config["modules"] # Modules -def get_covstats_input(wildcards): - covstats_list = [] - if "covstats" in MODULES: - covstats_list = expand( - "results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv", - reference = REFERENCE, - sample = SAMPLE, - aligner = ALIGNER, - min_cov = MIN_COV - ) - return covstats_list +functions.SAMPLE, = glob_wildcards("resources/symlinks/{sample}_R1.fastq.gz") # Samples -def get_consensus_input(wildcards): - consensus_list = [] - if "consensus" in MODULES: - consensus_list = expand( - "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta", - reference = REFERENCE, - sample = SAMPLE, - aligner = ALIGNER, - min_cov = MIN_COV, - caller = CALLER - ) - return consensus_list +functions.REFERENCE = config["consensus"]["reference"] # Genome reference sequence, in fasta format +functions.ALIGNER = config["consensus"]["aligner"] # Aligner ('minimap2', 'bwa' or 'bowtie2') +functions.MIN_COV = config["consensus"]["min_cov"] # Minimum coverage, mask lower regions with 'N' +functions.CALLER = config["consensus"]["caller"] # Variant Caller ('lofreq' or 'ivar') +#functions.IUPAC = config["consensus"]["iupac"] # Output variants in the form of IUPAC ambiguity codes -def get_vcf_input(wildcards): - vcf_list = [] - if "consensus" in MODULES: - vcf_list = expand( - "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_variant-filt.vcf", - reference = REFERENCE, - sample = SAMPLE, - aligner = ALIGNER, - min_cov = MIN_COV, - caller = CALLER - ) - return vcf_list +# Snakemake version +snakutils.min_version("8.27.0") -def get_pangolin_input(wildcards): - pangolin_list = [] - if "pangolin" in MODULES: - pangolin_list = expand( - "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_pangolin-report.csv", - reference = REFERENCE, - sample = SAMPLE, - aligner = ALIGNER, - min_cov = MIN_COV, - caller = CALLER - ) - return pangolin_list - -def get_nextclade_input(wildcards): - nextclade_list = [] - if "nextclade" in MODULES: - nextclade_list = expand( - "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-report.tsv", - reference = REFERENCE, - sample = SAMPLE, - aligner = ALIGNER, - min_cov = MIN_COV, - caller = CALLER - ) - return nextclade_list - -def get_gisaid_input(wildcards): - gisaid_list = [] - if "gisaid" in MODULES: - gisaid_list = expand( - "" - ) - return gisaid_list +# Schemas +#snakutils.validate(config, schema="schemas/config.schema.yaml") ############################################################################### ### WILDCARDS ### @@ -166,7 +72,7 @@ SAMPLE, = glob_wildcards("resources/symlinks/{sample}_R1.fastq.gz") CPUS = config["resources"]["cpus"] # Threads (maximum) RAM = config["resources"]["ram"] # Memory (RAM) in Gb (maximum) -MEM_GB = get_memory_per_thread # Memory per thread in GB (maximum) +MEM_GB = functions.get_memory_per_thread # Memory per thread in GB (maximum) TMP_DIR = config["resources"]["tmp_dir"] # Temporary directory ############################################################################### @@ -195,8 +101,6 @@ NEXTCLADE = config["envs"]["nextclade"] # Nextclade conda environment ### PARAMETERS ### ################## -MODULES = config["modules"] # Modules - SUBSET = config["fastq_screen"]["subset"] # Fastq-Screen --subset FQC_CONFIG = config["fastq_screen"]["config"] # Fastq-Screen --conf #MQC_PATH = config["multiqc"]["path"] # MultiQC --conf @@ -211,7 +115,7 @@ SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # MM2 split i BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm BT2_ALGO = config["bowtie2"]["algorithm"] # BT2 indexing algorithm -REFERENCE = config["consensus"]["reference"] # Genome reference sequence, in fasta format +#REFERENCE = config["consensus"]["reference"] # Genome reference sequence, in fasta format MIN_COV = config["consensus"]["min_cov"] # Minimum coverage, mask lower regions with 'N' IUPAC = config["consensus"]["iupac"] # Output variants in the form of IUPAC ambiguity codes ALIGNER = config["consensus"]["aligner"] # Aligner ('minimap2', 'bwa' or 'bowtie2') @@ -260,1189 +164,35 @@ NEXT_PATH = config["nextclade"]["path"] # Path to nextclade dataset NEXT_DATASET = config["nextclade"]["dataset"] # Nextclade dataset ############################################################################### -### RULES ### -############# -rule all: - input: - multiqc = get_quality_input, - trimming = get_trimming_input, - consensus = get_consensus_input, - flagstat = get_flagstat_input, - covstats = get_covstats_input, - vcf = get_vcf_input, - pangolin = get_pangolin_input, - nextclade = get_nextclade_input, - gisaid = get_gisaid_input - - -############################################################################### -################################## LINEAGES ################################### -############################################################################### - -############################################################################### -rule nextclade_lineage: - # Aim: nextclade lineage assignation - # Use: nextclade [QUERY.fasta] -t [THREADS] --outfile [NAME.csv] - message: - """ - ~ Nextclade ∞ Assign Lineage ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - Variant caller: __ {wildcards.caller} - """ - conda: - NEXTCLADE - resources: - cpus = CPUS - params: - path = NEXT_PATH, - dataset = NEXT_DATASET - input: - consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" - output: - lineage = "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-report.tsv", - alignment = directory("results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-all/") - log: - "results/10_Reports/tools-log/nextclade/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_lineage.log" - shell: - "nextclade " # Nextclade, assign queries sequences to clades and reports potential quality issues - "run " # Run analyzis - "--jobs {resources.cpus} " # -j: Number of CPU threads used by the algorithm (default: all available threads) - "--input-dataset {params.path}{params.dataset} " # -raq: Path to a directory containing a dataset (root-seq, tree and qc-config required) - "--output-tsv {output.lineage} " # -t: Path to output TSV results file - "--output-all {output.alignment} " # -O: Produce all of the output files into this directory, using default basename - "{input.consensus} " # Path to a .fasta file with input sequences - "&> {log}" # Log redirection - -############################################################################### -rule pangolin_lineage: - # Aim: lineage mapping - # Use: pangolin [QUERY.fasta] -t [THREADS] --outfile [NAME.csv] - message: - """ - ~ Pangolin ∞ Assign Lineage ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - Variant caller: __ {wildcards.caller} - """ - conda: - PANGOLIN - resources: - cpus = CPUS, - tmp_dir = TMP_DIR - input: - consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" - output: - lineage = "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_pangolin-report.csv" - log: - "results/10_Reports/tools-log/pangolin/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_lineage.log" - shell: - "pangolin " # Pangolinn, Phylogenetic Assignment of Named Global Outbreak LINeages - "{input.consensus} " # Query fasta file of sequences to analyse - "--threads {resources.cpus} " # -t: Number of threads - "--tempdir {resources.tmp_dir} " # Specify where you want the temp stuff to go (default: $TMPDIR) - "--outfile {output.lineage} " # Optional output file name (default: lineage_report.csv) - "&> {log}" # Log redirection - -############################################################################### -################################## CONSENSUS ################################## -############################################################################### - -############################################################################### -rule sed_rename_headers: - # Aim: rename all fasta header with sample name - # Use: sed 's/[OLD]/[NEW]/' [IN] > [OUT] - message: - """ - ~ Sed ∞ Rename Fasta Header ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - Variant caller: __ {wildcards.caller} - """ - conda: - GAWK - input: - cons_tmp = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta.tmp" - output: - consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" - log: - "results/10_Reports/tools-log/sed/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_fasta-header.log" - shell: - "sed " # Sed, a Stream EDitor used to perform basic text transformations on an input stream - "'s/^>.*$/>{wildcards.sample}_{wildcards.aligner}_{wildcards.min_cov}X_{wildcards.caller}/' " - "{input.cons_tmp} " # Input file - "1> {output.consensus} " # Output file - "2> {log}" # Log redirection - -############################################################################### -########################### VARIANTS CALLING - IVAR ########################### -############################################################################### - -############################################################################### -rule convert_tsv2vcf: - message: - """ - ~ iVar ∞ Convert TSV to VCF file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - Variant caller: __ iVar - """ - conda: - TSV2VCF - input: - tsv = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" - output: - vcf = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-filt.vcf" - log: - "results/10_Reports/tools-log/tsv2vcf/{reference}/{sample}_{aligner}_{min_cov}X_ivar_tsv2vcf.log" - shell: - "python3 " # Python 3 - "workflow/scripts/ivar_variants_to_vcf.py " # Script (from viralrecon) - "{input.tsv} " # TSV input - "{output.vcf} " # VCF output - "&> {log}" # Log redirection - -############################################################################### -rule ivar_consensus: - # Aim: - # Use: - message: - """ - ~ iVar ∞ Call Consensus ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - Variant caller: __ iVar - """ - conda: - IVAR - params: - min_depth = IVAR_MIN_DEPTH, - min_freq = IVAR_MIN_FREQ, - min_insert = IVAR_MIN_INSERT, - max_depth = IVAR_MAX_DEPTH, - min_bq = IVAR_MIN_BQ, - min_qual = IVAR_MIN_QUAL, - baq = IVAR_MAP_QUAL - input: - mark_dup = get_bam_input, - variant_call = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" - output: - prefix = temp("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus"), - cons_tmp = temp("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus.fasta.tmp"), - qual_txt = "results/05_Consensus/{reference}/ivar_consensus-quality/{sample}_{aligner}_{min_cov}X_ivar_consensus.qual.txt", - log: - "results/10_Reports/tools-log/ivar/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus.log" - shell: - "samtools mpileup " # Samtools mpileup, tools for alignments in the SAM format with command multi-way pileup - "--verbosity 0 " # Set level of verbosity [INT] - "-a " # -a: output all positions (including zero depth) - "-a " # -a -a / -aa: output absolutely all positions, including unused ref. sequences - "--count-orphans " # -A: do not discard anomalous read pairs - "--max-depth {params.max_depth} " # -d: max per-file depth; avoids excessive memory usage [INT] (default: 8000) - "{params.baq} " # --no-BAQ / -B: disable BAQ (per-Base Alignment Quality) - "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than [INT] (default: 13) - #"--reference {input.masked_ref} " # Reference sequence FASTA FILE - "{input.mark_dup} " # Markdup BAM input - "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout - "| " ### PIPE to iVar - "ivar consensus " # iVar, with command 'consensus': Call consensus from aligned BAM file - "-p {output.prefix} " # -p: prefix - "-q {params.min_qual} " # -q: Minimum quality score threshold to count base [INT] (Default: 20) - "-t {params.min_freq} " # -t: Minimum frequency threshold to call variants [FLOAT] (Default: 0.03) - "-c {params.min_insert} " # -c: Minimum insertion frequency threshold to call consensus [FLOAT] (Default: 0.8) - "-m {params.min_depth} " # -m: Minimum read depth to call variants [INT] (Default: 0) - "-n N " # -n: Character to print in regions with less than minimum coverage (Default: N) - #"-i {wildcards.sample} " # -i: Name of fasta header (default: Consensus_<prefix>_threshold_<min_freq>_quality_<min_qual>_<min_insert> - "&> {log} " # Log redirection - "&& mv {output.prefix}.fa {output.cons_tmp} " # copy consensus.fa (temp) to consensus.fasta.tmp (tmp) - "&& mv {output.prefix}.qual.txt {output.qual_txt} " # cppty consensus.qual.txt (tmp) to ivar_consensus-quality/ directory - "&& touch {output.prefix}" # Touch done - -############################################################################### -rule ivar_variant_calling: - # Aim: SNVs and Indels calling - # Use: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file] - # Note: samtools mpileup output must be piped into ivar variants - message: - """ - ~ iVar ∞ Call Variants ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - Variant caller: __ iVar - """ - conda: - IVAR - params: - min_depth = IVAR_MIN_DEPTH, - min_freq = IVAR_MIN_FREQ, - min_insert = IVAR_MIN_INSERT, - max_depth = IVAR_MAX_DEPTH, - min_bq = IVAR_MIN_BQ, - min_qual = IVAR_MIN_QUAL, - baq = IVAR_MAP_QUAL - input: - masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta", - mark_dup = get_bam_input - output: - variant_call = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" - log: - "results/10_Reports/tools-log/ivar/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.log" - shell: - "samtools mpileup " # Samtools mpileup, tools for alignments in the SAM format with command multi-way pileup - "--verbosity 0 " # Set level of verbosity [INT] - "-a " # -a: output all positions (including zero depth) - "-a " # -a -a / -aa: output absolutely all positions, including unused ref. sequences - "--count-orphans " # -A: do not discard anomalous read pairs - "--max-depth {params.max_depth} " # -d: max per-file depth; avoids excessive memory usage (default: 8000) [INT] - "{params.baq} " # --no-BAQ / -B: disable BAQ (per-Base Alignment Quality) - "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than (default: 13) [INT] - "--reference {input.masked_ref} " # Reference sequence FASTA FILE - "{input.mark_dup} " # Markdup BAM input - "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout - "| " ### pipe to iVar - "ivar variants " # iVar, with command 'variants': Call variants from aligned BAM file - "-p {output.variant_call} " # -p: prefix - "-q {params.min_qual} " # -q: Minimum quality score threshold to count base (Default: 20) [INT] - "-t {params.min_freq} " # -t: Minimum frequency threshold to call variants (Default: 0.03) [FLOAT] - "-m {params.min_depth} " # -m: Minimum read depth to call variants (Default: 0) [INT] - "-r {input.masked_ref} " # -r: Reference file used for alignment (translate the nuc. sequences and identify intra host single nuc. variants) - #"-g " # -g: A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs) - "&> {log}" # Log redirection - - -############################################################################### -############################ MASKING LOW COVERAGE ############################# -############################################################################### - -############################################################################### -rule bedtools_masking: - # Aim: masking low coverage regions - # Use: bedtools maskfasta [OPTIONS] -fi [REFERENCE.fasta] -bed [RANGE.bed] -fo [MASKEDREF.fasta] - message: - """ - ~ BedTools ∞ Mask Low Coverage Regions ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - """ - conda: - BEDTOOLS - input: - reference = "resources/genomes/{reference}.fasta", - low_cov_mask = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed" - output: - masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta" - log: - "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_masking.log" - shell: - "bedtools maskfasta " # Bedtools maskfasta, mask a fasta file based on feature coordinates - "-fi {input.reference} " # Input FASTA file - "-bed {input.low_cov_mask} " # BED/GFF/VCF file of ranges to mask in -fi - "-fo {output.masked_ref} " # Output masked FASTA file - "&> {log}" # Log redirection - -############################################################################### -rule bedtools_merged_mask: - # Aim: merging overlaps - # Use: bedtools merge [OPTIONS] -i [FILTERED.bed] -g [GENOME.fasta] - message: - """ - ~ BedTools ∞ Merge Overlaps ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - """ - conda: - BEDTOOLS - input: - min_cov_filt = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_min-cov-filt.bed" - output: - low_cov_mask = temp("results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed") - log: - "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_merging.log" - shell: - "bedtools merge " # Bedtools merge, merges overlapping BED/GFF/VCF entries into a single interval - "-i {input.min_cov_filt} " # -i: BED/GFF/VCF input to merge - "1> {output.low_cov_mask} " # merged output - "2> {log}" # Log redirection - -############################################################################### -rule awk_min_covfilt: - # Aim: minimum coverage filtration - # Use: awk '$4 < [MIN_COV]' [BEDGRAPH.bed] 1> [FILTERED.bed] - message: - """ - ~ Awk ∞ Minimum Coverage Filtration ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - Min. cov.: _______ {wildcards.min_cov}X - """ - conda: - GAWK - input: - genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" - output: - min_cov_filt = temp("results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_min-cov-filt.bed") - log: - "results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_min-cov-filt.log" - shell: - "awk " # Awk, a program that you can use to select particular records in a file and perform operations upon them - "'$4 < {wildcards.min_cov}' " # Minimum coverage for masking regions in consensus sequence - "{input.genome_cov} " # BedGraph coverage input - "1> {output.min_cov_filt} " # Minimum coverage filtered bed output - "2> {log} " # Log redirection - -############################################################################### -################################## STATISTICS ################################# -############################################################################### - -############################################################################### -rule awk_coverage_statistics: - # Aim: computing genomme coverage stats - # Use: awk {FORMULA} END {{print [RESULTS.tsv] [BEDGRAPH.bed] - message: - """ - ~ Awk ∞ Compute Genome Coverage Statistics from BED file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - GAWK - input: - cutadapt = "results/10_Reports/tools-log/cutadapt/{sample}.log", - sickle = "results/10_Reports/tools-log/sickle-trim/{sample}.log", - samtools = "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log", - flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.json", - histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt", - genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" - output: - cov_stats = "results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv" - log: - "results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.log" - shell: - r""" rawReads=$(grep -o -E """ # Get raw reads - r""" 'Total read pairs processed:.+' {input.cutadapt} """ # - r""" | sed -E 's/Total read pairs processed:\ +//' """ # - r""" | sed 's/,//g') ; """ # - # - r""" cutadaptPF=$(grep -o -E """ # Get cutadapt Passing Filtes reads - r""" 'Pairs written \(passing filters\):.+' {input.cutadapt} """ # - r""" | sed -E 's/Pairs written \(passing filters\):\ +//' """ # - r""" | sed 's/,//g') ; """ # - # - r""" sicklePF=$(grep -o -E """ # Get sickle Passing Filtes reads - r""" 'FastQ paired records kept:.+' {input.sickle} """ # - r""" | sed -E 's/FastQ paired records kept:\ +//') ; """ # - # - r""" totalDuplicate=$(grep -o -E """ # Get total duplicated reads - r""" 'DUPLICATE TOTAL:.+' {input.samtools} """ # - r""" | sed -E 's/DUPLICATE TOTAL:\ +//') ; """ # - # - r""" estimatedLibrarySize=$(grep -o -E """ # Get estimated library size - r""" 'ESTIMATED_LIBRARY_SIZE:.+' {input.samtools} """ # - r""" | sed -E 's/ESTIMATED_LIBRARY_SIZE:\ +//') ; """ # - # - r""" samtoolsPF=$(grep -o -E """ # Get samtool Passing Filter reads - r""" 'WRITTEN: .+' {input.samtools} """ # - r""" | sed -E 's/WRITTEN:\ +//') ; """ # - # - r""" mappedReads=$(grep -o -E -m 1 """ # Get mapped reads - r""" '"mapped": .+' {input.flagstat} """ # - r""" | sed -E 's/"mapped":\ +//' """ # - r""" | sed 's/,//g') ; """ # - # - r""" mappedPercentReads=$(grep -o -E -m 1 """ # Get mapped precent reads - r""" '"mapped %": .+' {input.flagstat} """ # - r""" | sed -E 's/"mapped %":\ +//' """ # - r""" | sed 's/,//g') ; """ # - # - r""" covPercentAt1X=$(grep -o -E """ # Get coverage percent @1X - r""" 'Percent covered:.+' {input.histogram} """ # - r""" | sed -E 's/Percent covered:\ +//') ; """ # - # - r""" awk """ # Awk, a program to select particular records in a file and perform operations upon them - r""" -v rawReads="${{rawReads}}" """ # Define external variable - r""" -v cutadaptPF="${{cutadaptPF}}" """ # Define external variable - r""" -v sicklePF="${{sicklePF}}" """ # Define external variable - r""" -v totalDuplicate="${{totalDuplicate}}" """ # Define external variable - r""" -v estimatedLibrarySize="${{estimatedLibrarySize}}" """ # Define external variable - r""" -v samtoolsPF="${{samtoolsPF}}" """ # Define external variable - r""" -v mappedReads="${{mappedReads}}" """ # Define external variable - r""" -v mappedPercentReads="${{mappedPercentReads}}" """ # Define external variable - r""" -v covPercentAt1X="${{covPercentAt1X}}" """ # Define external variable - r""" '$4 >= {wildcards.min_cov} {{supMin_Cov+=$3-$2}} ; """ # Genome size (>= min_cov @X) - r""" {{genomeSize+=$3-$2}} ; """ # Genome size (total) - r""" {{totalBases+=($3-$2)*$4}} ; """ # Total bases @1X - r""" {{totalBasesSq+=(($3-$2)*$4)**2}} """ # Total bases square @1X - r""" END """ # END - r""" {{print """ # Print - r""" "sample_id", "\t", """ # header: Sample ID - r""" "raw_paired_reads", "\t", """ # header: Raw paired reads - r""" "cutadapt_pairs_PF", "\t", """ # header: Cutadapt Passing Filters - r""" "sickle_reads_PF", "\t", """ # header: Sickle-trim Passing Filters - r""" "duplicated_reads", "\t", """ # header: - r""" "duplicated_percent_%","\t", """ # header: - r""" "estimated_library_size*", "\t", """ # header: - r""" "samtools_pairs_PF", "\t", """ # header: - r""" "mapped_with", "\t", """ # header: aligner - r""" "mapped_on", "\t", """ # header: reference - r""" "mapped_reads", "\t", """ # header: - r""" "mapped_percent_%", "\t", """ # header: - r""" "mean_depth", "\t", """ # header: mean depth - r""" "standard_deviation", "\t", """ # header: standard deviation - r""" "cov_percent_%_@1X", "\t", """ # header: coverage percentage @1X - r""" "cov_percent_%" "\t", """ # header: coverage percentage - r""" "@_min_cov" """ # header: @_[min_cov]_X - r""" ORS """ # \n newline - r""" "{wildcards.sample}", "\t", """ # value: Sample ID - r""" rawReads, "\t", """ # value: Raw sequences - r""" cutadaptPF, "\t", """ # value: Cutadapt Passing Filter - r""" sicklePF, "\t", """ # value: Sickle Passing Filter - r""" totalDuplicate, "\t", """ # value: - r""" int(((totalDuplicate)/(rawReads*2))*100), "%", "\t", """ # value: (divided by 2 to estimated pairs) - r""" estimatedLibrarySize, "\t", """ # value: - r""" samtoolsPF, "\t", """ # value: - r""" "{wildcards.aligner}", "\t", """ # value: aligner - r""" "{wildcards.reference}", "\t", """ # value: reference - r""" mappedReads, "\t", """ # value: - r""" mappedPercentReads, "%", "\t", """ # value: - r""" int(totalBases/genomeSize), "\t", """ # value: mean depth - r""" int(sqrt((totalBasesSq/genomeSize)-(totalBases/genomeSize)**2)), "\t", """ # Standard deviation value - r""" covPercentAt1X, "\t", """ # value - r""" supMin_Cov/genomeSize*100, "%", "\t", """ # Coverage percent (@ min_cov X) value - r""" "@{wildcards.min_cov}X" """ # @ min_cov X value - r""" }}' """ # Close print - r""" {input.genome_cov} """ # BedGraph coverage input - r""" 1> {output.cov_stats} """ # Mean depth output - r""" 2> {log}""" # Log redirection - -############################################################################### -rule bedtools_genome_coverage: - # Aim: computing genome coverage sequencing - # Use: bedtools genomecov [OPTIONS] -ibam [MARK-DUP.bam] 1> [BEDGRAPH.bed] - message: - """ - ~ BedTools ∞ Compute Genome Coverage ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - BEDTOOLS - input: - mark_dup = get_bam_input, - index = get_bai_input - output: - genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" - log: - "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_genome-cov.log" - shell: - "bedtools genomecov " # Bedtools genomecov, compute the coverage of a feature file among a genome - "-bga " # Report depth in BedGraph format, regions with zero coverage are also reported - "-ibam {input.mark_dup} " # The input file is in BAM format, must be sorted by position - "1> {output.genome_cov} " # BedGraph output - "2> {log} " # Log redirection - -############################################################################### -rule samtools_coverage_histogram: - # Aim: alignment depth and percent coverage histogram - # Use: samtools coverage --histogram [INPUT.bam] - message: - """ - ~ SamTools ∞ Calcul Depth and Coverage from BAM file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS - #bins = BINS, - #depth = DEPTH - input: - mark_dup = get_bam_input - output: - histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt" - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_coverage-histogram.log" - shell: - "samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage - "--histogram " # -m: show histogram instead of tabular output - "--verbosity 4 " # Set level of verbosity [INT] (default: 3) - "--n-bins 149 " # -w: number of bins in histogram (default: terminal width - 40) (todo: {params.bins}) - "--depth 0 " # -d maximum allowed coverage depth [INT] (default: 1000000 ; 0 removing any depth limit) (todo: {params.depth}) - "--output {output.histogram} " # write output to FILE (default: stdout) - "{input.mark_dup} ; " # Mark_dup bam input - "echo >> {output.histogram} ; " # Newline - "samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage - "--verbosity 4 " # Set level of verbosity [INT] (default: 3) - "{input.mark_dup} " # Mark_dup bam input - ">> {output.histogram} " # write output to FILE (default: stdout) - "2> {log}" # Log redirection - -############################################################################### -rule samtools_flagstat_ext: - # Aim: simple stats - # Use: samtools flagstat -@ [THREADS] [INPUT.bam] - message: - """ - ~ SamTools ∞ Calcul simple Stats from BAM file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS - input: - mark_dup = get_bam_input - output: - flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}" - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_flagstat-{ext}.log" - shell: - "samtools flagstat " # Samtools flagstat, tools for alignments in the SAM format with command to simple stat - "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) - "--verbosity 4 " # Set level of verbosity [INT] (default: 3) - "--output-fmt {wildcards.ext} " # -O Specify output format (none, tsv and json) - "{input.mark_dup} " # Mark_dup bam input - "1> {output.flagstat} " # Mark_dup index output - "2> {log}" # Log redirection - -############################################################################### -############################## PRIMER CLEAPING ############################### -############################################################################### - -############################################################################### -rule bamclipper_amplicon_primers: - # Aim: soft-clip amplicon PCR primers from BAM alignments - # Use: bamclipper.sh -n [THREADS] -b [MARKDUP.bam] -p [PRIMER.bed] -u [UPSTREAM] -d [DOWNSTREAM] - message: - """ - ~ BAMClipper ∞ soft-clipping amplicon PCR primers from BAM alignments ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - BAMCLIPPER - resources: - cpus = CPUS - params: - path = CLIPPATH, - primers = PRIMERS, - upstream = UPSTREAM, - downstream = DOWNSTREAM - input: - markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam", - index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" - output: - bamclip = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam", - baiclip = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam.bai" - log: - "results/10_Reports/tools-log/bamclipper/{reference}/{sample}_{aligner}_primers-clip.log" - shell: - "bamclipper.sh " # BAMClipper, remove primer sequences from BAM alignments of PCR amplicons by soft-clipping - "-b {input.markdup} " # Indexed BAM alignment file - "-p {params.path}{params.primers}.bedpe " # BEDPE of primer pair locations - "-n {resources.cpus} " # Number of threads (default: 1) - "-u {params.upstream} " # Number of nuc. upstream for assigning alignments to primers (default: 5) - "-d {params.downstream} " # Number of nuc. downstream for assigning alignments to primers (default: 5) - #"-o results/02_Mapping/ " # Path to write output (if BamClipper v.1.1.2) (todo) - "&> {log} " # Log redirection - "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam {output.bamclip} " # because BamClipper v.1 default output system... - "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam.bai {output.baiclip}" # because BamClipper v.1 default output system... - -############################################################################### -############################## REMOVE DUPLICATS ############################### -############################################################################### - -############################################################################### -rule samtools_index_markdup: - # Aim: indexing marked as duplicate BAM file - # Use: samtools index -@ [THREADS] -b [MARK-DUP.bam] [INDEX.bai] - message: - """ - ~ SamTools ∞ Index 'Marked as Duplicate' BAM file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS - input: - mark_dup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" - output: - index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup-index.log" - shell: - "samtools index " # Samtools index, tools for alignments in the SAM format with command to index alignment - "-@ {resources.cpus} " # --threads: Number of additional threads to use (default: 1)(NB, --threads form dose'nt work) - "-b " # -b: Generate BAI-format index for BAM files (default) - "{input.mark_dup} " # Mark_dup bam input - "{output.index} " # Mark_dup index output - "&> {log}" # Log redirection - -############################################################################### -rule samtools_markdup: - # Aim: marking duplicate alignments - # Use: samtools markdup -@ [THREADS] -r -s -O BAM [SORTED.bam] [MARK-DUP.bam] - message: - """ - ~ SamTools ∞ Mark Duplicate Alignments ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS - input: - sorted = "results/02_Mapping/{reference}/{sample}_{aligner}_sorted.bam" - output: - mark_dup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log" - shell: - "samtools markdup " # Samtools markdup, tools for alignments in the SAM format with command mark duplicates - "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) - "-r " # -r: Remove duplicate reads - "-s " # -s: Report stats - "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) - "{input.sorted} " # Sorted bam input - "{output.mark_dup} " # Mark_dup bam output - "&> {log}" # Log redirection - -############################################################################### -rule samtools_sorting: - # Aim: sorting - # Use: samtools sort -@ [THREADS] -m [MEM_GB] -T [TMP_DIR] -O BAM -o [SORTED.bam] [FIX-MATE.bam] - message: - """ - ~ SamTools ∞ Sort BAM file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS, - mem_gb = MEM_GB, - tmp_dir = TMP_DIR - input: - fix_mate = "results/02_Mapping/{reference}/{sample}_{aligner}_fix-mate.bam" - output: - sorted = temp("results/02_Mapping/{reference}/{sample}_{aligner}_sorted.bam") - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_sorted.log" - shell: - "samtools sort " # Samtools sort, tools for alignments in the SAM format with command to sort alignment file - "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) - "-m {resources.mem_gb}G " # -m: Set maximum memory per thread, suffix K/M/G recognized (default: 768M) - "-T {resources.tmp_dir} " # -T: Write temporary files to PREFIX.nnnn.bam - "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) - "-o {output.sorted} " # Sorted bam output - "{input.fix_mate} " # Fixmate bam input - "&> {log}" # Log redirection - -############################################################################### -rule samtools_fixmate: - # Aim: filling in mate coordinates - # Use: samtools fixmate -@ [THREADS] -m -O BAM [SORT-BY-NAMES.bam] [FIX-MATE.bam] - message: - """ - ~ SamTools ∞ Fil Mate Coordinates ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS - input: - sort_by_names = "results/02_Mapping/{reference}/{sample}_{aligner}_sort-by-names.bam" - output: - fix_mate = temp("results/02_Mapping/{reference}/{sample}_{aligner}_fix-mate.bam") - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_fix-mate.log" - shell: - "samtools fixmate " # Samtools fixmate, tools for alignments in the SAM format with command to fix mate information - "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) - "-m " # -m: Add mate score tag - "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) - "{input.sort_by_names} " # Sort_by_names bam input - "{output.fix_mate} " # Fix_mate bam output - "&> {log}" # Log redirection - -############################################################################### -rule samtools_sortbynames: - # Aim: sorting by names - # Use: samtools sort -t [THREADS] -m [MEM_GB] -n -O BAM -o [SORT-BY-NAMES.bam] [MAPPED.sam] - message: - """ - ~ SamTools ∞ Sort by Names BAM file ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ {wildcards.aligner} - """ - conda: - SAMTOOLS - resources: - cpus = CPUS, - mem_gb = MEM_GB - input: - mapped = "results/02_Mapping/{reference}/{sample}_{aligner}-mapped.sam" - output: - sort_by_names = temp("results/02_Mapping/{reference}/{sample}_{aligner}_sort-by-names.bam") - log: - "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_sort-by-names.log" - shell: - "samtools sort " # Samtools sort, tools for alignments in the SAM format with command to sort alignment file - "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) - "-m {resources.mem_gb}G " # -m: Set maximum memory per thread, suffix K/M/G recognized (default: 768M) - "-n " # -n: Sort by read name (not compatible with samtools index command) - "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) - "-o {output.sort_by_names} " # -o: Write final output to FILE rather than standard output - "{input.mapped} " # Mapped reads input - "&> {log}" # Log redirection - - -############################################################################### -################################### MAPPING ################################### -############################################################################### - -############################################################################### -rule minimap2_mapping: - # Aim: reads mapping against reference sequence - # Use: minimap2 - message: - """ - ~ Minimap2 ∞ Map Reads ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ MiniMap2 - """ - conda: - MINIMAP2 - resources: - cpus = CPUS - params: - preset = MM2_PRESET - #length = LENGTH - input: - mm2_indexes = "resources/indexes/minimap2/{reference}.mmi", - fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", - rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" - output: - mapped = temp("results/02_Mapping/{reference}/{sample}_minimap2-mapped.sam") - log: - "results/10_Reports/tools-log/minimap2/{sample}_{reference}.log" - shell: - "minimap2 " # Minimap2, a versatile sequence alignment program - "-x {params.preset} " # -x: presets (always applied before other options) - "-t {resources.cpus} " # -t: Number of threads (default: 3) - "-a " # -a: output in the SAM format (PAF by default) - #"-F {params.length} " # -F: max fragment length, effective with -x sr mode (default: 800) - "{input.mm2_indexes} " # Reference index filename prefix.mmi - # (-k, -w, -I and -H can't be changed during mapping) - #"resources/genomes/{wildcards.reference}.fasta " # Reference genome fasta format (for custom -kwIH) - #"-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] - #"-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] - #"-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] - #"{params.homopolymer} " # -H: use homopolymer-compressed k-mer (preferrable for PacBio) - "{input.fwd_reads} " # Forward input reads - "{input.rev_reads} " # Reverse input reads - "1> {output.mapped} " # SAM output - "2> {log}" # Log redirection - -############################################################################### -rule bwa_mapping: - # Aim: reads mapping against reference sequence - # Use: bwa mem -t [THREADS] -x [REFERENCE] [FWD_R1.fq] [REV_R2.fq] 1> [MAPPED.sam] - message: - """ - ~ BWA-MEM ∞ Map Reads ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ BWA - """ - conda: - BWA - resources: - cpus = CPUS - input: - bwa_indexes = "resources/indexes/bwa/{reference}", - fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", - rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" - output: - mapped = temp("results/02_Mapping/{reference}/{sample}_bwa-mapped.sam") - log: - "results/10_Reports/tools-log/bwa/{sample}_{reference}.log" - shell: - "bwa mem " # BWA-MEM algorithm, performs local alignment - "-t {resources.cpus} " # -t: Number of threads (default: 12) - "-v 1 " # -v: Verbosity level: 1=error, 2=warning, 3=message, 4+=debugging - "{input.bwa_indexes} " # Reference index filename prefix - "{input.fwd_reads} " # Forward input reads - "{input.rev_reads} " # Reverse input reads - "1> {output.mapped} " # SAM output - "2> {log}" # Log redirection - -############################################################################### -rule bowtie2_mapping: - # Aim: reads mapping against reference sequence - # Use: bowtie2 -p [THREADS] -x [REFERENCE] -1 [FWD_R1.fq] -2 [REV_R2.fq] -S [MAPPED.sam] - message: - """ - ~ Bowtie2 ∞ Map Reads ~ - Sample: __________ {wildcards.sample} - Reference: _______ {wildcards.reference} - Aligner: _________ Bowtie2 - """ - conda: - BOWTIE2 - resources: - cpus = CPUS - params: - sensitivity = BT2_SENSITIVITY - input: - bt2_indexes = "resources/indexes/bowtie2/{reference}", - fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", - rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" - output: - mapped = temp("results/02_Mapping/{reference}/{sample}_bowtie2-mapped.sam") - log: - "results/10_Reports/tools-log/bowtie2/{sample}_{reference}.log" - shell: - "bowtie2 " # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences - "--threads {resources.cpus} " # -p: Number of alignment threads to launch (default: 1) - "--reorder " # Keep the original read order (if multi-processor option -p is used) - "-x {input.bt2_indexes} " # -x: Reference index filename prefix (minus trailing .X.bt2) - "{params.sensitivity} " # Preset (default: "--sensitive") - # sensitive: same as [-D 15 -R 2 -N 0 -L 22 -i S,1,1.15] - "-q " # -q: Query input files are FASTQ .fq/.fastq (default) - "-1 {input.fwd_reads} " # Forward input reads - "-2 {input.rev_reads} " # Reverse input reads - "1> {output.mapped} " # -S: File for SAM output (default: stdout) - "2> {log}" # Log redirection - -############################################################################### -################################## INDEXING ################################### -############################################################################### - -############################################################################### -rule minimap2_genome_indexing: - # Aim: index sequences in the FASTA format - # Use: minimap2 [OPTIONS] -d [INDEX.mmi] <query.fasta> - message: - """ - ~ Minimap2 ∞ Index Genome ~ - Reference: _______ {wildcards.reference} - """ - conda: - MINIMAP2 - params: - kmer_size = KMER_SIZE, - minimizer_size = MINIMIZER_SIZE, - split_size = SPLIT_SIZE - #homopolymer = HOMOPOLYMER - input: - fasta = "resources/genomes/{reference}.fasta" - output: - mm2_indexes = multiext("resources/indexes/minimap2/{reference}", - ".mmi") - log: - "results/10_Reports/tools-log/minimap2-indexes/{reference}.log" - shell: - "minimap2 " # Minimap2, index sequences - "-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] - "-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] - "-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] - #"{params.homopolymer} " # use homopolymer-compressed k-mer (preferrable for PacBio) - "-d {output.mm2_indexes} " # -d: dump index to FILE [] - "{input.fasta} " # Reference sequences in the FASTA format - "&> {log}" # Log redirection - -############################################################################### -rule bwa_genome_indexing: - # Aim: index sequences in the FASTA format - # Use: bwa index -a [ALGO] -p [PREFIX] <genome.fasta> - message: - """ - ~ BWA-SW ∞ Index Genome ~ - Reference: _______ {wildcards.reference} - """ - conda: - BWA - params: - algorithm = BWA_ALGO - input: - fasta = "resources/genomes/{reference}.fasta" - output: - prefix = "resources/indexes/bwa/{reference}", - bwa_indexes = multiext("resources/indexes/bwa/{reference}", - ".amb", ".ann", ".bwt", ".pac", ".sa") - log: - "results/10_Reports/tools-log/bwa-indexes/{reference}.log" - shell: - "bwa index " # BWA-SW algorithm, index sequences - "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) - "-p {output.prefix} " # -p: Prefix of the output database - "{input.fasta} " # Reference sequences in the FASTA format - "&> {log} " # Log redirection - "&& touch {output.prefix}" # Touch done - -############################################################################### -rule bowtie2_genome_indexing: - # Aim: index sequences in the FASTA format - # Use: bowtie2-build [OPTIONS] <reference_in> <bt2_index_base> - message: - """ - ~ Bowtie2-build ∞ Index Genome ~ - Reference: _______ {wildcards.reference} - """ - conda: - BOWTIE2 - resources: - cpus = CPUS - params: - algorithm = BT2_ALGO - input: - fasta = "resources/genomes/{reference}.fasta" - output: - prefix = "resources/indexes/bowtie2/{reference}", - bt2_indexes = multiext("resources/indexes/bowtie2/{reference}", - ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", - ".rev.1.bt2", ".rev.2.bt2") - log: - "results/10_Reports/tools-log/bowtie2-indexes/{reference}.log" - shell: - "bowtie2-build " # Bowtie2-build, index sequences - "--quiet " # -q: quiet - "--threads {resources.cpus} " # Number of threads - "{params.algorithm} " # Force (or no by default) generated index to be 'large', - # even if ref has fewer than 4 billion nucleotides - "-f " # Reference files are FASTA (default) - "{input.fasta} " # Reference sequences files (comma-separated list) in the FASTA format - "{output.prefix} " # Write bt2 data to files with this dir/basename - "&> {log} " # Log redirection - "&& touch {output.prefix}" # Touch done - - -############################################################################### -################################## TRIMMING ################################### -############################################################################### - -############################################################################### -rule sickle_trim_quality: - # Aim: windowed adaptive trimming tool for FASTQ files using quality - # Use: sickle [COMMAND] [OPTIONS] - message: - """ - ~ Sickle-trim ∞ Trim Low Quality Sequences ~ - Sample: __________ {wildcards.sample} - """ - conda: - SICKLE_TRIM - params: - command = SIC_COMMAND, - encoding = SIC_ENCODING, - quality = SIC_QUALITY, - length = SIC_LENGTH - input: - fwd_reads = "results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz", - rev_reads = "results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz" - output: - fwd_reads = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz"), - rev_reads = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz"), - single = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_SE.fastq.gz") - log: - "results/10_Reports/tools-log/sickle-trim/{sample}.log" - shell: - "sickle " # Sickle, a windowed adaptive trimming tool for FASTQ files using quality - "{params.command} " # Paired-end or single-end sequence trimming - "-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8 - "-q {params.quality} " # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20) - "-l {params.length} " # --length-threshold: Threshold to keep a read based on length after trimming (default: 20) - "-f {input.fwd_reads} " # --pe-file1: Input paired-end forward fastq file - "-r {input.rev_reads} " # --pe-file2: Input paired-end reverse fastq file - "-g " # --gzip-output: Output gzipped files - "-o {output.fwd_reads} " # --output-pe1: Output trimmed forward fastq file - "-p {output.rev_reads} " # --output-pe2: Output trimmed reverse fastq file (must use -s option) - "-s {output.single} " # --output-single: Output trimmed singles fastq file - "&> {log} " # Log redirection - -############################################################################### -rule cutadapt_adapters_removing: - # Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads - # Use: cutadapt [OPTIONS] -a/-A [ADAPTER] -o [OUT_FWD.fastq.gz] -p [OUT_REV.fastq.gz] [IN_FWD.fastq.gz] [IN_REV.fastq.gz] - # Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed - message: - """ - ~ Cutadapt ∞ Remove Adapters ~ - Sample: __________ {wildcards.sample} - """ - conda: - CUTADAPT - resources: - cpus = CPUS - params: - length = CUT_LENGTH, - truseq = CUT_TRUSEQ, - nextera = CUT_NEXTERA, - small = CUT_SMALL, - cut = CUT_CLIPPING - input: - fwd_reads = "resources/symlinks/{sample}_R1.fastq.gz", - rev_reads = "resources/symlinks/{sample}_R2.fastq.gz" - output: - fwd_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz"), - rev_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz") - log: - "results/10_Reports/tools-log/cutadapt/{sample}.log" - shell: - "cutadapt " # Cutadapt, finds and removes unwanted sequence from your HT-seq reads - "--cores {resources.cpus} " # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1) - "--cut {params.cut} " # -u: Remove 'n' first bases (5') from forward R1 (hard-clipping, default: 0) - "-U {params.cut} " # -U: Remove 'n' first bases (5') from reverse R2 (hard-clipping, default: 0) - "--trim-n " # --trim-n: Trim N's on ends (3') of reads - "--minimum-length {params.length} " # -m: Discard reads shorter than length - "--adapter {params.truseq} " # -a: Sequence of an adapter ligated to the 3' end of the first read - "-A {params.truseq} " # -A: 3' adapter to be removed from second read in a pair - "--adapter {params.nextera} " # -a: Sequence of an adapter ligated to the 3' end of the first read - "-A {params.nextera} " # -A: 3' adapter to be removed from second read in a pair - "--adapter {params.small} " # -a: Sequence of an adapter ligated to the 3' end of the first read - "-A {params.small} " # -A: 3' adapter to be removed from second read in a pair - "--output {output.fwd_reads} " # -o: Write trimmed reads to FILE - "--paired-output {output.rev_reads} " # -p: Write second read in a pair to FILE - "{input.fwd_reads} " # Input forward reads R1.fastq - "{input.rev_reads} " # Input reverse reads R2.fastq - "&> {log} " # Log redirection +### LOAD RULES ### +################## -############################################################################### -############################### QUALITY CONTROL ############################### -############################################################################### +include: "rules/quality_controls.smk" +include: "rules/reads_cleaning.smk" +include: "rules/indexing_genomes.smk" +include: "rules/reads_mapping.smk" +include: "rules/primer_cleaping.smk" +include: "rules/remove_duplicates.smk" +include: "rules/coverage_stats.smk" +include: "rules/mask_lowcov.smk" +include: "rules/variant_calling.smk" +include: "rules/consensus_calling.smk" +include: "rules/lineages_calling.smk" ############################################################################### -rule multiqc_reports_aggregation: - # Aim: aggregates bioinformatics analyses results into a single report - # Use: multiqc [OPTIONS] --output [MULTIQC/] [FASTQC/] [MULTIQC/] - priority: 999 # Explicit high priority - message: - """ - ~ MultiQC ∞ Aggregat HTML Qualities Reports ~ - """ - conda: - MULTIQC - params: - #config = MQC_CONFIG, - #tag = TAG - input: - fastqc = expand("results/00_Quality_Control/fastqc/{fastq}/", - fastq = FASTQ), - fastq_screen = expand("results/00_Quality_Control/fastq-screen/{fastq}/", - fastq = FASTQ) - output: - multiqc = directory("results/00_Quality_Control/multiqc/") - log: - "results/10_Reports/tools-log/multiqc.log" - shell: - "multiqc " # Multiqc, searches in given directories for analysis & compiles a HTML report - "--quiet " # -q: Only show log warning - "--no-ansi " # Disable coloured log - #"--config {params.config} " # Specific config file to load - #"--tag {params.tag} " # Use only modules which tagged with this keyword - #"--pdf " # Creates PDF report with 'simple' template (require xelatex) - "--export " # Export plots as static images in addition to the report - "--outdir {output.multiqc} " # -o: Create report in the specified output directory - "{input.fastqc} " # Input FastQC files - "{input.fastq_screen} " # Input Fastq-Screen - "&> {log}" # Log redirection +### TARGET RULES ### +#################### -############################################################################### -rule fastqscreen_contamination_checking: - # Aim: screen if the composition of the library matches with what you expect - # Use fastq_screen [OPTIONS] --outdir [DIR/] [FASTQ.GZ] - message: - """ - ~ Fasts-Screen ∞ Screen Contamination ~ - Fastq: __________ {wildcards.fastq} - """ - conda: - FASTQ_SCREEN - resources: - cpus = CPUS - params: - config = FQC_CONFIG, - subset = SUBSET +rule all: input: - fastq = "resources/symlinks/{fastq}.fastq.gz" - output: - fastq_screen = directory("results/00_Quality_Control/fastq-screen/{fastq}/") - log: - "results/10_Reports/tools-log/fastq-screen/{fastq}.log" - shell: - "fastq_screen " # FastqScreen, what did you expect ? - "-q " # --quiet: Only show log warning - "--threads {resources.cpus} " # --threads: Specifies across how many aligner will be allowed to run - "--aligner 'bwa' " # -a: choose aligner 'bowtie', 'bowtie2', 'bwa' - "--conf {params.config} " # path to configuration file - "--subset {params.subset} " # Don't use the whole sequence file, but create a subset of specified size - "--outdir {output.fastq_screen} " # Output directory - "{input.fastq} " # Input file.fastq - "&> {log}" # Log redirection + multiqc = functions.get_quality_input, + trimming = functions.get_trimming_input, + consensus = functions.get_consensus_input, + flagstat = functions.get_flagstat_input, + covstats = functions.get_covstats_input, + vcf = functions.get_vcf_input, + pangolin = functions.get_pangolin_input, + nextclade = functions.get_nextclade_input, + gisaid = functions.get_gisaid_input -############################################################################### -rule fastqc_quality_control: - # Aim: reads sequence files and produces a quality control report - # Use: fastqc [OPTIONS] --output [DIR/] [FASTQ.GZ] - message: - """ - ~ FastQC ∞ Quality Control ~ - Fastq: __________ {wildcards.fastq} - """ - conda: - FASTQC - resources: - cpus = CPUS - input: - fastq = "resources/symlinks/{fastq}.fastq.gz" - output: - fastqc = directory("results/00_Quality_Control/fastqc/{fastq}/") - log: - "results/10_Reports/tools-log/fastqc/{fastq}.log" - shell: - "mkdir -p {output.fastqc} " # (*) this directory must exist as the program will not create it - "2> /dev/null && " # in silence and then... - "fastqc " # FastQC, a high throughput sequence QC analysis tool - "--quiet " # -q: Supress all progress messages on stdout and only report errors - "--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously - "--outdir {output.fastqc} " # -o: Create all output files in the specified output directory (*) - "{input.fastq} " # Input file.fastq - "&> {log}" # Log redirection -############################################################################### diff --git a/workflow/rules/gevarli.smk b/workflow/Snakefile_SAVE old mode 100755 new mode 100644 similarity index 95% rename from workflow/rules/gevarli.smk rename to workflow/Snakefile_SAVE index ba9ad8c..fed2347 --- a/workflow/rules/gevarli.smk +++ b/workflow/Snakefile_SAVE @@ -8,18 +8,21 @@ ### ### ###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### # Name ___________________ gevarli.smk -# Version ________________ v.2024.08 +# Version ________________ v.2025.01 # Author _________________ Nicolas Fernandez # Affiliation ____________ IRD_U233_TransVIHMI # Aim ____________________ Snakefile with GeVarLi rules # Date ___________________ 2021.10.12 -# Latest modifications ___ 2024.08.05 ('Noarch' conda environment yaml files) -# Use ____________________ snakemake -s gevarli.smk --use-conda +# Latest modifications ___ 2025.01.08 (Prepare for Snakedeploy) +# Use ____________________ snakemake -s Snakemake --use-conda -j ############################################################################### ### CONFIGURATION ### ##################### +from snakemake.utils import min_version +min_version("8.27.0") + configfile: "configuration/config.yaml" ############################################################################### @@ -46,7 +49,7 @@ def get_trimming_input(wildcards): return trimming_list def stash_or_trash(path): - if "trimming" in MODULES: + if "keeptrim" in MODULES: return path else: return temp(path) @@ -157,8 +160,8 @@ def get_gisaid_input(wildcards): ### WILDCARDS ### ################# -FASTQ, = glob_wildcards("resources/reads/{fastq}.fastq.gz") -SAMPLE, = glob_wildcards("resources/reads/{sample}_R1.fastq.gz") +FASTQ, = glob_wildcards("resources/symlinks/{fastq}.fastq.gz") +SAMPLE, = glob_wildcards("resources/symlinks/{sample}_R1.fastq.gz") ############################################################################### ### RESOURCES ### @@ -173,23 +176,23 @@ TMP_DIR = config["resources"]["tmp_dir"] # Temporary directory ### ENVIRONMENTS ### #################### -MULTIQC = config["conda"]["yaml"]["multiqc"] # Multi-QC conda env -FASTQ_SCREEN = config["conda"]["yaml"]["fastq_screen"] # Fastq-Screen conda env -FASTQC= config["conda"]["yaml"]["fastqc"] # FastQC conda env -CUTADAPT = config["conda"]["yaml"]["cutadapt"] # Cutadapt conda environment -SICKLE_TRIM = config["conda"]["yaml"]["sickle_trim"] # Sickle-Trim conda environment -MINIMAP2 = config["conda"]["yaml"]["minimap2"] # BWA conda environment -BWA = config["conda"]["yaml"]["bwa"] # BWA conda environment -BOWTIE2 = config["conda"]["yaml"]["bowtie2"] # Bowtie2 conda environment -SAMTOOLS = config["conda"]["yaml"]["samtools"] # SamTools conda environment -BEDTOOLS = config["conda"]["yaml"]["bedtools"] # BedTools conda environment -BAMCLIPPER = config["conda"]["yaml"]["bamclipper"] # BAMClipper -GAWK = config["conda"]["yaml"]["gawk"] # Awk (GNU) conda environment -IVAR = config["conda"]["yaml"]["ivar"] # iVar conda environment -TSV2VCF = config["conda"]["yaml"]["tsv2vcf"] # tsv2vcf conda environment -BCFTOOLS = config["conda"]["yaml"]["bcftools"] # BcfTools conda environment -PANGOLIN = config["conda"]["yaml"]["pangolin"] # Pangolin conda environment -NEXTCLADE = config["conda"]["yaml"]["nextclade"] # Nextclade conda environment +MULTIQC = config["envs"]["multiqc"] # Multi-QC conda env +FASTQ_SCREEN = config["envs"]["fastq_screen"] # Fastq-Screen conda env +FASTQC= config["envs"]["fastqc"] # FastQC conda env +CUTADAPT = config["envs"]["cutadapt"] # Cutadapt conda environment +SICKLE_TRIM = config["envs"]["sickle_trim"] # Sickle-Trim conda environment +MINIMAP2 = config["envs"]["minimap2"] # BWA conda environment +BWA = config["envs"]["bwa"] # BWA conda environment +BOWTIE2 = config["envs"]["bowtie2"] # Bowtie2 conda environment +SAMTOOLS = config["envs"]["samtools"] # SamTools conda environment +BEDTOOLS = config["envs"]["bedtools"] # BedTools conda environment +BAMCLIPPER = config["envs"]["bamclipper"] # BAMClipper +GAWK = config["envs"]["gawk"] # Awk (GNU) conda environment +IVAR = config["envs"]["ivar"] # iVar conda environment +TSV2VCF = config["envs"]["tsv2vcf"] # tsv2vcf conda environment +BCFTOOLS = config["envs"]["bcftools"] # BcfTools conda environment +PANGOLIN = config["envs"]["pangolin"] # Pangolin conda environment +NEXTCLADE = config["envs"]["nextclade"] # Nextclade conda environment ############################################################################### ### PARAMETERS ### @@ -207,11 +210,11 @@ KMER_SIZE = config["minimap2"]["algorithm"]["k-mer_size"] # MM2 k-mer s MINIMIZER_SIZE = config["minimap2"]["algorithm"]["minimizer_size"] # MM2 minimizer window size SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # MM2 split index #HOMOPOLYMER = config["minimap2"]["algorithm"]["homopolymer"] # MM2 if PacBio + BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm BT2_ALGO = config["bowtie2"]["algorithm"] # BT2 indexing algorithm REFERENCE = config["consensus"]["reference"] # Genome reference sequence, in fasta format -REF_PATH = config["consensus"]["path"] # Path to genomes references MIN_COV = config["consensus"]["min_cov"] # Minimum coverage, mask lower regions with 'N' IUPAC = config["consensus"]["iupac"] # Output variants in the form of IUPAC ambiguity codes ALIGNER = config["consensus"]["aligner"] # Aligner ('minimap2', 'bwa' or 'bowtie2') @@ -228,7 +231,6 @@ SIC_ENCODING = config["sickle_trim"]["encoding"] # Sickle-trim --qual-type SIC_QUALITY = config["sickle_trim"]["quality"] # Sickle-trim --qual-threshold SIC_LENGTH = config["sickle_trim"]["length"] # Sickle-trim --length-treshold -MM2_PATH = config["minimap2"]["path"] # Minimpa2 path to indexes MM2_PRESET = config["minimap2"]["preset"] # Minimap2 preset #MM2_LENGTH = config["minimap2"]["length"] # Minimap2 length #MM2_KMER_SIZE = config["minimap2"]["algorithm"]["k-mer_size"] # Minimap2 k-mer size @@ -236,10 +238,8 @@ MM2_PRESET = config["minimap2"]["preset"] # Minimap2 preset #MM2_SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # Minimap2 split index #MM2_HOMOPOLYMER = config["minimap2"]["algorithm"]["homopolymer"] # Minimap2 for PacBio -BWA_PATH = config["bwa"]["path"] # BWA path to indexes BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm -BT2_PATH = config["bowtie2"]["path"] # Bowtie2 path to indexes BT2_ALGO = config["bowtie2"]["algorithm"] # Bowtie2 indexing algorithm BT2_SENSITIVITY = config["bowtie2"]["sensitivity"] # Bowtie2 sensitivity preset @@ -456,6 +456,7 @@ rule ivar_consensus: "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than [INT] (default: 13) #"--reference {input.masked_ref} " # Reference sequence FASTA FILE "{input.mark_dup} " # Markdup BAM input + "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout "| " ### PIPE to iVar "ivar consensus " # iVar, with command 'consensus': Call consensus from aligned BAM file "-p {output.prefix} " # -p: prefix @@ -512,6 +513,7 @@ rule ivar_variant_calling: "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than (default: 13) [INT] "--reference {input.masked_ref} " # Reference sequence FASTA FILE "{input.mark_dup} " # Markdup BAM input + "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout "| " ### pipe to iVar "ivar variants " # iVar, with command 'variants': Call variants from aligned BAM file "-p {output.variant_call} " # -p: prefix @@ -541,20 +543,19 @@ rule bedtools_masking: """ conda: BEDTOOLS - params: - path = REF_PATH input: + reference = "resources/genomes/{reference}.fasta", low_cov_mask = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed" output: masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta" log: "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_masking.log" shell: - "bedtools maskfasta " # Bedtools maskfasta, mask a fasta file based on feature coordinates - "-fi {params.path}{wildcards.reference}.fasta " # Input FASTA file - "-bed {input.low_cov_mask} " # BED/GFF/VCF file of ranges to mask in -fi - "-fo {output.masked_ref} " # Output masked FASTA file - "&> {log}" # Log redirection + "bedtools maskfasta " # Bedtools maskfasta, mask a fasta file based on feature coordinates + "-fi {input.reference} " # Input FASTA file + "-bed {input.low_cov_mask} " # BED/GFF/VCF file of ranges to mask in -fi + "-fo {output.masked_ref} " # Output masked FASTA file + "&> {log}" # Log redirection ############################################################################### rule bedtools_merged_mask: @@ -874,7 +875,7 @@ rule bamclipper_amplicon_primers: "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam.bai {output.baiclip}" # because BamClipper v.1 default output system... ############################################################################### -############################## REMOVE DUPLICATS ############################### +############################# REMOVE DUPLICATES ############################### ############################################################################### ############################################################################### @@ -1056,7 +1057,7 @@ rule minimap2_mapping: preset = MM2_PRESET #length = LENGTH input: - mm2_indexes = "resources/indexes/minimap2/{reference}.mmi" + mm2_indexes = "resources/indexes/minimap2/{reference}.mmi", fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" output: @@ -1097,7 +1098,7 @@ rule bwa_mapping: resources: cpus = CPUS input: - bwa_indexes = "resources/indexes/bwa/{reference}" + bwa_indexes = "resources/indexes/bwa/{reference}", fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" output: @@ -1143,7 +1144,7 @@ rule bowtie2_mapping: "bowtie2 " # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences "--threads {resources.cpus} " # -p: Number of alignment threads to launch (default: 1) "--reorder " # Keep the original read order (if multi-processor option -p is used) - "-x {input.indexes} " # -x: Reference index filename prefix (minus trailing .X.bt2) + "-x {input.bt2_indexes} " # -x: Reference index filename prefix (minus trailing .X.bt2) "{params.sensitivity} " # Preset (default: "--sensitive") # sensitive: same as [-D 15 -R 2 -N 0 -L 22 -i S,1,1.15] "-q " # -q: Query input files are FASTQ .fq/.fastq (default) @@ -1176,7 +1177,7 @@ rule minimap2_genome_indexing: fasta = "resources/genomes/{reference}.fasta" output: mm2_indexes = multiext("resources/indexes/minimap2/{reference}", - ".mmi") + ".mmi") log: "results/10_Reports/tools-log/minimap2-indexes/{reference}.log" shell: @@ -1185,7 +1186,7 @@ rule minimap2_genome_indexing: "-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] "-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] #"{params.homopolymer} " # use homopolymer-compressed k-mer (preferrable for PacBio) - "-d {output.indexes} " # -d: dump index to FILE [] + "-d {output.mm2_indexes} " # -d: dump index to FILE [] "{input.fasta} " # Reference sequences in the FASTA format "&> {log}" # Log redirection @@ -1205,14 +1206,14 @@ rule bwa_genome_indexing: input: fasta = "resources/genomes/{reference}.fasta" output: - prefix = temp("resources/indexes/bwa/{reference}"), - indexes = multiext("resources/indexes/bwa/{ref_seq}", - ".amb", ".ann", ".bwt", ".pac", ".sa") + prefix = "resources/indexes/bwa/{reference}", + bwa_indexes = multiext("resources/indexes/bwa/{reference}", + ".amb", ".ann", ".bwt", ".pac", ".sa") log: "results/10_Reports/tools-log/bwa-indexes/{reference}.log" shell: "bwa index " # BWA-SW algorithm, index sequences - "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) + "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) "-p {output.prefix} " # -p: Prefix of the output database "{input.fasta} " # Reference sequences in the FASTA format "&> {log} " # Log redirection @@ -1236,10 +1237,10 @@ rule bowtie2_genome_indexing: input: fasta = "resources/genomes/{reference}.fasta" output: - prefix = temp("resources/indexes/bowtie2/{reference}"), - indexes = multiext("resources/indexes/bowtie2/{ref_seq}", - ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", - ".rev.1.bt2", ".rev.2.bt2") + prefix = "resources/indexes/bowtie2/{reference}", + bt2_indexes = multiext("resources/indexes/bowtie2/{reference}", + ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", + ".rev.1.bt2", ".rev.2.bt2") log: "results/10_Reports/tools-log/bowtie2-indexes/{reference}.log" shell: @@ -1319,10 +1320,8 @@ rule cutadapt_adapters_removing: small = CUT_SMALL, cut = CUT_CLIPPING input: - fwd_reads = "resources/reads/{sample}_R1.fastq.gz", - rev_reads = "resources/reads/{sample}_R2.fastq.gz" - #fwd_reads = "/users/illumina/local/data/run_1/FATSQ/{sample}_R1.fastq.gz", - #rev_reads = "/users/illumina/local/data/run_1/FATSQ/{sample}_R2.fastq.gz" + fwd_reads = "resources/symlinks/{sample}_R1.fastq.gz", + rev_reads = "resources/symlinks/{sample}_R2.fastq.gz" output: fwd_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz"), rev_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz") @@ -1404,7 +1403,7 @@ rule fastqscreen_contamination_checking: config = FQC_CONFIG, subset = SUBSET input: - fastq = "resources/reads/{fastq}.fastq.gz" + fastq = "resources/symlinks/{fastq}.fastq.gz" output: fastq_screen = directory("results/00_Quality_Control/fastq-screen/{fastq}/") log: @@ -1434,7 +1433,7 @@ rule fastqc_quality_control: resources: cpus = CPUS input: - fastq = "resources/reads/{fastq}.fastq.gz" + fastq = "resources/symlinks/{fastq}.fastq.gz" output: fastqc = directory("results/00_Quality_Control/fastqc/{fastq}/") log: diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst new file mode 100644 index 0000000..e69de29 diff --git a/workflow/rules/consensus_calling.smk b/workflow/rules/consensus_calling.smk new file mode 100644 index 0000000..e5c22c7 --- /dev/null +++ b/workflow/rules/consensus_calling.smk @@ -0,0 +1,33 @@ +############################################################################### +################################## CONSENSUS ################################## +############################################################################### + +############################################################################### +rule sed_rename_headers: + # Aim: rename all fasta header with sample name + # Use: sed 's/[OLD]/[NEW]/' [IN] > [OUT] + message: + """ + ~ Sed ∞ Rename Fasta Header ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ {wildcards.caller} + """ + conda: + GAWK + input: + cons_tmp = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta.tmp" + output: + consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" + log: + "results/10_Reports/tools-log/sed/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_fasta-header.log" + shell: + "sed " # Sed, a Stream EDitor used to perform basic text transformations on an input stream + "'s/^>.*$/>{wildcards.sample}_{wildcards.aligner}_{wildcards.min_cov}X_{wildcards.caller}/' " + "{input.cons_tmp} " # Input file + "1> {output.consensus} " # Output file + "2> {log}" # Log redirection + +############################################################################### diff --git a/workflow/rules/coverage_stats.smk b/workflow/rules/coverage_stats.smk new file mode 100644 index 0000000..74bee1b --- /dev/null +++ b/workflow/rules/coverage_stats.smk @@ -0,0 +1,221 @@ +############################################################################### +################################## STATISTICS ################################# +############################################################################### + +############################################################################### +rule awk_coverage_statistics: + # Aim: computing genomme coverage stats + # Use: awk {FORMULA} END {{print [RESULTS.tsv] [BEDGRAPH.bed] + message: + """ + ~ Awk ∞ Compute Genome Coverage Statistics from BED file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + GAWK + input: + cutadapt = "results/10_Reports/tools-log/cutadapt/{sample}.log", + sickle = "results/10_Reports/tools-log/sickle-trim/{sample}.log", + samtools = "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log", + flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.json", + histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt", + genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" + output: + cov_stats = "results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv" + log: + "results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.log" + shell: + r""" rawReads=$(grep -o -E """ # Get raw reads + r""" 'Total read pairs processed:.+' {input.cutadapt} """ # + r""" | sed -E 's/Total read pairs processed:\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" cutadaptPF=$(grep -o -E """ # Get cutadapt Passing Filtes reads + r""" 'Pairs written \(passing filters\):.+' {input.cutadapt} """ # + r""" | sed -E 's/Pairs written \(passing filters\):\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" sicklePF=$(grep -o -E """ # Get sickle Passing Filtes reads + r""" 'FastQ paired records kept:.+' {input.sickle} """ # + r""" | sed -E 's/FastQ paired records kept:\ +//') ; """ # + # + r""" totalDuplicate=$(grep -o -E """ # Get total duplicated reads + r""" 'DUPLICATE TOTAL:.+' {input.samtools} """ # + r""" | sed -E 's/DUPLICATE TOTAL:\ +//') ; """ # + # + r""" estimatedLibrarySize=$(grep -o -E """ # Get estimated library size + r""" 'ESTIMATED_LIBRARY_SIZE:.+' {input.samtools} """ # + r""" | sed -E 's/ESTIMATED_LIBRARY_SIZE:\ +//') ; """ # + # + r""" samtoolsPF=$(grep -o -E """ # Get samtool Passing Filter reads + r""" 'WRITTEN: .+' {input.samtools} """ # + r""" | sed -E 's/WRITTEN:\ +//') ; """ # + # + r""" mappedReads=$(grep -o -E -m 1 """ # Get mapped reads + r""" '"mapped": .+' {input.flagstat} """ # + r""" | sed -E 's/"mapped":\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" mappedPercentReads=$(grep -o -E -m 1 """ # Get mapped precent reads + r""" '"mapped %": .+' {input.flagstat} """ # + r""" | sed -E 's/"mapped %":\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" covPercentAt1X=$(grep -o -E """ # Get coverage percent @1X + r""" 'Percent covered:.+' {input.histogram} """ # + r""" | sed -E 's/Percent covered:\ +//') ; """ # + # + r""" awk """ # Awk, a program to select particular records in a file and perform operations upon them + r""" -v rawReads="${{rawReads}}" """ # Define external variable + r""" -v cutadaptPF="${{cutadaptPF}}" """ # Define external variable + r""" -v sicklePF="${{sicklePF}}" """ # Define external variable + r""" -v totalDuplicate="${{totalDuplicate}}" """ # Define external variable + r""" -v estimatedLibrarySize="${{estimatedLibrarySize}}" """ # Define external variable + r""" -v samtoolsPF="${{samtoolsPF}}" """ # Define external variable + r""" -v mappedReads="${{mappedReads}}" """ # Define external variable + r""" -v mappedPercentReads="${{mappedPercentReads}}" """ # Define external variable + r""" -v covPercentAt1X="${{covPercentAt1X}}" """ # Define external variable + r""" '$4 >= {wildcards.min_cov} {{supMin_Cov+=$3-$2}} ; """ # Genome size (>= min_cov @X) + r""" {{genomeSize+=$3-$2}} ; """ # Genome size (total) + r""" {{totalBases+=($3-$2)*$4}} ; """ # Total bases @1X + r""" {{totalBasesSq+=(($3-$2)*$4)**2}} """ # Total bases square @1X + r""" END """ # END + r""" {{print """ # Print + r""" "sample_id", "\t", """ # header: Sample ID + r""" "raw_paired_reads", "\t", """ # header: Raw paired reads + r""" "cutadapt_pairs_PF", "\t", """ # header: Cutadapt Passing Filters + r""" "sickle_reads_PF", "\t", """ # header: Sickle-trim Passing Filters + r""" "duplicated_reads", "\t", """ # header: + r""" "duplicated_percent_%","\t", """ # header: + r""" "estimated_library_size*", "\t", """ # header: + r""" "samtools_pairs_PF", "\t", """ # header: + r""" "mapped_with", "\t", """ # header: aligner + r""" "mapped_on", "\t", """ # header: reference + r""" "mapped_reads", "\t", """ # header: + r""" "mapped_percent_%", "\t", """ # header: + r""" "mean_depth", "\t", """ # header: mean depth + r""" "standard_deviation", "\t", """ # header: standard deviation + r""" "cov_percent_%_@1X", "\t", """ # header: coverage percentage @1X + r""" "cov_percent_%" "\t", """ # header: coverage percentage + r""" "@_min_cov" """ # header: @_[min_cov]_X + r""" ORS """ # \n newline + r""" "{wildcards.sample}", "\t", """ # value: Sample ID + r""" rawReads, "\t", """ # value: Raw sequences + r""" cutadaptPF, "\t", """ # value: Cutadapt Passing Filter + r""" sicklePF, "\t", """ # value: Sickle Passing Filter + r""" totalDuplicate, "\t", """ # value: + r""" int(((totalDuplicate)/(rawReads*2))*100), "%", "\t", """ # value: (divided by 2 to estimated pairs) + r""" estimatedLibrarySize, "\t", """ # value: + r""" samtoolsPF, "\t", """ # value: + r""" "{wildcards.aligner}", "\t", """ # value: aligner + r""" "{wildcards.reference}", "\t", """ # value: reference + r""" mappedReads, "\t", """ # value: + r""" mappedPercentReads, "%", "\t", """ # value: + r""" int(totalBases/genomeSize), "\t", """ # value: mean depth + r""" int(sqrt((totalBasesSq/genomeSize)-(totalBases/genomeSize)**2)), "\t", """ # Standard deviation value + r""" covPercentAt1X, "\t", """ # value + r""" supMin_Cov/genomeSize*100, "%", "\t", """ # Coverage percent (@ min_cov X) value + r""" "@{wildcards.min_cov}X" """ # @ min_cov X value + r""" }}' """ # Close print + r""" {input.genome_cov} """ # BedGraph coverage input + r""" 1> {output.cov_stats} """ # Mean depth output + r""" 2> {log}""" # Log redirection + +############################################################################### +rule bedtools_genome_coverage: + # Aim: computing genome coverage sequencing + # Use: bedtools genomecov [OPTIONS] -ibam [MARK-DUP.bam] 1> [BEDGRAPH.bed] + message: + """ + ~ BedTools ∞ Compute Genome Coverage ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + BEDTOOLS + input: + mark_dup = get_bam_input, + index = get_bai_input + output: + genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" + log: + "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_genome-cov.log" + shell: + "bedtools genomecov " # Bedtools genomecov, compute the coverage of a feature file among a genome + "-bga " # Report depth in BedGraph format, regions with zero coverage are also reported + "-ibam {input.mark_dup} " # The input file is in BAM format, must be sorted by position + "1> {output.genome_cov} " # BedGraph output + "2> {log} " # Log redirection + +############################################################################### +rule samtools_coverage_histogram: + # Aim: alignment depth and percent coverage histogram + # Use: samtools coverage --histogram [INPUT.bam] + message: + """ + ~ SamTools ∞ Calcul Depth and Coverage from BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + #bins = BINS, + #depth = DEPTH + input: + mark_dup = get_bam_input + output: + histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_coverage-histogram.log" + shell: + "samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage + "--histogram " # -m: show histogram instead of tabular output + "--verbosity 4 " # Set level of verbosity [INT] (default: 3) + "--n-bins 149 " # -w: number of bins in histogram (default: terminal width - 40) (todo: {params.bins}) + "--depth 0 " # -d maximum allowed coverage depth [INT] (default: 1000000 ; 0 removing any depth limit) (todo: {params.depth}) + "--output {output.histogram} " # write output to FILE (default: stdout) + "{input.mark_dup} ; " # Mark_dup bam input + "echo >> {output.histogram} ; " # Newline + "samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage + "--verbosity 4 " # Set level of verbosity [INT] (default: 3) + "{input.mark_dup} " # Mark_dup bam input + ">> {output.histogram} " # write output to FILE (default: stdout) + "2> {log}" # Log redirection + +############################################################################### +rule samtools_flagstat_ext: + # Aim: simple stats + # Use: samtools flagstat -@ [THREADS] [INPUT.bam] + message: + """ + ~ SamTools ∞ Calcul simple Stats from BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + mark_dup = get_bam_input + output: + flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_flagstat-{ext}.log" + shell: + "samtools flagstat " # Samtools flagstat, tools for alignments in the SAM format with command to simple stat + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "--verbosity 4 " # Set level of verbosity [INT] (default: 3) + "--output-fmt {wildcards.ext} " # -O Specify output format (none, tsv and json) + "{input.mark_dup} " # Mark_dup bam input + "1> {output.flagstat} " # Mark_dup index output + "2> {log}" # Log redirection + +############################################################################### diff --git a/workflow/rules/indexing_genomes.smk b/workflow/rules/indexing_genomes.smk old mode 100755 new mode 100644 index b25bf1a..7d238bb --- a/workflow/rules/indexing_genomes.smk +++ b/workflow/rules/indexing_genomes.smk @@ -1,81 +1,36 @@ -###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### -### ### -### /\ ______ ___ ____ _ _ __ ____ __ ____ ______ /\ ### -### || \ \ \ \ / __( ___( \/ )__\ ( _ ( ) (_ _) / / / / || ### -### || > > > > ( (_-.)__) \ /(__)\ ) /)(__ _)(_ < < < < || ### -### || /_/_/_/ \___(____) \(__)(__(_)\_(____(____) \_\_\_\ || ### -### \/ \/ ### -### ### -###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### -# Name ___________________ indexing_genomes.smk -# Version ________________ v.2023.08 -# Author _________________ Nicolas Fernandez -# Affiliation ____________ IRD_U233_TransVIHMI -# Aim ____________________ Snakefile with indexing genomes rules -# Date ___________________ 2022.09.28 -# Latest modifications ___ 2024.08.05 ('Noarch' conda environment yaml files) -# Use ____________________ snakemake -s indexing_genomes.smk --use-conda - -############################################################################### -### CONFIGURATION ### -##################### - -configfile: "configuration/config.yaml" - ############################################################################### -### FUNCTIONS ### -################# - - -############################################################################### -### WILDCARDS ### -################# - -REF_SEQ, = glob_wildcards("resources/genomes/{ref_seq}.fasta") - -############################################################################### -### RESOURCES ### -################# - -CPUS = config["resources"]["cpus"] # Threads (maximum) - -############################################################################### -### ENVIRONMENTS ### -#################### - -MINIMAP2 = config["conda"]["yaml"]["minimap2"] # MM2 conda env -BWA = config["conda"]["yaml"]["bwa"] # BWA conda env -BOWTIE2 = config["conda"]["yaml"]["bowtie2"] # BT2 conda env - +################################## INDEXING ################################### ############################################################################### -### PARAMETERS ### -################## - -KMER_SIZE = config["minimap2"]["algorithm"]["k-mer_size"] # MM2 k-mer size -MINIMIZER_SIZE = config["minimap2"]["algorithm"]["minimizer_size"] # MM2 minimizer window size -SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # MM2 split index -#HOMOPOLYMER = config["minimap2"]["algorithm"]["homopolymer"] # MM2 if PacBio - -BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm -BT2_ALGO = config["bowtie2"]["algorithm"] # BT2 indexing algorithm ############################################################################### -### RULES ### -############# - -rule all: +rule bwa_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: bwa index -a [ALGO] -p [PREFIX] <genome.fasta> + message: + """ + ~ BWA-SW ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + BWA + params: + algorithm = BWA_ALGO input: - mm2_indexes = expand("resources/indexes/minimap2/{ref_seq}.{ext}", - ref_seq = REF_SEQ, - ext = ["mmi"]), - bwa_indexes = expand("resources/indexes/bwa/{ref_seq}.{ext}", - ref_seq = REF_SEQ, - ext = ["amb", "ann", "bwt", "pac", "sa"]), - bt2_indexes = expand("resources/indexes/bowtie2/{ref_seq}.{ext}", - ref_seq = REF_SEQ, - ext = ["1.bt2", "2.bt2", "3.bt2", "4.bt2", - "rev.1.bt2", "rev.2.bt2"]) - + fasta = "resources/genomes/{reference}.fasta" + output: + prefix = "resources/indexes/bwa/{reference}", + bwa_indexes = multiext("resources/indexes/bwa/{reference}", + ".amb", ".ann", ".bwt", ".pac", ".sa") + log: + "results/10_Reports/tools-log/bwa-indexes/{reference}.log" + shell: + "bwa index " # BWA-SW algorithm, index sequences + "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) + "-p {output.prefix} " # -p: Prefix of the output database + "{input.fasta} " # Reference sequences in the FASTA format + "&> {log} " # Log redirection + "&& touch {output.prefix}" # Touch done + ############################################################################### rule minimap2_genome_indexing: # Aim: index sequences in the FASTA format @@ -83,61 +38,32 @@ rule minimap2_genome_indexing: message: """ ~ Minimap2 ∞ Index Genome ~ - Reference: _______ {wildcards.ref_seq} + Reference: _______ {wildcards.reference} """ conda: MINIMAP2 params: kmer_size = KMER_SIZE, minimizer_size = MINIMIZER_SIZE, - split_size = SPLIT_SIZE, + split_size = SPLIT_SIZE #homopolymer = HOMOPOLYMER input: - fasta = "resources/genomes/{ref_seq}.fasta" + fasta = "resources/genomes/{reference}.fasta" output: - indexes = multiext("resources/indexes/minimap2/{ref_seq}", - ".mmi") + mm2_indexes = multiext("resources/indexes/minimap2/{reference}", + ".mmi") log: - "results/10_Reports/tools-log/minimap2-indexes/{ref_seq}.log" + "results/10_Reports/tools-log/minimap2-indexes/{reference}.log" shell: "minimap2 " # Minimap2, index sequences "-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] "-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] "-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] #"{params.homopolymer} " # use homopolymer-compressed k-mer (preferrable for PacBio) - "-d {output.indexes} " # -d: dump index to FILE [] + "-d {output.mm2_indexes} " # -d: dump index to FILE [] "{input.fasta} " # Reference sequences in the FASTA format "&> {log}" # Log redirection -############################################################################### -rule bwa_genome_indexing: - # Aim: index sequences in the FASTA format - # Use: bwa index -a [ALGO] -p [PREFIX] <genome.fasta> - message: - """ - ~ BWA-SW ∞ Index Genome ~ - Reference: _______ {wildcards.ref_seq} - """ - conda: - BWA - params: - algorithm = BWA_ALGO - input: - fasta = "resources/genomes/{ref_seq}.fasta" - output: - prefix = temp("resources/indexes/bwa/{ref_seq}"), - indexes = multiext("resources/indexes/bwa/{ref_seq}", - ".amb", ".ann", ".bwt", ".pac", ".sa") - log: - "results/10_Reports/tools-log/bwa-indexes/{ref_seq}.log" - shell: - "bwa index " # BWA-SW algorithm, index sequences - "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) - "-p {output.prefix} " # -p: Prefix of the output database - "{input.fasta} " # Reference sequences in the FASTA format - "&> {log} " # Log redirection - "&& touch {output.prefix}" # Touch done - ############################################################################### rule bowtie2_genome_indexing: # Aim: index sequences in the FASTA format @@ -145,7 +71,7 @@ rule bowtie2_genome_indexing: message: """ ~ Bowtie2-build ∞ Index Genome ~ - Reference: _______ {wildcards.ref_seq} + Reference: _______ {wildcards.reference} """ conda: BOWTIE2 @@ -154,19 +80,20 @@ rule bowtie2_genome_indexing: params: algorithm = BT2_ALGO input: - fasta = "resources/genomes/{ref_seq}.fasta" + fasta = "resources/genomes/{reference}.fasta" output: - prefix = temp("resources/indexes/bowtie2/{ref_seq}"), - indexes = multiext("resources/indexes/bowtie2/{ref_seq}", - ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", - ".rev.1.bt2", ".rev.2.bt2") + prefix = "resources/indexes/bowtie2/{reference}", + bt2_indexes = multiext("resources/indexes/bowtie2/{reference}", + ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", + ".rev.1.bt2", ".rev.2.bt2") log: - "results/10_Reports/tools-log/bowtie2-indexes/{ref_seq}.log" + "results/10_Reports/tools-log/bowtie2-indexes/{reference}.log" shell: "bowtie2-build " # Bowtie2-build, index sequences "--quiet " # -q: quiet "--threads {resources.cpus} " # Number of threads - "{params.algorithm} " # Force (or no by default) generated index to be 'large', even if ref has fewer than 4 billion nucleotides + "{params.algorithm} " # Force (or no by default) generated index to be 'large', + # even if ref has fewer than 4 billion nucleotides "-f " # Reference files are FASTA (default) "{input.fasta} " # Reference sequences files (comma-separated list) in the FASTA format "{output.prefix} " # Write bt2 data to files with this dir/basename diff --git a/workflow/rules/lineages_calling.smk b/workflow/rules/lineages_calling.smk new file mode 100644 index 0000000..86b0be2 --- /dev/null +++ b/workflow/rules/lineages_calling.smk @@ -0,0 +1,74 @@ +############################################################################### +################################## LINEAGES ################################### +############################################################################### + +############################################################################### +rule nextclade_lineage: + # Aim: nextclade lineage assignation + # Use: nextclade [QUERY.fasta] -t [THREADS] --outfile [NAME.csv] + message: + """ + ~ Nextclade ∞ Assign Lineage ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ {wildcards.caller} + """ + conda: + NEXTCLADE + resources: + cpus = CPUS + params: + path = NEXT_PATH, + dataset = NEXT_DATASET + input: + consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" + output: + lineage = "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-report.tsv", + alignment = directory("results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-all/") + log: + "results/10_Reports/tools-log/nextclade/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_lineage.log" + shell: + "nextclade " # Nextclade, assign queries sequences to clades and reports potential quality issues + "run " # Run analyzis + "--jobs {resources.cpus} " # -j: Number of CPU threads used by the algorithm (default: all available threads) + "--input-dataset {params.path}{params.dataset} " # -raq: Path to a directory containing a dataset (root-seq, tree and qc-config required) + "--output-tsv {output.lineage} " # -t: Path to output TSV results file + "--output-all {output.alignment} " # -O: Produce all of the output files into this directory, using default basename + "{input.consensus} " # Path to a .fasta file with input sequences + "&> {log}" # Log redirection + +############################################################################### +rule pangolin_lineage: + # Aim: lineage mapping + # Use: pangolin [QUERY.fasta] -t [THREADS] --outfile [NAME.csv] + message: + """ + ~ Pangolin ∞ Assign Lineage ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ {wildcards.caller} + """ + conda: + PANGOLIN + resources: + cpus = CPUS, + tmp_dir = TMP_DIR + input: + consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" + output: + lineage = "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_pangolin-report.csv" + log: + "results/10_Reports/tools-log/pangolin/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_lineage.log" + shell: + "pangolin " # Pangolinn, Phylogenetic Assignment of Named Global Outbreak LINeages + "{input.consensus} " # Query fasta file of sequences to analyse + "--threads {resources.cpus} " # -t: Number of threads + "--tempdir {resources.tmp_dir} " # Specify where you want the temp stuff to go (default: $TMPDIR) + "--outfile {output.lineage} " # Optional output file name (default: lineage_report.csv) + "&> {log}" # Log redirection + +############################################################################### diff --git a/workflow/rules/mask_lowcov.smk b/workflow/rules/mask_lowcov.smk new file mode 100644 index 0000000..90be218 --- /dev/null +++ b/workflow/rules/mask_lowcov.smk @@ -0,0 +1,86 @@ +############################################################################### +############################ MASKING LOW COVERAGE ############################# +############################################################################### + +############################################################################### +rule bedtools_masking: + # Aim: masking low coverage regions + # Use: bedtools maskfasta [OPTIONS] -fi [REFERENCE.fasta] -bed [RANGE.bed] -fo [MASKEDREF.fasta] + message: + """ + ~ BedTools ∞ Mask Low Coverage Regions ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + """ + conda: + BEDTOOLS + input: + reference = "resources/genomes/{reference}.fasta", + low_cov_mask = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed" + output: + masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta" + log: + "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_masking.log" + shell: + "bedtools maskfasta " # Bedtools maskfasta, mask a fasta file based on feature coordinates + "-fi {input.reference} " # Input FASTA file + "-bed {input.low_cov_mask} " # BED/GFF/VCF file of ranges to mask in -fi + "-fo {output.masked_ref} " # Output masked FASTA file + "&> {log}" # Log redirection + +############################################################################### +rule bedtools_merged_mask: + # Aim: merging overlaps + # Use: bedtools merge [OPTIONS] -i [FILTERED.bed] -g [GENOME.fasta] + message: + """ + ~ BedTools ∞ Merge Overlaps ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + """ + conda: + BEDTOOLS + input: + min_cov_filt = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_min-cov-filt.bed" + output: + low_cov_mask = temp("results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed") + log: + "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_merging.log" + shell: + "bedtools merge " # Bedtools merge, merges overlapping BED/GFF/VCF entries into a single interval + "-i {input.min_cov_filt} " # -i: BED/GFF/VCF input to merge + "1> {output.low_cov_mask} " # merged output + "2> {log}" # Log redirection + +############################################################################### +rule awk_min_covfilt: + # Aim: minimum coverage filtration + # Use: awk '$4 < [MIN_COV]' [BEDGRAPH.bed] 1> [FILTERED.bed] + message: + """ + ~ Awk ∞ Minimum Coverage Filtration ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + """ + conda: + GAWK + input: + genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" + output: + min_cov_filt = temp("results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_min-cov-filt.bed") + log: + "results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_min-cov-filt.log" + shell: + "awk " # Awk, a program that you can use to select particular records in a file and perform operations upon them + "'$4 < {wildcards.min_cov}' " # Minimum coverage for masking regions in consensus sequence + "{input.genome_cov} " # BedGraph coverage input + "1> {output.min_cov_filt} " # Minimum coverage filtered bed output + "2> {log} " # Log redirection + +############################################################################### diff --git a/workflow/rules/primer_cleaping.smk b/workflow/rules/primer_cleaping.smk new file mode 100644 index 0000000..676cacd --- /dev/null +++ b/workflow/rules/primer_cleaping.smk @@ -0,0 +1,45 @@ +############################################################################### +############################## PRIMER CLEAPING ############################### +############################################################################### + +############################################################################### +rule bamclipper_amplicon_primers: + # Aim: soft-clip amplicon PCR primers from BAM alignments + # Use: bamclipper.sh -n [THREADS] -b [MARKDUP.bam] -p [PRIMER.bed] -u [UPSTREAM] -d [DOWNSTREAM] + message: + """ + ~ BAMClipper ∞ soft-clipping amplicon PCR primers from BAM alignments ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + BAMCLIPPER + resources: + cpus = CPUS + params: + path = CLIPPATH, + primers = PRIMERS, + upstream = UPSTREAM, + downstream = DOWNSTREAM + input: + markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam", + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" + output: + bamclip = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam", + baiclip = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam.bai" + log: + "results/10_Reports/tools-log/bamclipper/{reference}/{sample}_{aligner}_primers-clip.log" + shell: + "bamclipper.sh " # BAMClipper, remove primer sequences from BAM alignments of PCR amplicons by soft-clipping + "-b {input.markdup} " # Indexed BAM alignment file + "-p {params.path}{params.primers}.bedpe " # BEDPE of primer pair locations + "-n {resources.cpus} " # Number of threads (default: 1) + "-u {params.upstream} " # Number of nuc. upstream for assigning alignments to primers (default: 5) + "-d {params.downstream} " # Number of nuc. downstream for assigning alignments to primers (default: 5) + #"-o results/02_Mapping/ " # Path to write output (if BamClipper v.1.1.2) (todo) + "&> {log} " # Log redirection + "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam {output.bamclip} " # because BamClipper v.1 default output system... + "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam.bai {output.baiclip}" # because BamClipper v.1 default output system... + +############################################################################### diff --git a/workflow/rules/quality_control.smk b/workflow/rules/quality_controls.smk old mode 100755 new mode 100644 similarity index 63% rename from workflow/rules/quality_control.smk rename to workflow/rules/quality_controls.smk index fea98c1..1abfda8 --- a/workflow/rules/quality_control.smk +++ b/workflow/rules/quality_controls.smk @@ -1,73 +1,12 @@ -###I###RA###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### -### ### -### /\ ______ ___ ____ _ _ __ ____ __ ____ ______ /\ ### -### || \ \ \ \ / __( ___( \/ )__\ ( _ ( ) (_ _) / / / / || ### -### || > > > > ( (_-.)__) \ /(__)\ ) /)(__ _)(_ < < < < || ### -### || /_/_/_/ \___(____) \(__)(__(_)\_(____(____) \_\_\_\ || ### -### \/ \/ ### -### ### -###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### -# Name ___________________ quality_control.smk -# Version ________________ v.2023.06 -# Author _________________ Nicolas Fernandez -# Affiliation ____________ IRD_U233_TransVIHMI -# Aim ____________________ Snakefile with quality control rules -# Date ___________________ 2021.09.28 -# Latest modifications ___ 2024.02.08 (add multiqc plot graphs export) -# Run ____________________ snakemake -s quality_control.smk --use-conda - -############################################################################### -### CONFIGURATION ### -##################### - -configfile: "configuration/config.yaml" - -############################################################################### -### FUNCTIONS ### -################# - ############################################################################### -### WILDCARDS ### -################# - -FASTQ, = glob_wildcards("resources/reads/{fastq}.fastq.gz") - +############################### QUALITY CONTROL ############################### ############################################################################### -### RESOURCES ### -################# - -OS = config["os"] # Operating system -CPUS = config["resources"]["cpus"] # Threads (maximum) - -############################################################################### -### ENVIRONMENTS ### -#################### - -MULTIQC = config["conda"][OS]["multiqc"] # Multi-QC conda env -FASTQ_SCREEN = config["conda"][OS]["fastq_screen"] # Fastq-Screen conda env -FASTQC= config["conda"][OS]["fastqc"] # FastQC conda env - -############################################################################### -### PARAMETERS ### -################## - -SUBSET = config["fastq_screen"]["subset"] # Fastq-Screen --subset -FQC_CONFIG = config["fastq_screen"]["config"] # Fastq-Screen --conf -#MQC_CONFIG = config["multiqc"]["config"] # MultiQC --conf -#TAG = config["multiqc"]["tag"] # MultiQC --tag - -############################################################################### -### RULES ### -############# - -rule all: - input: - multiqc = "results/00_Quality_Control/multiqc/" ############################################################################### rule multiqc_reports_aggregation: # Aim: aggregates bioinformatics analyses results into a single report # Use: multiqc [OPTIONS] --output [MULTIQC/] [FASTQC/] [MULTIQC/] + priority: 999 # Explicit high priority message: """ ~ MultiQC ∞ Aggregat HTML Qualities Reports ~ @@ -116,7 +55,7 @@ rule fastqscreen_contamination_checking: config = FQC_CONFIG, subset = SUBSET input: - fastq = "resources/reads/{fastq}.fastq.gz" + fastq = "resources/symlinks/{fastq}.fastq.gz" output: fastq_screen = directory("results/00_Quality_Control/fastq-screen/{fastq}/") log: @@ -146,7 +85,7 @@ rule fastqc_quality_control: resources: cpus = CPUS input: - fastq = "resources/reads/{fastq}.fastq.gz" + fastq = "resources/symlinks/{fastq}.fastq.gz" output: fastqc = directory("results/00_Quality_Control/fastqc/{fastq}/") log: diff --git a/workflow/rules/reads_cleaning.smk b/workflow/rules/reads_cleaning.smk new file mode 100644 index 0000000..40cf7ac --- /dev/null +++ b/workflow/rules/reads_cleaning.smk @@ -0,0 +1,91 @@ +############################################################################### +################################## TRIMMING ################################### +############################################################################### + +############################################################################### +rule sickle_trim_quality: + # Aim: windowed adaptive trimming tool for FASTQ files using quality + # Use: sickle [COMMAND] [OPTIONS] + message: + """ + ~ Sickle-trim ∞ Trim Low Quality Sequences ~ + Sample: __________ {wildcards.sample} + """ + conda: + SICKLE_TRIM + params: + command = SIC_COMMAND, + encoding = SIC_ENCODING, + quality = SIC_QUALITY, + length = SIC_LENGTH + input: + fwd_reads = "results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz", + rev_reads = "results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz" + output: + fwd_reads = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz"), + rev_reads = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz"), + single = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_SE.fastq.gz") + log: + "results/10_Reports/tools-log/sickle-trim/{sample}.log" + shell: + "sickle " # Sickle, a windowed adaptive trimming tool for FASTQ files using quality + "{params.command} " # Paired-end or single-end sequence trimming + "-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8 + "-q {params.quality} " # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20) + "-l {params.length} " # --length-threshold: Threshold to keep a read based on length after trimming (default: 20) + "-f {input.fwd_reads} " # --pe-file1: Input paired-end forward fastq file + "-r {input.rev_reads} " # --pe-file2: Input paired-end reverse fastq file + "-g " # --gzip-output: Output gzipped files + "-o {output.fwd_reads} " # --output-pe1: Output trimmed forward fastq file + "-p {output.rev_reads} " # --output-pe2: Output trimmed reverse fastq file (must use -s option) + "-s {output.single} " # --output-single: Output trimmed singles fastq file + "&> {log} " # Log redirection + +############################################################################### +rule cutadapt_adapters_removing: + # Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads + # Use: cutadapt [OPTIONS] -a/-A [ADAPTER] -o [OUT_FWD.fastq.gz] -p [OUT_REV.fastq.gz] [IN_FWD.fastq.gz] [IN_REV.fastq.gz] + # Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed + message: + """ + ~ Cutadapt ∞ Remove Adapters ~ + Sample: __________ {wildcards.sample} + """ + conda: + CUTADAPT + resources: + cpus = CPUS + params: + length = CUT_LENGTH, + truseq = CUT_TRUSEQ, + nextera = CUT_NEXTERA, + small = CUT_SMALL, + cut = CUT_CLIPPING + input: + fwd_reads = "resources/symlinks/{sample}_R1.fastq.gz", + rev_reads = "resources/symlinks/{sample}_R2.fastq.gz" + output: + fwd_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz"), + rev_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz") + log: + "results/10_Reports/tools-log/cutadapt/{sample}.log" + shell: + "cutadapt " # Cutadapt, finds and removes unwanted sequence from your HT-seq reads + "--cores {resources.cpus} " # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1) + "--cut {params.cut} " # -u: Remove 'n' first bases (5') from forward R1 (hard-clipping, default: 0) + "-U {params.cut} " # -U: Remove 'n' first bases (5') from reverse R2 (hard-clipping, default: 0) + "--trim-n " # --trim-n: Trim N's on ends (3') of reads + "--minimum-length {params.length} " # -m: Discard reads shorter than length + "--adapter {params.truseq} " # -a: Sequence of an adapter ligated to the 3' end of the first read + "-A {params.truseq} " # -A: 3' adapter to be removed from second read in a pair + "--adapter {params.nextera} " # -a: Sequence of an adapter ligated to the 3' end of the first read + "-A {params.nextera} " # -A: 3' adapter to be removed from second read in a pair + "--adapter {params.small} " # -a: Sequence of an adapter ligated to the 3' end of the first read + "-A {params.small} " # -A: 3' adapter to be removed from second read in a pair + "--output {output.fwd_reads} " # -o: Write trimmed reads to FILE + "--paired-output {output.rev_reads} " # -p: Write second read in a pair to FILE + "{input.fwd_reads} " # Input forward reads R1.fastq + "{input.rev_reads} " # Input reverse reads R2.fastq + "&> {log} " # Log redirection + +############################################################################### diff --git a/workflow/rules/reads_mapping.smk b/workflow/rules/reads_mapping.smk new file mode 100644 index 0000000..85f1b71 --- /dev/null +++ b/workflow/rules/reads_mapping.smk @@ -0,0 +1,120 @@ +############################################################################### +################################### MAPPING ################################### +############################################################################### + +############################################################################### +rule minimap2_mapping: + # Aim: reads mapping against reference sequence + # Use: minimap2 + message: + """ + ~ Minimap2 ∞ Map Reads ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ MiniMap2 + """ + conda: + MINIMAP2 + resources: + cpus = CPUS + params: + preset = MM2_PRESET + #length = LENGTH + input: + mm2_indexes = "resources/indexes/minimap2/{reference}.mmi", + fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", + rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" + output: + mapped = temp("results/02_Mapping/{reference}/{sample}_minimap2-mapped.sam") + log: + "results/10_Reports/tools-log/minimap2/{sample}_{reference}.log" + shell: + "minimap2 " # Minimap2, a versatile sequence alignment program + "-x {params.preset} " # -x: presets (always applied before other options) + "-t {resources.cpus} " # -t: Number of threads (default: 3) + "-a " # -a: output in the SAM format (PAF by default) + #"-F {params.length} " # -F: max fragment length, effective with -x sr mode (default: 800) + "{input.mm2_indexes} " # Reference index filename prefix.mmi + # (-k, -w, -I and -H can't be changed during mapping) + #"resources/genomes/{wildcards.reference}.fasta " # Reference genome fasta format (for custom -kwIH) + #"-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] + #"-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] + #"-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] + #"{params.homopolymer} " # -H: use homopolymer-compressed k-mer (preferrable for PacBio) + "{input.fwd_reads} " # Forward input reads + "{input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # SAM output + "2> {log}" # Log redirection + +############################################################################### +rule bwa_mapping: + # Aim: reads mapping against reference sequence + # Use: bwa mem -t [THREADS] -x [REFERENCE] [FWD_R1.fq] [REV_R2.fq] 1> [MAPPED.sam] + message: + """ + ~ BWA-MEM ∞ Map Reads ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ BWA + """ + conda: + BWA + resources: + cpus = CPUS + input: + bwa_indexes = "resources/indexes/bwa/{reference}", + fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", + rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" + output: + mapped = temp("results/02_Mapping/{reference}/{sample}_bwa-mapped.sam") + log: + "results/10_Reports/tools-log/bwa/{sample}_{reference}.log" + shell: + "bwa mem " # BWA-MEM algorithm, performs local alignment + "-t {resources.cpus} " # -t: Number of threads (default: 12) + "-v 1 " # -v: Verbosity level: 1=error, 2=warning, 3=message, 4+=debugging + "{input.bwa_indexes} " # Reference index filename prefix + "{input.fwd_reads} " # Forward input reads + "{input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # SAM output + "2> {log}" # Log redirection + +############################################################################### +rule bowtie2_mapping: + # Aim: reads mapping against reference sequence + # Use: bowtie2 -p [THREADS] -x [REFERENCE] -1 [FWD_R1.fq] -2 [REV_R2.fq] -S [MAPPED.sam] + message: + """ + ~ Bowtie2 ∞ Map Reads ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ Bowtie2 + """ + conda: + BOWTIE2 + resources: + cpus = CPUS + params: + sensitivity = BT2_SENSITIVITY + input: + bt2_indexes = "resources/indexes/bowtie2/{reference}", + fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", + rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" + output: + mapped = temp("results/02_Mapping/{reference}/{sample}_bowtie2-mapped.sam") + log: + "results/10_Reports/tools-log/bowtie2/{sample}_{reference}.log" + shell: + "bowtie2 " # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences + "--threads {resources.cpus} " # -p: Number of alignment threads to launch (default: 1) + "--reorder " # Keep the original read order (if multi-processor option -p is used) + "-x {input.bt2_indexes} " # -x: Reference index filename prefix (minus trailing .X.bt2) + "{params.sensitivity} " # Preset (default: "--sensitive") + # sensitive: same as [-D 15 -R 2 -N 0 -L 22 -i S,1,1.15] + "-q " # -q: Query input files are FASTQ .fq/.fastq (default) + "-1 {input.fwd_reads} " # Forward input reads + "-2 {input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # -S: File for SAM output (default: stdout) + "2> {log}" # Log redirection + +############################################################################### diff --git a/workflow/rules/remove_duplicates.smk b/workflow/rules/remove_duplicates.smk new file mode 100644 index 0000000..6f55da9 --- /dev/null +++ b/workflow/rules/remove_duplicates.smk @@ -0,0 +1,160 @@ +############################################################################### +############################# REMOVE DUPLICATES ############################### +############################################################################### + +############################################################################### +rule samtools_index_markdup: + # Aim: indexing marked as duplicate BAM file + # Use: samtools index -@ [THREADS] -b [MARK-DUP.bam] [INDEX.bai] + message: + """ + ~ SamTools ∞ Index 'Marked as Duplicate' BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + mark_dup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" + output: + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup-index.log" + shell: + "samtools index " # Samtools index, tools for alignments in the SAM format with command to index alignment + "-@ {resources.cpus} " # --threads: Number of additional threads to use (default: 1)(NB, --threads form dose'nt work) + "-b " # -b: Generate BAI-format index for BAM files (default) + "{input.mark_dup} " # Mark_dup bam input + "{output.index} " # Mark_dup index output + "&> {log}" # Log redirection + +############################################################################### +rule samtools_markdup: + # Aim: marking duplicate alignments + # Use: samtools markdup -@ [THREADS] -r -s -O BAM [SORTED.bam] [MARK-DUP.bam] + message: + """ + ~ SamTools ∞ Mark Duplicate Alignments ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + sorted = "results/02_Mapping/{reference}/{sample}_{aligner}_sorted.bam" + output: + mark_dup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log" + shell: + "samtools markdup " # Samtools markdup, tools for alignments in the SAM format with command mark duplicates + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-r " # -r: Remove duplicate reads + "-s " # -s: Report stats + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "{input.sorted} " # Sorted bam input + "{output.mark_dup} " # Mark_dup bam output + "&> {log}" # Log redirection + +############################################################################### +rule samtools_sorting: + # Aim: sorting + # Use: samtools sort -@ [THREADS] -m [MEM_GB] -T [TMP_DIR] -O BAM -o [SORTED.bam] [FIX-MATE.bam] + message: + """ + ~ SamTools ∞ Sort BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS, + mem_gb = MEM_GB, + tmp_dir = TMP_DIR + input: + fix_mate = "results/02_Mapping/{reference}/{sample}_{aligner}_fix-mate.bam" + output: + sorted = temp("results/02_Mapping/{reference}/{sample}_{aligner}_sorted.bam") + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_sorted.log" + shell: + "samtools sort " # Samtools sort, tools for alignments in the SAM format with command to sort alignment file + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-m {resources.mem_gb}G " # -m: Set maximum memory per thread, suffix K/M/G recognized (default: 768M) + "-T {resources.tmp_dir} " # -T: Write temporary files to PREFIX.nnnn.bam + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "-o {output.sorted} " # Sorted bam output + "{input.fix_mate} " # Fixmate bam input + "&> {log}" # Log redirection + +############################################################################### +rule samtools_fixmate: + # Aim: filling in mate coordinates + # Use: samtools fixmate -@ [THREADS] -m -O BAM [SORT-BY-NAMES.bam] [FIX-MATE.bam] + message: + """ + ~ SamTools ∞ Fil Mate Coordinates ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + sort_by_names = "results/02_Mapping/{reference}/{sample}_{aligner}_sort-by-names.bam" + output: + fix_mate = temp("results/02_Mapping/{reference}/{sample}_{aligner}_fix-mate.bam") + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_fix-mate.log" + shell: + "samtools fixmate " # Samtools fixmate, tools for alignments in the SAM format with command to fix mate information + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-m " # -m: Add mate score tag + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "{input.sort_by_names} " # Sort_by_names bam input + "{output.fix_mate} " # Fix_mate bam output + "&> {log}" # Log redirection + +############################################################################### +rule samtools_sortbynames: + # Aim: sorting by names + # Use: samtools sort -t [THREADS] -m [MEM_GB] -n -O BAM -o [SORT-BY-NAMES.bam] [MAPPED.sam] + message: + """ + ~ SamTools ∞ Sort by Names BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS, + mem_gb = MEM_GB + input: + mapped = "results/02_Mapping/{reference}/{sample}_{aligner}-mapped.sam" + output: + sort_by_names = temp("results/02_Mapping/{reference}/{sample}_{aligner}_sort-by-names.bam") + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_sort-by-names.log" + shell: + "samtools sort " # Samtools sort, tools for alignments in the SAM format with command to sort alignment file + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-m {resources.mem_gb}G " # -m: Set maximum memory per thread, suffix K/M/G recognized (default: 768M) + "-n " # -n: Sort by read name (not compatible with samtools index command) + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "-o {output.sort_by_names} " # -o: Write final output to FILE rather than standard output + "{input.mapped} " # Mapped reads input + "&> {log}" # Log redirection + +############################################################################### diff --git a/workflow/rules/variant_calling.smk b/workflow/rules/variant_calling.smk new file mode 100644 index 0000000..54f2766 --- /dev/null +++ b/workflow/rules/variant_calling.smk @@ -0,0 +1,142 @@ +############################################################################### +########################### VARIANTS CALLING - IVAR ########################### +############################################################################### + +############################################################################### +rule convert_tsv2vcf: + message: + """ + ~ iVar ∞ Convert TSV to VCF file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ iVar + """ + conda: + TSV2VCF + input: + tsv = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" + output: + vcf = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-filt.vcf" + log: + "results/10_Reports/tools-log/tsv2vcf/{reference}/{sample}_{aligner}_{min_cov}X_ivar_tsv2vcf.log" + shell: + "python3 " # Python 3 + "workflow/scripts/ivar_variants_to_vcf.py " # Script (from viralrecon) + "{input.tsv} " # TSV input + "{output.vcf} " # VCF output + "&> {log}" # Log redirection + +############################################################################### +rule ivar_consensus: + # Aim: + # Use: + message: + """ + ~ iVar ∞ Call Consensus ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ iVar + """ + conda: + IVAR + params: + min_depth = IVAR_MIN_DEPTH, + min_freq = IVAR_MIN_FREQ, + min_insert = IVAR_MIN_INSERT, + max_depth = IVAR_MAX_DEPTH, + min_bq = IVAR_MIN_BQ, + min_qual = IVAR_MIN_QUAL, + baq = IVAR_MAP_QUAL + input: + mark_dup = get_bam_input, + variant_call = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" + output: + prefix = temp("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus"), + cons_tmp = temp("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus.fasta.tmp"), + qual_txt = "results/05_Consensus/{reference}/ivar_consensus-quality/{sample}_{aligner}_{min_cov}X_ivar_consensus.qual.txt", + log: + "results/10_Reports/tools-log/ivar/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus.log" + shell: + "samtools mpileup " # Samtools mpileup, tools for alignments in the SAM format with command multi-way pileup + "--verbosity 0 " # Set level of verbosity [INT] + "-a " # -a: output all positions (including zero depth) + "-a " # -a -a / -aa: output absolutely all positions, including unused ref. sequences + "--count-orphans " # -A: do not discard anomalous read pairs + "--max-depth {params.max_depth} " # -d: max per-file depth; avoids excessive memory usage [INT] (default: 8000) + "{params.baq} " # --no-BAQ / -B: disable BAQ (per-Base Alignment Quality) + "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than [INT] (default: 13) + #"--reference {input.masked_ref} " # Reference sequence FASTA FILE + "{input.mark_dup} " # Markdup BAM input + "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout + "| " ### PIPE to iVar + "ivar consensus " # iVar, with command 'consensus': Call consensus from aligned BAM file + "-p {output.prefix} " # -p: prefix + "-q {params.min_qual} " # -q: Minimum quality score threshold to count base [INT] (Default: 20) + "-t {params.min_freq} " # -t: Minimum frequency threshold to call variants [FLOAT] (Default: 0.03) + "-c {params.min_insert} " # -c: Minimum insertion frequency threshold to call consensus [FLOAT] (Default: 0.8) + "-m {params.min_depth} " # -m: Minimum read depth to call variants [INT] (Default: 0) + "-n N " # -n: Character to print in regions with less than minimum coverage (Default: N) + #"-i {wildcards.sample} " # -i: Name of fasta header (default: Consensus_<prefix>_threshold_<min_freq>_quality_<min_qual>_<min_insert> + "&> {log} " # Log redirection + "&& mv {output.prefix}.fa {output.cons_tmp} " # copy consensus.fa (temp) to consensus.fasta.tmp (tmp) + "&& mv {output.prefix}.qual.txt {output.qual_txt} " # cppty consensus.qual.txt (tmp) to ivar_consensus-quality/ directory + "&& touch {output.prefix}" # Touch done + +############################################################################### +rule ivar_variant_calling: + # Aim: SNVs and Indels calling + # Use: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file] + # Note: samtools mpileup output must be piped into ivar variants + message: + """ + ~ iVar ∞ Call Variants ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ iVar + """ + conda: + IVAR + params: + min_depth = IVAR_MIN_DEPTH, + min_freq = IVAR_MIN_FREQ, + min_insert = IVAR_MIN_INSERT, + max_depth = IVAR_MAX_DEPTH, + min_bq = IVAR_MIN_BQ, + min_qual = IVAR_MIN_QUAL, + baq = IVAR_MAP_QUAL + input: + masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta", + mark_dup = get_bam_input + output: + variant_call = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" + log: + "results/10_Reports/tools-log/ivar/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.log" + shell: + "samtools mpileup " # Samtools mpileup, tools for alignments in the SAM format with command multi-way pileup + "--verbosity 0 " # Set level of verbosity [INT] + "-a " # -a: output all positions (including zero depth) + "-a " # -a -a / -aa: output absolutely all positions, including unused ref. sequences + "--count-orphans " # -A: do not discard anomalous read pairs + "--max-depth {params.max_depth} " # -d: max per-file depth; avoids excessive memory usage (default: 8000) [INT] + "{params.baq} " # --no-BAQ / -B: disable BAQ (per-Base Alignment Quality) + "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than (default: 13) [INT] + "--reference {input.masked_ref} " # Reference sequence FASTA FILE + "{input.mark_dup} " # Markdup BAM input + "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout + "| " ### pipe to iVar + "ivar variants " # iVar, with command 'variants': Call variants from aligned BAM file + "-p {output.variant_call} " # -p: prefix + "-q {params.min_qual} " # -q: Minimum quality score threshold to count base (Default: 20) [INT] + "-t {params.min_freq} " # -t: Minimum frequency threshold to call variants (Default: 0.03) [FLOAT] + "-m {params.min_depth} " # -m: Minimum read depth to call variants (Default: 0) [INT] + "-r {input.masked_ref} " # -r: Reference file used for alignment (translate the nuc. sequences and identify intra host single nuc. variants) + #"-g " # -g: A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs) + "&> {log}" # Log redirection + +############################################################################### diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflow/scripts/__pycache__/__init__.cpython-312.pyc b/workflow/scripts/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80e6722fd8b0751239f5fedd373affaa5933c5a7 GIT binary patch literal 154 zcmX@j%ge<81gzq%=^*+sh(HIQS%4zb87dhx8U0o=6fpsLpFwJVS?Y%trxq3K8yOfG z=A{`J80x2Imgu{uh9wsHWa^ja7iFjA<d^FgCl_TFload7$7kkcmc+;F6;$5hu*uC& bDa}c>D`Ewj#0bR2AjU^#Mn=XWW*`dyT5%%y literal 0 HcmV?d00001 diff --git a/workflow/scripts/__pycache__/functions.cpython-312.pyc b/workflow/scripts/__pycache__/functions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..769a4fad5066a794e73ad02064280c41b8dd9c96 GIT binary patch literal 4247 zcmd5<%}*Og6yLSiUK<;1L(-B25}GzePKphLq;1-!p^i(5j6<N1iX5E9JD4r&^=5V< zV1mS<l_K?$15zrrr?wZA{3pFYPu7Y`mD)>g2+~VWeQ$QXHn9`sql!AxJdbDJd-LY^ z-n@D9Yinx@2Q4!2ZDF{B<Nm}y)#h;q`x*@Hb22CMDp%xZcpk?d7<*<sJXhdmyt4Nj zZbpy=pg!3L)Gzyiin0haAP0a3<si@&xdmuQ4gn3zVW6#YE6_H%4QND;0FACh<FQA$ zc$~KclN5FM^^;{?(&Rkv7==FqbT9P%2_W}5)BR>vP3Imsb9LWPinZ#uGH|c)cX*j! zfD!e`JpP;H;-2gy6pwo?ucDcjcVc+V5>n$+lW~s~ysfBmUZS#L#fqd@qANL_(409> zi6k4ieoxOo7m`y3p+<7Be{evX>+c^(78El%LZ&5}QIfYybYV^{-A)>Ls_3SXoLkiL zrc%<3guY@OE)X-<pqaLU<}T<)9LPGi(|PR2)DMAQQV)fnN4GoswuHkKvGe<Z-^8v; zSNB%nNF~&@C3NirGj{+xw2)!f2?KC0Zz2n8OC04_JOvIA@MbOX)}o{;=1Tc(N{mGn z%<S*aU9;zMsgh>WlA0{S2jx~ixyf7N*wxEZne^l)PcgI%M6id3fm$)#lA|uCX!@c_ z55fFNWNQbq&Q-+d#>lpKtP&Zjgj&}}X&X!*u-o<d2k@oA?wE;tVomm#uBRV*AK-qW zO+vWBzJ#n5FsV{3Dq5j@(cRcU?n`GfX(;&xl_XaUsi>=DEocAlV?spEO{Nobl40IT z6v`N!HdowBaT^rZwfvLAW8;}Lg;R9^gmwg>y0(MD5#Z@XK7;~Mh;3Zo7Q1#LhqvCC z-Z^pVv6qkb?s9yv_mAeN;I5fB--AH|DxN#A=NdA1+*83~+zKoZqN7D|k8M9{!lS24 z=6t>X+6+@N=5r+oa|#364Sapj4Ya>zb|unrH}qGe<KE@($`FB#K6GR6ILM*3Ydm-h z#&(1F^T=N33<}vg{|x5FBwYueC08k#Ba~=)vR3oaDg<glBXliSl;}dAyr?H;rJ@zg ztHc&7XIxjmxGEhGA~dfmx=v(NOk-d#V1{9CiMS-J=p21J;9Jqx!0EX8Enqwgy^2>` zc73EGwr>=+#qLV`&AVbH)V`j5WxPIng-wF7ur-CA05i+uub`*_EvB{}F`uHzWVsn$ zii6iXV8n-Yc!kH;(=J}o*}J|<D7v0$#47+Y>@?W-!V|gec<m(A=LpjeQmmOM&su?# zy+82}g$F+Szmq+DkUjFHB{&VHQNH4WbQaR#5}{H7(nei)&Z$xXS)0Xa^;=?<ESqb1 z(U{9{(U``PC46m^w76geT~+i=u)-3ZAh*0A68F_*oZv+I^YlbIn@Uqm>Xta186C-{ zCz_|A>e3F@Ns#yyx`79}c9vNRn%y~h2C~pjPp{QA_&6v;hjuw3IJDcsh1;6tXc1H} z{HI~ikd1ntAAge-C!mCb41UlDLYBS@^K~)$6!_f?Uds-vkoI%nz_qWCGLQ8!XC?sn zxIj;XVg@!n10y)vHhbJ(1DhgD3}%{OJY%ETT<Yp{bF8bu;$V$A;0x%@u%36Z2DlOH zIAR@tD%NhS-vosXSf6?stWCVESp(FaQ&GD&gLx=Wh-NGr<?C+fpTkPPo;?uz&jx=r zFR3b~YgZ+^RHaSd6E2_wV2<Ut**C?fx`7TdzSM9g(<JOWFdMr|Ilj0664#*b_XEMW zx6?6;`%=9x@EQ*J@EZ07dw0WJ^U#0Y?x5$u>^esb<)F4yBn`^6`W*C}eps?)`{oqY zOf2Q+=y}jivC?XYYj(8(iv!~)z;grnK0?w^wY_dwte^(jF|SH8Dc^9T^n5O(Xaw@} zOM&BR?4v|4Q4^EJGXP@I=E^uQT>^9nbfZbM8UO!ivjXr2E0k14E6@B#s8$tLLCrrC zR8~A!MI201fM=;5)4JlBzBs0Mak0y*X(=75PpH)gq>~)svE(l(hNQ?AmWA$9pyBL9 zF%j9E&K^c!(G}bSR>YWvA{*~-i$@^~-J9L#_-H2<|DCnsKO2u(VMCJ^ND+EMDN!_; zJv{7n@tMdTR}^b=R;g1gKPlE!6fb*<w-CkjNAd8|OGpq)ipFf(huAJ-10NSl@}f#E z(kt)@A9ThPkX<j&^M9Y__~XBGCwBcDJWa$szrc6zg}r>+UW))EEbvE>bRdcCg+-Wa RXCwflLkGzIA&_9X<6nD_@n8S| literal 0 HcmV?d00001 diff --git a/workflow/scripts/functions.py b/workflow/scripts/functions.py new file mode 100644 index 0000000..4231b28 --- /dev/null +++ b/workflow/scripts/functions.py @@ -0,0 +1,127 @@ +###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### +### ### +### /\ ______ ___ ____ _ _ __ ____ __ ____ ______ /\ ### +### || \ \ \ \ / __( ___( \/ )__\ ( _ ( ) (_ _) / / / / || ### +### || > > > > ( (_-.)__) \ /(__)\ ) /)(__ _)(_ < < < < || ### +### || /_/_/_/ \___(____) \(__)(__(_)\_(____(____) \_\_\_\ || ### +### \/ \/ ### +### ### +###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### +# Name ___________________ functions.py +# Version ________________ v.2025.01 +# Author _________________ Nicolas Fernandez +# Affiliation ____________ IRD_U233_TransVIHMI +# Aim ____________________ Snakefile functions +# Date ___________________ 2021.10.12 +# Latest modifications ___ 2025.01.13 +# Use ____________________ import scripts.functions as functions +############################################################################### + +# import +from snakemake.io import temp +from snakemake.io import expand + +# global variable +MODULES = [] + +# functions +def get_memory_per_thread(wildcards): + memory_per_thread = int(RAM) // int(CPUS) + if memory_per_thread < 1: + memory_per_thread = 1 + return memory_per_thread + +def get_quality_input(wildcards): + if "quality" in MODULES: + return "results/00_Quality_Control/multiqc/" + return [] + +def get_trimming_input(wildcards): + if "trimming" in MODULES: + return expand("results/01_Trimming/sickle/{sample}_sickle-trimmed_SE.fastq.gz", + sample = SAMPLE) + return [] + +def stash_or_trash(path): + if "keeptrim" in MODULES: + return path + else: + return temp(path) + +def get_bam_input(wildcards): + markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" + if "cleapping" in MODULES: + markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam" + return markdup + +def get_bai_input(wildcards): + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" + if "cleapping" in MODULES: + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam.bai" + return index + +def get_flagstat_input(wildcards): + if "covstats" in MODULES: + return expand("results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + ext = ["txt", "tsv", "json"]) + return [] + +def get_covstats_input(wildcards): + if "covstats" in MODULES: + return expand("results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV) + return [] + +def get_consensus_input(wildcards): + if "consensus" in MODULES: + return expand("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER) + return [] + +def get_vcf_input(wildcards): + if "consensus" in MODULES: + return expand("results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_variant-filt.vcf", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER) + return [] + +def get_nextclade_input(wildcards): + if "nextclade" in MODULES: + return expand("results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-report.tsv", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER) + return [] + +def get_pangolin_input(wildcards): + if "pangolin" in MODULES: + return expand("results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_pangolin-report.csv", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER) + return [] + +def get_gisaid_input(wildcards): + if "gisaid" in MODULES: + return expand("", + ) + return [] + +############################################################################### -- GitLab