diff --git a/Run_GeVarLi.sh b/Run_GeVarLi.sh index 95a7c631a046d4f1ca1806bab50f8eb0b113e16d..c550985e0ae1514d4c880870b1e58c97b69ee522 100755 --- a/Run_GeVarLi.sh +++ b/Run_GeVarLi.sh @@ -10,13 +10,13 @@ ### ### ###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### # Name ___________________ Run_GeVarLi.sh -# Version ________________ v.2024.10 +# Version ________________ v.2025.01 # Author _________________ Nicolas Fernandez # Affiliation ____________ IRD_U233_TransVIHMI # Aim ____________________ Bash script running gevarli.smk snakefile # Date ___________________ 2021.10.12 -# Latest modifications ___ 2024.11.15 (Update Snakemake) -# Use ____________________ 'bash ./Run_GeVarLi.sh' +# Latest modifications ___ 2025.01.08 (Prepare for Snakedeploy) +# Use ____________________ 'bash Run_GeVarLi.sh' ############################################################################### ### COLORS ### @@ -31,7 +31,7 @@ nc="\033[0m" # no color ############################################################################### ### ABOUT ### ############# -version="2024.11" # Version +version="2025.01" # Version workdir=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd) # Get working directory sample_test="SARS-CoV-2_Omicron-BA1_Covid-Seq-Lib-on-MiSeq_250000-reads" @@ -47,8 +47,8 @@ ${blue}Author${nc} _________________ Nicolas Fernandez ${blue}Affiliation${nc} ____________ IRD_U233_TransVIHMI ${blue}Aim${nc} ____________________ Bash script for ${red}Ge${nc}nome assembling, ${red}Var${nc}iant calling and ${red}Li${nc}neage assignation ${blue}Date${nc} ___________________ 2021.10.12 -${blue}Latest modifications${nc} ___ 2024.10.01 (Dedicated installation scripts creation) -${blue}Use${nc} ____________________ '${ylo}./Run_GeVarLi.sh${nc}' +${blue}Latest modifications${nc} ___ 2025.01.08 (Prepare for Snakedeploy) +${blue}Use${nc} ____________________ '${ylo}bash Run_GeVarLi.sh${nc}' " @@ -158,9 +158,9 @@ ${green}#####${nc} ${red}CONDA ACTIVATION${nc} ${green}#####${nc} ${green}----------------------------${nc} " -echo -e "conda activate ${ylo}workflow-base_v.${version}${nc}" - -conda activate workflow-base_v.${version} +workflowbase_version="2024.11" +echo -e "conda activate ${ylo}workflow-base_v.${workflowbase_version}${nc}" +conda activate workflow-base_v.${workflowbase_version} ############################################################################### @@ -179,6 +179,7 @@ yq_version=$(yq --version | sed 's/yq //') # Yq version rename_version="1.601" # Rename version (ver. 1.601 from 2024-02) graphviz_version="11.0.0" # GraphViz version (ver. 11.0.0 from 2024-02) #graphviz_version=$#(dot -V | sed 's/dot - graphviz version //') # GraphViz version (ver. 11.0.0 from 2024-02) +nextclade_version="" fastq=$(expr $(ls -l ${workdir}/resources/reads/*.fastq.gz 2> /dev/null | wc -l)) # Get fastq.gz files count if [[ "${fastq}" == "0" ]] # If no sample, @@ -241,9 +242,7 @@ ${blue}Samples processed${nc} ______ ${red}${samples}${nc} samples (${ylo}${fast ${blue}Snakemake version${nc} ______ ${ylo}${snakemake_version}${nc} ${blue}Conda version${nc} __________ ${ylo}${conda_version}${nc} -${blue}Conda frontend${nc} _________ ${ylo}${conda_frontend}${nc} ${blue}Mamba version${nc} __________ ${ylo}${mamba_version}${nc} -${blue}Nextclade version${nc} ______ ${ylo}${nextclade_version}${nc} ${blue}Quality Ccontrol${nc} _______ [ ${red}${quality}${nc} ] ${blue}Trimming${nc} _______________ [ ${red}${trimming}${nc} ] @@ -258,6 +257,7 @@ ${blue}Reference genome${nc} _______ ${ylo}${reference}${nc} ${blue}Aligner${nc} ________________ ${ylo}${aligner}${nc} ${blue}Min coverage${nc} ___________ ${red}${min_cov}${nc} X ${blue}Min allele frequency${nc} ___ ${red}${min_freq}${nc} + ${blue}Nextclade dataset${nc} ______ ${red}${nextclade_dataset}${nc} " @@ -324,22 +324,26 @@ fi ############################################################################### -### RENAME SAMPLES ### -###################### +### RENAME SYMLINKS ### +####################### echo -e " ${green}------------------------------------------------------------------------${nc} -${green}#####${nc} ${red}RENAME FASTQ FILES${nc} ${green}#####${nc} -${green}------------------------------${nc} +${green}#####${nc} ${red}RENAME FASTQ SYMLINKS${nc} ${green}#####${nc} +${green}---------------------------------${nc} " -# Rename fastq files to remove "_001" Illumina pattern (mandatory) -## De/comment line (#) if you want keep Illumina barcode-ID and/or Illumina line-ID -echo -e "Removing ${red}'_S'${nc} index tag ID" -rename "s/_S\d+_/_/" ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_} -echo -e "Removing ${red}'_L'${nc} line tag ID" -rename "s/_L\d+_/_/" ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_} -echo -e "Removing ${red}'_001'${nc} illumina tag ID" -rename "s/_001.fastq.gz/.fastq.gz/" ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz +# Remove tags from symlinks: +## barcode-ID like {_S001_} +## line-ID ID like {_L001_} +## end-name ID like {_001}.fastq.gz +mkdir -p ${workdir}/resources/symlinks/ +for fastq in ${workdir}/resources/reads/*.fastq.gz; do + symlinks=$(echo $(basename "${fastq}") | \ + sed -E "s/_S\d+_//" | \ + sed -E "s/_L\d+_//" | \ + sed -E "s/_001.fastq.gz/.fastq.gz/") + ln -s "${fastq}" "${workdir}/resources/symlinks/${symlinks}" +done echo -e " If you want to keep Illumina ${blue}barcode-ID${nc} and/or Illumina ${blue}line-ID${nc}, please edit ${ylo}Run_GeVarLi.sh${nc} script (l.335). @@ -355,9 +359,6 @@ ${green}#####${nc} ${red}SNAKEMAKE PIPELINES${nc} ${green}#####${nc} ${green}-------------------------------${nc} " -# MODULES -snakefiles_list="indexing_genomes gevarli" - echo -e " ${blue}## Unlocking Working Directory ##${nc} ${blue}---------------------------------${nc} @@ -367,14 +368,11 @@ ${blue}---------------------------------${nc} # Re-run all jobs the output of which is recognized as incomplete. # Remove a lock on the working directory. -for snakefile in ${snakefiles_list} ; do - echo -e "${blue}-- ${snakefile} --${nc}" ; - snakemake \ - --directory ${workdir}/ \ - --snakefile ${workdir}/workflow/snakefiles/${snakefile}.smk \ - --rerun-incomplete \ - --unlock ; -done +snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/Snakefile \ + --rerun-incomplete \ + --unlock echo -e " ${blue}## Let's Run! ##${nc} @@ -390,22 +388,17 @@ ${blue}----------------${nc} # If defined in the rule, run job in a conda environment. # Print out the shell commands that will be executed. -for snakefile in ${snakefiles_list} ; do - echo -e "${blue}-- ${snakefile} --${nc}" ; - snakemake \ - --directory ${workdir}/ \ - --snakefile ${workdir}/workflow/snakefiles/${snakefile}.smk \ - --cores ${max_threads} \ - --max-threads ${max_threads} \ - --resources mem_gb=${max_memory} \ - --rerun-incomplete \ - --keep-going \ - --use-conda ; -# --quiet host \ -# --quiet progress \ -# --quiet rules ; -done - +snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/Snakefile\ + --cores ${max_threads} \ + --max-threads ${max_threads} \ + --resources mem_gb=${max_memory} \ + --rerun-incomplete \ + --keep-going \ + --use-conda \ + --quiet host +# Possible choices: all, host, progress, rules ############################################################################### ### CONCATENATE RESULTS ### @@ -477,27 +470,23 @@ mkdir -p ${workdir}/results/10_Reports/files-summaries/ 2> /dev/null graph_list="dag rulegraph filegraph" extention_list="pdf png" -for snakefile in ${snakefiles_list} ; do - for graph in ${graph_list} ; do - for extention in ${extention_list} ; do - snakemake \ - --directory ${workdir}/ \ - --snakefile ${workdir}/workflow/snakefiles/${snakefile}.smk \ - --${graph} \ - | dot -T${extention} \ - 2> /dev/null \ - 1> ${workdir}/results/10_Reports/graphs/${snakefile}_${graph}.${extention} ; - done ; +for graph in ${graph_list} ; do + for extention in ${extention_list} ; do + snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/Snakefile \ + --${graph} \ + | dot -T${extention} \ + 2> /dev/null \ + 1> ${workdir}/results/10_Reports/graphs/${graph}.${extention} ; done ; done -for snakefile in ${snakefiles_list} ; do - snakemake \ - --directory ${workdir} \ - --snakefile ${workdir}/workflow/snakefiles/${snakefile}.smk \ - --summary > ${workdir}/results/10_Reports/files-summaries/${snakefile}_files-summary.txt \ - 2> /dev/null ; -done +snakemake \ + --directory ${workdir} \ + --snakefile ${workdir}/workflow/Snakefile \ + --summary > ${workdir}/results/10_Reports/files-summaries/files-summary.txt \ + 2> /dev/null cp ${config_file} ${workdir}/results/10_Reports/config.log 2> /dev/null @@ -512,14 +501,15 @@ ${green}------------------------${nc} " # Save and deactive environments -mkdir -p ${workdir}/results/10_Reports/conda_env/ 2> /dev/null -cp ${workdir}/workflow/environments/*.yaml ${workdir}/results/10_Reports/conda_env/ +mkdir -p ${workdir}/results/10_Reports/conda_envs/ 2> /dev/null +cp ${workdir}/workflow/envs/*.yaml ${workdir}/results/10_Reports/conda_envs/ conda deactivate # Cleanup find ${workdir}/results/ -type f -empty -delete # Remove empty file (like empty log) find ${workdir}/results/ -type d -empty -delete # Remove empty directory rm -f ${workdir}/resources/reads/${sample_test}_R*.fastq.gz 2> /dev/null +rm -rf ${workdir}/resources/symlinks/ 2> /dev/null # Timer time_stamp_end=$(date +"%Y-%m-%d %H:%M") # Get date / hour ending analyzes @@ -542,8 +532,8 @@ Author _________________ Nicolas Fernandez Affiliation ____________ IRD_U233_TransVIHMI Aim ____________________ Bash script for GeVarLi Date ___________________ 2021.10.12 -Latest modifications ___ 2024.10.01 (Dedicated installation scripts creation) -Run ____________________ ./Run_GeVarLi.sh +Latest modifications ___ 2025.01.08 (Prepare for Snakedeploy) +Run ____________________ 'bash Run_GeVarLi.sh' Operating System _______ ${os} Shell __________________ ${shell} @@ -567,9 +557,8 @@ Samples processed _______ ${samples} samples (${ylo}${fastq} fastq files) Snakemake version _______ ${snakemake_version} Conda version ___________ ${conda_version} -Conda frontend __________ ${conda_frontend} Mamba version ___________ ${mamba_version} -Nextclade version _______ ${nextclade_version} + Quality Ccontrol ________ [ ${quality} ] Trimming ________________ [ ${trimming} ] @@ -584,16 +573,17 @@ Reference genome ________ ${reference} Aligner _________________ ${aligner} Min coverage ____________ ${min_cov} X Min allele frequency ____ ${min_freq} + Nextclade dataset _______ ${nextclade_dataset} " > ${workdir}/results/10_Reports/settings.log # Gzip reports directory cd ${workdir}/results/ tar -zcf 10_Reports_archive.tar.gz 10_Reports +cd ${workdir} # Gzip results directory #mkdir -p ${workdir}/archives/ 2> /dev/null -#cd ${workdir}/ #tar -zcf archives/Results_${time_stamp_archive}_${reference}_${aligner}-${min_cov}X_${samples}sp_archive.tar.gz results/ echo -e " diff --git a/configuration/config.yaml b/configuration/config.yaml index c636b9de6e4c6510e0ddc2c0f228aa076cae5ac4..6b1b4c69752d555ef9a78c8da9819b2444579f74 100755 --- a/configuration/config.yaml +++ b/configuration/config.yaml @@ -36,12 +36,12 @@ resources: ############################################################################### modules: # GeVarLi modules analysis # Available options, set at least one: - - 'quality' # Perform reads quality control (FastQC, Fastq-Screen, MultiQC) default: ON + #- 'quality' # Perform reads quality control (FastQC, Fastq-Screen, MultiQC) default: ON #- 'trimming' # Keep trimmed reads files after alignment (Cutadapt / Sickle-trim) default: OFF #- 'cleapping' # Perform reads clipping (Bamclipper) default: OFF - 'consensus' # Perform reads mapping to reference (BWA, Bowtie2, Minimap2) default: ON - - 'covstats' # Perform coverage statistics (Fagstat, Covstats) default: ON - - 'nextclade' # Perform nextclade clade assignation (Nexclade) default: ON + #- 'covstats' # Perform coverage statistics (Fagstat, Covstats) default: ON + #- 'nextclade' # Perform nextclade clade assignation (Nexclade) default: ON #- 'pangolin' # Perform pangolin lineage assignation (Pangolin) default: OFF ###- 'gisaid' # Perform gisaid submission (GisAid : TODO) default: OFF @@ -51,7 +51,7 @@ consensus: reference: # Your reference, in fasta format (default: "SARS-CoV-2_Wuhan_MN-908947-3") # Available options (not exhaustive), set one: - 'SARS-CoV-2_Wuhan_MN-908947-3' # SARS-CoV-2 (Nextclade and Pangolin) - #- 'Monkeypox-virus_Zaire_AF-380138-1' # Monkeypox (Nextclade and Pangolin) + - 'Monkeypox-virus_Zaire_AF-380138-1' # Monkeypox (Nextclade and Pangolin) #- 'Monkeypox-virus_UK_MT-903345-1' # Monkeypox (Nextclade and Pangolin) #- 'Swinepox-virus_India_MW-036632-1' # Swinepox (Nextclade) #- 'Ebola-virus_Zaire_AF-272001-1' # Ebola (na) @@ -96,8 +96,8 @@ consensus: aligner: # Select your favorite aligner (default: 'bwa') # Available options, set one: - 'bwa' # Better, Faster, (Stronger, Harder...) - #- 'bowtie2' # Depreciated (slower),"sensitivity" requiried (see below "Bowtie2" options) - #- 'minimap2' # A versatile sequence alignment program + - 'bowtie2' # Depreciated (slower),"sensitivity" requiried (see below "Bowtie2" options) + - 'minimap2' # A versatile sequence alignment program caller: 'ivar' ############################################################################### @@ -275,24 +275,24 @@ fastq_screen: ############################################################################### conda: - yaml: # Conda environement yaml files: - bamclipper: '../environments/bamclipper_v.1.0.0.yaml' # Bamclipper ver. 1.0.0 - bcftools: '../environments/bcftools_v.1.20.yaml' # Bcftools ver. 1.20 - bedtools: '../environments/bedtools_v.2.31.1.yaml' # Bedtools ver. 2.31.1 - bowtie2: '../environments/bowtie2_v.2.5.4.yaml' # Bowtie2 ver. 2.5.4 - bwa: '../environments/bwa_v.0.7.18.yaml' # BWA ver. 0.7.18 - cutadapt: '../environments/cutadapt_v.4.9.yaml' # Cutadapt ver. 4.9 - fastq_screen: '../environments/fastq-screen_v.0.15.3.yaml' # Fastq-Screen ver. 0.15.3 (with BWA ver. 0.7.18) - fastqc: '../environments/fastqc_v.0.12.1.yaml' # FastQC ver. 0.12.1 - gawk: '../environments/gawk_v.5.3.0.yaml' # Awk (GNU) ver. 5.3.0 - ivar: '../environments/ivar_v.1.4.3.yaml' # iVar ver. 1.4.3 (with Samtools ver. 1.20) - minimap2: '../environments/minimap2_v.2.28.yaml' # Minimap2 ver. 2.28 - multiqc: '../environments/multiqc_v.1.23.yaml' # MultiQC ver. 1.23 (with Pandoc ver. 3.3) - nextclade: '../environments/nextclade_v.3.9.1.yaml' # Nextclade ver. 3.9.1 - pangolin: '../environments/pangolin_v.4.3.yaml' # Pangolin ver. 4.3 - samtools: '../environments/samtools_v.1.20.yaml' # Samtools ver. 1.20 - sickle_trim: '../environments/sickle-trim_v.1.33.yaml' # Sickle-trim ver. 1.33 - tsv2vcf: '../environments/tsv2vcf_v.1.1.yaml' # tsv2vcf ver. 1.1 (bcftools biopython numpy) - workflow_base: '../environments/workflow-base_v.2024.11.yaml' # Workflow-Base ver. 2024.11 + envs: # Conda environement yaml files: + bamclipper: 'envs/bamclipper_v.1.0.0.yaml' # Bamclipper ver. 1.0.0 + bcftools: 'envs/bcftools_v.1.20.yaml' # Bcftools ver. 1.20 + bedtools: 'envs/bedtools_v.2.31.1.yaml' # Bedtools ver. 2.31.1 + bowtie2: 'envs/bowtie2_v.2.5.4.yaml' # Bowtie2 ver. 2.5.4 + bwa: 'envs/bwa_v.0.7.18.yaml' # BWA ver. 0.7.18 + cutadapt: 'envs/cutadapt_v.4.9.yaml' # Cutadapt ver. 4.9 + fastq_screen: 'envs/fastq-screen_v.0.15.3.yaml' # Fastq-Screen ver. 0.15.3 (with BWA ver. 0.7.18) + fastqc: 'envs/fastqc_v.0.12.1.yaml' # FastQC ver. 0.12.1 + gawk: 'envs/gawk_v.5.3.0.yaml' # Awk (GNU) ver. 5.3.0 + ivar: 'envs/ivar_v.1.4.3.yaml' # iVar ver. 1.4.3 (with Samtools ver. 1.20) + minimap2: 'envs/minimap2_v.2.28.yaml' # Minimap2 ver. 2.28 + multiqc: 'envs/multiqc_v.1.23.yaml' # MultiQC ver. 1.23 (with Pandoc ver. 3.3) + nextclade: 'envs/nextclade_v.3.9.1.yaml' # Nextclade ver. 3.9.1 + pangolin: 'envs/pangolin_v.4.3.yaml' # Pangolin ver. 4.3 + samtools: 'envs/samtools_v.1.20.yaml' # Samtools ver. 1.20 + sickle_trim: 'envs/sickle-trim_v.1.33.yaml' # Sickle-trim ver. 1.33 + tsv2vcf: 'envs/tsv2vcf_v.1.1.yaml' # tsv2vcf ver. 1.1 (bcftools biopython numpy) + workflow_base: 'envs/workflow-base_v.2024.11.yaml' # Workflow-Base ver. 2024.11 ############################################################################### diff --git a/version.txt b/version.txt index 424805bac262e0487a8e9a7c654536c0a5e7e922..eb11b6f54b840a21cb459d580529691f99ac6dcd 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -2024.11 +2025.01 diff --git a/workflow/Snakefile b/workflow/Snakefile index 118850c0b068c9c4e289dc417375bad465b8965f..e58983186c74103324d3ba8252b639ce988a2208 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,4 +1,1454 @@ -include: "workflow/snakefiles/quality_control.smk" -include: "workflow/snakefiles/indexing_genomes.smk" -include: "workflow/snakefiles/gevarli.smk" -conda_prefix: "workflow/environments/" \ No newline at end of file +###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### +### ### +### /\ ______ ___ ____ _ _ __ ____ __ ____ ______ /\ ### +### || \ \ \ \ / __( ___( \/ )__\ ( _ ( ) (_ _) / / / / || ### +### || > > > > ( (_-.)__) \ /(__)\ ) /)(__ _)(_ < < < < || ### +### || /_/_/_/ \___(____) \(__)(__(_)\_(____(____) \_\_\_\ || ### +### \/ \/ ### +### ### +###I###R###D######U###2###3###3#######T###R###A###N###S###V###I###H###M###I#### +# Name ___________________ gevarli.smk +# Version ________________ v.2025.01 +# Author _________________ Nicolas Fernandez +# Affiliation ____________ IRD_U233_TransVIHMI +# Aim ____________________ Snakefile with GeVarLi rules +# Date ___________________ 2021.10.12 +# Latest modifications ___ 2025.01.08 (Prepare for Snakedeploy) +# Use ____________________ snakemake -s Snakemake --use-conda + +############################################################################### +### CONFIGURATION ### +##################### + +configfile: "configuration/config.yaml" + +############################################################################### +### FUNCTIONS ### +################# + +def get_memory_per_thread(wildcards): + memory_per_thread = int(RAM) // int(CPUS) + if memory_per_thread < 1: + memory_per_thread = 1 + return memory_per_thread + +def get_quality_input(wildcards): + quality_list = [] + if "quality" in MODULES: + quality_list = "results/00_Quality_Control/multiqc/" + return quality_list + +def get_trimming_input(wildcards): + trimming_list = [] + if "trimming" in MODULES: + trimming_list = expand("results/01_Trimming/sickle/{sample}_sickle-trimmed_SE.fastq.gz", + sample = SAMPLE) + return trimming_list + +def stash_or_trash(path): + if "trimming" in MODULES: + return path + else: + return temp(path) + +def get_cleapping_input(wildcards): + cleapping_list = [] + if "cleapping" in MODULES: + cleapping_list = "" + return cleapping_list + +def get_bam_input(wildcards): + markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" + if "cleapping" in MODULES: + markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam" + return markdup + +def get_bai_input(wildcards): + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" + if "cleapping" in MODULES: + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam.bai" + return index + +def get_flagstat_input(wildcards): + flagstat_list = [] + if "covstats" in MODULES: + flagstat_list = expand( + "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + ext = ["txt", "tsv", "json"] + ) + return flagstat_list + +def get_covstats_input(wildcards): + covstats_list = [] + if "covstats" in MODULES: + covstats_list = expand( + "results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV + ) + return covstats_list + +def get_consensus_input(wildcards): + consensus_list = [] + if "consensus" in MODULES: + consensus_list = expand( + "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER + ) + return consensus_list + +def get_vcf_input(wildcards): + vcf_list = [] + if "consensus" in MODULES: + vcf_list = expand( + "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_variant-filt.vcf", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER + ) + return vcf_list + +def get_pangolin_input(wildcards): + pangolin_list = [] + if "pangolin" in MODULES: + pangolin_list = expand( + "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_pangolin-report.csv", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER + ) + return pangolin_list + +def get_nextclade_input(wildcards): + nextclade_list = [] + if "nextclade" in MODULES: + nextclade_list = expand( + "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-report.tsv", + reference = REFERENCE, + sample = SAMPLE, + aligner = ALIGNER, + min_cov = MIN_COV, + caller = CALLER + ) + return nextclade_list + +def get_gisaid_input(wildcards): + gisaid_list = [] + if "gisaid" in MODULES: + gisaid_list = expand( + "" + ) + return gisaid_list + +############################################################################### +### WILDCARDS ### +################# + +FASTQ, = glob_wildcards("resources/symlinks/{fastq}.fastq.gz") +SAMPLE, = glob_wildcards("resources/symlinks/{sample}_R1.fastq.gz") + +############################################################################### +### RESOURCES ### +################# + +CPUS = config["resources"]["cpus"] # Threads (maximum) +RAM = config["resources"]["ram"] # Memory (RAM) in Gb (maximum) +MEM_GB = get_memory_per_thread # Memory per thread in GB (maximum) +TMP_DIR = config["resources"]["tmp_dir"] # Temporary directory + +############################################################################### +### ENVIRONMENTS ### +#################### + +MULTIQC = config["conda"]["envs"]["multiqc"] # Multi-QC conda env +FASTQ_SCREEN = config["conda"]["envs"]["fastq_screen"] # Fastq-Screen conda env +FASTQC= config["conda"]["envs"]["fastqc"] # FastQC conda env +CUTADAPT = config["conda"]["envs"]["cutadapt"] # Cutadapt conda environment +SICKLE_TRIM = config["conda"]["envs"]["sickle_trim"] # Sickle-Trim conda environment +MINIMAP2 = config["conda"]["envs"]["minimap2"] # BWA conda environment +BWA = config["conda"]["envs"]["bwa"] # BWA conda environment +BOWTIE2 = config["conda"]["envs"]["bowtie2"] # Bowtie2 conda environment +SAMTOOLS = config["conda"]["envs"]["samtools"] # SamTools conda environment +BEDTOOLS = config["conda"]["envs"]["bedtools"] # BedTools conda environment +BAMCLIPPER = config["conda"]["envs"]["bamclipper"] # BAMClipper +GAWK = config["conda"]["envs"]["gawk"] # Awk (GNU) conda environment +IVAR = config["conda"]["envs"]["ivar"] # iVar conda environment +TSV2VCF = config["conda"]["envs"]["tsv2vcf"] # tsv2vcf conda environment +BCFTOOLS = config["conda"]["envs"]["bcftools"] # BcfTools conda environment +PANGOLIN = config["conda"]["envs"]["pangolin"] # Pangolin conda environment +NEXTCLADE = config["conda"]["envs"]["nextclade"] # Nextclade conda environment + +############################################################################### +### PARAMETERS ### +################## + +MODULES = config["modules"] # Modules + +SUBSET = config["fastq_screen"]["subset"] # Fastq-Screen --subset +FQC_CONFIG = config["fastq_screen"]["config"] # Fastq-Screen --conf +#MQC_PATH = config["multiqc"]["path"] # MultiQC --conf +#MQC_CONF = config["multiqc"]["config"] # MultiQC --conf +#TAG = config["multiqc"]["tag"] # MultiQC --tag + +KMER_SIZE = config["minimap2"]["algorithm"]["k-mer_size"] # MM2 k-mer size +MINIMIZER_SIZE = config["minimap2"]["algorithm"]["minimizer_size"] # MM2 minimizer window size +SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # MM2 split index +#HOMOPOLYMER = config["minimap2"]["algorithm"]["homopolymer"] # MM2 if PacBio +BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm +BT2_ALGO = config["bowtie2"]["algorithm"] # BT2 indexing algorithm + +REFERENCE = config["consensus"]["reference"] # Genome reference sequence, in fasta format +REF_PATH = config["consensus"]["path"] # Path to genomes references +MIN_COV = config["consensus"]["min_cov"] # Minimum coverage, mask lower regions with 'N' +IUPAC = config["consensus"]["iupac"] # Output variants in the form of IUPAC ambiguity codes +ALIGNER = config["consensus"]["aligner"] # Aligner ('minimap2', 'bwa' or 'bowtie2') +CALLER = config["consensus"]["caller"] # Variant Caller ('lofreq' or 'ivar') + +CUT_LENGTH = config["cutadapt"]["length"] # Cutadapt --minimum-length +CUT_TRUSEQ = config["cutadapt"]["kits"]["truseq"] # Cutadapt --adapter Illumina TruSeq +CUT_NEXTERA = config["cutadapt"]["kits"]["nextera"] # Cutadapt --adapter Illumina Nextera +CUT_SMALL = config["cutadapt"]["kits"]["small"] # Cutadapt --adapter Illumina Small +CUT_CLIPPING = config["cutadapt"]["clipping"] # Cutadapt --cut + +SIC_COMMAND = config["sickle_trim"]["command"] # Sickle-trim command +SIC_ENCODING = config["sickle_trim"]["encoding"] # Sickle-trim --qual-type +SIC_QUALITY = config["sickle_trim"]["quality"] # Sickle-trim --qual-threshold +SIC_LENGTH = config["sickle_trim"]["length"] # Sickle-trim --length-treshold + +MM2_PATH = config["minimap2"]["path"] # Minimpa2 path to indexes +MM2_PRESET = config["minimap2"]["preset"] # Minimap2 preset +#MM2_LENGTH = config["minimap2"]["length"] # Minimap2 length +#MM2_KMER_SIZE = config["minimap2"]["algorithm"]["k-mer_size"] # Minimap2 k-mer size +#MM2_MINIMIZER_SIZE = config["minimap2"]["algorithm"]["minimizer_size"] # Minimap2 minimizer window size +#MM2_SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # Minimap2 split index +#MM2_HOMOPOLYMER = config["minimap2"]["algorithm"]["homopolymer"] # Minimap2 for PacBio + +BWA_PATH = config["bwa"]["path"] # BWA path to indexes +BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm + +BT2_PATH = config["bowtie2"]["path"] # Bowtie2 path to indexes +BT2_ALGO = config["bowtie2"]["algorithm"] # Bowtie2 indexing algorithm +BT2_SENSITIVITY = config["bowtie2"]["sensitivity"] # Bowtie2 sensitivity preset + +CLIPPATH = config["bamclipper"]["path"] # Bamclipper path to primers +PRIMERS = config["bamclipper"]["primers"] # Bamclipper primers bed files +UPSTREAM = config["bamclipper"]["upstream"] # Bamclipper upstream nucleotides +DOWNSTREAM = config["bamclipper"]["downstream"] # Bamclipper downstream nucleotides + +IVAR_MIN_DEPTH = config["consensus"]["min_cov"] # iVar +IVAR_MIN_FREQ = config["consensus"]["min_freq"] # iVar minimum allele frequency allowed +IVAR_MIN_INSERT = config["consensus"]["min_freq"] # iVar minimum insertion frequency allowed +#IVAR_MIN_DEPTH = config["ivar"]["min_depth"] # iVar +#IVAR_MIN_FREQ = config["ivar"]["min_freq"] # iVar minimum allele frequency allowed +#IVAR_MIN_INSERT = config["ivar"]["min_insert"] # iVar minimum insertion frequency allowed +IVAR_MAX_DEPTH = config["ivar"]["max_depth"] # iVar +IVAR_MIN_BQ = config["ivar"]["min_bq"] # iVar +IVAR_MIN_QUAL = config["ivar"]["min_qual"] # iVar +IVAR_MAP_QUAL = config["ivar"]["map_qual"] # iVar mapping quality + +NEXT_PATH = config["nextclade"]["path"] # Path to nextclade dataset +NEXT_DATASET = config["nextclade"]["dataset"] # Nextclade dataset + +############################################################################### +### RULES ### +############# +rule all: + input: + multiqc = get_quality_input, + trimming = get_trimming_input, + consensus = get_consensus_input, + flagstat = get_flagstat_input, + covstats = get_covstats_input, + vcf = get_vcf_input, + pangolin = get_pangolin_input, + nextclade = get_nextclade_input, + gisaid = get_gisaid_input + + +############################################################################### +################################## LINEAGES ################################### +############################################################################### + +############################################################################### +rule nextclade_lineage: + # Aim: nextclade lineage assignation + # Use: nextclade [QUERY.fasta] -t [THREADS] --outfile [NAME.csv] + message: + """ + ~ Nextclade ∞ Assign Lineage ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ {wildcards.caller} + """ + conda: + NEXTCLADE + resources: + cpus = CPUS + params: + path = NEXT_PATH, + dataset = NEXT_DATASET + input: + consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" + output: + lineage = "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-report.tsv", + alignment = directory("results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_nextclade-all/") + log: + "results/10_Reports/tools-log/nextclade/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_lineage.log" + shell: + "nextclade " # Nextclade, assign queries sequences to clades and reports potential quality issues + "run " # Run analyzis + "--jobs {resources.cpus} " # -j: Number of CPU threads used by the algorithm (default: all available threads) + "--input-dataset {params.path}{params.dataset} " # -raq: Path to a directory containing a dataset (root-seq, tree and qc-config required) + "--output-tsv {output.lineage} " # -t: Path to output TSV results file + "--output-all {output.alignment} " # -O: Produce all of the output files into this directory, using default basename + "{input.consensus} " # Path to a .fasta file with input sequences + "&> {log}" # Log redirection + +############################################################################### +rule pangolin_lineage: + # Aim: lineage mapping + # Use: pangolin [QUERY.fasta] -t [THREADS] --outfile [NAME.csv] + message: + """ + ~ Pangolin ∞ Assign Lineage ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ {wildcards.caller} + """ + conda: + PANGOLIN + resources: + cpus = CPUS, + tmp_dir = TMP_DIR + input: + consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" + output: + lineage = "results/06_Lineages/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_pangolin-report.csv" + log: + "results/10_Reports/tools-log/pangolin/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_lineage.log" + shell: + "pangolin " # Pangolinn, Phylogenetic Assignment of Named Global Outbreak LINeages + "{input.consensus} " # Query fasta file of sequences to analyse + "--threads {resources.cpus} " # -t: Number of threads + "--tempdir {resources.tmp_dir} " # Specify where you want the temp stuff to go (default: $TMPDIR) + "--outfile {output.lineage} " # Optional output file name (default: lineage_report.csv) + "&> {log}" # Log redirection + +############################################################################### +################################## CONSENSUS ################################## +############################################################################### + +############################################################################### +rule sed_rename_headers: + # Aim: rename all fasta header with sample name + # Use: sed 's/[OLD]/[NEW]/' [IN] > [OUT] + message: + """ + ~ Sed ∞ Rename Fasta Header ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ {wildcards.caller} + """ + conda: + GAWK + input: + cons_tmp = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta.tmp" + output: + consensus = "results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_consensus.fasta" + log: + "results/10_Reports/tools-log/sed/{reference}/{sample}_{aligner}_{min_cov}X_{caller}_fasta-header.log" + shell: + "sed " # Sed, a Stream EDitor used to perform basic text transformations on an input stream + "'s/^>.*$/>{wildcards.sample}_{wildcards.aligner}_{wildcards.min_cov}X_{wildcards.caller}/' " + "{input.cons_tmp} " # Input file + "1> {output.consensus} " # Output file + "2> {log}" # Log redirection + +############################################################################### +########################### VARIANTS CALLING - IVAR ########################### +############################################################################### + +############################################################################### +rule convert_tsv2vcf: + message: + """ + ~ iVar ∞ Convert TSV to VCF file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ iVar + """ + conda: + TSV2VCF + input: + tsv = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" + output: + vcf = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-filt.vcf" + log: + "results/10_Reports/tools-log/tsv2vcf/{reference}/{sample}_{aligner}_{min_cov}X_ivar_tsv2vcf.log" + shell: + "python3 " # Python 3 + "workflow/scripts/ivar_variants_to_vcf.py " # Script (from viralrecon) + "{input.tsv} " # TSV input + "{output.vcf} " # VCF output + "&> {log}" # Log redirection + +############################################################################### +rule ivar_consensus: + # Aim: + # Use: + message: + """ + ~ iVar ∞ Call Consensus ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ iVar + """ + conda: + IVAR + params: + min_depth = IVAR_MIN_DEPTH, + min_freq = IVAR_MIN_FREQ, + min_insert = IVAR_MIN_INSERT, + max_depth = IVAR_MAX_DEPTH, + min_bq = IVAR_MIN_BQ, + min_qual = IVAR_MIN_QUAL, + baq = IVAR_MAP_QUAL + input: + mark_dup = get_bam_input, + variant_call = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" + output: + prefix = temp("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus"), + cons_tmp = temp("results/05_Consensus/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus.fasta.tmp"), + qual_txt = "results/05_Consensus/{reference}/ivar_consensus-quality/{sample}_{aligner}_{min_cov}X_ivar_consensus.qual.txt", + log: + "results/10_Reports/tools-log/ivar/{reference}/{sample}_{aligner}_{min_cov}X_ivar_consensus.log" + shell: + "samtools mpileup " # Samtools mpileup, tools for alignments in the SAM format with command multi-way pileup + "--verbosity 0 " # Set level of verbosity [INT] + "-a " # -a: output all positions (including zero depth) + "-a " # -a -a / -aa: output absolutely all positions, including unused ref. sequences + "--count-orphans " # -A: do not discard anomalous read pairs + "--max-depth {params.max_depth} " # -d: max per-file depth; avoids excessive memory usage [INT] (default: 8000) + "{params.baq} " # --no-BAQ / -B: disable BAQ (per-Base Alignment Quality) + "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than [INT] (default: 13) + #"--reference {input.masked_ref} " # Reference sequence FASTA FILE + "{input.mark_dup} " # Markdup BAM input + "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout + "| " ### PIPE to iVar + "ivar consensus " # iVar, with command 'consensus': Call consensus from aligned BAM file + "-p {output.prefix} " # -p: prefix + "-q {params.min_qual} " # -q: Minimum quality score threshold to count base [INT] (Default: 20) + "-t {params.min_freq} " # -t: Minimum frequency threshold to call variants [FLOAT] (Default: 0.03) + "-c {params.min_insert} " # -c: Minimum insertion frequency threshold to call consensus [FLOAT] (Default: 0.8) + "-m {params.min_depth} " # -m: Minimum read depth to call variants [INT] (Default: 0) + "-n N " # -n: Character to print in regions with less than minimum coverage (Default: N) + #"-i {wildcards.sample} " # -i: Name of fasta header (default: Consensus_<prefix>_threshold_<min_freq>_quality_<min_qual>_<min_insert> + "&> {log} " # Log redirection + "&& mv {output.prefix}.fa {output.cons_tmp} " # copy consensus.fa (temp) to consensus.fasta.tmp (tmp) + "&& mv {output.prefix}.qual.txt {output.qual_txt} " # cppty consensus.qual.txt (tmp) to ivar_consensus-quality/ directory + "&& touch {output.prefix}" # Touch done + +############################################################################### +rule ivar_variant_calling: + # Aim: SNVs and Indels calling + # Use: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file] + # Note: samtools mpileup output must be piped into ivar variants + message: + """ + ~ iVar ∞ Call Variants ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + Variant caller: __ iVar + """ + conda: + IVAR + params: + min_depth = IVAR_MIN_DEPTH, + min_freq = IVAR_MIN_FREQ, + min_insert = IVAR_MIN_INSERT, + max_depth = IVAR_MAX_DEPTH, + min_bq = IVAR_MIN_BQ, + min_qual = IVAR_MIN_QUAL, + baq = IVAR_MAP_QUAL + input: + masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta", + mark_dup = get_bam_input + output: + variant_call = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.tsv" + log: + "results/10_Reports/tools-log/ivar/{reference}/{sample}_{aligner}_{min_cov}X_ivar_variant-call.log" + shell: + "samtools mpileup " # Samtools mpileup, tools for alignments in the SAM format with command multi-way pileup + "--verbosity 0 " # Set level of verbosity [INT] + "-a " # -a: output all positions (including zero depth) + "-a " # -a -a / -aa: output absolutely all positions, including unused ref. sequences + "--count-orphans " # -A: do not discard anomalous read pairs + "--max-depth {params.max_depth} " # -d: max per-file depth; avoids excessive memory usage (default: 8000) [INT] + "{params.baq} " # --no-BAQ / -B: disable BAQ (per-Base Alignment Quality) + "--min-BQ {params.min_bq} " # -Q: skip bases with baseQ/BAQ smaller than (default: 13) [INT] + "--reference {input.masked_ref} " # Reference sequence FASTA FILE + "{input.mark_dup} " # Markdup BAM input + "2>&1 | grep -v '[mpileup] 1 samples in 1 input files' " # Remove this stdout + "| " ### pipe to iVar + "ivar variants " # iVar, with command 'variants': Call variants from aligned BAM file + "-p {output.variant_call} " # -p: prefix + "-q {params.min_qual} " # -q: Minimum quality score threshold to count base (Default: 20) [INT] + "-t {params.min_freq} " # -t: Minimum frequency threshold to call variants (Default: 0.03) [FLOAT] + "-m {params.min_depth} " # -m: Minimum read depth to call variants (Default: 0) [INT] + "-r {input.masked_ref} " # -r: Reference file used for alignment (translate the nuc. sequences and identify intra host single nuc. variants) + #"-g " # -g: A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs) + "&> {log}" # Log redirection + + +############################################################################### +############################ MASKING LOW COVERAGE ############################# +############################################################################### + +############################################################################### +rule bedtools_masking: + # Aim: masking low coverage regions + # Use: bedtools maskfasta [OPTIONS] -fi [REFERENCE.fasta] -bed [RANGE.bed] -fo [MASKEDREF.fasta] + message: + """ + ~ BedTools ∞ Mask Low Coverage Regions ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + """ + conda: + BEDTOOLS + params: + path = REF_PATH + input: + low_cov_mask = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed" + output: + masked_ref = "results/04_Variants/{reference}/{sample}_{aligner}_{min_cov}X_masked-ref.fasta" + log: + "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_masking.log" + shell: + "bedtools maskfasta " # Bedtools maskfasta, mask a fasta file based on feature coordinates + "-fi {params.path}{wildcards.reference}.fasta " # Input FASTA file + "-bed {input.low_cov_mask} " # BED/GFF/VCF file of ranges to mask in -fi + "-fo {output.masked_ref} " # Output masked FASTA file + "&> {log}" # Log redirection + +############################################################################### +rule bedtools_merged_mask: + # Aim: merging overlaps + # Use: bedtools merge [OPTIONS] -i [FILTERED.bed] -g [GENOME.fasta] + message: + """ + ~ BedTools ∞ Merge Overlaps ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + """ + conda: + BEDTOOLS + input: + min_cov_filt = "results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_min-cov-filt.bed" + output: + low_cov_mask = temp("results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_low-cov-mask.bed") + log: + "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_{min_cov}X_merging.log" + shell: + "bedtools merge " # Bedtools merge, merges overlapping BED/GFF/VCF entries into a single interval + "-i {input.min_cov_filt} " # -i: BED/GFF/VCF input to merge + "1> {output.low_cov_mask} " # merged output + "2> {log}" # Log redirection + +############################################################################### +rule awk_min_covfilt: + # Aim: minimum coverage filtration + # Use: awk '$4 < [MIN_COV]' [BEDGRAPH.bed] 1> [FILTERED.bed] + message: + """ + ~ Awk ∞ Minimum Coverage Filtration ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + Min. cov.: _______ {wildcards.min_cov}X + """ + conda: + GAWK + input: + genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" + output: + min_cov_filt = temp("results/03_Coverage/{reference}/bed/{sample}_{aligner}_{min_cov}X_min-cov-filt.bed") + log: + "results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_min-cov-filt.log" + shell: + "awk " # Awk, a program that you can use to select particular records in a file and perform operations upon them + "'$4 < {wildcards.min_cov}' " # Minimum coverage for masking regions in consensus sequence + "{input.genome_cov} " # BedGraph coverage input + "1> {output.min_cov_filt} " # Minimum coverage filtered bed output + "2> {log} " # Log redirection + +############################################################################### +################################## STATISTICS ################################# +############################################################################### + +############################################################################### +rule awk_coverage_statistics: + # Aim: computing genomme coverage stats + # Use: awk {FORMULA} END {{print [RESULTS.tsv] [BEDGRAPH.bed] + message: + """ + ~ Awk ∞ Compute Genome Coverage Statistics from BED file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + GAWK + input: + cutadapt = "results/10_Reports/tools-log/cutadapt/{sample}.log", + sickle = "results/10_Reports/tools-log/sickle-trim/{sample}.log", + samtools = "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log", + flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.json", + histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt", + genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" + output: + cov_stats = "results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv" + log: + "results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.log" + shell: + r""" rawReads=$(grep -o -E """ # Get raw reads + r""" 'Total read pairs processed:.+' {input.cutadapt} """ # + r""" | sed -E 's/Total read pairs processed:\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" cutadaptPF=$(grep -o -E """ # Get cutadapt Passing Filtes reads + r""" 'Pairs written \(passing filters\):.+' {input.cutadapt} """ # + r""" | sed -E 's/Pairs written \(passing filters\):\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" sicklePF=$(grep -o -E """ # Get sickle Passing Filtes reads + r""" 'FastQ paired records kept:.+' {input.sickle} """ # + r""" | sed -E 's/FastQ paired records kept:\ +//') ; """ # + # + r""" totalDuplicate=$(grep -o -E """ # Get total duplicated reads + r""" 'DUPLICATE TOTAL:.+' {input.samtools} """ # + r""" | sed -E 's/DUPLICATE TOTAL:\ +//') ; """ # + # + r""" estimatedLibrarySize=$(grep -o -E """ # Get estimated library size + r""" 'ESTIMATED_LIBRARY_SIZE:.+' {input.samtools} """ # + r""" | sed -E 's/ESTIMATED_LIBRARY_SIZE:\ +//') ; """ # + # + r""" samtoolsPF=$(grep -o -E """ # Get samtool Passing Filter reads + r""" 'WRITTEN: .+' {input.samtools} """ # + r""" | sed -E 's/WRITTEN:\ +//') ; """ # + # + r""" mappedReads=$(grep -o -E -m 1 """ # Get mapped reads + r""" '"mapped": .+' {input.flagstat} """ # + r""" | sed -E 's/"mapped":\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" mappedPercentReads=$(grep -o -E -m 1 """ # Get mapped precent reads + r""" '"mapped %": .+' {input.flagstat} """ # + r""" | sed -E 's/"mapped %":\ +//' """ # + r""" | sed 's/,//g') ; """ # + # + r""" covPercentAt1X=$(grep -o -E """ # Get coverage percent @1X + r""" 'Percent covered:.+' {input.histogram} """ # + r""" | sed -E 's/Percent covered:\ +//') ; """ # + # + r""" awk """ # Awk, a program to select particular records in a file and perform operations upon them + r""" -v rawReads="${{rawReads}}" """ # Define external variable + r""" -v cutadaptPF="${{cutadaptPF}}" """ # Define external variable + r""" -v sicklePF="${{sicklePF}}" """ # Define external variable + r""" -v totalDuplicate="${{totalDuplicate}}" """ # Define external variable + r""" -v estimatedLibrarySize="${{estimatedLibrarySize}}" """ # Define external variable + r""" -v samtoolsPF="${{samtoolsPF}}" """ # Define external variable + r""" -v mappedReads="${{mappedReads}}" """ # Define external variable + r""" -v mappedPercentReads="${{mappedPercentReads}}" """ # Define external variable + r""" -v covPercentAt1X="${{covPercentAt1X}}" """ # Define external variable + r""" '$4 >= {wildcards.min_cov} {{supMin_Cov+=$3-$2}} ; """ # Genome size (>= min_cov @X) + r""" {{genomeSize+=$3-$2}} ; """ # Genome size (total) + r""" {{totalBases+=($3-$2)*$4}} ; """ # Total bases @1X + r""" {{totalBasesSq+=(($3-$2)*$4)**2}} """ # Total bases square @1X + r""" END """ # END + r""" {{print """ # Print + r""" "sample_id", "\t", """ # header: Sample ID + r""" "raw_paired_reads", "\t", """ # header: Raw paired reads + r""" "cutadapt_pairs_PF", "\t", """ # header: Cutadapt Passing Filters + r""" "sickle_reads_PF", "\t", """ # header: Sickle-trim Passing Filters + r""" "duplicated_reads", "\t", """ # header: + r""" "duplicated_percent_%","\t", """ # header: + r""" "estimated_library_size*", "\t", """ # header: + r""" "samtools_pairs_PF", "\t", """ # header: + r""" "mapped_with", "\t", """ # header: aligner + r""" "mapped_on", "\t", """ # header: reference + r""" "mapped_reads", "\t", """ # header: + r""" "mapped_percent_%", "\t", """ # header: + r""" "mean_depth", "\t", """ # header: mean depth + r""" "standard_deviation", "\t", """ # header: standard deviation + r""" "cov_percent_%_@1X", "\t", """ # header: coverage percentage @1X + r""" "cov_percent_%" "\t", """ # header: coverage percentage + r""" "@_min_cov" """ # header: @_[min_cov]_X + r""" ORS """ # \n newline + r""" "{wildcards.sample}", "\t", """ # value: Sample ID + r""" rawReads, "\t", """ # value: Raw sequences + r""" cutadaptPF, "\t", """ # value: Cutadapt Passing Filter + r""" sicklePF, "\t", """ # value: Sickle Passing Filter + r""" totalDuplicate, "\t", """ # value: + r""" int(((totalDuplicate)/(rawReads*2))*100), "%", "\t", """ # value: (divided by 2 to estimated pairs) + r""" estimatedLibrarySize, "\t", """ # value: + r""" samtoolsPF, "\t", """ # value: + r""" "{wildcards.aligner}", "\t", """ # value: aligner + r""" "{wildcards.reference}", "\t", """ # value: reference + r""" mappedReads, "\t", """ # value: + r""" mappedPercentReads, "%", "\t", """ # value: + r""" int(totalBases/genomeSize), "\t", """ # value: mean depth + r""" int(sqrt((totalBasesSq/genomeSize)-(totalBases/genomeSize)**2)), "\t", """ # Standard deviation value + r""" covPercentAt1X, "\t", """ # value + r""" supMin_Cov/genomeSize*100, "%", "\t", """ # Coverage percent (@ min_cov X) value + r""" "@{wildcards.min_cov}X" """ # @ min_cov X value + r""" }}' """ # Close print + r""" {input.genome_cov} """ # BedGraph coverage input + r""" 1> {output.cov_stats} """ # Mean depth output + r""" 2> {log}""" # Log redirection + +############################################################################### +rule bedtools_genome_coverage: + # Aim: computing genome coverage sequencing + # Use: bedtools genomecov [OPTIONS] -ibam [MARK-DUP.bam] 1> [BEDGRAPH.bed] + message: + """ + ~ BedTools ∞ Compute Genome Coverage ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + BEDTOOLS + input: + mark_dup = get_bam_input, + index = get_bai_input + output: + genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed" + log: + "results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_genome-cov.log" + shell: + "bedtools genomecov " # Bedtools genomecov, compute the coverage of a feature file among a genome + "-bga " # Report depth in BedGraph format, regions with zero coverage are also reported + "-ibam {input.mark_dup} " # The input file is in BAM format, must be sorted by position + "1> {output.genome_cov} " # BedGraph output + "2> {log} " # Log redirection + +############################################################################### +rule samtools_coverage_histogram: + # Aim: alignment depth and percent coverage histogram + # Use: samtools coverage --histogram [INPUT.bam] + message: + """ + ~ SamTools ∞ Calcul Depth and Coverage from BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + #bins = BINS, + #depth = DEPTH + input: + mark_dup = get_bam_input + output: + histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_coverage-histogram.log" + shell: + "samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage + "--histogram " # -m: show histogram instead of tabular output + "--verbosity 4 " # Set level of verbosity [INT] (default: 3) + "--n-bins 149 " # -w: number of bins in histogram (default: terminal width - 40) (todo: {params.bins}) + "--depth 0 " # -d maximum allowed coverage depth [INT] (default: 1000000 ; 0 removing any depth limit) (todo: {params.depth}) + "--output {output.histogram} " # write output to FILE (default: stdout) + "{input.mark_dup} ; " # Mark_dup bam input + "echo >> {output.histogram} ; " # Newline + "samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage + "--verbosity 4 " # Set level of verbosity [INT] (default: 3) + "{input.mark_dup} " # Mark_dup bam input + ">> {output.histogram} " # write output to FILE (default: stdout) + "2> {log}" # Log redirection + +############################################################################### +rule samtools_flagstat_ext: + # Aim: simple stats + # Use: samtools flagstat -@ [THREADS] [INPUT.bam] + message: + """ + ~ SamTools ∞ Calcul simple Stats from BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + mark_dup = get_bam_input + output: + flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_flagstat-{ext}.log" + shell: + "samtools flagstat " # Samtools flagstat, tools for alignments in the SAM format with command to simple stat + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "--verbosity 4 " # Set level of verbosity [INT] (default: 3) + "--output-fmt {wildcards.ext} " # -O Specify output format (none, tsv and json) + "{input.mark_dup} " # Mark_dup bam input + "1> {output.flagstat} " # Mark_dup index output + "2> {log}" # Log redirection + +############################################################################### +############################## PRIMER CLEAPING ############################### +############################################################################### + +############################################################################### +rule bamclipper_amplicon_primers: + # Aim: soft-clip amplicon PCR primers from BAM alignments + # Use: bamclipper.sh -n [THREADS] -b [MARKDUP.bam] -p [PRIMER.bed] -u [UPSTREAM] -d [DOWNSTREAM] + message: + """ + ~ BAMClipper ∞ soft-clipping amplicon PCR primers from BAM alignments ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + BAMCLIPPER + resources: + cpus = CPUS + params: + path = CLIPPATH, + primers = PRIMERS, + upstream = UPSTREAM, + downstream = DOWNSTREAM + input: + markdup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam", + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" + output: + bamclip = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam", + baiclip = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.primerclipped.bam.bai" + log: + "results/10_Reports/tools-log/bamclipper/{reference}/{sample}_{aligner}_primers-clip.log" + shell: + "bamclipper.sh " # BAMClipper, remove primer sequences from BAM alignments of PCR amplicons by soft-clipping + "-b {input.markdup} " # Indexed BAM alignment file + "-p {params.path}{params.primers}.bedpe " # BEDPE of primer pair locations + "-n {resources.cpus} " # Number of threads (default: 1) + "-u {params.upstream} " # Number of nuc. upstream for assigning alignments to primers (default: 5) + "-d {params.downstream} " # Number of nuc. downstream for assigning alignments to primers (default: 5) + #"-o results/02_Mapping/ " # Path to write output (if BamClipper v.1.1.2) (todo) + "&> {log} " # Log redirection + "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam {output.bamclip} " # because BamClipper v.1 default output system... + "&& mv {wildcards.sample}_{wildcards.aligner}_mark-dup.primerclipped.bam.bai {output.baiclip}" # because BamClipper v.1 default output system... + +############################################################################### +############################## REMOVE DUPLICATS ############################### +############################################################################### + +############################################################################### +rule samtools_index_markdup: + # Aim: indexing marked as duplicate BAM file + # Use: samtools index -@ [THREADS] -b [MARK-DUP.bam] [INDEX.bai] + message: + """ + ~ SamTools ∞ Index 'Marked as Duplicate' BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + mark_dup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" + output: + index = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam.bai" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup-index.log" + shell: + "samtools index " # Samtools index, tools for alignments in the SAM format with command to index alignment + "-@ {resources.cpus} " # --threads: Number of additional threads to use (default: 1)(NB, --threads form dose'nt work) + "-b " # -b: Generate BAI-format index for BAM files (default) + "{input.mark_dup} " # Mark_dup bam input + "{output.index} " # Mark_dup index output + "&> {log}" # Log redirection + +############################################################################### +rule samtools_markdup: + # Aim: marking duplicate alignments + # Use: samtools markdup -@ [THREADS] -r -s -O BAM [SORTED.bam] [MARK-DUP.bam] + message: + """ + ~ SamTools ∞ Mark Duplicate Alignments ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + sorted = "results/02_Mapping/{reference}/{sample}_{aligner}_sorted.bam" + output: + mark_dup = "results/02_Mapping/{reference}/{sample}_{aligner}_mark-dup.bam" + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log" + shell: + "samtools markdup " # Samtools markdup, tools for alignments in the SAM format with command mark duplicates + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-r " # -r: Remove duplicate reads + "-s " # -s: Report stats + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "{input.sorted} " # Sorted bam input + "{output.mark_dup} " # Mark_dup bam output + "&> {log}" # Log redirection + +############################################################################### +rule samtools_sorting: + # Aim: sorting + # Use: samtools sort -@ [THREADS] -m [MEM_GB] -T [TMP_DIR] -O BAM -o [SORTED.bam] [FIX-MATE.bam] + message: + """ + ~ SamTools ∞ Sort BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS, + mem_gb = MEM_GB, + tmp_dir = TMP_DIR + input: + fix_mate = "results/02_Mapping/{reference}/{sample}_{aligner}_fix-mate.bam" + output: + sorted = temp("results/02_Mapping/{reference}/{sample}_{aligner}_sorted.bam") + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_sorted.log" + shell: + "samtools sort " # Samtools sort, tools for alignments in the SAM format with command to sort alignment file + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-m {resources.mem_gb}G " # -m: Set maximum memory per thread, suffix K/M/G recognized (default: 768M) + "-T {resources.tmp_dir} " # -T: Write temporary files to PREFIX.nnnn.bam + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "-o {output.sorted} " # Sorted bam output + "{input.fix_mate} " # Fixmate bam input + "&> {log}" # Log redirection + +############################################################################### +rule samtools_fixmate: + # Aim: filling in mate coordinates + # Use: samtools fixmate -@ [THREADS] -m -O BAM [SORT-BY-NAMES.bam] [FIX-MATE.bam] + message: + """ + ~ SamTools ∞ Fil Mate Coordinates ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS + input: + sort_by_names = "results/02_Mapping/{reference}/{sample}_{aligner}_sort-by-names.bam" + output: + fix_mate = temp("results/02_Mapping/{reference}/{sample}_{aligner}_fix-mate.bam") + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_fix-mate.log" + shell: + "samtools fixmate " # Samtools fixmate, tools for alignments in the SAM format with command to fix mate information + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-m " # -m: Add mate score tag + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "{input.sort_by_names} " # Sort_by_names bam input + "{output.fix_mate} " # Fix_mate bam output + "&> {log}" # Log redirection + +############################################################################### +rule samtools_sortbynames: + # Aim: sorting by names + # Use: samtools sort -t [THREADS] -m [MEM_GB] -n -O BAM -o [SORT-BY-NAMES.bam] [MAPPED.sam] + message: + """ + ~ SamTools ∞ Sort by Names BAM file ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ {wildcards.aligner} + """ + conda: + SAMTOOLS + resources: + cpus = CPUS, + mem_gb = MEM_GB + input: + mapped = "results/02_Mapping/{reference}/{sample}_{aligner}-mapped.sam" + output: + sort_by_names = temp("results/02_Mapping/{reference}/{sample}_{aligner}_sort-by-names.bam") + log: + "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_sort-by-names.log" + shell: + "samtools sort " # Samtools sort, tools for alignments in the SAM format with command to sort alignment file + "--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1) + "-m {resources.mem_gb}G " # -m: Set maximum memory per thread, suffix K/M/G recognized (default: 768M) + "-n " # -n: Sort by read name (not compatible with samtools index command) + "--output-fmt BAM " # -O: Specify output format: SAM, BAM, CRAM (here, BAM format) + "-o {output.sort_by_names} " # -o: Write final output to FILE rather than standard output + "{input.mapped} " # Mapped reads input + "&> {log}" # Log redirection + + +############################################################################### +################################### MAPPING ################################### +############################################################################### + +############################################################################### +rule minimap2_mapping: + # Aim: reads mapping against reference sequence + # Use: minimap2 + message: + """ + ~ Minimap2 ∞ Map Reads ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ MiniMap2 + """ + conda: + MINIMAP2 + resources: + cpus = CPUS + params: + preset = MM2_PRESET + #length = LENGTH + input: + mm2_indexes = "resources/indexes/minimap2/{reference}.mmi", + fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", + rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" + output: + mapped = temp("results/02_Mapping/{reference}/{sample}_minimap2-mapped.sam") + log: + "results/10_Reports/tools-log/minimap2/{sample}_{reference}.log" + shell: + "minimap2 " # Minimap2, a versatile sequence alignment program + "-x {params.preset} " # -x: presets (always applied before other options) + "-t {resources.cpus} " # -t: Number of threads (default: 3) + "-a " # -a: output in the SAM format (PAF by default) + #"-F {params.length} " # -F: max fragment length, effective with -x sr mode (default: 800) + "{input.mm2_indexes} " # Reference index filename prefix.mmi + # (-k, -w, -I and -H can't be changed during mapping) + #"resources/genomes/{wildcards.reference}.fasta " # Reference genome fasta format (for custom -kwIH) + #"-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] + #"-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] + #"-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] + #"{params.homopolymer} " # -H: use homopolymer-compressed k-mer (preferrable for PacBio) + "{input.fwd_reads} " # Forward input reads + "{input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # SAM output + "2> {log}" # Log redirection + +############################################################################### +rule bwa_mapping: + # Aim: reads mapping against reference sequence + # Use: bwa mem -t [THREADS] -x [REFERENCE] [FWD_R1.fq] [REV_R2.fq] 1> [MAPPED.sam] + message: + """ + ~ BWA-MEM ∞ Map Reads ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ BWA + """ + conda: + BWA + resources: + cpus = CPUS + input: + bwa_indexes = "resources/indexes/bwa/{reference}", + fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", + rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" + output: + mapped = temp("results/02_Mapping/{reference}/{sample}_bwa-mapped.sam") + log: + "results/10_Reports/tools-log/bwa/{sample}_{reference}.log" + shell: + "bwa mem " # BWA-MEM algorithm, performs local alignment + "-t {resources.cpus} " # -t: Number of threads (default: 12) + "-v 1 " # -v: Verbosity level: 1=error, 2=warning, 3=message, 4+=debugging + "{input.bwa_indexes} " # Reference index filename prefix + "{input.fwd_reads} " # Forward input reads + "{input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # SAM output + "2> {log}" # Log redirection + +############################################################################### +rule bowtie2_mapping: + # Aim: reads mapping against reference sequence + # Use: bowtie2 -p [THREADS] -x [REFERENCE] -1 [FWD_R1.fq] -2 [REV_R2.fq] -S [MAPPED.sam] + message: + """ + ~ Bowtie2 ∞ Map Reads ~ + Sample: __________ {wildcards.sample} + Reference: _______ {wildcards.reference} + Aligner: _________ Bowtie2 + """ + conda: + BOWTIE2 + resources: + cpus = CPUS + params: + sensitivity = BT2_SENSITIVITY + input: + bt2_indexes = "resources/indexes/bowtie2/{reference}", + fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", + rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" + output: + mapped = temp("results/02_Mapping/{reference}/{sample}_bowtie2-mapped.sam") + log: + "results/10_Reports/tools-log/bowtie2/{sample}_{reference}.log" + shell: + "bowtie2 " # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences + "--threads {resources.cpus} " # -p: Number of alignment threads to launch (default: 1) + "--reorder " # Keep the original read order (if multi-processor option -p is used) + "-x {input.bt2_indexes} " # -x: Reference index filename prefix (minus trailing .X.bt2) + "{params.sensitivity} " # Preset (default: "--sensitive") + # sensitive: same as [-D 15 -R 2 -N 0 -L 22 -i S,1,1.15] + "-q " # -q: Query input files are FASTQ .fq/.fastq (default) + "-1 {input.fwd_reads} " # Forward input reads + "-2 {input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # -S: File for SAM output (default: stdout) + "2> {log}" # Log redirection + +############################################################################### +################################## INDEXING ################################### +############################################################################### + +############################################################################### +rule minimap2_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: minimap2 [OPTIONS] -d [INDEX.mmi] <query.fasta> + message: + """ + ~ Minimap2 ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + MINIMAP2 + params: + kmer_size = KMER_SIZE, + minimizer_size = MINIMIZER_SIZE, + split_size = SPLIT_SIZE + #homopolymer = HOMOPOLYMER + input: + fasta = "resources/genomes/{reference}.fasta" + output: + mm2_indexes = multiext("resources/indexes/minimap2/{reference}", + ".mmi") + log: + "results/10_Reports/tools-log/minimap2-indexes/{reference}.log" + shell: + "minimap2 " # Minimap2, index sequences + "-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] + "-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] + "-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] + #"{params.homopolymer} " # use homopolymer-compressed k-mer (preferrable for PacBio) + "-d {output.mm2_indexes} " # -d: dump index to FILE [] + "{input.fasta} " # Reference sequences in the FASTA format + "&> {log}" # Log redirection + +############################################################################### +rule bwa_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: bwa index -a [ALGO] -p [PREFIX] <genome.fasta> + message: + """ + ~ BWA-SW ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + BWA + params: + algorithm = BWA_ALGO + input: + fasta = "resources/genomes/{reference}.fasta" + output: + prefix = "resources/indexes/bwa/{reference}", + bwa_indexes = multiext("resources/indexes/bwa/{reference}", + ".amb", ".ann", ".bwt", ".pac", ".sa") + log: + "results/10_Reports/tools-log/bwa-indexes/{reference}.log" + shell: + "bwa index " # BWA-SW algorithm, index sequences + "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) + "-p {output.prefix} " # -p: Prefix of the output database + "{input.fasta} " # Reference sequences in the FASTA format + "&> {log} " # Log redirection + "&& touch {output.prefix}" # Touch done + +############################################################################### +rule bowtie2_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: bowtie2-build [OPTIONS] <reference_in> <bt2_index_base> + message: + """ + ~ Bowtie2-build ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + BOWTIE2 + resources: + cpus = CPUS + params: + algorithm = BT2_ALGO + input: + fasta = "resources/genomes/{reference}.fasta" + output: + prefix = "resources/indexes/bowtie2/{reference}", + bt2_indexes = multiext("resources/indexes/bowtie2/{reference}", + ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", + ".rev.1.bt2", ".rev.2.bt2") + log: + "results/10_Reports/tools-log/bowtie2-indexes/{reference}.log" + shell: + "bowtie2-build " # Bowtie2-build, index sequences + "--quiet " # -q: quiet + "--threads {resources.cpus} " # Number of threads + "{params.algorithm} " # Force (or no by default) generated index to be 'large', + # even if ref has fewer than 4 billion nucleotides + "-f " # Reference files are FASTA (default) + "{input.fasta} " # Reference sequences files (comma-separated list) in the FASTA format + "{output.prefix} " # Write bt2 data to files with this dir/basename + "&> {log} " # Log redirection + "&& touch {output.prefix}" # Touch done + + +############################################################################### +################################## TRIMMING ################################### +############################################################################### + +############################################################################### +rule sickle_trim_quality: + # Aim: windowed adaptive trimming tool for FASTQ files using quality + # Use: sickle [COMMAND] [OPTIONS] + message: + """ + ~ Sickle-trim ∞ Trim Low Quality Sequences ~ + Sample: __________ {wildcards.sample} + """ + conda: + SICKLE_TRIM + params: + command = SIC_COMMAND, + encoding = SIC_ENCODING, + quality = SIC_QUALITY, + length = SIC_LENGTH + input: + fwd_reads = "results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz", + rev_reads = "results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz" + output: + fwd_reads = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz"), + rev_reads = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz"), + single = stash_or_trash("results/01_Trimming/sickle/{sample}_sickle-trimmed_SE.fastq.gz") + log: + "results/10_Reports/tools-log/sickle-trim/{sample}.log" + shell: + "sickle " # Sickle, a windowed adaptive trimming tool for FASTQ files using quality + "{params.command} " # Paired-end or single-end sequence trimming + "-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8 + "-q {params.quality} " # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20) + "-l {params.length} " # --length-threshold: Threshold to keep a read based on length after trimming (default: 20) + "-f {input.fwd_reads} " # --pe-file1: Input paired-end forward fastq file + "-r {input.rev_reads} " # --pe-file2: Input paired-end reverse fastq file + "-g " # --gzip-output: Output gzipped files + "-o {output.fwd_reads} " # --output-pe1: Output trimmed forward fastq file + "-p {output.rev_reads} " # --output-pe2: Output trimmed reverse fastq file (must use -s option) + "-s {output.single} " # --output-single: Output trimmed singles fastq file + "&> {log} " # Log redirection + +############################################################################### +rule cutadapt_adapters_removing: + # Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads + # Use: cutadapt [OPTIONS] -a/-A [ADAPTER] -o [OUT_FWD.fastq.gz] -p [OUT_REV.fastq.gz] [IN_FWD.fastq.gz] [IN_REV.fastq.gz] + # Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed + message: + """ + ~ Cutadapt ∞ Remove Adapters ~ + Sample: __________ {wildcards.sample} + """ + conda: + CUTADAPT + resources: + cpus = CPUS + params: + length = CUT_LENGTH, + truseq = CUT_TRUSEQ, + nextera = CUT_NEXTERA, + small = CUT_SMALL, + cut = CUT_CLIPPING + input: + fwd_reads = "resources/reads/{sample}_R1.fastq.gz", + rev_reads = "resources/reads/{sample}_R2.fastq.gz" + #fwd_reads = "/users/illumina/local/data/run_1/FATSQ/{sample}_R1.fastq.gz", + #rev_reads = "/users/illumina/local/data/run_1/FATSQ/{sample}_R2.fastq.gz" + output: + fwd_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R1.fastq.gz"), + rev_reads = temp("results/01_Trimming/cutadapt/{sample}_cutadapt-removed_R2.fastq.gz") + log: + "results/10_Reports/tools-log/cutadapt/{sample}.log" + shell: + "cutadapt " # Cutadapt, finds and removes unwanted sequence from your HT-seq reads + "--cores {resources.cpus} " # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1) + "--cut {params.cut} " # -u: Remove 'n' first bases (5') from forward R1 (hard-clipping, default: 0) + "-U {params.cut} " # -U: Remove 'n' first bases (5') from reverse R2 (hard-clipping, default: 0) + "--trim-n " # --trim-n: Trim N's on ends (3') of reads + "--minimum-length {params.length} " # -m: Discard reads shorter than length + "--adapter {params.truseq} " # -a: Sequence of an adapter ligated to the 3' end of the first read + "-A {params.truseq} " # -A: 3' adapter to be removed from second read in a pair + "--adapter {params.nextera} " # -a: Sequence of an adapter ligated to the 3' end of the first read + "-A {params.nextera} " # -A: 3' adapter to be removed from second read in a pair + "--adapter {params.small} " # -a: Sequence of an adapter ligated to the 3' end of the first read + "-A {params.small} " # -A: 3' adapter to be removed from second read in a pair + "--output {output.fwd_reads} " # -o: Write trimmed reads to FILE + "--paired-output {output.rev_reads} " # -p: Write second read in a pair to FILE + "{input.fwd_reads} " # Input forward reads R1.fastq + "{input.rev_reads} " # Input reverse reads R2.fastq + "&> {log} " # Log redirection + +############################################################################### +############################### QUALITY CONTROL ############################### +############################################################################### + +############################################################################### +rule multiqc_reports_aggregation: + # Aim: aggregates bioinformatics analyses results into a single report + # Use: multiqc [OPTIONS] --output [MULTIQC/] [FASTQC/] [MULTIQC/] + priority: 999 # Explicit high priority + message: + """ + ~ MultiQC ∞ Aggregat HTML Qualities Reports ~ + """ + conda: + MULTIQC + params: + #config = MQC_CONFIG, + #tag = TAG + input: + fastqc = expand("results/00_Quality_Control/fastqc/{fastq}/", + fastq = FASTQ), + fastq_screen = expand("results/00_Quality_Control/fastq-screen/{fastq}/", + fastq = FASTQ) + output: + multiqc = directory("results/00_Quality_Control/multiqc/") + log: + "results/10_Reports/tools-log/multiqc.log" + shell: + "multiqc " # Multiqc, searches in given directories for analysis & compiles a HTML report + "--quiet " # -q: Only show log warning + "--no-ansi " # Disable coloured log + #"--config {params.config} " # Specific config file to load + #"--tag {params.tag} " # Use only modules which tagged with this keyword + #"--pdf " # Creates PDF report with 'simple' template (require xelatex) + "--export " # Export plots as static images in addition to the report + "--outdir {output.multiqc} " # -o: Create report in the specified output directory + "{input.fastqc} " # Input FastQC files + "{input.fastq_screen} " # Input Fastq-Screen + "&> {log}" # Log redirection + +############################################################################### +rule fastqscreen_contamination_checking: + # Aim: screen if the composition of the library matches with what you expect + # Use fastq_screen [OPTIONS] --outdir [DIR/] [FASTQ.GZ] + message: + """ + ~ Fasts-Screen ∞ Screen Contamination ~ + Fastq: __________ {wildcards.fastq} + """ + conda: + FASTQ_SCREEN + resources: + cpus = CPUS + params: + config = FQC_CONFIG, + subset = SUBSET + input: + fastq = "resources/reads/{fastq}.fastq.gz" + output: + fastq_screen = directory("results/00_Quality_Control/fastq-screen/{fastq}/") + log: + "results/10_Reports/tools-log/fastq-screen/{fastq}.log" + shell: + "fastq_screen " # FastqScreen, what did you expect ? + "-q " # --quiet: Only show log warning + "--threads {resources.cpus} " # --threads: Specifies across how many aligner will be allowed to run + "--aligner 'bwa' " # -a: choose aligner 'bowtie', 'bowtie2', 'bwa' + "--conf {params.config} " # path to configuration file + "--subset {params.subset} " # Don't use the whole sequence file, but create a subset of specified size + "--outdir {output.fastq_screen} " # Output directory + "{input.fastq} " # Input file.fastq + "&> {log}" # Log redirection + +############################################################################### +rule fastqc_quality_control: + # Aim: reads sequence files and produces a quality control report + # Use: fastqc [OPTIONS] --output [DIR/] [FASTQ.GZ] + message: + """ + ~ FastQC ∞ Quality Control ~ + Fastq: __________ {wildcards.fastq} + """ + conda: + FASTQC + resources: + cpus = CPUS + input: + fastq = "resources/reads/{fastq}.fastq.gz" + output: + fastqc = directory("results/00_Quality_Control/fastqc/{fastq}/") + log: + "results/10_Reports/tools-log/fastqc/{fastq}.log" + shell: + "mkdir -p {output.fastqc} " # (*) this directory must exist as the program will not create it + "2> /dev/null && " # in silence and then... + "fastqc " # FastQC, a high throughput sequence QC analysis tool + "--quiet " # -q: Supress all progress messages on stdout and only report errors + "--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously + "--outdir {output.fastqc} " # -o: Create all output files in the specified output directory (*) + "{input.fastq} " # Input file.fastq + "&> {log}" # Log redirection + +############################################################################### diff --git a/workflow/environments/bamclipper_v.1.0.0.yaml b/workflow/envs/bamclipper_v.1.0.0.yaml similarity index 100% rename from workflow/environments/bamclipper_v.1.0.0.yaml rename to workflow/envs/bamclipper_v.1.0.0.yaml diff --git a/workflow/environments/bcftools_v.1.20.yaml b/workflow/envs/bcftools_v.1.20.yaml similarity index 100% rename from workflow/environments/bcftools_v.1.20.yaml rename to workflow/envs/bcftools_v.1.20.yaml diff --git a/workflow/environments/bedtools_v.2.31.1.yaml b/workflow/envs/bedtools_v.2.31.1.yaml similarity index 100% rename from workflow/environments/bedtools_v.2.31.1.yaml rename to workflow/envs/bedtools_v.2.31.1.yaml diff --git a/workflow/environments/bowtie2_v.2.5.4.yaml b/workflow/envs/bowtie2_v.2.5.4.yaml similarity index 100% rename from workflow/environments/bowtie2_v.2.5.4.yaml rename to workflow/envs/bowtie2_v.2.5.4.yaml diff --git a/workflow/environments/bwa_v.0.7.18.yaml b/workflow/envs/bwa_v.0.7.18.yaml similarity index 100% rename from workflow/environments/bwa_v.0.7.18.yaml rename to workflow/envs/bwa_v.0.7.18.yaml diff --git a/workflow/environments/cutadapt_v.4.9.yaml b/workflow/envs/cutadapt_v.4.9.yaml similarity index 100% rename from workflow/environments/cutadapt_v.4.9.yaml rename to workflow/envs/cutadapt_v.4.9.yaml diff --git a/workflow/environments/fastq-screen_v.0.15.3.yaml b/workflow/envs/fastq-screen_v.0.15.3.yaml similarity index 100% rename from workflow/environments/fastq-screen_v.0.15.3.yaml rename to workflow/envs/fastq-screen_v.0.15.3.yaml diff --git a/workflow/environments/fastqc_v.0.12.1.yaml b/workflow/envs/fastqc_v.0.12.1.yaml similarity index 100% rename from workflow/environments/fastqc_v.0.12.1.yaml rename to workflow/envs/fastqc_v.0.12.1.yaml diff --git a/workflow/environments/gawk_v.5.3.0.yaml b/workflow/envs/gawk_v.5.3.0.yaml similarity index 100% rename from workflow/environments/gawk_v.5.3.0.yaml rename to workflow/envs/gawk_v.5.3.0.yaml diff --git a/workflow/environments/ivar_v.1.4.3.yaml b/workflow/envs/ivar_v.1.4.3.yaml similarity index 100% rename from workflow/environments/ivar_v.1.4.3.yaml rename to workflow/envs/ivar_v.1.4.3.yaml diff --git a/workflow/environments/minimap2_v.2.28.yaml b/workflow/envs/minimap2_v.2.28.yaml similarity index 100% rename from workflow/environments/minimap2_v.2.28.yaml rename to workflow/envs/minimap2_v.2.28.yaml diff --git a/workflow/environments/multiqc_v.1.23.yaml b/workflow/envs/multiqc_v.1.23.yaml similarity index 100% rename from workflow/environments/multiqc_v.1.23.yaml rename to workflow/envs/multiqc_v.1.23.yaml diff --git a/workflow/environments/nextclade_v.3.9.1.yaml b/workflow/envs/nextclade_v.3.9.1.yaml similarity index 100% rename from workflow/environments/nextclade_v.3.9.1.yaml rename to workflow/envs/nextclade_v.3.9.1.yaml diff --git a/workflow/environments/pangolin_v.4.3.yaml b/workflow/envs/pangolin_v.4.3.yaml similarity index 100% rename from workflow/environments/pangolin_v.4.3.yaml rename to workflow/envs/pangolin_v.4.3.yaml diff --git a/workflow/environments/samtools_v.1.20.yaml b/workflow/envs/samtools_v.1.20.yaml similarity index 100% rename from workflow/environments/samtools_v.1.20.yaml rename to workflow/envs/samtools_v.1.20.yaml diff --git a/workflow/environments/sickle-trim_v.1.33.yaml b/workflow/envs/sickle-trim_v.1.33.yaml similarity index 100% rename from workflow/environments/sickle-trim_v.1.33.yaml rename to workflow/envs/sickle-trim_v.1.33.yaml diff --git a/workflow/environments/tsv2vcf_v.1.1.yaml b/workflow/envs/tsv2vcf_v.1.1.yaml similarity index 100% rename from workflow/environments/tsv2vcf_v.1.1.yaml rename to workflow/envs/tsv2vcf_v.1.1.yaml diff --git a/workflow/environments/workflow-base_v.2024.11.yaml b/workflow/envs/workflow-base_v.2024.11.yaml similarity index 100% rename from workflow/environments/workflow-base_v.2024.11.yaml rename to workflow/envs/workflow-base_v.2024.11.yaml diff --git a/workflow/snakefiles/gevarli.smk b/workflow/rules/gevarli.smk similarity index 90% rename from workflow/snakefiles/gevarli.smk rename to workflow/rules/gevarli.smk index 501a4228538694728a8ece1447882eb370dfd15a..ba9ad8c82356e3c043054d03171487c97a8a588e 100755 --- a/workflow/snakefiles/gevarli.smk +++ b/workflow/rules/gevarli.smk @@ -203,6 +203,13 @@ FQC_CONFIG = config["fastq_screen"]["config"] # Fastq-Screen --conf #MQC_CONF = config["multiqc"]["config"] # MultiQC --conf #TAG = config["multiqc"]["tag"] # MultiQC --tag +KMER_SIZE = config["minimap2"]["algorithm"]["k-mer_size"] # MM2 k-mer size +MINIMIZER_SIZE = config["minimap2"]["algorithm"]["minimizer_size"] # MM2 minimizer window size +SPLIT_SIZE = config["minimap2"]["algorithm"]["split_size"] # MM2 split index +#HOMOPOLYMER = config["minimap2"]["algorithm"]["homopolymer"] # MM2 if PacBio +BWA_ALGO = config["bwa"]["algorithm"] # BWA indexing algorithm +BT2_ALGO = config["bowtie2"]["algorithm"] # BT2 indexing algorithm + REFERENCE = config["consensus"]["reference"] # Genome reference sequence, in fasta format REF_PATH = config["consensus"]["path"] # Path to genomes references MIN_COV = config["consensus"]["min_cov"] # Minimum coverage, mask lower regions with 'N' @@ -1046,10 +1053,10 @@ rule minimap2_mapping: resources: cpus = CPUS params: - mm2_path = MM2_PATH, preset = MM2_PRESET #length = LENGTH input: + mm2_indexes = "resources/indexes/minimap2/{reference}.mmi" fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" output: @@ -1057,21 +1064,22 @@ rule minimap2_mapping: log: "results/10_Reports/tools-log/minimap2/{sample}_{reference}.log" shell: - "minimap2 " # Minimap2, a versatile sequence alignment program - "-x {params.preset} " # -x: presets (always applied before other options) - "-t {resources.cpus} " # -t: Number of threads (default: 3) - "-a " # -a: output in the SAM format (PAF by default) - #"-F {params.length} " # -F: max fragment length, effective with -x sr mode (default: 800) - "{params.mm2_path}{wildcards.reference}.mmi " # Reference index filename prefix.mmi (-k, -w, -I and -H can't be changed during mapping) + "minimap2 " # Minimap2, a versatile sequence alignment program + "-x {params.preset} " # -x: presets (always applied before other options) + "-t {resources.cpus} " # -t: Number of threads (default: 3) + "-a " # -a: output in the SAM format (PAF by default) + #"-F {params.length} " # -F: max fragment length, effective with -x sr mode (default: 800) + "{input.mm2_indexes} " # Reference index filename prefix.mmi + # (-k, -w, -I and -H can't be changed during mapping) #"resources/genomes/{wildcards.reference}.fasta " # Reference genome fasta format (for custom -kwIH) - #"-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] - #"-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] - #"-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] - #"{params.homopolymer} " # -H: use homopolymer-compressed k-mer (preferrable for PacBio) - "{input.fwd_reads} " # Forward input reads - "{input.rev_reads} " # Reverse input reads - "1> {output.mapped} " # SAM output - "2> {log}" # Log redirection + #"-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] + #"-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] + #"-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] + #"{params.homopolymer} " # -H: use homopolymer-compressed k-mer (preferrable for PacBio) + "{input.fwd_reads} " # Forward input reads + "{input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # SAM output + "2> {log}" # Log redirection ############################################################################### rule bwa_mapping: @@ -1088,9 +1096,8 @@ rule bwa_mapping: BWA resources: cpus = CPUS - params: - bwa_path = BWA_PATH input: + bwa_indexes = "resources/indexes/bwa/{reference}" fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" output: @@ -1098,14 +1105,14 @@ rule bwa_mapping: log: "results/10_Reports/tools-log/bwa/{sample}_{reference}.log" shell: - "bwa mem " # BWA-MEM algorithm, performs local alignment - "-t {resources.cpus} " # -t: Number of threads (default: 12) - "-v 1 " # -v: Verbosity level: 1=error, 2=warning, 3=message, 4+=debugging - "{params.bwa_path}{wildcards.reference} " # Reference index filename prefix - "{input.fwd_reads} " # Forward input reads - "{input.rev_reads} " # Reverse input reads - "1> {output.mapped} " # SAM output - "2> {log}" # Log redirection + "bwa mem " # BWA-MEM algorithm, performs local alignment + "-t {resources.cpus} " # -t: Number of threads (default: 12) + "-v 1 " # -v: Verbosity level: 1=error, 2=warning, 3=message, 4+=debugging + "{input.bwa_indexes} " # Reference index filename prefix + "{input.fwd_reads} " # Forward input reads + "{input.rev_reads} " # Reverse input reads + "1> {output.mapped} " # SAM output + "2> {log}" # Log redirection ############################################################################### rule bowtie2_mapping: @@ -1123,9 +1130,9 @@ rule bowtie2_mapping: resources: cpus = CPUS params: - bt2_path = BT2_PATH, sensitivity = BT2_SENSITIVITY input: + bt2_indexes = "resources/indexes/bowtie2/{reference}", fwd_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R1.fastq.gz", rev_reads = "results/01_Trimming/sickle/{sample}_sickle-trimmed_R2.fastq.gz" output: @@ -1136,14 +1143,118 @@ rule bowtie2_mapping: "bowtie2 " # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences "--threads {resources.cpus} " # -p: Number of alignment threads to launch (default: 1) "--reorder " # Keep the original read order (if multi-processor option -p is used) - "-x {params.bt2_path}{wildcards.reference} " # -x: Reference index filename prefix (minus trailing .X.bt2) [Bowtie-1 indexes are not compatible] - "{params.sensitivity} " # Preset (default: "--sensitive", same as [-D 15 -R 2 -N 0 -L 22 -i S,1,1.15]) + "-x {input.indexes} " # -x: Reference index filename prefix (minus trailing .X.bt2) + "{params.sensitivity} " # Preset (default: "--sensitive") + # sensitive: same as [-D 15 -R 2 -N 0 -L 22 -i S,1,1.15] "-q " # -q: Query input files are FASTQ .fq/.fastq (default) "-1 {input.fwd_reads} " # Forward input reads "-2 {input.rev_reads} " # Reverse input reads "1> {output.mapped} " # -S: File for SAM output (default: stdout) "2> {log}" # Log redirection +############################################################################### +################################## INDEXING ################################### +############################################################################### + +############################################################################### +rule minimap2_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: minimap2 [OPTIONS] -d [INDEX.mmi] <query.fasta> + message: + """ + ~ Minimap2 ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + MINIMAP2 + params: + kmer_size = KMER_SIZE, + minimizer_size = MINIMIZER_SIZE, + split_size = SPLIT_SIZE + #homopolymer = HOMOPOLYMER + input: + fasta = "resources/genomes/{reference}.fasta" + output: + mm2_indexes = multiext("resources/indexes/minimap2/{reference}", + ".mmi") + log: + "results/10_Reports/tools-log/minimap2-indexes/{reference}.log" + shell: + "minimap2 " # Minimap2, index sequences + "-k {params.kmer_size} " # -k: k-mer size (default: "21", no larger than "28") [INT] + "-w {params.minimizer_size} " # -w: minimizer window size (default: "11") [INT] + "-I {params.split_size} " # -I: split index for every {NUM} input bases (default: "8G") [INT] + #"{params.homopolymer} " # use homopolymer-compressed k-mer (preferrable for PacBio) + "-d {output.indexes} " # -d: dump index to FILE [] + "{input.fasta} " # Reference sequences in the FASTA format + "&> {log}" # Log redirection + +############################################################################### +rule bwa_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: bwa index -a [ALGO] -p [PREFIX] <genome.fasta> + message: + """ + ~ BWA-SW ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + BWA + params: + algorithm = BWA_ALGO + input: + fasta = "resources/genomes/{reference}.fasta" + output: + prefix = temp("resources/indexes/bwa/{reference}"), + indexes = multiext("resources/indexes/bwa/{ref_seq}", + ".amb", ".ann", ".bwt", ".pac", ".sa") + log: + "results/10_Reports/tools-log/bwa-indexes/{reference}.log" + shell: + "bwa index " # BWA-SW algorithm, index sequences + "{params.algorithm} " # -a: Algorithm for constructing BWT index (default: auto) + "-p {output.prefix} " # -p: Prefix of the output database + "{input.fasta} " # Reference sequences in the FASTA format + "&> {log} " # Log redirection + "&& touch {output.prefix}" # Touch done + +############################################################################### +rule bowtie2_genome_indexing: + # Aim: index sequences in the FASTA format + # Use: bowtie2-build [OPTIONS] <reference_in> <bt2_index_base> + message: + """ + ~ Bowtie2-build ∞ Index Genome ~ + Reference: _______ {wildcards.reference} + """ + conda: + BOWTIE2 + resources: + cpus = CPUS + params: + algorithm = BT2_ALGO + input: + fasta = "resources/genomes/{reference}.fasta" + output: + prefix = temp("resources/indexes/bowtie2/{reference}"), + indexes = multiext("resources/indexes/bowtie2/{ref_seq}", + ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", + ".rev.1.bt2", ".rev.2.bt2") + log: + "results/10_Reports/tools-log/bowtie2-indexes/{reference}.log" + shell: + "bowtie2-build " # Bowtie2-build, index sequences + "--quiet " # -q: quiet + "--threads {resources.cpus} " # Number of threads + "{params.algorithm} " # Force (or no by default) generated index to be 'large', + # even if ref has fewer than 4 billion nucleotides + "-f " # Reference files are FASTA (default) + "{input.fasta} " # Reference sequences files (comma-separated list) in the FASTA format + "{output.prefix} " # Write bt2 data to files with this dir/basename + "&> {log} " # Log redirection + "&& touch {output.prefix}" # Touch done + + ############################################################################### ################################## TRIMMING ################################### ############################################################################### diff --git a/workflow/snakefiles/indexing_genomes.smk b/workflow/rules/indexing_genomes.smk similarity index 100% rename from workflow/snakefiles/indexing_genomes.smk rename to workflow/rules/indexing_genomes.smk diff --git a/workflow/snakefiles/quality_control.smk b/workflow/rules/quality_control.smk similarity index 100% rename from workflow/snakefiles/quality_control.smk rename to workflow/rules/quality_control.smk