-
Nicolas FERNANDEZ NUÑEZ authoredNicolas FERNANDEZ NUÑEZ authored
reads_quality_control_pipeline.smk 12.22 KiB
###############################################################################
# Name: Reads Quality Control Pipeline
# Author: Nicolas Fernandez
# Affiliation: IRD_U233_TransVIHMI
# Aim: Reads Quality Control
# Date: 2021.04.30
# Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores
# Latest modification: 2021.10.07
# Todo: ND
###############################################################################
# CONFIGURATION #
configfile: "config/config.yaml"
# FUNCTIONS #
# WILDCARDS #
SAMPLE, = glob_wildcards("results/reads/raw/{sample}_R1.fastq.gz")
QCDIR = config["datasets"]["quality-directory"]
# ENVIRONMENT #
CUTADAPT = config["conda"]["cutadapt"] # Cutadapt
SICKLETRIM = config["conda"]["sickle-trim"] # Sickle-trim
FASTQJOIN = config["conda"]["fastq-join"] # Fastq-join
FASTQC = config["conda"]["fastqc"] # FastQC
FASTQSCREEN = config["conda"]["fastq-screen"] # FastQ Screen
MULTIQC = config["conda"]["multiqc"] # MultiQC
# RESOURCES #
CPUS = config["resources"]["cpus"] # Resources thread
MEM_GB = config["resources"]["mem_gb"] # Resources mem in Gb
# PARAMETERS #
LENGTHC = config["cutadapt"]["length"] # Cutadapt --minimum-length
TRUSEQ = config["cutadapt"]["kits"]["truseq"] # Cutadapt --adapter Illumina TruSeq
NEXTERA = config["cutadapt"]["kits"]["nextera"] # Cutadapt --adapter Illumina Nextera
SMALL = config["cutadapt"]["kits"]["small"] # Cutadapt --adapter Illumina Small
COMMAND = config["sickle-trim"]["command"] # Sickle-trim command
ENCODING = config["sickle-trim"]["encoding"] # Sickle-trim --qual-type
QUALITY = config["sickle-trim"]["quality"] # Sickle-trim --qual-threshold
LENGTHS = config["sickle-trim"]["length"] # Sickle-trim --length-treshold
PERCENT = config["fastq-join"]["percent"] # Fastq-join -p (percent maximum difference)
OVERLAP = config ["fastq-join"]["overlap"] # Fastq-join -m (minimum overlap)
CONFIG = config["fastq-screen"]["config"] # Fastq-screen --conf
ALIGNER = config["fastq-screen"]["aligner"] # Fastq-screen --aligner
SUBSET = config["fastq-screen"]["subset"] # Fastq-screen --subset
###############################################################################
rule all:
input:
joined = expand("results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
sample = SAMPLE),
multiqc = expand("results/quality/{qcdir}/multiqc/",
qcdir = QCDIR)
###############################################################################
rule multiqc:
# Aim: aggregates bioinformatics analyses results into a single report
# Use: multiqc [OPTIONS] [--output dir] <analysis directory>
message:
"MultiQC aggregate quality results from {wildcards.qcdir} reads"
conda:
MULTIQC
log:
"results/reports/multiqc/{qcdir}.log"
input:
fastqc = "results/quality/{qcdir}/fastqc/",
fastqscreen = "results/quality/{qcdir}/fastqscreen/"
output:
multiqc = directory("results/quality/{qcdir}/multiqc/")
log:
"results/reports/multiqc/{qcdir}.log"
shell:
"multiqc " # Multiqc, searches in given directories for analysis & compiles a HTML report
"--quiet " # -q: Only show log warning
"--outdir {output.multiqc} " # -o: Create report in the specified output directory
"{input.fastqc} " # Input FastQC files
"{input.fastqscreen} " # Input Fastq-Screen files
#"--no-ansi " # Disable coloured log
"&> {log}" # Add redirection for log
###############################################################################
rule fastqscreen:
# Aim: screen if the composition of the library matches with what you expect
# Use fastq_screen [OPTIONS] <fastq.1>... <fastq.n>
message:
"Fastq-Screen on samples {wildcards.qcdir} reads"
conda:
FASTQSCREEN
resources:
cpus = CPUS
params:
config = CONFIG,
aligner = ALIGNER,
subset = SUBSET
input:
fastq = "results/reads/{qcdir}/"
output:
fastqscreen = directory("results/quality/{qcdir}/fastqscreen/")
log:
"results/reports/fastq-screen/{qcdir}.log"
shell:
"fastq_screen " # FastqScreen, what did you expect ?
"--quiet " # -q: Only show log warning
"--threads {resources.cpus} " # -t: Specifies across how many threads bowtie will be allowed to run
"--conf {params.config} " # path to configuration file
"--aligner {params.aligner} " # -a: choose aligner 'bowtie', 'bowtie2', 'bwa'
"--subset {params.subset} " # Don't use the whole sequence file, but create a subset of specified size
"--outdir {output.fastqscreen} " # Output directory
"{input.fastq}/*.fastq.gz " # Input file.fastq
"&> {log}" # Add redirection for log
###############################################################################
rule fastqc:
# Aim: reads sequence files and produces a quality control report
# Use: fastqc [OPTIONS] [--output dir] <fastq.1> ... <fastq.n>
message:
"FastQC on samples {wildcards.qcdir} reads"
conda:
FASTQC
resources:
cpus = CPUS
input:
fastq = "results/reads/{qcdir}/"
output:
fastqc = directory("results/quality/{qcdir}/fastqc/")
log:
"results/reports/fastqc/{qcdir}.log"
shell:
"fastqc " # FastQC, a high throughput sequence QC analysis tool
"--quiet " # -q: Supress all progress messages on stdout and only report errors
"--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously
"--outdir {output.fastqc} " # -o: Create all output files in the specified output directory
"{input.fastq}/*.fastq.gz " # Input file.fastq
"&> {log}" # Add redirection for log
###############################################################################
rule fastqjoin:
# Aim: joins two paired-end reads on the overlapping ends
# Use: fastq-join [OPTIONS] <read1.fastq> <read2.fastq> [mate.fastq] -o <read.fastq> -o <read.fastq> -o <read.fastq>
priority: 1
message:
"Fastq-join assemble R1 and R2 from {wildcards.sample} reads"
conda:
FASTQJOIN
params:
percent = PERCENT,
overlap = OVERLAP
input:
forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz"
output:
forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz",
reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz",
joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz"
log:
"results/reports/fastq-join/{sample}.log"
shell:
"fastq-join " # Fastq-Join, joins two paired-end reads on the overlapping ends
"{input.forward} " # Input forward
"{input.reverse} " # Input reverse
"-p {params.percent} " # Percent maximum difference (default: 8)
"-m {params.overlap} " # Minimum overlap (default: 6)
"-o {output.forward} " # for unique forward files
"-o {output.reverse} " # for unique reverse files
"-o {output.joined} " # for join files
"-r {log} " # Verbose stitch length report
#"&> {log}" # Add redirection for log
###############################################################################
rule sickletrim:
# Aim: windowed adaptive trimming tool for FASTQ files using quality
# Use: sickle <command> [OPTIONS]
message:
"Sickle-trim remove low quality sequences from {wildcards.sample} reads"
conda:
SICKLETRIM
params:
command = COMMAND,
encoding = ENCODING,
quality = QUALITY,
length = LENGTHS
input:
forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
output:
forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz",
single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz"
log:
"results/reports/sickle-trim/{sample}.log"
shell:
"sickle " # Sickle, a windowed adaptive trimming tool for FASTQ files using quality
"{params.command} " # Paired-end or single-end sequence trimming
"-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8
"-q {params.quality} " # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20)
"-l {params.length} " # --length-threshold: Threshold to keep a read based on length after trimming (default: 20)
"-f {input.forward} " # --pe-file1: Input paired-end forward fastq file
"-r {input.reverse} " # --pe-file2: Input paired-end reverse fastq file
"-g " # --gzip-output: Output gzipped files
"-o {output.forward} " # --output-pe1: Output trimmed forward fastq file
"-p {output.reverse} " # --output-pe2: Output trimmed reverse fastq file (must use -s option)
"-s {output.single} " # --output-single: Output trimmed singles fastq file
"&> {log}" # Add redirection for log
###############################################################################
rule cutadapt:
# Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads
# Use: cutadapt -a ADAPTER [OPTIONS] [-o output.forward] [-p output.reverse] <input.forward> <input.reverse>
# Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed
message:
"Cutadapt remove adapters from {wildcards.sample} reads"
conda:
CUTADAPT
resources:
cpus = CPUS
params:
length = LENGTHC,
truseq = TRUSEQ,
nextera = NEXTERA,
small = SMALL
input:
forward = "results/reads/raw/{sample}_R1.fastq.gz",
reverse = "results/reads/raw/{sample}_R2.fastq.gz"
output:
forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
log:
"results/reports/cutadapt/{sample}.log"
shell:
"cutadapt " # Cutadapt, finds and removes unwanted sequence from your HT-seq reads
"--cores {resources.cpus} " # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1)
"--trim-n " # --trim-n: Trim N's on ends of reads
"--minimum-length {params.length} " # -m: Discard reads shorter than length
"--adapter {params.truseq} " # -a: Sequence of an adapter ligated to the 3' end of the first read
"-A {params.truseq} " # -A: 3' adapter to be removed from second read in a pair
"--adapter {params.nextera} " # -a: Sequence of an adapter ligated to the 3' end of the first read
"-A {params.nextera} " # -A: 3' adapter to be removed from second read in a pair
"--adapter {params.small} " # -a: Sequence of an adapter ligated to the 3' end of the first read
"-A {params.small} " # -A: 3' adapter to be removed from second read in a pair
"--output {output.forward} " # -o: Write trimmed reads to FILE
"--paired-output {output.reverse} " # -p: Write second read in a pair to FILE
"{input.forward} " # Input forward reads R1.fastq
"{input.reverse} " # Input reverse reads R2.fastq
"&> {log} " # Add redirection for log
###############################################################################