Skip to content
Snippets Groups Projects
reads_quality_control_pipeline.smk 12.3 KiB
Newer Older
###############################################################################
# Name: Reads Quality Control Pipeline
# Author: Nicolas Fernandez
# Affiliation: IRD_U233_TransVIHMI
# Aim: Reads Quality Control
# Date: 2021.04.30
# Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores 
# Todo: ND

###############################################################################
# CONFIGURATION #
configfile: "config/config.yaml"

# FUNCTIONS #

# WILDCARDS #
SAMPLE, = glob_wildcards("results/reads/raw/{sample}_R1.fastq.gz")
QCDIR = config["datasets"]["quality-directory"]

# ENVIRONMENT #
CUTADAPT = config["conda"]["cutadapt"]      # Cutadapt
SICKLETRIM = config["conda"]["sickle-trim"] # Sickle-trim
FASTQJOIN = config["conda"]["fastq-join"]   # Fastq-join

FASTQC = config["conda"]["fastqc"]            # FastQC
FASTQSCREEN = config["conda"]["fastq-screen"] # FastQ Screen
MULTIQC = config["conda"]["multiqc"]          # MultiQC

# RESOURCES #
CPUS = config["resources"]["cpus"]     # Resources thread
MEM_GB = config["resources"]["mem_gb"] # Resources mem in Gb

# PARAMETERS #
LENGTHC = config["cutadapt"]["length"]          # Cutadapt --minimum-length
TRUSEQ = config["cutadapt"]["kits"]["truseq"]   # Cutadapt --adapter Illumina TruSeq
NEXTERA = config["cutadapt"]["kits"]["nextera"] # Cutadapt --adapter Illumina Nextera
SMALL = config["cutadapt"]["kits"]["small"]     # Cutadapt --adapter Illumina Small

COMMAND = config["sickle-trim"]["command"]   # Sickle-trim command
ENCODING = config["sickle-trim"]["encoding"] # Sickle-trim --qual-type 
QUALITY = config["sickle-trim"]["quality"]   # Sickle-trim --qual-threshold
LENGTHS = config["sickle-trim"]["length"]    # Sickle-trim --length-treshold

PERCENT = config["fastq-join"]["percent"]  # Fastq-join -p (percent maximum difference)
OVERLAP = config ["fastq-join"]["overlap"] # Fastq-join -m (minimum overlap)

CONFIG = config["fastq-screen"]["config"]   # Fastq-screen --conf
ALIGNER = config["fastq-screen"]["aligner"] # Fastq-screen --aligner
SUBSET = config["fastq-screen"]["subset"]   # Fastq-screen --subset

###############################################################################
rule all:
    input:
        joined = expand("results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
                         sample = SAMPLE),
        multiqc = expand("results/quality/{qcdir}/multiqc/",
                         qcdir = QCDIR)

###############################################################################
rule multiqc:
    # Aim: aggregates bioinformatics analyses results into a single report
    # Use: multiqc [OPTIONS] [--output dir] <analysis directory>
    message:
        "MultiQC aggregate quality results from {wildcards.qcdir} reads"
    conda:
        MULTIQC
    log:
       "results/reports/multiqc/{qcdir}.log"
    input:
        fastqc = "results/quality/{qcdir}/fastqc/",
        fastqscreen = "results/quality/{qcdir}/fastqscreen/"
    output:
        multiqc = directory("results/quality/{qcdir}/multiqc/")
    log:
        "results/reports/multiqc/{qcdir}.log"
    shell:
        "multiqc "                   # Multiqc, searches in given directories for analysis & compiles a HTML report
        "--quiet "                   # -q: Only show log warning
        "--outdir {output.multiqc} " # -o: Create report in the specified output directory
        "{input.fastqc} "            # Input FastQC files
        "{input.fastqscreen} "       # Input Fastq-Screen files
        #"--no-ansi "                 # Disable coloured log
        "&> {log}"                   # Add redirection for log

###############################################################################
rule fastqscreen:
    # Aim: screen if the composition of the library matches with  what you expect
    # Use fastq_screen [OPTIONS] <fastq.1>... <fastq.n>
    message:
        "Fastq-Screen on samples {wildcards.qcdir} reads"
    conda:
        FASTQSCREEN
    resources:
        cpus = CPUS
    params:
        config = CONFIG,
        aligner = ALIGNER,
        subset = SUBSET
    input:
        fastq = "results/reads/{qcdir}/"
    output:
        fastqscreen = directory("results/quality/{qcdir}/fastqscreen/")
    log:
        "results/reports/fastq-screen/{qcdir}.log"
    shell:
        "fastq_screen "                  # FastqScreen, what did you expect ?
        "-q "                            # --quiet: Only show log warning
        "--threads {resources.cpus} "    # --threads: Specifies across how many threads bowtie will be allowed to run
        "--conf {params.config} "        # path to configuration file
        "--aligner {params.aligner} "    # -a: choose aligner 'bowtie', 'bowtie2', 'bwa'
        "--subset {params.subset} "      # Don't use the whole sequence file, but create a subset of specified size
        "--outdir {output.fastqscreen} " # Output directory
        "{input.fastq}/*.fastq.gz "      # Input file.fastq
        "&> {log}"                       # Add redirection for log

###############################################################################
rule fastqc:
    # Aim: reads sequence files and produces a quality control report
    # Use: fastqc [OPTIONS] [--output dir] <fastq.1> ... <fastq.n>
    message:
        "FastQC on samples {wildcards.qcdir} reads"
    conda:
        FASTQC
    resources:
        cpus = CPUS
    input:
        fastq = "results/reads/{qcdir}/"
    output:
        fastqc = directory("results/quality/{qcdir}/fastqc/")
    log:
        "results/reports/fastqc/{qcdir}.log"
    shell:
        "fastqc "                     # FastQC, a high throughput sequence QC analysis tool
        "--quiet "                    # -q: Supress all progress messages on stdout and only report errors
        "--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously
        "--outdir {output.fastqc} "   # -o: Create all output files in the specified output directory
        "{input.fastq}/*.fastq.gz "   # Input file.fastq
        "&> {log}"                    # Add redirection for log
        
###############################################################################
rule fastqjoin:
    # Aim: joins two paired-end reads on the overlapping ends
    # Use: fastq-join [OPTIONS] <read1.fastq> <read2.fastq> [mate.fastq] -o <read.fastq> -o <read.fastq> -o <read.fastq>
    priority: 1
    message:
        "Fastq-join assemble R1 and R2 from {wildcards.sample} reads"
    conda:
        FASTQJOIN
    params:
        percent = PERCENT,
        overlap = OVERLAP
    input:
        forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
        reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz"
    output:
        forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz",
        reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz",
        joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
        report = "results/reports/fastq-join/{sample}_stich_length_report.txt"
    log:
        "results/reports/fastq-join/{sample}.log"
    shell:
        "fastq-join "          # Fastq-Join, joins two paired-end reads on the overlapping ends
        "{input.forward} "     # Input forward
        "{input.reverse} "     # Input reverse
        "-p {params.percent} " # Percent maximum difference (default: 8)
        "-m {params.overlap} " # Minimum overlap (default: 6)
        "-o {output.forward} " # for unique forward files
        "-o {output.reverse} " # for unique reverse files
        "-o {output.joined} "  # for join files
        "-r {output.report} "  # Verbose stitch length report
        "&> {log}"             # Add redirection for log

###############################################################################
rule sickletrim:
    # Aim: windowed adaptive trimming tool for FASTQ files using quality
    # Use: sickle <command> [OPTIONS]
    message:
        "Sickle-trim remove low quality sequences from  {wildcards.sample} reads"
    conda:
        SICKLETRIM
    params:
        command = COMMAND,
        encoding = ENCODING,
        quality = QUALITY,
        length = LENGTHS
    input:
        forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
        reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
    output:
        forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
        reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz",
        single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz"
    log:
        "results/reports/sickle-trim/{sample}.log"
    shell:
       "sickle "                # Sickle, a windowed adaptive trimming tool for FASTQ files using quality
        "{params.command} "     # Paired-end or single-end sequence trimming
        "-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8
        "-q {params.quality} "  # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20)
        "-l {params.length} "   # --length-threshold: Threshold to keep a read based on length after trimming (default: 20)
        "-f {input.forward} "   # --pe-file1: Input paired-end forward fastq file
        "-r {input.reverse} "   # --pe-file2: Input paired-end reverse fastq file
        "-g "                   # --gzip-output: Output gzipped files
        "-o {output.forward} "  # --output-pe1: Output trimmed forward fastq file
        "-p {output.reverse} "  # --output-pe2: Output trimmed reverse fastq file (must use -s option)
        "-s {output.single} "   # --output-single: Output trimmed singles fastq file
        "&> {log}"              # Add redirection for log

###############################################################################
rule cutadapt:
    # Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads
    # Use: cutadapt -a ADAPTER [OPTIONS] [-o output.forward] [-p output.reverse] <input.forward> <input.reverse>
    # Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed
    message:
        "Cutadapt remove adapters from {wildcards.sample} reads"
    conda:
        CUTADAPT
    resources:
        cpus = CPUS
    params:
        length = LENGTHC,
        truseq = TRUSEQ,
        nextera = NEXTERA,
        small = SMALL
    input:
        forward = "results/reads/raw/{sample}_R1.fastq.gz",
        reverse = "results/reads/raw/{sample}_R2.fastq.gz"
    output:
        forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
        reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
    log:
        "results/reports/cutadapt/{sample}.log"
    shell:
        "cutadapt "                         # Cutadapt, finds and removes unwanted sequence from your HT-seq reads
        "--cores {resources.cpus} "         # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1)
        "--trim-n "                         # --trim-n: Trim N's on ends of reads
        "--minimum-length {params.length} " # -m: Discard reads shorter than length
        "--adapter {params.truseq} "        # -a: Sequence of an adapter ligated to the 3' end of the first read
        "-A {params.truseq} "               # -A: 3' adapter to be removed from second read in a pair
        "--adapter {params.nextera} "       # -a: Sequence of an adapter ligated to the 3' end of the first read
        "-A {params.nextera} "              # -A: 3' adapter to be removed from second read in a pair
        "--adapter {params.small} "         # -a: Sequence of an adapter ligated to the 3' end of the first read
        "-A {params.small} "                # -A: 3' adapter to be removed from second read in a pair
        "--output {output.forward} "        # -o: Write trimmed reads to FILE
        "--paired-output {output.reverse} " # -p: Write second read in a pair to FILE
        "{input.forward} "                  # Input forward reads R1.fastq
        "{input.reverse} "                  # Input reverse reads R2.fastq
        "&> {log} "                         # Add redirection for log

###############################################################################