Newer
Older
Nicolas FERNANDEZ NUÑEZ
committed
###############################################################################
# Name: Reads Quality Control Pipeline
# Author: Nicolas Fernandez
# Affiliation: IRD_U233_TransVIHMI
# Aim: Reads Quality Control
# Date: 2021.04.30
# Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores
Nicolas FERNANDEZ NUÑEZ
committed
# Latest modification: 2021.10.07
Nicolas FERNANDEZ NUÑEZ
committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Todo: ND
###############################################################################
# CONFIGURATION #
configfile: "config/config.yaml"
# FUNCTIONS #
# WILDCARDS #
SAMPLE, = glob_wildcards("results/reads/raw/{sample}_R1.fastq.gz")
QCDIR = config["datasets"]["quality-directory"]
# ENVIRONMENT #
CUTADAPT = config["conda"]["cutadapt"] # Cutadapt
SICKLETRIM = config["conda"]["sickle-trim"] # Sickle-trim
FASTQJOIN = config["conda"]["fastq-join"] # Fastq-join
FASTQC = config["conda"]["fastqc"] # FastQC
FASTQSCREEN = config["conda"]["fastq-screen"] # FastQ Screen
MULTIQC = config["conda"]["multiqc"] # MultiQC
# RESOURCES #
CPUS = config["resources"]["cpus"] # Resources thread
MEM_GB = config["resources"]["mem_gb"] # Resources mem in Gb
# PARAMETERS #
LENGTHC = config["cutadapt"]["length"] # Cutadapt --minimum-length
TRUSEQ = config["cutadapt"]["kits"]["truseq"] # Cutadapt --adapter Illumina TruSeq
NEXTERA = config["cutadapt"]["kits"]["nextera"] # Cutadapt --adapter Illumina Nextera
SMALL = config["cutadapt"]["kits"]["small"] # Cutadapt --adapter Illumina Small
COMMAND = config["sickle-trim"]["command"] # Sickle-trim command
ENCODING = config["sickle-trim"]["encoding"] # Sickle-trim --qual-type
QUALITY = config["sickle-trim"]["quality"] # Sickle-trim --qual-threshold
LENGTHS = config["sickle-trim"]["length"] # Sickle-trim --length-treshold
PERCENT = config["fastq-join"]["percent"] # Fastq-join -p (percent maximum difference)
OVERLAP = config ["fastq-join"]["overlap"] # Fastq-join -m (minimum overlap)
CONFIG = config["fastq-screen"]["config"] # Fastq-screen --conf
ALIGNER = config["fastq-screen"]["aligner"] # Fastq-screen --aligner
SUBSET = config["fastq-screen"]["subset"] # Fastq-screen --subset
###############################################################################
rule all:
input:
joined = expand("results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
sample = SAMPLE),
multiqc = expand("results/quality/{qcdir}/multiqc/",
qcdir = QCDIR)
###############################################################################
rule multiqc:
# Aim: aggregates bioinformatics analyses results into a single report
# Use: multiqc [OPTIONS] [--output dir] <analysis directory>
message:
"MultiQC aggregate quality results from {wildcards.qcdir} reads"
conda:
MULTIQC
log:
"results/reports/multiqc/{qcdir}.log"
input:
fastqc = "results/quality/{qcdir}/fastqc/",
fastqscreen = "results/quality/{qcdir}/fastqscreen/"
output:
multiqc = directory("results/quality/{qcdir}/multiqc/")
log:
"results/reports/multiqc/{qcdir}.log"
shell:
"multiqc " # Multiqc, searches in given directories for analysis & compiles a HTML report
"--quiet " # -q: Only show log warning
"--outdir {output.multiqc} " # -o: Create report in the specified output directory
"{input.fastqc} " # Input FastQC files
"{input.fastqscreen} " # Input Fastq-Screen files
#"--no-ansi " # Disable coloured log
"&> {log}" # Add redirection for log
###############################################################################
rule fastqscreen:
# Aim: screen if the composition of the library matches with what you expect
# Use fastq_screen [OPTIONS] <fastq.1>... <fastq.n>
message:
"Fastq-Screen on samples {wildcards.qcdir} reads"
conda:
FASTQSCREEN
resources:
cpus = CPUS
params:
config = CONFIG,
aligner = ALIGNER,
subset = SUBSET
input:
fastq = "results/reads/{qcdir}/"
output:
fastqscreen = directory("results/quality/{qcdir}/fastqscreen/")
log:
"results/reports/fastq-screen/{qcdir}.log"
shell:
"fastq_screen " # FastqScreen, what did you expect ?
Nicolas FERNANDEZ NUÑEZ
committed
"-q " # --quiet: Only show log warning
"--threads {resources.cpus} " # --threads: Specifies across how many threads bowtie will be allowed to run
Nicolas FERNANDEZ NUÑEZ
committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"--conf {params.config} " # path to configuration file
"--aligner {params.aligner} " # -a: choose aligner 'bowtie', 'bowtie2', 'bwa'
"--subset {params.subset} " # Don't use the whole sequence file, but create a subset of specified size
"--outdir {output.fastqscreen} " # Output directory
"{input.fastq}/*.fastq.gz " # Input file.fastq
"&> {log}" # Add redirection for log
###############################################################################
rule fastqc:
# Aim: reads sequence files and produces a quality control report
# Use: fastqc [OPTIONS] [--output dir] <fastq.1> ... <fastq.n>
message:
"FastQC on samples {wildcards.qcdir} reads"
conda:
FASTQC
resources:
cpus = CPUS
input:
fastq = "results/reads/{qcdir}/"
output:
fastqc = directory("results/quality/{qcdir}/fastqc/")
log:
"results/reports/fastqc/{qcdir}.log"
shell:
"fastqc " # FastQC, a high throughput sequence QC analysis tool
"--quiet " # -q: Supress all progress messages on stdout and only report errors
"--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously
"--outdir {output.fastqc} " # -o: Create all output files in the specified output directory
"{input.fastq}/*.fastq.gz " # Input file.fastq
"&> {log}" # Add redirection for log
###############################################################################
rule fastqjoin:
# Aim: joins two paired-end reads on the overlapping ends
# Use: fastq-join [OPTIONS] <read1.fastq> <read2.fastq> [mate.fastq] -o <read.fastq> -o <read.fastq> -o <read.fastq>
priority: 1
message:
"Fastq-join assemble R1 and R2 from {wildcards.sample} reads"
conda:
FASTQJOIN
params:
percent = PERCENT,
overlap = OVERLAP
input:
forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz"
output:
forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz",
reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz",
Nicolas FERNANDEZ NUÑEZ
committed
joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
report = "results/reports/fastq-join/{sample}_stich_length_report.txt"
Nicolas FERNANDEZ NUÑEZ
committed
log:
"results/reports/fastq-join/{sample}.log"
shell:
"fastq-join " # Fastq-Join, joins two paired-end reads on the overlapping ends
"{input.forward} " # Input forward
"{input.reverse} " # Input reverse
"-p {params.percent} " # Percent maximum difference (default: 8)
"-m {params.overlap} " # Minimum overlap (default: 6)
"-o {output.forward} " # for unique forward files
"-o {output.reverse} " # for unique reverse files
"-o {output.joined} " # for join files
Nicolas FERNANDEZ NUÑEZ
committed
"-r {output.report} " # Verbose stitch length report
"&> {log}" # Add redirection for log
Nicolas FERNANDEZ NUÑEZ
committed
###############################################################################
rule sickletrim:
# Aim: windowed adaptive trimming tool for FASTQ files using quality
# Use: sickle <command> [OPTIONS]
message:
"Sickle-trim remove low quality sequences from {wildcards.sample} reads"
conda:
SICKLETRIM
params:
command = COMMAND,
encoding = ENCODING,
quality = QUALITY,
length = LENGTHS
input:
forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
output:
forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz",
single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz"
log:
"results/reports/sickle-trim/{sample}.log"
shell:
Nicolas FERNANDEZ NUÑEZ
committed
"sickle " # Sickle, a windowed adaptive trimming tool for FASTQ files using quality
"{params.command} " # Paired-end or single-end sequence trimming
"-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8
"-q {params.quality} " # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20)
"-l {params.length} " # --length-threshold: Threshold to keep a read based on length after trimming (default: 20)
"-f {input.forward} " # --pe-file1: Input paired-end forward fastq file
"-r {input.reverse} " # --pe-file2: Input paired-end reverse fastq file
"-g " # --gzip-output: Output gzipped files
"-o {output.forward} " # --output-pe1: Output trimmed forward fastq file
"-p {output.reverse} " # --output-pe2: Output trimmed reverse fastq file (must use -s option)
"-s {output.single} " # --output-single: Output trimmed singles fastq file
"&> {log}" # Add redirection for log
Nicolas FERNANDEZ NUÑEZ
committed
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
###############################################################################
rule cutadapt:
# Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads
# Use: cutadapt -a ADAPTER [OPTIONS] [-o output.forward] [-p output.reverse] <input.forward> <input.reverse>
# Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed
message:
"Cutadapt remove adapters from {wildcards.sample} reads"
conda:
CUTADAPT
resources:
cpus = CPUS
params:
length = LENGTHC,
truseq = TRUSEQ,
nextera = NEXTERA,
small = SMALL
input:
forward = "results/reads/raw/{sample}_R1.fastq.gz",
reverse = "results/reads/raw/{sample}_R2.fastq.gz"
output:
forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
log:
"results/reports/cutadapt/{sample}.log"
shell:
"cutadapt " # Cutadapt, finds and removes unwanted sequence from your HT-seq reads
"--cores {resources.cpus} " # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1)
"--trim-n " # --trim-n: Trim N's on ends of reads
"--minimum-length {params.length} " # -m: Discard reads shorter than length
"--adapter {params.truseq} " # -a: Sequence of an adapter ligated to the 3' end of the first read
"-A {params.truseq} " # -A: 3' adapter to be removed from second read in a pair
"--adapter {params.nextera} " # -a: Sequence of an adapter ligated to the 3' end of the first read
"-A {params.nextera} " # -A: 3' adapter to be removed from second read in a pair
"--adapter {params.small} " # -a: Sequence of an adapter ligated to the 3' end of the first read
"-A {params.small} " # -A: 3' adapter to be removed from second read in a pair
"--output {output.forward} " # -o: Write trimmed reads to FILE
"--paired-output {output.reverse} " # -p: Write second read in a pair to FILE
"{input.forward} " # Input forward reads R1.fastq
"{input.reverse} " # Input reverse reads R2.fastq
"&> {log} " # Add redirection for log
###############################################################################