diff --git a/.gitignore b/.gitignore index 58b00f83e0f13d912b59939be24f9429b3ccace4..a654588b85a3b593cd8af7b729281dfef65af1c4 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ *.txt /results* /resources/databases/bowtie2 -/resources/databases/bwa \ No newline at end of file +/resources/databases/bwa +/resources/databases/bwa_large_genomes.tar.gz \ No newline at end of file diff --git a/RQC.sh b/RQC.sh new file mode 100755 index 0000000000000000000000000000000000000000..5105ef0e900bd700da374911128ec8e30f55c1ac --- /dev/null +++ b/RQC.sh @@ -0,0 +1,252 @@ +#!/bin/bash + +###### About ###### +echo "" +echo "##### ABOUT #####" +echo "-----------------" +echo "Name: Reads Quality Control pipeline" +echo "Author: Nicolas Fernandez" +echo "Affiliation: IRD_U233_TransVIHMI" +echo "Aim: Bash script for RQC pipeline" +echo "Date: 2021.04.30" +echo "Run: snakemake --snakemake rqc.smk --cores --use-conda" +echo "Latest modification: 2022.01.25" +echo "Todo: done" +echo "________________________________________________________________________" + +###### Hardware check ###### +echo "" +echo "##### HARDWARE #####" +echo "--------------------" +echo "" + +physicalcpu=$(sysctl -n hw.physicalcpu) # Get physical cpu +echo "Physical CPU: ${physicalcpu}" # Print physical cpu + +logicalcpu=$(sysctl -n hw.logicalcpu) # Get logical cpu +echo "Logical CPU: ${logicalcpu}" # Print logical cpu + +hwmemsize=$(sysctl -n hw.memsize) # Get memory size +ramsize=$(expr $hwmemsize / $((1024**3))) # 1024**3 = GB +echo "System Memory: ${ramsize} GB" # Print RAM size + +echo "________________________________________________________________________" + +##### Working directory ##### +echo "" +echo "##### WORKING DIRECTORY #####" +echo "-----------------------------" + +workdir=${0%/*} +echo "Working directory: ${workdir}/" + +fastq=$(ls -l ${workdir}/resources/reads/*.fastq.gz | wc -l) +echo "Fastq files: ${fastq}" + +SECONDS=0 +timestampstart=$(date +'%Y-%m-%d %H:%M') +echo "Start time: ${timestampstart}" + +echo "________________________________________________________________________" + +###### Get threads number ###### +#echo "" +#echo "##### GET THREADS NUMBER #####" +#echo "------------------------------" +# +#threads=$logicalcpu +#read -p "Please enter threads number (default, all: ${logicalcpu}): " input_threads +#if [[ $input_threads -gt 0 ]] && [[ $input_threads -le $logicalcpu ]] 2> /dev/null +#then +# threads=$input_threads +#fi +#echo "Pipeline running with ${threads} threads" +# +#echo "________________________________________________________________________" +# +###### Get project name ###### +#echo "" +#echo "##### GET PROJECT NAME #####" +#echo "----------------------------" +# +#project="project" +#read -p "Please enter a project name: " input_project +#if [[ $input_project != $project ]] 2> /dev/null +#then +# project=$input_project +#fi +#echo "Project name: ${project}" +# +#echo "________________________________________________________________________" + +###### Rename samples ###### +# de/comment first line if you want to keep or remove barcode-ID in sample name +echo "" +echo "##### RENAME FASTQ FILES #####" +echo "------------------------------" + +# With rename command from macOSX +#rename --verbose 's/_S\d+_/_/' ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_} +rename --verbose 's/_L\d+_/_/' ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_} +rename --verbose 's/_001.fastq.gz/.fastq.gz/' ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz + +# With rename command as part of the util-linux package +#rename --verbose _S\d+_ _ ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_} +rename --verbose _L\d+_ _ ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_} +rename --verbose _001.fastq.gz .fastq.gz ${workdir}/resources/reads/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz + +echo "________________________________________________________________________" + +###### Extract bwa indexes for small genomes ###### +echo "" +echo "##### INDEXES EXTRACTION #####" +echo "-------------------------------" + +#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ +#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ +tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_small_genomes.tar.gz -C ${workdir}/resources/databases/ +tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_large_genomes.tar.gz -C ${workdir}/resources/databases/ + +echo "________________________________________________________________________" + +###### Call snakemake pipeline ###### +echo "" +echo "##### SNAKEMAKE PIPELINE #####" +echo "-----------------------------" + +echo "Conda environments list:" +# Specify working directory (relative paths in the snakefile will use this as their origin). +# The workflow definition in form of a snakefile. +# List all conda environments and their location on disk. +snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/rules/rqc.smk \ + --list-conda-envs + +echo "" +echo "Conda environments update:" +# Specify working directory (relative paths in the snakefile will use this as their origin). +# The workflow definition in form of a snakefile. +# Use at most N CPU cores/jobs in parallel. If N is omitted or ‘all’, the limit is set to the number of available CPU cores. +# Cleanup unused conda environments. +snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/rules/rqc.smk \ + --cores \ + --conda-cleanup-envs + +echo "" +echo "Conda environments setup:" +# Specify working directory (relative paths in the snakefile will use this as their origin). +# The workflow definition in form of a snakefile. +# Use at most N CPU cores/jobs in parallel. If N is omitted or ‘all’, the limit is set to the number of available CPU cores. +# If defined in the rule, run job in a conda environment. +# If specified, only creates the job-specific conda environments then exits. The –use-conda flag must also be set. +# If mamba package manager is not available, or if you still prefer to use conda, you can enforce that with this setting (default: 'mamba'). +snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/rules/rqc.smk \ + --cores \ + --use-conda \ + --conda-create-envs-only \ + --conda-frontend mamba + + +echo "" +echo "Unlocking working directory:" +# Specify working directory (relative paths in the snakefile will use this as their origin). +# The workflow definition in form of a snakefile. +# Remove a lock on the working directory. +snakemake \ + --directory ${workdir} \ + --snakefile ${workdir}/workflow/rules/rqc.smk \ + --unlock + +echo "" +echo "Dry run:" +# Specify working directory (relative paths in the snakefile will use this as their origin). +# The workflow definition in form of a snakefile. +# Use at most N CPU cores/jobs in parallel. If N is omitted or ‘all’, the limit is set to the number of available CPU cores. +# If defined in the rule, run job in a conda environment. +# Tell the scheduler to assign creation of given targets (and all their dependencies) highest priority. +# Do not execute anything, and display what would be done. If you have a very large workflow, use –dry-run –quiet to just print a summary of the DAG of jobs. +snakemake \ + --directory ${workdir} \ + --snakefile ${workdir}/workflow/rules/rqc.smk \ + --cores \ + --use-conda \ + --dryrun \ + --quiet + +echo "" +echo "Let's go!" +# Specify working directory (relative paths in the snakefile will use this as their origin). +# The workflow definition in form of a snakefile. +# Use at most N CPU cores/jobs in parallel. If N is omitted or ‘all’, the limit is set to the number of available CPU cores. +# If defined in the rule, run job in a conda environment. +# Tell the scheduler to assign creation of given targets (and all their dependencies) highest priority. +# Print out the shell commands that will be executed. +# Go on with independent jobs if a job fails. +# Re-run all jobs the output of which is recognized as incomplete. +# Tell the scheduler to assign creation of given targets (and all their dependencies) highest priority. +snakemake \ + --directory ${workdir} \ + --snakefile ${workdir}/workflow/rules/rqc.smk \ + --cores \ + --use-conda \ + --printshellcmds \ + --keep-going \ + --rerun-incomplete + +echo "________________________________________________________________________" + +###### Create usefull graphs and summary ###### +echo "" +echo "##### SNAKEMAKE PIPELINE GRAPHS ######" +echo "--------------------------------------" + +mkdir ${workdir}/results/10_Graphs/ 2> /dev/null + +graphList="dag rulegraph filegraph" +extentionList="pdf png" + +for graph in ${graphList} ; do + for extention in ${extentionList} ; do + snakemake \ + --directory ${workdir}/ \ + --snakefile ${workdir}/workflow/rules/gevarli.smk \ + --${graph} | \ + dot -T${extention} > \ + ${workdir}/results/10_Graphs/${graph}.${extention} ; + done ; +done + +snakemake \ + --directory ${workdir} \ + --snakefile ${workdir}/workflow/rules/gevarli.smk \ + --summary > ${workdir}/results/11_Reports/files_summary.txt + +echo "________________________________________________________________________" + +###### Clean End ###### +echo "" +echo "##### SCRIPT END #####" +echo "----------------------" + +find ${workdir}/results/ -type f -empty -delete # Remove empty file (like empty log) +find ${workdir}/results/ -type d -empty -delete # Remove empty directory + +echo "________________________________________________________________________" + +###### Report time ###### +echo "" +echo "##### TIMER #####" +echo "-----------------" + +timestampend=$(date +'%Y-%m-%d %H:%M') +echo "End time: ${timestampend}" + +elapsedtime=${SECONDS} +echo "Processing time: $((${elapsedtime}/60)) minutes and $((${elapsedtime}%60)) seconds elapsed" + +echo "________________________________________________________________________" diff --git a/RQCP.sh b/RQCP.sh deleted file mode 100755 index 8171ee6ba077670ef0ae5e2955314718de06f06d..0000000000000000000000000000000000000000 --- a/RQCP.sh +++ /dev/null @@ -1,208 +0,0 @@ -#!/bin/bash - -##### About ##### -echo "" -echo "##### ABOUT #####" -echo "-----------------" -echo "Name: Reads Quality Control Pipeline" -echo "Author: Nicolas Fernandez" -echo "Affiliation: IRD_U233_TransVIHMI" -echo "Aim: Reads Quality Control and Cleaning" -echo "Date: 2021.04.30" -echo "Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores" -echo "Latest modification: 2021.10.14" -echo "Todo: na" -echo "________________________________________________________________________" - -##### Hardware check ##### -echo "" -echo "##### HARDWARE #####" -echo "--------------------" - -physicalcpu=$(sysctl -n hw.physicalcpu) # Get physical cpu -echo "Physical CPU: ${physicalcpu}" # Print physical cpu - -logicalcpu=$(sysctl -n hw.logicalcpu) # Get logical cpu -echo "Logical CPU: ${logicalcpu}" # Print logical cpu - -hwmemsize=$(sysctl -n hw.memsize) # Get memory size -ramsize=$(expr $hwmemsize / $((1024**3))) # 1024**3 = GB -echo "System Memory: ${ramsize} GB" # Print RAM size - -timestamp=$(date +'%Y-%m-%d_%H-%M') -echo "Date: ${timestamp}" - -echo "________________________________________________________________________" - -##### Working directory ##### -echo "" -echo "##### WORKING DIRECTORY #####" -echo "-----------------------------" - -workdir=${0%/*} # for MacOS users, running script with double-click on "RQCP.sh" -#workdir="." # for Linux users (todo), running script with terminal command "bash ./RQCP.sh" -echo "CWD: ${workdir}" - -echo "________________________________________________________________________" - -###### Get threads number ##### -#echo "" -#echo "##### GET THREADS NUMBER ####" -#echo "--------------------------" -# -#threads=$logicalcpu -#read -p "Please enter threads number (default, all: ${logicalcpu}): " input_threads -#if [[ $input_threads -gt 0 ]] && [[ $input_threads -le $logicalcpu ]] 2> /dev/null -#then -# threads=$input_threads -#fi -#echo "Pipeline running with ${threads} threads" -# -#echo "________________________________________________________________________" -# -###### Get project name ##### -#echo "" -#echo "##### GET PROJECT NAME #####" -#echo "----------------------------" -# -#project="project" -#read -p "Please enter a project name: " input_project -#if [[ $input_project != $project ]] 2> /dev/null -#then -# project=$input_project -#fi -#echo "Project name: ${project}" -# -#echo "________________________________________________________________________" - -###### Create links for raw samples ##### -echo "" -echo "##### CREATE RAW READ LINKS #####" -echo "---------------------------------" - -mkdir ${workdir}/results/ 2> /dev/null -mkdir ${workdir}/results/reads/ 2> /dev/null -mkdir ${workdir}/results/reads/raw_links/ 2> /dev/null - -# Create Symbolic links for raw reads - -for fastq in ${workdir}/resources/reads/*.fastq.gz ; do - ln -sFv $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) ; -done - -echo "________________________________________________________________________" - -###### Rename links ##### -# comment first line if you want do keep barcode-ID in name -echo "" -echo "##### RENAME FASTQ FILES #####" -echo "------------------------------" - -rename --force --verbose 's/_S\d+_L00\d_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz # Remove barcode-ID and line-ID like {_S10_L001} -rename --force --verbose 's/_001.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz # Remove end-name ID like {_001}.fastq.gz - -# Create dir for QC-tools and Conf-backup ##### -mkdir ${workdir}/results/reads/cutadapt_removed/ 2> /dev/null -mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/ 2> /dev/null -mkdir ${workdir}/results/reads/fastq-join_single-end/ 2> /dev/null -mkdir ${workdir}/results/reads/all_merged_single-end/ 2> /dev/null -cp -R ${workdir}/config/ ${workdir}/results/config/ 2> /dev/null - -echo "________________________________________________________________________" - - -###### Extract bwa indexes for small genomes ##### -echo "" -echo "##### INDEXES EXTRACTION #####" -echo "---------------------------------" - -#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ -#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ -tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_small_genomes.tar.gz -C ${workdir}/resources/databases/ -#tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_large_genomes.tar.gz -C ${workdir}/resources/databases/ - -echo "________________________________________________________________________" - -###### Call snakemake pipeline ##### -echo "" -echo "##### SNAKEMAKE PIPELIE #####" -echo "-----------------------------" - -echo "" -echo "Unlocking working directory:" -snakemake \ - --directory ${workdir} \ - --use-conda \ - --printshellcmds \ - --keep-going \ - --rerun-incomplete \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --cores \ - --unlock # unlock first, if previous error - -echo "" -echo "Dry run:" -snakemake \ - --directory ${workdir} \ - --use-conda \ - --printshellcmds \ - --keep-going \ - --rerun-incomplete \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --cores \ - --dryrun # then dry run, check error like missing file - -echo "" -echo "Let's go!" -snakemake \ - --directory ${workdir} \ - --use-conda \ - --printshellcmds \ - --keep-going \ - --rerun-incomplete \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --cores 1 # at last, real run - -echo "________________________________________________________________________" - -###### Create usefull graphs and summary ###### -echo "" -echo "##### SNAKEMAKE PIPELINE GRAPHS ######" -echo "--------------------------------------" - -mkdir ${workdir}/results/graphs/ 2> /dev/null - -snakemake \ - --directory ${workdir} \ - --use-conda \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --dag | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf - -snakemake \ - --directory ${workdir} \ - --use-conda \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf - -snakemake \ - --directory ${workdir} \ - --use-conda \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf - -snakemake \ - --directory ${workdir} \ - --use-conda \ - --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ - --summary > ${workdir}/results/graphs/Summary.txt - -echo "________________________________________________________________________" - -###### Rename results directory with timestamp and project name ###### -echo "" -echo "##### SCRIPT END ######" -echo "-----------------------" - -#mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/ 2> /dev/null - -echo "________________________________________________________________________" diff --git a/config/config.yaml b/config/config.yaml index 3439dc259110e4fe30f99ce659c3d80adc9ebb17..deaea535bb8f6e3cbe903c1a6e646f95430c69be 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,70 +1,25 @@ +--- ############################################################################### # Author: Nicolas Fernandez # Affiliation: IRD_TransVIHMI -# Aim: Config file for the Reads Quality Control Pipeline (RQCP) +# Aim: Config file for RQC pipeline # Date: 2021.04.30 -# Use: Edit or (de)comment values you want modify -# Latest modification: 2021.10.14 -# Todo: ... +# Use: Edit or (de)comment settings +# Latest modification: 2022.01.25 +# Todo: done ############################################################################### -## RESOURCES ------------------------------------------------------------------------------------------- +## RESOURCES ----------------------------------------------------------------------------------------- resources: - cpus: 12 # cpus (overwrited by RQCP.sh snakemake call) - mem_mb: 64000 # mem in Mb - mem_gb: 64 # mem in Gb - time: 1140 # time in Min + cpus: 12 # cpus + mem_gb: 16 # mem in Gb ## ENVIRONNEMENTS ------------------------------------------------------------------------------------ conda: - cutadapt: '../envs/cutadapt-3.4.yaml' - sickle-trim: '../envs/sickle-trim-1.33.yaml' - fastq-join: '../envs/fastq-join-1.3.1.yaml' fastqc: '../envs/fastqc-0.11.9.yaml' fastq-screen: '../envs/fastq-screen-0.14.0.yaml' - multiqc: '../envs/multiqc-1.10.1.yaml' - -## DATASETS ------------------------------------------------------------------------------------------ -datasets: - quality-directory: - - 'raw_links' - #- 'cutadapt_removed' - - 'sickle-trim_paired-end_filtered' - #- 'sickle-trim_single-end' - #- 'fastq-join_single-end' - #- 'fastq-join_paired-end' - #- 'all_merged_single-end' - -## CUTADAPT ------------------------------------------------------------------------------------------ -cutadapt: - length: '35' # Discard reads shorter than length, after trim - kits: # Sequence of an adapter ligated to the 3' end of the first read - truseq: 'AGATCGGAAGAGC' # Illumina TruSeq / ScriptSeq based kits libraries - nextera: 'CTGTCTCTTATACACATC' # Illumina Nextera / TruSight based kits libraries - small: 'TGGAATTCTCGGGTGCCAAGG' # Illumina Small based kits libraries - #TruSeqR1: 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCA' # TruSeq-LT and TruSeq-HT based kits R1 - #TruSeqR2: 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT' # TruSeq-LT and TruSeq-HT based kits R2 - #ScriptSeqR1: 'AGATCGGAAGAGCACACGTCTGAAC' # ScriptSeq and TruSeq DNA Methylation based kits R1 - #ScriptSeqR2: 'AGATCGGAAGAGCGTCGTGTAGGGA' # ScriptSeq and TruSeq DNA Methylation based kits R2 - #TruSeqRibo: 'AGATCGGAAGAGCACACGTCT' # TruSeq Ribo Profile based kits - -## SICKLE-TRIM --------------------------------------------------------------------------------------- -sickle-trim: - command: # choose one - #- 'se' # if single-end sequence - - 'pe' # if paired-end sequence - encoding: # choose one - - 'sanger' # if sanger (CASAVA >= 1.8) - #- 'illumina' # if illumina (CASAVA 1.3 to 1.7) - #- 'solexa' # if solexa (CASAVA < 1.3) - quality: '30' # phred score limit - length: '35' # read length limit, after trim - -## FASTQ-JOIN ---------------------------------------------------------------------------------------- -fastq-join: - percent: '8' # N-percent maximum difference (default: 8) - overlap: '6' # N-minimum overlap (default: 6) + multiqc: '../envs/multiqc-1.11.yaml' ## FASTQSCREEN --------------------------------------------------------------------------------------- fastq-screen: diff --git a/config/fastq-screen.conf b/config/fastq-screen.conf index 2a6f366e17f9269f7fb3122ed18b7ef4a0266c50..872861655b9ed8b38b7bba4f5d4d4a4dae37e44b 100644 --- a/config/fastq-screen.conf +++ b/config/fastq-screen.conf @@ -48,7 +48,7 @@ THREADS 1 ## ##---------- ## Human h38 -#DATABASE Human resources/databases/bwa/Human/Homo_sapiens_h38 +DATABASE Human resources/databases/bwa/Human/Homo_sapiens_h38 #DATABASE Human resources/databases/bowtie2/Human/Homo_sapiens_h38 ## ## Mouse m39 @@ -56,7 +56,7 @@ THREADS 1 #DATABASE Mouse resources/databases/bowtie2/Mouse/Mus_musculus_m39 ## ## Arabidopsis thaliana - sequence from NCBI: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.3_TAIR10/GCF_000001735.3_TAIR10_genomic.fna.gz -#DATABASE Arabidopsis resources/databases/bwa/Arabidopsis/Arabidopsis_thaliana_t10 +DATABASE Arabidopsis resources/databases/bwa/Arabidopsis/Arabidopsis_thaliana_t10 #DATABASE Arabidopsis resources/databases/bowtie2/Arabidopsis/Arabidopsis_thaliana_t10 ## ## Ecoli - sequence available from EMBL accession U00096.2 @@ -86,7 +86,7 @@ DATABASE Vectors resources/databases/bwa/Vectors/UniVec_wo_phi-X174 #DATABASE Chimpanzee resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3 ## ## Bat 10 -#DATABASE Bat resources/databases/bwa/Bat/Pteropus_vampyrus_v1 +DATABASE Bat resources/databases/bwa/Bat/Pteropus_vampyrus_v1 #DATABASE Bat resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1 ## ## HIV - HXB2 diff --git a/workflow/envs/cutadapt-3.4.yaml b/workflow/envs/cutadapt-3.4.yaml deleted file mode 100644 index 62220f7106acf66ed889ba93ea6d947a210ef1c8..0000000000000000000000000000000000000000 --- a/workflow/envs/cutadapt-3.4.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: cutadapt-3.4 -channels: - - bioconda - - conda-forge - - r - - anaconda - - defaults -dependencies: - - ca-certificates=2021.4.13=hecd8cb5_1 - - certifi=2020.12.5=py39h6e9494a_1 - - cutadapt=3.4=py39ha492530_1 - - dnaio=0.5.1=py39ha492530_0 - - isa-l=2.30.0=h0d85af4_4 - - libcxx=11.1.0=habf9029_0 - - libffi=3.3=h046ec9c_2 - - ncurses=6.2=h2e338ed_4 - - openssl=1.1.1k=h0d85af4_0 - - pigz=2.6=h5dbffcc_0 - - pip=21.1=pyhd8ed1ab_0 - - python=3.9.4=h88f2d9e_0 - - python-isal=0.10.0=py39h89e85a6_0 - - python_abi=3.9=1_cp39 - - readline=8.1=h05e3726_0 - - setuptools=52.0.0=py39hecd8cb5_0 - - sqlite=3.35.5=h44b9ce1_0 - - tk=8.6.10=h0419947_1 - - tzdata=2021a=he74cb21_0 - - wheel=0.36.2=pyhd3deb0d_0 - - xopen=1.1.0=py39h6e9494a_2 - - xz=5.2.5=haf1e3a3_1 - - zlib=1.2.11=h7795811_1010 -prefix: /Users/2021nf001/miniconda3/envs/cutadapt-3.4 diff --git a/workflow/envs/cutadapt-3.5.yaml b/workflow/envs/cutadapt-3.5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9340f4577dfd3b174ba09e490e3a9fbd2129dec7 --- /dev/null +++ b/workflow/envs/cutadapt-3.5.yaml @@ -0,0 +1,40 @@ +name: cutadapt-3.5 +channels: + - bioconda + - conda-forge + - r + - anaconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2021.10.26=h06a4308_2 + - cutadapt=3.5=py39h38f01e4_0 + - dnaio=0.6.0=py39h38f01e4_0 + - isa-l=2.30.0=ha770c72_4 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=11.2.0=h1d223b6_11 + - libgomp=11.2.0=h1d223b6_11 + - libnsl=2.0.0=h7f98852_0 + - libuuid=2.32.1=h7f98852_1000 + - libzlib=1.2.11=h36c2ea0_1013 + - ncurses=6.2=h58526e2_4 + - openssl=3.0.0=h7f98852_2 + - pbzip2=1.1.13=0 + - pigz=2.6=h27826a3_0 + - pip=21.3.1=pyhd8ed1ab_0 + - python=3.9.9=h543edf9_0_cpython + - python-isal=0.11.1=py39h3811e60_1 + - python_abi=3.9=2_cp39 + - readline=8.1=h46c0cb4_0 + - setuptools=59.6.0=py39hf3d152e_0 + - sqlite=3.37.0=h9cd32fc_0 + - tk=8.6.11=h27826a3_1 + - tzdata=2021e=he74cb21_0 + - wheel=0.37.0=pyhd8ed1ab_1 + - xopen=1.2.1=py39hf3d152e_1 + - xz=5.2.5=h516909a_1 + - zlib=1.2.11=h36c2ea0_1013 + diff --git a/workflow/envs/multiqc-1.10.1.yaml b/workflow/envs/multiqc-1.10.1.yaml deleted file mode 100644 index 8c04a8b0ec170a1dd55fad091aa6f9716ce6657b..0000000000000000000000000000000000000000 --- a/workflow/envs/multiqc-1.10.1.yaml +++ /dev/null @@ -1,84 +0,0 @@ -name: multiqc-1.10.1 -channels: - - bioconda - - conda-forge - - r - - anaconda - - defaults -dependencies: - - brotlipy=0.7.0=py39hcbf5805_1001 - - ca-certificates=2021.4.13=hecd8cb5_1 - - certifi=2020.12.5=py39h6e9494a_1 - - cffi=1.14.5=py39h319c39b_0 - - chardet=4.0.0=py39h6e9494a_1 - - click=7.1.2=pyh9f0ad1d_0 - - colorama=0.4.4=pyh9f0ad1d_0 - - coloredlogs=15.0=py39h6e9494a_0 - - colormath=3.0.0=py_2 - - commonmark=0.9.1=py_0 - - cryptography=3.4.7=py39ha2c9959_0 - - cycler=0.10.0=py_2 - - decorator=5.0.7=pyhd8ed1ab_0 - - freetype=2.10.4=h4cff582_1 - - future=0.18.2=py39h6e9494a_3 - - humanfriendly=9.1=py39h6e9494a_0 - - idna=2.10=pyh9f0ad1d_0 - - importlib-metadata=4.0.1=py39h6e9494a_0 - - jinja2=2.11.3=pyh44b312d_0 - - jpeg=9d=hbcb3906_0 - - kiwisolver=1.3.1=py39hedf5dff_1 - - lcms2=2.12=h577c468_0 - - libblas=3.9.0=8_openblas - - libcblas=3.9.0=8_openblas - - libcxx=11.1.0=habf9029_0 - - libffi=3.3=h046ec9c_2 - - libgfortran=5.0.0=9_3_0_h6c81a4c_22 - - libgfortran5=9.3.0=h6c81a4c_22 - - liblapack=3.9.0=8_openblas - - libopenblas=0.3.12=openmp_h54245bb_1 - - libpng=1.6.37=h7cec526_2 - - libtiff=4.2.0=h7c11950_1 - - libwebp-base=1.2.0=h0d85af4_2 - - llvm-openmp=11.1.0=hda6cdc1_1 - - lz4-c=1.9.3=h046ec9c_0 - - lzstring=1.0.4=py_1001 - - markdown=3.3.4=pyhd8ed1ab_0 - - markupsafe=1.1.1=py39hcbf5805_3 - - matplotlib-base=3.4.1=py39hb07454d_0 - - multiqc=1.10.1=py_0 - - ncurses=6.2=h2e338ed_4 - - networkx=2.5=py_0 - - numpy=1.20.2=py39h7eed0ac_0 - - olefile=0.46=pyh9f0ad1d_1 - - openssl=1.1.1k=h0d85af4_0 - - pillow=8.2.0=py39h5270095_0 - - pip=21.1=pyhd8ed1ab_0 - - pycparser=2.20=pyh9f0ad1d_2 - - pygments=2.8.1=pyhd8ed1ab_0 - - pyopenssl=20.0.1=pyhd8ed1ab_0 - - pyparsing=2.4.7=pyh9f0ad1d_0 - - pysocks=1.7.1=py39h6e9494a_3 - - python=3.9.4=h88f2d9e_0 - - python-dateutil=2.8.1=py_0 - - python_abi=3.9=1_cp39 - - pyyaml=5.4.1=py39hcbf5805_0 - - readline=8.1=h05e3726_0 - - requests=2.25.1=pyhd3deb0d_0 - - rich=10.1.0=py39h6e9494a_0 - - setuptools=52.0.0=py39hecd8cb5_0 - - simplejson=3.17.2=py39hcbf5805_2 - - six=1.15.0=pyh9f0ad1d_0 - - spectra=0.0.11=py_1 - - sqlite=3.35.5=h44b9ce1_0 - - tk=8.6.10=h0419947_1 - - tornado=6.1=py39hcbf5805_1 - - typing_extensions=3.7.4.3=py_0 - - tzdata=2021a=he74cb21_0 - - urllib3=1.26.4=pyhd8ed1ab_0 - - wheel=0.36.2=pyhd3deb0d_0 - - xz=5.2.5=haf1e3a3_1 - - yaml=0.2.5=haf1e3a3_0 - - zipp=3.4.1=pyhd8ed1ab_0 - - zlib=1.2.11=h7795811_1010 - - zstd=1.4.9=h582d3a0_0 -prefix: /Users/2021nf001/miniconda3/envs/multiqc diff --git a/workflow/envs/multiqc-1.11.yaml b/workflow/envs/multiqc-1.11.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab5764cdcce21a14e0fd1be7e67461866ca6ce5c --- /dev/null +++ b/workflow/envs/multiqc-1.11.yaml @@ -0,0 +1,108 @@ +name: multiqc-1.11 +channels: + - bioconda + - conda-forge + - r + - anaconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - brotli=1.0.9=h7f98852_6 + - brotli-bin=1.0.9=h7f98852_6 + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2021.10.26=h06a4308_2 + - charset-normalizer=2.0.9=pyhd8ed1ab_0 + - colorama=0.4.4=pyh9f0ad1d_0 + - colormath=3.0.0=py_2 + - commonmark=0.9.1=py_0 + - cycler=0.11.0=pyhd8ed1ab_0 + - dataclasses=0.8=pyhc8e2a94_3 + - freetype=2.11.0=h70c0345_0 + - idna=3.3=pyhd3eb1b0_0 + - jbig=2.1=h7f98852_2003 + - jinja2=3.0.3=pyhd8ed1ab_0 + - jpeg=9d=h36c2ea0_0 + - lcms2=2.12=hddcbb42_0 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - lerc=3.0=h9c3ff4c_0 + - libblas=3.9.0=12_linux64_openblas + - libbrotlicommon=1.0.9=h7f98852_6 + - libbrotlidec=1.0.9=h7f98852_6 + - libbrotlienc=1.0.9=h7f98852_6 + - libcblas=3.9.0=12_linux64_openblas + - libdeflate=1.8=h7f98852_0 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=11.2.0=h1d223b6_11 + - libgfortran-ng=11.2.0=h69a702a_11 + - libgfortran5=11.2.0=h5c6108e_11 + - libgomp=11.2.0=h1d223b6_11 + - liblapack=3.9.0=12_linux64_openblas + - libnsl=2.0.0=h7f98852_0 + - libopenblas=0.3.18=pthreads_h8fe5266_0 + - libpng=1.6.37=h21135ba_2 + - libstdcxx-ng=11.2.0=he4da1e4_11 + - libtiff=4.3.0=h6f004c6_2 + - libuuid=2.32.1=h7f98852_1000 + - libwebp-base=1.2.1=h7f98852_0 + - libzlib=1.2.11=h36c2ea0_1013 + - lz4-c=1.9.3=h9c3ff4c_1 + - lzstring=1.0.4=py_1001 + - markdown=3.3.6=pyhd8ed1ab_0 + - matplotlib-base=3.5.1=py310h23f4a51_0 + - multiqc=1.11=pyhdfd78af_0 + - munkres=1.1.4=pyh9f0ad1d_0 + - ncurses=6.2=h58526e2_4 + - networkx=2.6.3=pyhd8ed1ab_1 + - olefile=0.46=pyh9f0ad1d_1 + - openjpeg=2.4.0=hb52868f_1 + - openssl=1.1.1l=h7f98852_0 + - packaging=21.3=pyhd8ed1ab_0 + - pip=21.3.1=pyhd8ed1ab_0 + - pycparser=2.21=pyhd8ed1ab_0 + - pygments=2.10.0=pyhd8ed1ab_0 + - pyopenssl=21.0.0=pyhd8ed1ab_0 + - pyparsing=3.0.6=pyhd8ed1ab_0 + - python=3.10.1=h62f1059_1_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python_abi=3.10=2_cp310 + - pytz=2021.3=pyhd8ed1ab_0 + - readline=8.1=h46c0cb4_0 + - requests=2.26.0=pyhd8ed1ab_1 + - rich=10.16.1=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - spectra=0.0.11=py_1 + - sqlite=3.37.0=h9cd32fc_0 + - tk=8.6.11=h27826a3_1 + - typing_extensions=4.0.1=pyha770c72_0 + - tzdata=2021e=he74cb21_0 + - urllib3=1.26.7=pyhd8ed1ab_0 + - wheel=0.37.0=pyhd8ed1ab_1 + - xz=5.2.5=h516909a_1 + - yaml=0.2.5=h516909a_0 + - zipp=3.6.0=pyhd8ed1ab_0 + - zlib=1.2.11=h36c2ea0_1013 + - zstd=1.5.0=ha95c52a_0 + - pip: + - brotlipy==0.7.0 + - certifi==2021.10.8 + - cffi==1.15.0 + - click==8.0.3 + - coloredlogs==15.0.1 + - cryptography==36.0.1 + - fonttools==4.28.5 + - future==0.18.2 + - humanfriendly==10.0 + - importlib-metadata==4.10.0 + - kiwisolver==1.3.2 + - markupsafe==2.0.1 + - matplotlib==3.5.1 + - numpy==1.21.4 + - pandas==1.3.5 + - pillow==8.4.0 + - pysocks==1.7.1 + - pyyaml==6.0 + - scipy==1.7.3 + - setuptools==59.6.0 + - simplejson==3.17.6 + diff --git a/workflow/rules/reads_quality_control_pipeline.smk b/workflow/rules/reads_quality_control_pipeline.smk deleted file mode 100644 index bb1044ec232fc2b6947b683c9506bb3baee399f1..0000000000000000000000000000000000000000 --- a/workflow/rules/reads_quality_control_pipeline.smk +++ /dev/null @@ -1,273 +0,0 @@ -############################################################################### -# Name: Reads Quality Control Pipeline -# Author: Nicolas Fernandez -# Affiliation: IRD_U233_TransVIHMI -# Aim: Reads Quality Control -# Date: 2021.04.30 -# Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores -# Latest modification: 2021.10.19 -# Todo: ND - -############################################################################### -# CONFIGURATION # -configfile: "config/config.yaml" - -# FUNCTIONS # - -# WILDCARDS # -SAMPLE, = glob_wildcards("results/reads/raw_links/{sample}_R1.fastq.gz") -QCDIR = config["datasets"]["quality-directory"] - -# ENVIRONMENT # -CUTADAPT = config["conda"]["cutadapt"] # Cutadapt -SICKLETRIM = config["conda"]["sickle-trim"] # Sickle-trim -FASTQJOIN = config["conda"]["fastq-join"] # Fastq-join - -FASTQC = config["conda"]["fastqc"] # FastQC -FASTQSCREEN = config["conda"]["fastq-screen"] # FastQ Screen -MULTIQC = config["conda"]["multiqc"] # MultiQC - -# RESOURCES # -CPUS = config["resources"]["cpus"] # Resources thread -MEM_GB = config["resources"]["mem_gb"] # Resources mem in Gb - -# PARAMETERS # -LENGTHC = config["cutadapt"]["length"] # Cutadapt --minimum-length -TRUSEQ = config["cutadapt"]["kits"]["truseq"] # Cutadapt --adapter Illumina TruSeq -NEXTERA = config["cutadapt"]["kits"]["nextera"] # Cutadapt --adapter Illumina Nextera -SMALL = config["cutadapt"]["kits"]["small"] # Cutadapt --adapter Illumina Small - -COMMAND = config["sickle-trim"]["command"] # Sickle-trim command -ENCODING = config["sickle-trim"]["encoding"] # Sickle-trim --qual-type -QUALITY = config["sickle-trim"]["quality"] # Sickle-trim --qual-threshold -LENGTHS = config["sickle-trim"]["length"] # Sickle-trim --length-treshold - -PERCENT = config["fastq-join"]["percent"] # Fastq-join -p (percent maximum difference) -OVERLAP = config ["fastq-join"]["overlap"] # Fastq-join -m (minimum overlap) - -CONFIG = config["fastq-screen"]["config"] # Fastq-screen --conf -ALIGNER = config["fastq-screen"]["aligner"] # Fastq-screen --aligner -SUBSET = config["fastq-screen"]["subset"] # Fastq-screen --subset - -############################################################################### -rule all: - input: - merged = expand("results/reads/all_merged_single-end/{sample}_cutadapt_sickle_join_merged_SE.fastq.gz", - sample = SAMPLE), - multiqc = expand("results/quality/{qcdir}/multiqc/", - qcdir = QCDIR) - -############################################################################### -rule multiqc: - # Aim: aggregates bioinformatics analyses results into a single report - # Use: multiqc [OPTIONS] [--output dir] <analysis directory> - message: - "MultiQC aggregate quality results from {wildcards.qcdir} reads" - conda: - MULTIQC - log: - "results/reports/multiqc/{qcdir}.log" - input: - fastqc = "results/quality/{qcdir}/fastqc/", - fastqscreen = "results/quality/{qcdir}/fastq-screen/" - output: - multiqc = directory("results/quality/{qcdir}/multiqc/") - log: - "results/reports/multiqc/{qcdir}.log" - shell: - "multiqc " # Multiqc, searches in given directories for analysis & compiles a HTML report - "--quiet " # -q: Only show log warning - "--outdir {output.multiqc} " # -o: Create report in the specified output directory - "{input.fastqc} " # Input FastQC files - "{input.fastqscreen} " # Input Fastq-Screen - #"--no-ansi " # Disable coloured log - "&> {log}" # Add redirection for log - -############################################################################### -rule fastqscreen: - # Aim: screen if the composition of the library matches with what you expect - # Use fastq_screen [OPTIONS] <fastq.1>... <fastq.n> - message: - "Fastq-Screen on samples {wildcards.qcdir} reads" - conda: - FASTQSCREEN - resources: - cpus = CPUS - params: - config = CONFIG, - aligner = ALIGNER, - subset = SUBSET - input: - fastq = "results/reads/{qcdir}/" - output: - fastqscreen = directory("results/quality/{qcdir}/fastq-screen/") - log: - "results/reports/fastq-screen/{qcdir}.log" - shell: - "fastq_screen " # FastqScreen, what did you expect ? - "-q " # --quiet: Only show log warning - "--threads {resources.cpus} " # --threads: Specifies across how many threads bowtie will be allowed to run - "--conf {params.config} " # path to configuration file - "--aligner {params.aligner} " # -a: choose aligner 'bowtie', 'bowtie2', 'bwa' - "--subset {params.subset} " # Don't use the whole sequence file, but create a subset of specified size - "--outdir {output.fastqscreen} " # Output directory - "{input.fastq}/*.fastq.gz " # Input file.fastq - "&> {log}" # Add redirection for log - -############################################################################### -rule fastqc: - # Aim: reads sequence files and produces a quality control report - # Use: fastqc [OPTIONS] [--output dir] <fastq.1> ... <fastq.n> - message: - "FastQC on samples {wildcards.qcdir} reads" - conda: - FASTQC - resources: - cpus = CPUS - input: - fastq = "results/reads/{qcdir}/" - output: - fastqc = directory("results/quality/{qcdir}/fastqc/") - log: - "results/reports/fastqc/{qcdir}.log" - shell: - "fastqc " # FastQC, a high throughput sequence QC analysis tool - "--quiet " # -q: Supress all progress messages on stdout and only report errors - "--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously - "--outdir {output.fastqc} " # -o: Create all output files in the specified output directory - "{input.fastq}/*.fastq.gz " # Input file.fastq - "&> {log}" # Add redirection for log - -############################################################################### -rule merge: - # Aim: Merge all passing filters reads - # Use: cat {input} > {output} - priority: 1 - message: - "Take all passing filters {wildcards.sample} reads" - input: - single = "results/reads/sickle-trim_single-end/{sample}_cutadapt_sickle_SE.fastq.gz", - forward= "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R1.fastq.gz", - reverse = "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R2.fastq.gz", - joined = "results/reads/fastq-join_single-end/{sample}_cutadapt_sickle_join_SE.fastq.gz", - output: - merged = "results/reads/all_merged_single-end/{sample}_cutadapt_sickle_join_merged_SE.fastq.gz" - shell: - "cat " # Cat, concatenate - "{input.single} " # Input trimmed singles fastq file from Sickle-trim - "{input.forward} " # Input unique forward files from Fastq-join - "{input.reverse} " # Input unique reverse files from Fastq-join - "{input.joined} " # Input join files from Fastq-join - "> {output.merged}" # Output merged - -############################################################################### -rule fastqjoin: - # Aim: joins two paired-end reads on the overlapping ends - # Use: fastq-join [OPTIONS] <read1.fastq> <read2.fastq> [mate.fastq] -o <read.fastq> -o <read.fastq> -o <read.fastq> - message: - "Fastq-join assemble R1 and R2 from {wildcards.sample} reads" - conda: - FASTQJOIN - params: - percent = PERCENT, - overlap = OVERLAP - input: - forward = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R1.fastq.gz", - reverse = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R2.fastq.gz" - output: - forward= "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R1.fastq.gz", - reverse = "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R2.fastq.gz", - joined = "results/reads/fastq-join_single-end/{sample}_cutadapt_sickle_join_SE.fastq.gz", - report = "results/reports/fastq-join/{sample}_stich_length_report.txt" - log: - "results/reports/fastq-join/{sample}.log" - shell: - "fastq-join " # Fastq-Join, joins two paired-end reads on the overlapping ends - "{input.forward} " # Input forward - "{input.reverse} " # Input reverse - "-p {params.percent} " # Percent maximum difference (default: 8) - "-m {params.overlap} " # Minimum overlap (default: 6) - "-o {output.forward} " # for unique forward files - "-o {output.reverse} " # for unique reverse files - "-o {output.joined} " # for join files - "-r {output.report} " # Verbose stitch length report - "&> {log}" # Add redirection for log - -############################################################################### -rule sickletrim: - # Aim: windowed adaptive trimming tool for FASTQ files using quality - # Use: sickle <command> [OPTIONS] - message: - "Sickle-trim remove low quality sequences from {wildcards.sample} reads" - conda: - SICKLETRIM - params: - command = COMMAND, - encoding = ENCODING, - quality = QUALITY, - length = LENGTHS - input: - forward = "results/reads/cutadapt_removed/{sample}_cutadapt_R1.fastq.gz", - reverse = "results/reads/cutadapt_removed/{sample}_cutadapt_R2.fastq.gz" - output: - forward = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R1.fastq.gz", - reverse = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R2.fastq.gz", - single = "results/reads/sickle-trim_single-end/{sample}_cutadapt_sickle_SE.fastq.gz" - log: - "results/reports/sickle-trim/{sample}.log" - shell: - "sickle " # Sickle, a windowed adaptive trimming tool for FASTQ files using quality - "{params.command} " # Paired-end or single-end sequence trimming - "-t {params.encoding} " # --qual-type: Type of quality values, solexa ; illumina ; sanger ; CASAVA, < 1.3 ; 1.3 to 1.7 ; >= 1.8 - "-q {params.quality} " # --qual-threshold: Threshold for trimming based on average quality in a window (default: 20) - "-l {params.length} " # --length-threshold: Threshold to keep a read based on length after trimming (default: 20) - "-f {input.forward} " # --pe-file1: Input paired-end forward fastq file - "-r {input.reverse} " # --pe-file2: Input paired-end reverse fastq file - "-g " # --gzip-output: Output gzipped files - "-o {output.forward} " # --output-pe1: Output trimmed forward fastq file - "-p {output.reverse} " # --output-pe2: Output trimmed reverse fastq file (must use -s option) - "-s {output.single} " # --output-single: Output trimmed singles fastq file - "&> {log}" # Add redirection for log - -############################################################################### -rule cutadapt: - # Aim: finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads - # Use: cutadapt -a ADAPTER [OPTIONS] [-o output.forward] [-p output.reverse] <input.forward> <input.reverse> - # Rmq: multiple adapter sequences can be given using further -a options, but only the best-matching adapter will be removed - message: - "Cutadapt remove adapters from {wildcards.sample} reads" - conda: - CUTADAPT - resources: - cpus = CPUS - params: - length = LENGTHC, - truseq = TRUSEQ, - nextera = NEXTERA, - small = SMALL - input: - forward = "results/reads/raw_links/{sample}_R1.fastq.gz", - reverse = "results/reads/raw_links/{sample}_R2.fastq.gz" - output: - forward = "results/reads/cutadapt_removed/{sample}_cutadapt_R1.fastq.gz", - reverse = "results/reads/cutadapt_removed/{sample}_cutadapt_R2.fastq.gz" - log: - "results/reports/cutadapt/{sample}.log" - shell: - "cutadapt " # Cutadapt, finds and removes unwanted sequence from your HT-seq reads - "--cores {resources.cpus} " # -j: Number of CPU cores to use. Use 0 to auto-detect (default: 1) - "--trim-n " # --trim-n: Trim N's on ends of reads - "--minimum-length {params.length} " # -m: Discard reads shorter than length - "--adapter {params.truseq} " # -a: Sequence of an adapter ligated to the 3' end of the first read - "-A {params.truseq} " # -A: 3' adapter to be removed from second read in a pair - "--adapter {params.nextera} " # -a: Sequence of an adapter ligated to the 3' end of the first read - "-A {params.nextera} " # -A: 3' adapter to be removed from second read in a pair - "--adapter {params.small} " # -a: Sequence of an adapter ligated to the 3' end of the first read - "-A {params.small} " # -A: 3' adapter to be removed from second read in a pair - "--output {output.forward} " # -o: Write trimmed reads to FILE - "--paired-output {output.reverse} " # -p: Write second read in a pair to FILE - "{input.forward} " # Input forward reads R1.fastq - "{input.reverse} " # Input reverse reads R2.fastq - "&> {log}" # Add redirection for log - -############################################################################### diff --git a/workflow/rules/rqc.smk b/workflow/rules/rqc.smk new file mode 100644 index 0000000000000000000000000000000000000000..a68c4363052d0ae5dbc2bef92f48562f7b9d29a9 --- /dev/null +++ b/workflow/rules/rqc.smk @@ -0,0 +1,127 @@ +############################################################################### +# Name: RQC pipeline +# Author: Nicolas Fernandez +# Affiliation: IRD_U233_TransVIHMI +# Aim: Snakefile for RQC pipeline +# Date: 2021.04.30 +# Run: snakemake --snakefile rqc.smk --cores --use-conda +# Latest modification: 2022.01.25 +# Todo: done + +############################################################################### +# PUBLICATIONS # + + + + +############################################################################### +# CONFIGURATION # +configfile: "config/config.yaml" + +# FUNCTIONS # + +# WILDCARDS # +RUN, = glob_wildcards("resources/reads/{run}/") + +# ENVIRONMENTS # +FASTQC = config["conda"]["fastqc"] # FastQC +FASTQSCREEN = config["conda"]["fastq-screen"] # FastQ Screen +MULTIQC = config["conda"]["multiqc"] # MultiQC + +# RESOURCES # +CPUS = config["resources"]["cpus"] # Resources thread +MEM_GB = config["resources"]["mem_gb"] # Resources mem in Gb + +# PARAMETERS # +CONFIG = config["fastq-screen"]["config"] # Fastq-screen --conf +ALIGNER = config["fastq-screen"]["aligner"] # Fastq-screen --aligner +SUBSET = config["fastq-screen"]["subset"] # Fastq-screen --subset + +############################################################################### +rule all: + input: + multiqc = expand("results/{run}/multiqc/multiqc_report.html") + +############################################################################### +rule multiqc_reports_aggregation: + # Aim: aggregates bioinformatics analyses results into a single report + # Use: multiqc [OPTIONS] --output [MULTIQC/] [FASTQC/] [MULTIQC/] + message: + "MultiQC reports aggregating ({wildcards.run})" + conda: + MULTIQC + input: + fastqc = "results/{run}/fastqc/", + fastqscreen = "results/{run}/fastq-screen/" + output: + multiqc = directory("results/{run}/quality/multiqc/"), + html = "results/{run}/quality/multiqc/multiqc_report.html" + log: + "results/{run}/reports/multiqc.log" + shell: + "multiqc " # Multiqc, searches in given directories for analysis & compiles a HTML report + "--quiet " # -q: Only show log warning + "--outdir {output.multiqc} " # -o: Create report in the specified output directory + "{input.fastqc} " # Input FastQC files + "{input.fastqscreen} " # Input Fastq-Screen + "--no-ansi " # Disable coloured log + "&> {log}" # Add redirection for log + +############################################################################### +rule fastqscreen_contamination_checking: + # Aim: screen if the composition of the library matches with what you expect + # Use: fastq_screen [OPTIONS] --outdir [DIR/] [SAMPLE_1.fastq] ... [SAMPLE_n.fastq] + message: + "Fastq-Screen reads contamination checking ({wildcards.run})" + conda: + FASTQSCREEN + resources: + cpus = CPUS + params: + config = CONFIG, + aligner = ALIGNER, + subset = SUBSET + input: + fastq = "resources/reads/{run/}" + output: + fastqscreen = directory("results/{run}/fastq-screen/") + log: + "results/{run}/reports/fastq-screen.log" + shell: + "fastq_screen " # FastqScreen, what did you expect ? + "-q " # --quiet: Only show log warning + "--threads {resources.cpus} " # --threads: Specifies across how many threads bowtie will be allowed to run + "--conf {params.config} " # path to configuration file + "--aligner {params.aligner} " # -a: choose aligner 'bowtie', 'bowtie2', 'bwa' + "--subset {params.subset} " # Don't use the whole sequence file, but create a subset of specified size + "--outdir {output.fastqscreen} " # Output directory + "{input.fastq}/*.fastq.gz " # Input file.fastq + "&> {log}" # Add redirection for log + +############################################################################### +rule fastqc_quality_control: + # Aim: reads sequence files and produces a quality control report + # Use: fastqc [OPTIONS] --output [DIR/] [SAMPLE_1.fastq] ... [SAMPLE_n.fastq] + message: + "FastQC reads quality controling ({wildcards.run})" + conda: + FASTQC + resources: + cpus = CPUS + input: + fastq = "resources/reads/{run}" + output: + fastqc = directory("results/{run}/fastqc/") + log: + "results/{run}/reports/fastqc.log" + shell: + "mkdir -p {output.fastqc} " # (*) this directory must exist as the program will not create it + "2> /dev/null && " # in silence and then... + "fastqc " # FastQC, a high throughput sequence QC analysis tool + "--quiet " # -q: Supress all progress messages on stdout and only report errors + "--threads {resources.cpus} " # -t: Specifies files number which can be processed simultaneously + "--outdir {output.fastqc} " # -o: Create all output files in the specified output directory + "{input.fastq}/*.fastq.gz " # Input file.fastq + "&> {log}" # Add redirection for log + +###############################################################################