From f7c572c000050b3ffc050be805c6a55c12b5bee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20FERNANDEZ=20NU=C3=91EZ?= <fernandez.nunez.nicolas@gmail.com> Date: Fri, 22 Oct 2021 16:47:21 +0200 Subject: [PATCH] bwa as aligner --- RQCP.sh | 149 ++++++++++++++---- config/config.yaml | 10 +- config/fastq-screen.conf | 39 +++-- .../rules/reads_quality_control_pipeline.smk | 12 +- 4 files changed, 155 insertions(+), 55 deletions(-) diff --git a/RQCP.sh b/RQCP.sh index 6ce6092..bd74bcf 100755 --- a/RQCP.sh +++ b/RQCP.sh @@ -18,47 +18,78 @@ echo "________________________________________________________________________" echo "" echo "##### HARDWARE #####" echo "--------------------" + physicalcpu=$(sysctl -n hw.physicalcpu) # Get physical cpu echo "Physical CPU: ${physicalcpu}" # Print physical cpu + logicalcpu=$(sysctl -n hw.logicalcpu) # Get logical cpu echo "Logical CPU: ${logicalcpu}" # Print logical cpu + hwmemsize=$(sysctl -n hw.memsize) # Get memory size ramsize=$(expr $hwmemsize / $((1024**3))) # 1024**3 = GB echo "System Memory: ${ramsize} GB" # Print RAM size + timestamp=$(date +'%Y-%m-%d_%H-%M') echo "Date: ${timestamp}" + echo "________________________________________________________________________" ##### Working directory ##### echo "" echo "##### WORKING DIRECTORY #####" echo "-----------------------------" + workdir=${0%/*} # for MacOS users, running script with double-click on "RQCP.sh" #workdir="." # for Linux users (todo), running script with terminal command "bash ./RQCP.sh" echo "CWD: ${workdir}" -echo "________________________________________________________________________" -##### Get project name ##### -echo "" -echo "##### GET PROJECT NAME #####" -echo "----------------------------" -project="project" -read -p "Please enter a project name: " project -echo "Project name: ${project}" echo "________________________________________________________________________" +###### Get threads number ##### +#echo "" +#echo "##### GET THREADS NUMBER ####" +#echo "--------------------------" +# +#threads=$logicalcpu +#read -p "Please enter threads number (default, all: ${logicalcpu}): " input_threads +#if [[ $input_threads -gt 0 ]] && [[ $input_threads -le $logicalcpu ]] 2> /dev/null +#then +# threads=$input_threads +#fi +#echo "Pipeline running with ${threads} threads" +# +#echo "________________________________________________________________________" +# +###### Get project name ##### +#echo "" +#echo "##### GET PROJECT NAME #####" +#echo "----------------------------" +# +#project="project" +#read -p "Please enter a project name: " input_project +#if [[ $input_project != $project ]] 2> /dev/null +#then +# project=$input_project +#fi +#echo "Project name: ${project}" +# +#echo "________________________________________________________________________" + ###### Create links for raw samples ##### echo "" echo "##### CREATE RAW READ LINKS #####" echo "---------------------------------" + mkdir ${workdir}/results/ 2> /dev/null mkdir ${workdir}/results/reads/ 2> /dev/null mkdir ${workdir}/results/reads/raw_links/ 2> /dev/null + # Create Symbolic links for raw reads + for fastq in ${workdir}/resources/reads/*.fastq.gz ; do - ln -s $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) 2> /dev/null ; + ln -s $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) ; done -echo "Silence mode" + echo "________________________________________________________________________" ###### Rename links ##### @@ -66,54 +97,114 @@ echo "________________________________________________________________________" echo "" echo "##### RENAME FASTQ FILES #####" echo "------------------------------" -rename -v 's/_S\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_} -rename -v 's/_L\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_} -rename -v 's/_\d+.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz -echo "Silence mode" + +#rename -v 's/_S\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz # Remove barcode-ID like {_S10_} +#rename -v 's/_L00\d_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz # Remove line-ID ID like {_L001_} +rename -v 's/_S\d+_L00\d_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz # Remove barcode-ID and line-ID like {_S10_L001} +rename -v 's/_001.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz # Remove end-name ID like {_001}.fastq.gz + +# Create dir for QC-tools and Conf-backup ##### +mkdir ${workdir}/results/reads/cutadapt_removed/ +mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/ +mkdir ${workdir}/results/reads/fastq-join_single-end/ +mkdir ${workdir}/results/reads/all_merged_single-end/ +cp -R ${workdir}/config/ ${workdir}/results/config/ + echo "________________________________________________________________________" -###### Create dir for QC tools ##### -mkdir ${workdir}/results/reads/cutadapt_removed/ 2> /dev/null -mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/ 2> /dev/null -mkdir ${workdir}/results/reads/fastq-join_single-end/ 2> /dev/null -mkdir ${workdir}/results/reads/all_merged_single-end/ 2> /dev/null ###### Extract bwa indexes for small genomes ##### echo "" echo "##### BOWTIE2 INDEXES EXTRACTION #####" echo "---------------------------------" -#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null -tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null + +#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ +#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ +tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_small_genomes.tar.gz -C ${workdir}/resources/databases/ +tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_large_genomes.tar.gz -C ${workdir}/resources/databases/ + echo "________________________________________________________________________" ###### Call snakemake pipeline ##### echo "" echo "##### SNAKEMAKE PIPELIE #####" echo "-----------------------------" + echo "" echo "Unlocking working directory:" -snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --unlock # unlock first, if previous error +snakemake \ + --directory ${workdir} \ + --use-conda \ + --printshellcmds \ + --keep-going \ + --rerun-incomplete \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --cores \ + --unlock # unlock first, if previous error + echo "" echo "Dry run:" -snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --dryrun # then dry run, check error like missing file +snakemake \ + --directory ${workdir} \ + --use-conda \ + --printshellcmds \ + --keep-going \ + --rerun-incomplete \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --cores \ + --dryrun # then dry run, check error like missing file + echo "" echo "Let's go!" -snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores # at last, real run +snakemake \ + --directory ${workdir} \ + --use-conda \ + --printshellcmds \ + --keep-going \ + --rerun-incomplete \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --cores 1 # at last, real run + echo "________________________________________________________________________" ###### Create usefull graphs and summary ###### echo "" echo "##### SNAKEMAKE PIPELINE GRAPHS ######" echo "--------------------------------------" + mkdir ${workdir}/results/graphs/ 2> /dev/null -snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --dag | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf -snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf -snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf -snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --summary > ${workdir}/results/graphs/Summary.txt + +snakemake \ + --directory ${workdir} \ + --use-conda \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --dag | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf + +snakemake \ + --directory ${workdir} \ + --use-conda \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf + +snakemake \ + --directory ${workdir} \ + --use-conda \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf + +snakemake \ + --directory ${workdir} \ + --use-conda \ + --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \ + --summary > ${workdir}/results/graphs/Summary.txt + echo "________________________________________________________________________" ###### Rename results directory with timestamp and project name ###### echo "" echo "##### SCRIPT END ######" echo "-----------------------" -mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/ 2> /dev/null + +#mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/ 2> /dev/null + +echo "________________________________________________________________________" diff --git a/config/config.yaml b/config/config.yaml index afc07dc..fa30364 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,4 +1,3 @@ ---- ############################################################################### # Author: Nicolas Fernandez # Affiliation: IRD_TransVIHMI @@ -36,9 +35,6 @@ datasets: #- 'fastq-join_single-end' #- 'fastq-join_paired-end' #- 'all_merged' - quality-tool: - - 'fastqc' - - 'fastq-screen' ## CUTADAPT ------------------------------------------------------------------------------------------ cutadapt: @@ -73,10 +69,10 @@ fastq-join: ## FASTQSCREEN --------------------------------------------------------------------------------------- fastq-screen: config: 'config/fastq-screen.conf' # path to the fastq-screen configuration file - subset: '100000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset) + subset: '1000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset) aligner: - #- 'bwa' # Burrows-Wheeler Aligner, for mapping low-divergent sequences against a large reference genome + - 'bwa' # Burrows-Wheeler Aligner, for mapping low-divergent sequences against a large reference genome #- 'bowtie' # Bowtie, an ultrafast, memory-efficient short read aligner - - 'bowtie2' # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences + #- 'bowtie2' # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences ## --------------------------------------------------------------------------------------------------- diff --git a/config/fastq-screen.conf b/config/fastq-screen.conf index d104722..7cff70c 100644 --- a/config/fastq-screen.conf +++ b/config/fastq-screen.conf @@ -7,11 +7,12 @@ ## Uncomment the relevant line below and set the appropriate location. ## Please note, this path should INCLUDE the executable filename. -BOWTIE /usr/local/bowtie -#BOWTIE2 /usr/local/bowtie2-2.3.4.1/bowtie2 -BOWTIE2 .snakemake/conda/db59fc2b/bin/bowtie2 -#BWA /usr/local/bwa -BWA .snakemake/conda/db59fc2b/bin/bwa +##----- BOWTIE ----- +#BOWTIE ${CONDA_PREFIX}/bin/bowtie +##----- BOWTIE 2 ----- +#BOWTIE2 ${CONDA_PREFIX}/bin/bowtie2 +##----- BWA ----- +#BWA ${CONDA_PREFIX}/bin/bwa ############################################ ## Bismark (for bisulfite sequencing only) # @@ -47,43 +48,55 @@ THREADS 1 ## ##---------- ## Human h38 +DATABASE Human resources/databases/bwa/Human/Homo_sapiens_h38 #DATABASE Human resources/databases/bowtie2/Human/Homo_sapiens_h38 ## ## Mouse m39 +DATABASE Mouse resources/databases/bwa/Mouse/Mus_musculus_m39 #DATABASE Mouse resources/databases/bowtie2/Mouse/Mus_musculus_m39 ## ## Arabidopsis thaliana - sequence from NCBI: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.3_TAIR10/GCF_000001735.3_TAIR10_genomic.fna.gz +DATABASE Arabidopsis resources/databases/bwa/Arabidopsis/Arabidopsis_thaliana_t10 #DATABASE Arabidopsis resources/databases/bowtie2/Arabidopsis/Arabidopsis_thaliana_t10 ## ## Ecoli - sequence available from EMBL accession U00096.2 -DATABASE Ecoli resources/databases/bowtie2/Ecoli/Echerichia_coli_U00096 +DATABASE Ecoli resources/databases/bwa/Ecoli/Echerichia_coli_U00096 +#DATABASE Ecoli resources/databases/bowtie2/Ecoli/Echerichia_coli_U00096 ## ##----- ## PhiX - sequence available from Refseq accession NC_001422.1 -DATABASE PhiX resources/databases/bowtie2/Phix/PhiX +DATABASE PhiX resources/databases/bwa/Phix/PhiX +#DATABASE PhiX resources/databases/bowtie2/Phix/PhiX ## ## Adapters - sequence derived from the FastQC contaminats file found at: www.bioinformatics.babraham.ac.uk/projects/fastqc -DATABASE Adapters resources/databases/bowtie2/Adapters/Adapters +DATABASE Adapters resources/databases/bwa/Adapters/Adapters +#DATABASE Adapters resources/databases/bowtie2/Adapters/Adapters ## ## Vector - Sequence taken from the UniVec database: http://www.ncbi.nlm.nih.gov/VecScreen/UniVec.html, BUT, without phi-X174 -DATABASE Vectors resources/databases/bowtie2/Vectors/UniVec_wo_phi-X174 +DATABASE Vectors resources/databases/bwa/Vectors/UniVec_wo_phi-X174 +#DATABASE Vectors resources/databases/bowtie2/Vectors/UniVec_wo_phi-X174 ## ##----------- ## Gorilla g4 +DATABASE Gorilla resources/databases/bwa/Gorilla/Gorilla_gorilla_g4 #DATABASE Gorilla resources/databases/bowtie2/Gorilla/Gorilla_gorilla_g4 ## ## Chimpanzee +DATABASE Chimpanzee resources/databases/bwa/Chimpanzee/Pan_troglodytes_t3 #DATABASE Chimpanzee resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3 ## ## Bat 10 +DATABASE Bat resources/databases/bwa/Bat/Pteropus_vampyrus_v1 #DATABASE Bat resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1 ## ## HIV - HXB2 -DATABASE HIV resources/databases/bowtie2/HIV/HXB2 +DATABASE HIV resources/databases/bwa/HIV/HXB2 +#DATABASE HIV resources/databases/bowtie2/HIV/HXB2 ## ## Ebola - sequence available from NCBI accession NC_002549 -DATABASE Ebola resources/databases/bowtie2/Ebola/ZEBOV +DATABASE Ebola resources/databases/bwa/Ebola/ZEBOV +#DATABASE Ebola resources/databases/bowtie2/Ebola/ZEBOV ## ## SARS-CoV-2 - sequence from Whuhan available from NCBI accession NC_045512.2 -DATABASE SARS-CoV-2 resources/databases/bowtie2/SARS-CoV-2/Wuhan-WIV04-2019 - +DATABASE SARS-CoV-2 resources/databases/bwa/SARS-CoV-2/Wuhan-WIV04-2019 +#DATABASE SARS-CoV-2 resources/databases/bowtie2/SARS-CoV-2/Wuhan-WIV04-2019 diff --git a/workflow/rules/reads_quality_control_pipeline.smk b/workflow/rules/reads_quality_control_pipeline.smk index 14fdb3b..bb1044e 100644 --- a/workflow/rules/reads_quality_control_pipeline.smk +++ b/workflow/rules/reads_quality_control_pipeline.smk @@ -5,7 +5,7 @@ # Aim: Reads Quality Control # Date: 2021.04.30 # Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores -# Latest modification: 2021.10.14 +# Latest modification: 2021.10.19 # Todo: ND ############################################################################### @@ -17,7 +17,7 @@ configfile: "config/config.yaml" # WILDCARDS # SAMPLE, = glob_wildcards("results/reads/raw_links/{sample}_R1.fastq.gz") QCDIR = config["datasets"]["quality-directory"] - + # ENVIRONMENT # CUTADAPT = config["conda"]["cutadapt"] # Cutadapt SICKLETRIM = config["conda"]["sickle-trim"] # Sickle-trim @@ -69,7 +69,7 @@ rule multiqc: "results/reports/multiqc/{qcdir}.log" input: fastqc = "results/quality/{qcdir}/fastqc/", - fastqscreen = "results/quality/{qcdir}/fastqscreen/" + fastqscreen = "results/quality/{qcdir}/fastq-screen/" output: multiqc = directory("results/quality/{qcdir}/multiqc/") log: @@ -79,7 +79,7 @@ rule multiqc: "--quiet " # -q: Only show log warning "--outdir {output.multiqc} " # -o: Create report in the specified output directory "{input.fastqc} " # Input FastQC files - "{input.fastqscreen} " # Input Fastq-Screen files + "{input.fastqscreen} " # Input Fastq-Screen #"--no-ansi " # Disable coloured log "&> {log}" # Add redirection for log @@ -100,7 +100,7 @@ rule fastqscreen: input: fastq = "results/reads/{qcdir}/" output: - fastqscreen = directory("results/quality/{qcdir}/fastqscreen/") + fastqscreen = directory("results/quality/{qcdir}/fastq-screen/") log: "results/reports/fastq-screen/{qcdir}.log" shell: @@ -268,6 +268,6 @@ rule cutadapt: "--paired-output {output.reverse} " # -p: Write second read in a pair to FILE "{input.forward} " # Input forward reads R1.fastq "{input.reverse} " # Input reverse reads R2.fastq - "&> {log} " # Add redirection for log + "&> {log}" # Add redirection for log ############################################################################### -- GitLab