From f7c572c000050b3ffc050be805c6a55c12b5bee4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20FERNANDEZ=20NU=C3=91EZ?=
 <fernandez.nunez.nicolas@gmail.com>
Date: Fri, 22 Oct 2021 16:47:21 +0200
Subject: [PATCH] bwa as aligner

---
 RQCP.sh                                       | 149 ++++++++++++++----
 config/config.yaml                            |  10 +-
 config/fastq-screen.conf                      |  39 +++--
 .../rules/reads_quality_control_pipeline.smk  |  12 +-
 4 files changed, 155 insertions(+), 55 deletions(-)

diff --git a/RQCP.sh b/RQCP.sh
index 6ce6092..bd74bcf 100755
--- a/RQCP.sh
+++ b/RQCP.sh
@@ -18,47 +18,78 @@ echo "________________________________________________________________________"
 echo ""
 echo "##### HARDWARE #####"
 echo "--------------------"
+
 physicalcpu=$(sysctl -n hw.physicalcpu)   # Get physical cpu
 echo "Physical CPU: ${physicalcpu}"       # Print physical cpu
+
 logicalcpu=$(sysctl -n hw.logicalcpu)     # Get logical cpu
 echo "Logical CPU: ${logicalcpu}"         # Print logical cpu
+
 hwmemsize=$(sysctl -n hw.memsize)         # Get memory size
 ramsize=$(expr $hwmemsize / $((1024**3))) # 1024**3 = GB
 echo "System Memory: ${ramsize} GB"       # Print RAM size
+
 timestamp=$(date +'%Y-%m-%d_%H-%M')
 echo "Date: ${timestamp}"
+
 echo "________________________________________________________________________"
 
 ##### Working directory #####
 echo ""
 echo "##### WORKING DIRECTORY #####"
 echo "-----------------------------"
+
 workdir=${0%/*} # for MacOS users, running script with double-click on "RQCP.sh"
 #workdir="." # for Linux users (todo), running script with terminal command "bash ./RQCP.sh"
 echo "CWD: ${workdir}"
-echo "________________________________________________________________________"
 
-##### Get project name ##### 
-echo ""
-echo "##### GET PROJECT NAME #####"
-echo "----------------------------"
-project="project" 
-read -p "Please enter a project name: " project
-echo "Project name: ${project}"
 echo "________________________________________________________________________"
 
+###### Get threads number #####
+#echo ""
+#echo "##### GET THREADS NUMBER ####"
+#echo "--------------------------"
+#
+#threads=$logicalcpu 
+#read -p "Please enter threads number (default, all: ${logicalcpu}): " input_threads
+#if [[ $input_threads -gt 0 ]] && [[ $input_threads -le $logicalcpu ]] 2> /dev/null
+#then
+#    threads=$input_threads
+#fi
+#echo "Pipeline running with ${threads} threads"
+#
+#echo "________________________________________________________________________"
+#
+###### Get project name ##### 
+#echo ""
+#echo "##### GET PROJECT NAME #####"
+#echo "----------------------------"
+#
+#project="project"
+#read -p "Please enter a project name: " input_project
+#if [[ $input_project != $project ]] 2> /dev/null
+#then
+#    project=$input_project
+#fi
+#echo "Project name: ${project}"
+#
+#echo "________________________________________________________________________"
+
 ###### Create links for raw samples #####
 echo ""
 echo "##### CREATE RAW READ LINKS #####"
 echo "---------------------------------"
+
 mkdir ${workdir}/results/ 2> /dev/null
 mkdir ${workdir}/results/reads/ 2> /dev/null
 mkdir ${workdir}/results/reads/raw_links/ 2> /dev/null
+
 # Create Symbolic links for raw reads
+
 for fastq in ${workdir}/resources/reads/*.fastq.gz ; do
-    ln -s $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) 2> /dev/null ;
+    ln -s $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) ;
 done
-echo "Silence mode"
+
 echo "________________________________________________________________________"
 
 ###### Rename links #####
@@ -66,54 +97,114 @@ echo "________________________________________________________________________"
 echo ""
 echo "##### RENAME FASTQ FILES #####"
 echo "------------------------------"
-rename -v 's/_S\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null                # Remove barcode-ID like {_S001_}
-rename -v 's/_L\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null                # Remove line-ID ID like {_L001_}
-rename -v 's/_\d+.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz
-echo "Silence mode"
+
+#rename -v 's/_S\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz                 # Remove barcode-ID like {_S10_}
+#rename -v 's/_L00\d_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz                # Remove line-ID ID like {_L001_}
+rename -v 's/_S\d+_L00\d_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz            # Remove barcode-ID and line-ID like {_S10_L001}
+rename -v 's/_001.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz  # Remove end-name ID like {_001}.fastq.gz
+
+# Create dir for QC-tools and Conf-backup #####
+mkdir ${workdir}/results/reads/cutadapt_removed/
+mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/
+mkdir ${workdir}/results/reads/fastq-join_single-end/
+mkdir ${workdir}/results/reads/all_merged_single-end/
+cp -R ${workdir}/config/ ${workdir}/results/config/
+
 echo "________________________________________________________________________"
 
-###### Create dir for QC tools #####
-mkdir ${workdir}/results/reads/cutadapt_removed/ 2> /dev/null
-mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/ 2> /dev/null
-mkdir ${workdir}/results/reads/fastq-join_single-end/ 2> /dev/null
-mkdir ${workdir}/results/reads/all_merged_single-end/ 2> /dev/null
 
 ###### Extract bwa indexes for small genomes #####
 echo ""
 echo "##### BOWTIE2 INDEXES EXTRACTION #####"
 echo "---------------------------------"
-#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null
-tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null
+
+#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/
+#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/
+tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_small_genomes.tar.gz -C ${workdir}/resources/databases/
+tar --keep-old-files -xzvf ${workdir}/resources/databases/bwa_large_genomes.tar.gz -C ${workdir}/resources/databases/
+
 echo "________________________________________________________________________"
 
 ###### Call snakemake pipeline #####
 echo ""
 echo "##### SNAKEMAKE PIPELIE #####"
 echo "-----------------------------"
+
 echo ""
 echo "Unlocking working directory:"
-snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --unlock # unlock first, if previous error
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --printshellcmds \
+    --keep-going \
+    --rerun-incomplete \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --cores \
+    --unlock # unlock first, if previous error
+
 echo ""
 echo "Dry run:"
-snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --dryrun # then dry run, check error like missing file
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --printshellcmds \
+    --keep-going \
+    --rerun-incomplete \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --cores \
+    --dryrun # then dry run, check error like missing file
+
 echo ""
 echo "Let's go!"
-snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores          # at last, real run 
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --printshellcmds \
+    --keep-going \
+    --rerun-incomplete \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --cores 1 # at last, real run 
+
 echo "________________________________________________________________________"
 
 ###### Create usefull graphs and summary ######
 echo ""
 echo "##### SNAKEMAKE PIPELINE GRAPHS ######"
 echo "--------------------------------------"
+
 mkdir ${workdir}/results/graphs/ 2> /dev/null
-snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --dag       | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf
-snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf
-snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf
-snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --summary   >             ${workdir}/results/graphs/Summary.txt
+
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --dag | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf
+
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf
+
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf
+
+snakemake \
+    --directory ${workdir} \
+    --use-conda \
+    --snakefile ${workdir}/workflow/rules/reads_quality_control_pipeline.smk \
+    --summary > ${workdir}/results/graphs/Summary.txt
+
 echo "________________________________________________________________________"
 
 ###### Rename results directory with timestamp and project name ######
 echo ""
 echo "##### SCRIPT END ######"
 echo "-----------------------"
-mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/ 2> /dev/null
+
+#mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/ 2> /dev/null
+
+echo "________________________________________________________________________"
diff --git a/config/config.yaml b/config/config.yaml
index afc07dc..fa30364 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,4 +1,3 @@
----
 ###############################################################################
 # Author: Nicolas Fernandez
 # Affiliation: IRD_TransVIHMI
@@ -36,9 +35,6 @@ datasets:
     #- 'fastq-join_single-end'
     #- 'fastq-join_paired-end'
     #- 'all_merged'
-  quality-tool:
-    - 'fastqc'
-    - 'fastq-screen'
 
 ## CUTADAPT ------------------------------------------------------------------------------------------
 cutadapt:
@@ -73,10 +69,10 @@ fastq-join:
 ## FASTQSCREEN ---------------------------------------------------------------------------------------
 fastq-screen:
   config: 'config/fastq-screen.conf' # path to the fastq-screen configuration file
-  subset: '100000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset)
+  subset: '1000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset)
   aligner:
-    #- 'bwa'     # Burrows-Wheeler Aligner, for mapping low-divergent sequences against a large reference genome
+    - 'bwa'     # Burrows-Wheeler Aligner, for mapping low-divergent sequences against a large reference genome
     #- 'bowtie'  # Bowtie, an ultrafast, memory-efficient short read aligner
-    - 'bowtie2' # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences
+    #- 'bowtie2' # Bowtie2, an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences
 
 ## ---------------------------------------------------------------------------------------------------
diff --git a/config/fastq-screen.conf b/config/fastq-screen.conf
index d104722..7cff70c 100644
--- a/config/fastq-screen.conf
+++ b/config/fastq-screen.conf
@@ -7,11 +7,12 @@
 ## Uncomment the relevant line below and set the appropriate location.
 ## Please note, this path should INCLUDE the executable filename.
 
-BOWTIE	  /usr/local/bowtie
-#BOWTIE2	  /usr/local/bowtie2-2.3.4.1/bowtie2
-BOWTIE2	  .snakemake/conda/db59fc2b/bin/bowtie2
-#BWA	  /usr/local/bwa
-BWA	  .snakemake/conda/db59fc2b/bin/bwa
+##----- BOWTIE -----
+#BOWTIE	  ${CONDA_PREFIX}/bin/bowtie
+##----- BOWTIE 2 -----
+#BOWTIE2	       ${CONDA_PREFIX}/bin/bowtie2
+##----- BWA -----
+#BWA	  ${CONDA_PREFIX}/bin/bwa
 
 ############################################
 ## Bismark (for bisulfite sequencing only) #
@@ -47,43 +48,55 @@ THREADS 1
 ##
 ##----------
 ## Human h38
+DATABASE	Human	 resources/databases/bwa/Human/Homo_sapiens_h38
 #DATABASE	Human	 resources/databases/bowtie2/Human/Homo_sapiens_h38
 ##
 ## Mouse m39
+DATABASE	Mouse	 resources/databases/bwa/Mouse/Mus_musculus_m39
 #DATABASE	Mouse	 resources/databases/bowtie2/Mouse/Mus_musculus_m39
 ##
 ## Arabidopsis thaliana - sequence from NCBI: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.3_TAIR10/GCF_000001735.3_TAIR10_genomic.fna.gz
+DATABASE	Arabidopsis	 resources/databases/bwa/Arabidopsis/Arabidopsis_thaliana_t10
 #DATABASE	Arabidopsis	 resources/databases/bowtie2/Arabidopsis/Arabidopsis_thaliana_t10
 ##
 ## Ecoli - sequence available from EMBL accession U00096.2
-DATABASE	Ecoli	 resources/databases/bowtie2/Ecoli/Echerichia_coli_U00096
+DATABASE	Ecoli	 resources/databases/bwa/Ecoli/Echerichia_coli_U00096
+#DATABASE	Ecoli	 resources/databases/bowtie2/Ecoli/Echerichia_coli_U00096
 ##
 ##-----
 ## PhiX - sequence available from Refseq accession NC_001422.1
-DATABASE	PhiX	 resources/databases/bowtie2/Phix/PhiX
+DATABASE	PhiX	 resources/databases/bwa/Phix/PhiX
+#DATABASE	PhiX	 resources/databases/bowtie2/Phix/PhiX
 ##
 ## Adapters - sequence derived from the FastQC contaminats file found at: www.bioinformatics.babraham.ac.uk/projects/fastqc
-DATABASE	Adapters        resources/databases/bowtie2/Adapters/Adapters
+DATABASE	Adapters        resources/databases/bwa/Adapters/Adapters
+#DATABASE	Adapters        resources/databases/bowtie2/Adapters/Adapters
 ##
 ## Vector - Sequence taken from the UniVec database: http://www.ncbi.nlm.nih.gov/VecScreen/UniVec.html, BUT, without phi-X174
-DATABASE	Vectors		 resources/databases/bowtie2/Vectors/UniVec_wo_phi-X174
+DATABASE	Vectors		 resources/databases/bwa/Vectors/UniVec_wo_phi-X174
+#DATABASE	Vectors		 resources/databases/bowtie2/Vectors/UniVec_wo_phi-X174
 ##
 ##-----------
 ## Gorilla g4	
+DATABASE	Gorilla	 resources/databases/bwa/Gorilla/Gorilla_gorilla_g4
 #DATABASE	Gorilla	 resources/databases/bowtie2/Gorilla/Gorilla_gorilla_g4
 ##
 ## Chimpanzee
+DATABASE	Chimpanzee	 resources/databases/bwa/Chimpanzee/Pan_troglodytes_t3
 #DATABASE	Chimpanzee	 resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3
 ##
 ## Bat 10
+DATABASE	Bat	 resources/databases/bwa/Bat/Pteropus_vampyrus_v1
 #DATABASE	Bat	 resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1
 ##
 ## HIV - HXB2
-DATABASE	HIV	 resources/databases/bowtie2/HIV/HXB2
+DATABASE	HIV	 resources/databases/bwa/HIV/HXB2
+#DATABASE	HIV	 resources/databases/bowtie2/HIV/HXB2
 ##
 ## Ebola - sequence available from NCBI accession NC_002549
-DATABASE	Ebola	 resources/databases/bowtie2/Ebola/ZEBOV
+DATABASE	Ebola	 resources/databases/bwa/Ebola/ZEBOV
+#DATABASE	Ebola	 resources/databases/bowtie2/Ebola/ZEBOV
 ##
 ## SARS-CoV-2 - sequence from Whuhan available from NCBI accession NC_045512.2
-DATABASE	SARS-CoV-2	 resources/databases/bowtie2/SARS-CoV-2/Wuhan-WIV04-2019
-
+DATABASE	SARS-CoV-2	 resources/databases/bwa/SARS-CoV-2/Wuhan-WIV04-2019
+#DATABASE	SARS-CoV-2	 resources/databases/bowtie2/SARS-CoV-2/Wuhan-WIV04-2019
diff --git a/workflow/rules/reads_quality_control_pipeline.smk b/workflow/rules/reads_quality_control_pipeline.smk
index 14fdb3b..bb1044e 100644
--- a/workflow/rules/reads_quality_control_pipeline.smk
+++ b/workflow/rules/reads_quality_control_pipeline.smk
@@ -5,7 +5,7 @@
 # Aim: Reads Quality Control
 # Date: 2021.04.30
 # Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores 
-# Latest modification: 2021.10.14
+# Latest modification: 2021.10.19
 # Todo: ND
 
 ###############################################################################
@@ -17,7 +17,7 @@ configfile: "config/config.yaml"
 # WILDCARDS #
 SAMPLE, = glob_wildcards("results/reads/raw_links/{sample}_R1.fastq.gz")
 QCDIR = config["datasets"]["quality-directory"]
-
+ 
 # ENVIRONMENT #
 CUTADAPT = config["conda"]["cutadapt"]      # Cutadapt
 SICKLETRIM = config["conda"]["sickle-trim"] # Sickle-trim
@@ -69,7 +69,7 @@ rule multiqc:
        "results/reports/multiqc/{qcdir}.log"
     input:
         fastqc = "results/quality/{qcdir}/fastqc/",
-        fastqscreen = "results/quality/{qcdir}/fastqscreen/"
+        fastqscreen = "results/quality/{qcdir}/fastq-screen/"
     output:
         multiqc = directory("results/quality/{qcdir}/multiqc/")
     log:
@@ -79,7 +79,7 @@ rule multiqc:
         "--quiet "                   # -q: Only show log warning
         "--outdir {output.multiqc} " # -o: Create report in the specified output directory
         "{input.fastqc} "            # Input FastQC files
-        "{input.fastqscreen} "       # Input Fastq-Screen files
+        "{input.fastqscreen} "       # Input Fastq-Screen
         #"--no-ansi "                 # Disable coloured log
         "&> {log}"                   # Add redirection for log
 
@@ -100,7 +100,7 @@ rule fastqscreen:
     input:
         fastq = "results/reads/{qcdir}/"
     output:
-        fastqscreen = directory("results/quality/{qcdir}/fastqscreen/")
+        fastqscreen = directory("results/quality/{qcdir}/fastq-screen/")
     log:
         "results/reports/fastq-screen/{qcdir}.log"
     shell:
@@ -268,6 +268,6 @@ rule cutadapt:
         "--paired-output {output.reverse} " # -p: Write second read in a pair to FILE
         "{input.forward} "                  # Input forward reads R1.fastq
         "{input.reverse} "                  # Input reverse reads R2.fastq
-        "&> {log} "                         # Add redirection for log
+        "&> {log}"                          # Add redirection for log
 
 ###############################################################################
-- 
GitLab