Skip to content
Snippets Groups Projects
Commit 0bbb25b1 authored by Nicolas FERNANDEZ NUÑEZ's avatar Nicolas FERNANDEZ NUÑEZ
Browse files

Fixe issue with untar bowtie2 indexes and add date and name to results dir

parent 4fe6ad34
No related branches found
No related tags found
No related merge requests found
......@@ -9,7 +9,6 @@
*.done
*.pdf
*.txt
/results
/results*
/resources/databases/bowtie2
/resources/databases/bwa
\ No newline at end of file
#!/bin/bash
##### About #####
echo ""
echo "##### ABOUT #####"
......@@ -10,7 +10,7 @@ echo "Affiliation: IRD_U233_TransVIHMI"
echo "Aim: Reads Quality Control and Cleaning"
echo "Date: 2021.04.30"
echo "Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores"
echo "Latest modification: 2021.10.05"
echo "Latest modification: 2021.10.14"
echo "Todo: na"
echo "________________________________________________________________________"
......@@ -25,23 +25,39 @@ echo "Logical CPU: ${logicalcpu}" # Print logical cpu
hwmemsize=$(sysctl -n hw.memsize) # Get memory size
ramsize=$(expr $hwmemsize / $((1024**3))) # 1024**3 = GB
echo "System Memory: ${ramsize} GB" # Print RAM size
timestamp=$(date +'%Y-%m-%d_%H-%M')
echo "Date: ${timestamp}"
echo "________________________________________________________________________"
##### Working directory #####
echo ""
echo "##### WORKING DIRECTORY #####"
echo "-----------------------------"
WORKDIR=${0%/*}
echo "CWD: ${WORKDIR}"
workdir=${0%/*} # for MacOS users, running script with double-click on "RQCP.sh"
#workdir="." # for Linux users (todo), running script with terminal command "bash ./RQCP.sh"
echo "CWD: ${workdir}"
echo "________________________________________________________________________"
#### Get project name
echo ""
echo "#### GET PROJECT NAME ####"
echo "--------------------------"
read -p "Please enter a project name: " project
echo "Project name: ${project}"
echo "________________________________________________________________________"
###### Create links for raw samples #####
echo ""
echo "##### CREATE RAW READ LINKS #####"
echo "---------------------------------"
mkdir ${WORKDIR}/results/ ${WORKDIR}/results/reads/ ${WORKDIR}/results/reads/raw/ 2> /dev/null
for FASTQ in ${WORKDIR}/resources/reads/*.fastq.gz ; do ln -s $FASTQ ${WORKDIR}/results/reads/raw/$(basename $FASTQ) 2> /dev/null ; done # Create Symbol links, quiet: 2> /dev/null
echo "Quiet."
mkdir ${workdir}/results/ 2> /dev/null
mkdir ${workdir}/results/reads/ 2> /dev/null
mkdir ${workdir}/results/reads/raw_links/ 2> /dev/null
# Create Symbolic links for raw reads
for fastq in ${workdir}/resources/reads/*.fastq.gz ; do
ln -s $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) 2> /dev/null ;
done
echo "Silence mode"
echo "________________________________________________________________________"
###### Rename links #####
......@@ -49,21 +65,24 @@ echo "________________________________________________________________________"
echo ""
echo "##### RENAME FASTQ FILES #####"
echo "------------------------------"
rename -v 's/_S\d+_/_/' ${WORKDIR}/results/reads/raw/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_}, quiet: 2> /dev/null
rename -v 's/_L\d+_/_/' ${WORKDIR}/results/reads/raw/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_}, quiet: 2> /dev/null
rename -v 's/_\d+.fastq.gz/.fastq.gz/' ${WORKDIR}/results/reads/raw/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz, quiet: 2> /dev/null
echo "Quiet."
rename -v 's/_S\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_}
rename -v 's/_L\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_}
rename -v 's/_\d+.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz
echo "Silence mode"
echo "________________________________________________________________________"
###### Create dir for QC tools #####
mkdir ${WORKDIR}/results/reads/cutadapt/ ${WORKDIR}/results/reads/sickle-trim/ ${WORKDIR}/results/reads/fastq-join/ ${WORKDIR}/results/reads/merged/ 2> /dev/null
mkdir ${workdir}/results/reads/cutadapt_removed/ 2> /dev/null
mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/ 2> /dev/null
mkdir ${workdir}/results/reads/fastq-join_single-end/ 2> /dev/null
mkdir ${workdir}/results/reads/all_merged_single-end/ 2> /dev/null
###### Extract bwa indexes for small genomes #####
echo ""
echo "##### BOWTIE2 INDEXES EXTRATION #####"
echo "##### BOWTIE2 INDEXES EXTRACTION #####"
echo "---------------------------------"
echo ${WORKDIR}
tar --strip-components=2 -xzvf ${WORKDIR}/resources/databases/bowtie2.tar.gz -C ${WORKDIR}/resources/databases/
tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null
#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null
echo "________________________________________________________________________"
###### Call snakemake pipeline #####
......@@ -72,23 +91,28 @@ echo "##### SNAKEMAKE PIPELIE #####"
echo "-----------------------------"
echo ""
echo "Unlocking working directory:"
snakemake --directory ${WORKDIR} --use-conda --keep-going --rerun-incomplete -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --cores --unlock # unlock first, if previous error
snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --unlock # unlock first, if previous error
echo ""
echo "Dry run:"
snakemake --directory ${WORKDIR} --use-conda --keep-going --rerun-incomplete -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --cores --dryrun # then dry run, check error like missing file
snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --dryrun # then dry run, check error like missing file
echo ""
echo "Let's go!"
snakemake --directory ${WORKDIR} --use-conda --keep-going --rerun-incomplete -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --cores # at last, real run
snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores # at last, real run
echo "________________________________________________________________________"
###### Create usefull graphs and summary
###### Create usefull graphs and summary ######
echo ""
echo "##### SNAKEMAKE PIPELINE GRAPHS ######"
echo "--------------------------------------"
mkdir ${WORKDIR}/results/graphs/ 2> /dev/null
snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --dag | dot -Tpdf > ${WORKDIR}/results/graphs/DAGraph.pdf
snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --rulegraph | dot -Tpdf > ${WORKDIR}/results/graphs/RuleGraph.pdf
snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --filegraph | dot -Tpdf > ${WORKDIR}/results/graphs/FileGraph.pdf
snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --summary > ${WORKDIR}/results/graphs/Summary.txt
echo "Done."
mkdir ${workdir}/results/graphs/ 2> /dev/null
snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --dag | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf
snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf
snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf
snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --summary > ${workdir}/results/graphs/Summary.txt
echo "________________________________________________________________________"
###### Rename results directory with timestamp and project name ######
echo ""
echo "##### SCRIPT END ######"
echo "-----------------------"
mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/
......@@ -29,13 +29,13 @@ conda:
## DATASETS ------------------------------------------------------------------------------------------
datasets:
quality-directory:
- 'raw'
- 'cutadapt'
- 'sickle-trim'
#- 'sickle-single'
- 'fastq-join'
#- 'fastq-join-unique'
- 'merged'
- 'raw_links'
#- 'cutadapt_removed'
- 'sickle-trim_paired-end_filtered'
#- 'sickle-trim_single-end'
#- 'fastq-join_single-end'
#- 'fastq-join_paired-end'
#- 'all_merged'
quality-tool:
- 'fastqc'
- 'fastq-screen'
......@@ -73,7 +73,7 @@ fastq-join:
## FASTQSCREEN ---------------------------------------------------------------------------------------
fastq-screen:
config: 'config/fastq-screen.conf' # path to the fastq-screen configuration file
subset: '10000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset)
subset: '100000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset)
aligner:
#- 'bwa' # Burrows-Wheeler Aligner, for mapping low-divergent sequences against a large reference genome
#- 'bowtie' # Bowtie, an ultrafast, memory-efficient short read aligner
......
......@@ -47,13 +47,13 @@ THREADS 1
##
##----------
## Human h38
#DATABASE Human resources/databases/bwa/Human/Homo_sapiens_h38
DATABASE Human resources/databases/bowtie2/Human/Homo_sapiens_h38
##
## Mouse m39
#DATABASE Mouse resources/databases/bwa/Mouse/Mus_musculus_m39
DATABASE Mouse resources/databases/bowtie2/Mouse/Mus_musculus_m39
##
## Arabidopsis thaliana - sequence from NCBI: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.3_TAIR10/GCF_000001735.3_TAIR10_genomic.fna.gz
##DATABASE Arabidopsis resources/databases/bwa/Arabidopsis/Arabidopsis_thaliana_t10
DATABASE Arabidopsis resources/databases/bowtie2/Arabidopsis/Arabidopsis_thaliana_t10
##
## Ecoli - sequence available from EMBL accession U00096.2
DATABASE Ecoli resources/databases/bowtie2/Ecoli/Echerichia_coli_U00096
......@@ -70,13 +70,13 @@ DATABASE Vectors resources/databases/bowtie2/Vectors/UniVec_wo_phi-X174
##
##-----------
## Gorilla g4
#DATABASE Gorilla resources/databases/bowtie2/Gorilla/Gorilla_gorilla_g4
DATABASE Gorilla resources/databases/bowtie2/Gorilla/Gorilla_gorilla_g4
##
## Chimpanzee
#DATABASE Chimpanzee resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3
DATABASE Chimpanzee resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3
##
## Bat 10
#DATABASE Bat resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1
DATABASE Bat resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1
##
## HIV - HXB2
DATABASE HIV resources/databases/bowtie2/HIV/HXB2
......
......@@ -15,7 +15,7 @@ configfile: "config/config.yaml"
# FUNCTIONS #
# WILDCARDS #
SAMPLE, = glob_wildcards("results/reads/raw/{sample}_R1.fastq.gz")
SAMPLE, = glob_wildcards("results/reads/raw_links/{sample}_R1.fastq.gz")
QCDIR = config["datasets"]["quality-directory"]
# ENVIRONMENT #
......@@ -52,7 +52,7 @@ SUBSET = config["fastq-screen"]["subset"] # Fastq-screen --subset
###############################################################################
rule all:
input:
merged = expand("results/reads/merged/{sample}_cutadapt_sickle-trim_fastq-join_merged.fastq.gz",
merged = expand("results/reads/all_merged_single-end/{sample}_cutadapt_sickle_join_merged_SE.fastq.gz",
sample = SAMPLE),
multiqc = expand("results/quality/{qcdir}/multiqc/",
qcdir = QCDIR)
......@@ -146,12 +146,12 @@ rule merge:
message:
"Take all passing filters {wildcards.sample} reads"
input:
single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz",
forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz",
reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz",
joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
single = "results/reads/sickle-trim_single-end/{sample}_cutadapt_sickle_SE.fastq.gz",
forward= "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R1.fastq.gz",
reverse = "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R2.fastq.gz",
joined = "results/reads/fastq-join_single-end/{sample}_cutadapt_sickle_join_SE.fastq.gz",
output:
merged = "results/reads/merged/{sample}_cutadapt_sickle-trim_fastq-join_merged.fastq.gz"
merged = "results/reads/all_merged_single-end/{sample}_cutadapt_sickle_join_merged_SE.fastq.gz"
shell:
"cat " # Cat, concatenate
"{input.single} " # Input trimmed singles fastq file from Sickle-trim
......@@ -172,12 +172,12 @@ rule fastqjoin:
percent = PERCENT,
overlap = OVERLAP
input:
forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz"
forward = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R1.fastq.gz",
reverse = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R2.fastq.gz"
output:
forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz",
reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz",
joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz",
forward= "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R1.fastq.gz",
reverse = "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R2.fastq.gz",
joined = "results/reads/fastq-join_single-end/{sample}_cutadapt_sickle_join_SE.fastq.gz",
report = "results/reports/fastq-join/{sample}_stich_length_report.txt"
log:
"results/reports/fastq-join/{sample}.log"
......@@ -207,12 +207,12 @@ rule sickletrim:
quality = QUALITY,
length = LENGTHS
input:
forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
forward = "results/reads/cutadapt_removed/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt_removed/{sample}_cutadapt_R2.fastq.gz"
output:
forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz",
reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz",
single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz"
forward = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R1.fastq.gz",
reverse = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R2.fastq.gz",
single = "results/reads/sickle-trim_single-end/{sample}_cutadapt_sickle_SE.fastq.gz"
log:
"results/reports/sickle-trim/{sample}.log"
shell:
......@@ -246,11 +246,11 @@ rule cutadapt:
nextera = NEXTERA,
small = SMALL
input:
forward = "results/reads/raw/{sample}_R1.fastq.gz",
reverse = "results/reads/raw/{sample}_R2.fastq.gz"
forward = "results/reads/raw_links/{sample}_R1.fastq.gz",
reverse = "results/reads/raw_links/{sample}_R2.fastq.gz"
output:
forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz"
forward = "results/reads/cutadapt_removed/{sample}_cutadapt_R1.fastq.gz",
reverse = "results/reads/cutadapt_removed/{sample}_cutadapt_R2.fastq.gz"
log:
"results/reports/cutadapt/{sample}.log"
shell:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment