diff --git a/.gitignore b/.gitignore index cfacd838abff6e40dac5ee3f8932cfd8f9b35fea..8f169918d07779fd795964035064a8dabe48c02f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ *.done *.pdf *.txt -/results /results* /resources/databases/bowtie2 /resources/databases/bwa \ No newline at end of file diff --git a/RQCP.sh b/RQCP.sh index eaed198d85d20a0281f71ad4225d39e84bf03d88..55474727da15a8da7fc101735a335a56ad08c9e3 100755 --- a/RQCP.sh +++ b/RQCP.sh @@ -1,5 +1,5 @@ #!/bin/bash - + ##### About ##### echo "" echo "##### ABOUT #####" @@ -10,7 +10,7 @@ echo "Affiliation: IRD_U233_TransVIHMI" echo "Aim: Reads Quality Control and Cleaning" echo "Date: 2021.04.30" echo "Run: snakemake --use-conda -s reads_quality_control_pipeline.smk --cores" -echo "Latest modification: 2021.10.05" +echo "Latest modification: 2021.10.14" echo "Todo: na" echo "________________________________________________________________________" @@ -25,23 +25,39 @@ echo "Logical CPU: ${logicalcpu}" # Print logical cpu hwmemsize=$(sysctl -n hw.memsize) # Get memory size ramsize=$(expr $hwmemsize / $((1024**3))) # 1024**3 = GB echo "System Memory: ${ramsize} GB" # Print RAM size +timestamp=$(date +'%Y-%m-%d_%H-%M') +echo "Date: ${timestamp}" echo "________________________________________________________________________" ##### Working directory ##### echo "" echo "##### WORKING DIRECTORY #####" echo "-----------------------------" -WORKDIR=${0%/*} -echo "CWD: ${WORKDIR}" +workdir=${0%/*} # for MacOS users, running script with double-click on "RQCP.sh" +#workdir="." # for Linux users (todo), running script with terminal command "bash ./RQCP.sh" +echo "CWD: ${workdir}" +echo "________________________________________________________________________" + +#### Get project name +echo "" +echo "#### GET PROJECT NAME ####" +echo "--------------------------" +read -p "Please enter a project name: " project +echo "Project name: ${project}" echo "________________________________________________________________________" ###### Create links for raw samples ##### echo "" echo "##### CREATE RAW READ LINKS #####" echo "---------------------------------" -mkdir ${WORKDIR}/results/ ${WORKDIR}/results/reads/ ${WORKDIR}/results/reads/raw/ 2> /dev/null -for FASTQ in ${WORKDIR}/resources/reads/*.fastq.gz ; do ln -s $FASTQ ${WORKDIR}/results/reads/raw/$(basename $FASTQ) 2> /dev/null ; done # Create Symbol links, quiet: 2> /dev/null -echo "Quiet." +mkdir ${workdir}/results/ 2> /dev/null +mkdir ${workdir}/results/reads/ 2> /dev/null +mkdir ${workdir}/results/reads/raw_links/ 2> /dev/null +# Create Symbolic links for raw reads +for fastq in ${workdir}/resources/reads/*.fastq.gz ; do + ln -s $fastq ${workdir}/results/reads/raw_links/$(basename $fastq) 2> /dev/null ; +done +echo "Silence mode" echo "________________________________________________________________________" ###### Rename links ##### @@ -49,21 +65,24 @@ echo "________________________________________________________________________" echo "" echo "##### RENAME FASTQ FILES #####" echo "------------------------------" -rename -v 's/_S\d+_/_/' ${WORKDIR}/results/reads/raw/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_}, quiet: 2> /dev/null -rename -v 's/_L\d+_/_/' ${WORKDIR}/results/reads/raw/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_}, quiet: 2> /dev/null -rename -v 's/_\d+.fastq.gz/.fastq.gz/' ${WORKDIR}/results/reads/raw/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz, quiet: 2> /dev/null -echo "Quiet." +rename -v 's/_S\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove barcode-ID like {_S001_} +rename -v 's/_L\d+_/_/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove line-ID ID like {_L001_} +rename -v 's/_\d+.fastq.gz/.fastq.gz/' ${workdir}/results/reads/raw_links/*.fastq.gz 2> /dev/null # Remove end-name ID like {_001}.fastq.gz +echo "Silence mode" echo "________________________________________________________________________" ###### Create dir for QC tools ##### -mkdir ${WORKDIR}/results/reads/cutadapt/ ${WORKDIR}/results/reads/sickle-trim/ ${WORKDIR}/results/reads/fastq-join/ ${WORKDIR}/results/reads/merged/ 2> /dev/null +mkdir ${workdir}/results/reads/cutadapt_removed/ 2> /dev/null +mkdir ${workdir}/results/reads/sickle-trim_paired-end_filtered/ 2> /dev/null +mkdir ${workdir}/results/reads/fastq-join_single-end/ 2> /dev/null +mkdir ${workdir}/results/reads/all_merged_single-end/ 2> /dev/null ###### Extract bwa indexes for small genomes ##### echo "" -echo "##### BOWTIE2 INDEXES EXTRATION #####" +echo "##### BOWTIE2 INDEXES EXTRACTION #####" echo "---------------------------------" -echo ${WORKDIR} -tar --strip-components=2 -xzvf ${WORKDIR}/resources/databases/bowtie2.tar.gz -C ${WORKDIR}/resources/databases/ +tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_small_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null +#tar --keep-old-files -xzvf ${workdir}/resources/databases/bowtie2_large_genomes.tar.gz -C ${workdir}/resources/databases/ # 2> /dev/null echo "________________________________________________________________________" ###### Call snakemake pipeline ##### @@ -72,23 +91,28 @@ echo "##### SNAKEMAKE PIPELIE #####" echo "-----------------------------" echo "" echo "Unlocking working directory:" -snakemake --directory ${WORKDIR} --use-conda --keep-going --rerun-incomplete -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --cores --unlock # unlock first, if previous error +snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --unlock # unlock first, if previous error echo "" echo "Dry run:" -snakemake --directory ${WORKDIR} --use-conda --keep-going --rerun-incomplete -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --cores --dryrun # then dry run, check error like missing file +snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores --dryrun # then dry run, check error like missing file echo "" echo "Let's go!" -snakemake --directory ${WORKDIR} --use-conda --keep-going --rerun-incomplete -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --cores # at last, real run +snakemake --directory ${workdir} --use-conda --keep-going --rerun-incomplete -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --cores # at last, real run echo "________________________________________________________________________" -###### Create usefull graphs and summary +###### Create usefull graphs and summary ###### echo "" echo "##### SNAKEMAKE PIPELINE GRAPHS ######" echo "--------------------------------------" -mkdir ${WORKDIR}/results/graphs/ 2> /dev/null -snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --dag | dot -Tpdf > ${WORKDIR}/results/graphs/DAGraph.pdf -snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --rulegraph | dot -Tpdf > ${WORKDIR}/results/graphs/RuleGraph.pdf -snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --filegraph | dot -Tpdf > ${WORKDIR}/results/graphs/FileGraph.pdf -snakemake --directory ${WORKDIR} --use-conda -s ${WORKDIR}/workflow/rules/reads_quality_control_pipeline.smk --summary > ${WORKDIR}/results/graphs/Summary.txt -echo "Done." +mkdir ${workdir}/results/graphs/ 2> /dev/null +snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --dag | dot -Tpdf > ${workdir}/results/graphs/DAGraph.pdf +snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --rulegraph | dot -Tpdf > ${workdir}/results/graphs/RuleGraph.pdf +snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --filegraph | dot -Tpdf > ${workdir}/results/graphs/FileGraph.pdf +snakemake --directory ${workdir} --use-conda -s ${workdir}/workflow/rules/reads_quality_control_pipeline.smk --summary > ${workdir}/results/graphs/Summary.txt echo "________________________________________________________________________" + +###### Rename results directory with timestamp and project name ###### +echo "" +echo "##### SCRIPT END ######" +echo "-----------------------" +mv ${workdir}/results/ ${workdir}/results_${timestamp}_${project}/ diff --git a/config/config.yaml b/config/config.yaml index db1a5479c2900bfd7e6bd6d6422d46a48ef0d912..a97394e3854b35b711d5eed757aa3d985924becd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -29,13 +29,13 @@ conda: ## DATASETS ------------------------------------------------------------------------------------------ datasets: quality-directory: - - 'raw' - - 'cutadapt' - - 'sickle-trim' - #- 'sickle-single' - - 'fastq-join' - #- 'fastq-join-unique' - - 'merged' + - 'raw_links' + #- 'cutadapt_removed' + - 'sickle-trim_paired-end_filtered' + #- 'sickle-trim_single-end' + #- 'fastq-join_single-end' + #- 'fastq-join_paired-end' + #- 'all_merged' quality-tool: - 'fastqc' - 'fastq-screen' @@ -73,7 +73,7 @@ fastq-join: ## FASTQSCREEN --------------------------------------------------------------------------------------- fastq-screen: config: 'config/fastq-screen.conf' # path to the fastq-screen configuration file - subset: '10000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset) + subset: '100000' # Don't use the whole sequence file, but create a temporary dataset of this specified number of read (default: '100000', set '0' for all dataset) aligner: #- 'bwa' # Burrows-Wheeler Aligner, for mapping low-divergent sequences against a large reference genome #- 'bowtie' # Bowtie, an ultrafast, memory-efficient short read aligner diff --git a/config/fastq-screen.conf b/config/fastq-screen.conf index 0862031355a421f8cb3353f2b21f53e4276faf7f..6316b02a7420c8a0e9e738db3dc4f1f0f37b1c90 100644 --- a/config/fastq-screen.conf +++ b/config/fastq-screen.conf @@ -47,13 +47,13 @@ THREADS 1 ## ##---------- ## Human h38 -#DATABASE Human resources/databases/bwa/Human/Homo_sapiens_h38 +DATABASE Human resources/databases/bowtie2/Human/Homo_sapiens_h38 ## ## Mouse m39 -#DATABASE Mouse resources/databases/bwa/Mouse/Mus_musculus_m39 +DATABASE Mouse resources/databases/bowtie2/Mouse/Mus_musculus_m39 ## ## Arabidopsis thaliana - sequence from NCBI: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.3_TAIR10/GCF_000001735.3_TAIR10_genomic.fna.gz -##DATABASE Arabidopsis resources/databases/bwa/Arabidopsis/Arabidopsis_thaliana_t10 +DATABASE Arabidopsis resources/databases/bowtie2/Arabidopsis/Arabidopsis_thaliana_t10 ## ## Ecoli - sequence available from EMBL accession U00096.2 DATABASE Ecoli resources/databases/bowtie2/Ecoli/Echerichia_coli_U00096 @@ -70,13 +70,13 @@ DATABASE Vectors resources/databases/bowtie2/Vectors/UniVec_wo_phi-X174 ## ##----------- ## Gorilla g4 -#DATABASE Gorilla resources/databases/bowtie2/Gorilla/Gorilla_gorilla_g4 +DATABASE Gorilla resources/databases/bowtie2/Gorilla/Gorilla_gorilla_g4 ## ## Chimpanzee -#DATABASE Chimpanzee resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3 +DATABASE Chimpanzee resources/databases/bowtie2/Chimpanzee/Pan_troglodytes_t3 ## ## Bat 10 -#DATABASE Bat resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1 +DATABASE Bat resources/databases/bowtie2/Bat/Pteropus_vampyrus_v1 ## ## HIV - HXB2 DATABASE HIV resources/databases/bowtie2/HIV/HXB2 diff --git a/workflow/rules/reads_quality_control_pipeline.smk b/workflow/rules/reads_quality_control_pipeline.smk index f1f47db9ef0eb412479ff97406177b5d37c8b0b3..99422818e0f4a87b7d85cd1800bd8ad7fae3a293 100644 --- a/workflow/rules/reads_quality_control_pipeline.smk +++ b/workflow/rules/reads_quality_control_pipeline.smk @@ -15,7 +15,7 @@ configfile: "config/config.yaml" # FUNCTIONS # # WILDCARDS # -SAMPLE, = glob_wildcards("results/reads/raw/{sample}_R1.fastq.gz") +SAMPLE, = glob_wildcards("results/reads/raw_links/{sample}_R1.fastq.gz") QCDIR = config["datasets"]["quality-directory"] # ENVIRONMENT # @@ -52,7 +52,7 @@ SUBSET = config["fastq-screen"]["subset"] # Fastq-screen --subset ############################################################################### rule all: input: - merged = expand("results/reads/merged/{sample}_cutadapt_sickle-trim_fastq-join_merged.fastq.gz", + merged = expand("results/reads/all_merged_single-end/{sample}_cutadapt_sickle_join_merged_SE.fastq.gz", sample = SAMPLE), multiqc = expand("results/quality/{qcdir}/multiqc/", qcdir = QCDIR) @@ -146,12 +146,12 @@ rule merge: message: "Take all passing filters {wildcards.sample} reads" input: - single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz", - forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz", - reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz", - joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz", + single = "results/reads/sickle-trim_single-end/{sample}_cutadapt_sickle_SE.fastq.gz", + forward= "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R1.fastq.gz", + reverse = "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R2.fastq.gz", + joined = "results/reads/fastq-join_single-end/{sample}_cutadapt_sickle_join_SE.fastq.gz", output: - merged = "results/reads/merged/{sample}_cutadapt_sickle-trim_fastq-join_merged.fastq.gz" + merged = "results/reads/all_merged_single-end/{sample}_cutadapt_sickle_join_merged_SE.fastq.gz" shell: "cat " # Cat, concatenate "{input.single} " # Input trimmed singles fastq file from Sickle-trim @@ -172,12 +172,12 @@ rule fastqjoin: percent = PERCENT, overlap = OVERLAP input: - forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz", - reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz" + forward = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R1.fastq.gz", + reverse = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R2.fastq.gz" output: - forward= "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R1.fastq.gz", - reverse = "results/reads/fastq-join-unique/{sample}_cutadapt_sickle-trim_fastq-join_Unique_R2.fastq.gz", - joined = "results/reads/fastq-join/{sample}_cutadapt_sickle-trim_fastq-join_Mate.fastq.gz", + forward= "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R1.fastq.gz", + reverse = "results/reads/fastq-join_paired-end/{sample}_cutadapt_sickle_join_R2.fastq.gz", + joined = "results/reads/fastq-join_single-end/{sample}_cutadapt_sickle_join_SE.fastq.gz", report = "results/reports/fastq-join/{sample}_stich_length_report.txt" log: "results/reports/fastq-join/{sample}.log" @@ -207,12 +207,12 @@ rule sickletrim: quality = QUALITY, length = LENGTHS input: - forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz", - reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz" + forward = "results/reads/cutadapt_removed/{sample}_cutadapt_R1.fastq.gz", + reverse = "results/reads/cutadapt_removed/{sample}_cutadapt_R2.fastq.gz" output: - forward = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R1.fastq.gz", - reverse = "results/reads/sickle-trim/{sample}_cutadapt_sickle-trim_R2.fastq.gz", - single = "results/reads/sickle-single/{sample}_cutadapt_sickle-trim_Single.fastq.gz" + forward = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R1.fastq.gz", + reverse = "results/reads/sickle-trim_paired-end_filtered/{sample}_cutadapt_sickle_R2.fastq.gz", + single = "results/reads/sickle-trim_single-end/{sample}_cutadapt_sickle_SE.fastq.gz" log: "results/reports/sickle-trim/{sample}.log" shell: @@ -246,11 +246,11 @@ rule cutadapt: nextera = NEXTERA, small = SMALL input: - forward = "results/reads/raw/{sample}_R1.fastq.gz", - reverse = "results/reads/raw/{sample}_R2.fastq.gz" + forward = "results/reads/raw_links/{sample}_R1.fastq.gz", + reverse = "results/reads/raw_links/{sample}_R2.fastq.gz" output: - forward = "results/reads/cutadapt/{sample}_cutadapt_R1.fastq.gz", - reverse = "results/reads/cutadapt/{sample}_cutadapt_R2.fastq.gz" + forward = "results/reads/cutadapt_removed/{sample}_cutadapt_R1.fastq.gz", + reverse = "results/reads/cutadapt_removed/{sample}_cutadapt_R2.fastq.gz" log: "results/reports/cutadapt/{sample}.log" shell: