Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
###############################################################################
################################## STATISTICS #################################
###############################################################################
###############################################################################
rule awk_coverage_statistics:
# Aim: computing genomme coverage stats
# Use: awk {FORMULA} END {{print [RESULTS.tsv] [BEDGRAPH.bed]
message:
"""
~ Awk ∞ Compute Genome Coverage Statistics from BED file ~
Sample: __________ {wildcards.sample}
Reference: _______ {wildcards.reference}
Aligner: _________ {wildcards.aligner}
"""
conda:
GAWK
input:
cutadapt = "results/10_Reports/tools-log/cutadapt/{sample}.log",
sickle = "results/10_Reports/tools-log/sickle-trim/{sample}.log",
samtools = "results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_mark-dup.log",
flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.json",
histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt",
genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed"
output:
cov_stats = "results/03_Coverage/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.tsv"
log:
"results/10_Reports/tools-log/awk/{reference}/{sample}_{aligner}_{min_cov}X_coverage-stats.log"
shell:
r""" rawReads=$(grep -o -E """ # Get raw reads
r""" 'Total read pairs processed:.+' {input.cutadapt} """ #
r""" | sed -E 's/Total read pairs processed:\ +//' """ #
r""" | sed 's/,//g') ; """ #
#
r""" cutadaptPF=$(grep -o -E """ # Get cutadapt Passing Filtes reads
r""" 'Pairs written \(passing filters\):.+' {input.cutadapt} """ #
r""" | sed -E 's/Pairs written \(passing filters\):\ +//' """ #
r""" | sed 's/,//g') ; """ #
#
r""" sicklePF=$(grep -o -E """ # Get sickle Passing Filtes reads
r""" 'FastQ paired records kept:.+' {input.sickle} """ #
r""" | sed -E 's/FastQ paired records kept:\ +//') ; """ #
#
r""" totalDuplicate=$(grep -o -E """ # Get total duplicated reads
r""" 'DUPLICATE TOTAL:.+' {input.samtools} """ #
r""" | sed -E 's/DUPLICATE TOTAL:\ +//') ; """ #
#
r""" estimatedLibrarySize=$(grep -o -E """ # Get estimated library size
r""" 'ESTIMATED_LIBRARY_SIZE:.+' {input.samtools} """ #
r""" | sed -E 's/ESTIMATED_LIBRARY_SIZE:\ +//') ; """ #
#
r""" samtoolsPF=$(grep -o -E """ # Get samtool Passing Filter reads
r""" 'WRITTEN: .+' {input.samtools} """ #
r""" | sed -E 's/WRITTEN:\ +//') ; """ #
#
r""" mappedReads=$(grep -o -E -m 1 """ # Get mapped reads
r""" '"mapped": .+' {input.flagstat} """ #
r""" | sed -E 's/"mapped":\ +//' """ #
r""" | sed 's/,//g') ; """ #
#
r""" mappedPercentReads=$(grep -o -E -m 1 """ # Get mapped precent reads
r""" '"mapped %": .+' {input.flagstat} """ #
r""" | sed -E 's/"mapped %":\ +//' """ #
r""" | sed 's/,//g') ; """ #
#
r""" covPercentAt1X=$(grep -o -E """ # Get coverage percent @1X
r""" 'Percent covered:.+' {input.histogram} """ #
r""" | sed -E 's/Percent covered:\ +//') ; """ #
#
r""" awk """ # Awk, a program to select particular records in a file and perform operations upon them
r""" -v rawReads="${{rawReads}}" """ # Define external variable
r""" -v cutadaptPF="${{cutadaptPF}}" """ # Define external variable
r""" -v sicklePF="${{sicklePF}}" """ # Define external variable
r""" -v totalDuplicate="${{totalDuplicate}}" """ # Define external variable
r""" -v estimatedLibrarySize="${{estimatedLibrarySize}}" """ # Define external variable
r""" -v samtoolsPF="${{samtoolsPF}}" """ # Define external variable
r""" -v mappedReads="${{mappedReads}}" """ # Define external variable
r""" -v mappedPercentReads="${{mappedPercentReads}}" """ # Define external variable
r""" -v covPercentAt1X="${{covPercentAt1X}}" """ # Define external variable
r""" '$4 >= {wildcards.min_cov} {{supMin_Cov+=$3-$2}} ; """ # Genome size (>= min_cov @X)
r""" {{genomeSize+=$3-$2}} ; """ # Genome size (total)
r""" {{totalBases+=($3-$2)*$4}} ; """ # Total bases @1X
r""" {{totalBasesSq+=(($3-$2)*$4)**2}} """ # Total bases square @1X
r""" END """ # END
r""" {{print """ # Print
r""" "sample_id", "\t", """ # header: Sample ID
r""" "raw_paired_reads", "\t", """ # header: Raw paired reads
r""" "cutadapt_pairs_PF", "\t", """ # header: Cutadapt Passing Filters
r""" "sickle_reads_PF", "\t", """ # header: Sickle-trim Passing Filters
r""" "duplicated_reads", "\t", """ # header:
r""" "duplicated_percent_%","\t", """ # header:
r""" "estimated_library_size*", "\t", """ # header:
r""" "samtools_pairs_PF", "\t", """ # header:
r""" "mapped_with", "\t", """ # header: aligner
r""" "mapped_on", "\t", """ # header: reference
r""" "mapped_reads", "\t", """ # header:
r""" "mapped_percent_%", "\t", """ # header:
r""" "mean_depth", "\t", """ # header: mean depth
r""" "standard_deviation", "\t", """ # header: standard deviation
r""" "cov_percent_%_@1X", "\t", """ # header: coverage percentage @1X
r""" "cov_percent_%" "\t", """ # header: coverage percentage
r""" "@_min_cov" """ # header: @_[min_cov]_X
r""" ORS """ # \n newline
r""" "{wildcards.sample}", "\t", """ # value: Sample ID
r""" rawReads, "\t", """ # value: Raw sequences
r""" cutadaptPF, "\t", """ # value: Cutadapt Passing Filter
r""" sicklePF, "\t", """ # value: Sickle Passing Filter
r""" totalDuplicate, "\t", """ # value:
r""" int(((totalDuplicate)/(rawReads*2))*100), "%", "\t", """ # value: (divided by 2 to estimated pairs)
r""" estimatedLibrarySize, "\t", """ # value:
r""" samtoolsPF, "\t", """ # value:
r""" "{wildcards.aligner}", "\t", """ # value: aligner
r""" "{wildcards.reference}", "\t", """ # value: reference
r""" mappedReads, "\t", """ # value:
r""" mappedPercentReads, "%", "\t", """ # value:
r""" int(totalBases/genomeSize), "\t", """ # value: mean depth
r""" int(sqrt((totalBasesSq/genomeSize)-(totalBases/genomeSize)**2)), "\t", """ # Standard deviation value
r""" covPercentAt1X, "\t", """ # value
r""" supMin_Cov/genomeSize*100, "%", "\t", """ # Coverage percent (@ min_cov X) value
r""" "@{wildcards.min_cov}X" """ # @ min_cov X value
r""" }}' """ # Close print
r""" {input.genome_cov} """ # BedGraph coverage input
r""" 1> {output.cov_stats} """ # Mean depth output
r""" 2> {log}""" # Log redirection
###############################################################################
rule bedtools_genome_coverage:
# Aim: computing genome coverage sequencing
# Use: bedtools genomecov [OPTIONS] -ibam [MARK-DUP.bam] 1> [BEDGRAPH.bed]
message:
"""
~ BedTools ∞ Compute Genome Coverage ~
Sample: __________ {wildcards.sample}
Reference: _______ {wildcards.reference}
Aligner: _________ {wildcards.aligner}
"""
conda:
BEDTOOLS
input:
mark_dup = get_bam_input,
index = get_bai_input
output:
genome_cov = "results/02_Mapping/{reference}/{sample}_{aligner}_genome-cov.bed"
log:
"results/10_Reports/tools-log/bedtools/{reference}/{sample}_{aligner}_genome-cov.log"
shell:
"bedtools genomecov " # Bedtools genomecov, compute the coverage of a feature file among a genome
"-bga " # Report depth in BedGraph format, regions with zero coverage are also reported
"-ibam {input.mark_dup} " # The input file is in BAM format, must be sorted by position
"1> {output.genome_cov} " # BedGraph output
"2> {log} " # Log redirection
###############################################################################
rule samtools_coverage_histogram:
# Aim: alignment depth and percent coverage histogram
# Use: samtools coverage --histogram [INPUT.bam]
message:
"""
~ SamTools ∞ Calcul Depth and Coverage from BAM file ~
Sample: __________ {wildcards.sample}
Reference: _______ {wildcards.reference}
Aligner: _________ {wildcards.aligner}
"""
conda:
SAMTOOLS
resources:
cpus = CPUS
#bins = BINS,
#depth = DEPTH
input:
mark_dup = get_bam_input
output:
histogram = "results/03_Coverage/{reference}/histogram/{sample}_{aligner}_coverage-histogram.txt"
log:
"results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_coverage-histogram.log"
shell:
"samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage
"--histogram " # -m: show histogram instead of tabular output
"--verbosity 4 " # Set level of verbosity [INT] (default: 3)
"--n-bins 149 " # -w: number of bins in histogram (default: terminal width - 40) (todo: {params.bins})
"--depth 0 " # -d maximum allowed coverage depth [INT] (default: 1000000 ; 0 removing any depth limit) (todo: {params.depth})
"--output {output.histogram} " # write output to FILE (default: stdout)
"{input.mark_dup} ; " # Mark_dup bam input
"echo >> {output.histogram} ; " # Newline
"samtools coverage " # Samtools coverage, tools for alignments in the SAM format with command to alignment depth and percent coverage
"--verbosity 4 " # Set level of verbosity [INT] (default: 3)
"{input.mark_dup} " # Mark_dup bam input
">> {output.histogram} " # write output to FILE (default: stdout)
"2> {log}" # Log redirection
###############################################################################
rule samtools_flagstat_ext:
# Aim: simple stats
# Use: samtools flagstat -@ [THREADS] [INPUT.bam]
message:
"""
~ SamTools ∞ Calcul simple Stats from BAM file ~
Sample: __________ {wildcards.sample}
Reference: _______ {wildcards.reference}
Aligner: _________ {wildcards.aligner}
"""
conda:
SAMTOOLS
resources:
cpus = CPUS
input:
mark_dup = get_bam_input
output:
flagstat = "results/03_Coverage/{reference}/flagstat/{sample}_{aligner}_flagstat.{ext}"
log:
"results/10_Reports/tools-log/samtools/{reference}/{sample}_{aligner}_flagstat-{ext}.log"
shell:
"samtools flagstat " # Samtools flagstat, tools for alignments in the SAM format with command to simple stat
"--threads {resources.cpus} " # -@: Number of additional threads to use (default: 1)
"--verbosity 4 " # Set level of verbosity [INT] (default: 3)
"--output-fmt {wildcards.ext} " # -O Specify output format (none, tsv and json)
"{input.mark_dup} " # Mark_dup bam input
"1> {output.flagstat} " # Mark_dup index output
"2> {log}" # Log redirection
###############################################################################