Skip to content
Snippets Groups Projects
Commit a580c3db authored by aucomte's avatar aucomte
Browse files

update

parent 40306eeb
No related branches found
No related tags found
No related merge requests found
FileName,SampleName,Treatment,Experiment
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FC1_S1_L001_R1_001_trimmed.fastq.gz,FC1,FC,E1
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FC2_S2_L001_R1_001_trimmed.fastq.gz,FC2,FC,E2
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FC3_S3_L001_R1_001_trimmed.fastq.gz,FC3,FC,E3
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FK1_S4_L001_R1_001_trimmed.fastq.gz,FK1,FK,E1
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FK2_S5_L001_R1_001_trimmed.fastq.gz,FK2,FK,E2
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FK3_S6_L001_R1_001_trimmed.fastq.gz,FK3,FK,E3
\ No newline at end of file
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FC1_S1_L001_R1_001_trimmed.fastq.gz,FC_1,FC,E1
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FC2_S2_L001_R1_001_trimmed.fastq.gz,FC_2,FC,E2
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FC3_S3_L001_R1_001_trimmed.fastq.gz,FC_3,FC,E3
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FK1_S4_L001_R1_001_trimmed.fastq.gz,FK_1,FK,E1
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FK2_S5_L001_R1_001_trimmed.fastq.gz,FK_2,FK,E2
/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/fastqtrimmed/FK3_S6_L001_R1_001_trimmed.fastq.gz,FK_3,FK,E3
\ No newline at end of file
......@@ -5,16 +5,9 @@
'out_dir': "output2" # Out directory
'files':
'reference': "DATA/ref/allcon.fasta"
'annotation': "DATA/ref/msu7.gtf"
'annotation': "DATA/ref/msu7.gff3"
'sample_info': "DATA/sample_info.txt"
'de_comparisons_file': "DATA/treatmentsComparisons.csv"
'DE':
# filtering
'cutoff_cpm': 5
'cutoff_nb_echantillons': 2
# threshold DE
'thres_FDR': 0.01
'thres_logFC': 1.5
SINGULARITY:
"MAIN": "RNAja/envs/Singularity.RNAja.sif"
suppressMessages(library('knitr', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('edgeR', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('limma', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('RColorBrewer', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('mixOmics', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('baySeq', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('Cairo', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('plotly', warn.conflict = FALSE, quietly = TRUE))
suppressMessages(library('Rsubread', warn.conflict = FALSE, quietly = TRUE))
sample_info <- snakemake@input[["sample_info"]]
comparaisons_file <- snakemake@input[["de_comparisons_file"]]
gtf_file <- snakemake@input[["gtf_file"]]
outDir <- snakemake@params[["out_dir"]]
invisible(dir.exists(outDir) || dir.create(path = outDir, mode = "770"))
normCount_file <- snakemake@params[["normCount"]]
counts_file <- snakemake@params[["Counts"]]
#resDE_file <- snakemake@params[["resDE"]]
diane_counts_file <- snakemake@params[["dianeCounts"]]
# ---------------------------------------------------------------------------------
pheno_data <- read.csv(sample_info)
sampleInfo <- data.frame(fileName= pheno_data$SampleName, condition = pheno_data$Treatment, replicate= pheno_data$Experiment)
rawCountTable <- as.matrix(read.csv(gcsv_hisat,row.names="gene_id"))
# Output for Diane
df = as.data.frame(rawCountTable)
df <- data.frame(Gene = row.names(df), df)
write.table(df, file=diane_counts_file, sep=";", row.names = FALSE)
dgList <- DGEList(rawCountTable, group=sampleInfo$condition, genes=rownames(rawCountTable))
# filter
keep <- filterByExpr(dgList)
dgList <- dgList[keep, , keep.lib.sizes=FALSE]
# normalisation
dgList <- calcNormFactors(dgList, method="TMM")
eff.lib.size <- dgList$samples$lib.size*dgList$samples$norm.factors
normCounts <- cpm(dgList)
# output for diffexdb
write.table(dgList$counts,file=counts_file,sep=";")
write.table(normCounts, file=normCount_file, sep=";")
......@@ -44,6 +44,8 @@ suppressMessages(library('Rsubread', warn.conflict = FALSE, quietly = TRUE))
```
```{r include=FALSE}
#http://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
#opts_knit$set(root.dir = snakemake@params[["out_dir"]])
##################### LOADING PARAMS FROM SNAKEMAKE OBJECT ######################
......@@ -67,9 +69,10 @@ thres_logFC <- snakemake@params[["thres_logFC"]]
```
```{r include=FALSE}
#test
#gcsv_hisat <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/output2/3_count/STRINGTIE/HISAT_gene_count_matrix.csv"
#sample_info <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/sample_info.txt"
#comparaisons_file <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/treatmentsComparisons.csv"
#gcsv_hisat <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/output/3_count/STRINGTIE/HISAT_gene_count_matrix.csv"
#sample_info <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/sample_info_all.txt"
#comparaisons_file <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/treatmentsComparisons_all.csv"
#outDir <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/test"
#invisible(dir.exists(outDir) || dir.create(path = outDir, mode = "770"))
#normCount_file <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/test/normCounts"
......@@ -77,34 +80,44 @@ thres_logFC <- snakemake@params[["thres_logFC"]]
#resDE_file <- "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/test/resde"
#gtf_file="/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/ref/msu7.gtf"
#cutoff_cpm <- 5
#cutoff_nb_echantillons <- 2
#thres_FDR <- 0.01
#thres_logFC <- 1.5
```
```{r include = FALSE}
# test en fonction des paires
comp = read.csv(comparaisons_file)
pair_txt = matrix(nrow=nrow(comp),ncol=2)
for (lines in 1:nrow(comp)){
pair_txt[lines,1] = lines
pair_txt[lines,2] = knit_expand(text= "\n\n## {{comp[lines,1]}},{{comp[lines,2]}}\n\n")
}
```
# creation DGEList object
```{r}
pheno_data <- read.csv(sample_info)
sampleInfo <- data.frame(fileName= pheno_data$SampleName, condition = pheno_data$Treatment, replicate= pheno_data$Experiment)
rawCountTable <- as.matrix(read.csv(gcsv_hisat,row.names="gene_id"))
dgList <- DGEList(rawCountTable, group=sampleInfo$condition, genes=rownames(rawCountTable))
# longueur des genes utile si on veut faire du RPKM :
SAF <- Rsubread::flattenGTF(gtf_file)
GeneLength <- rowsum(SAF$End-SAF$Start+1, SAF$GeneID)
GeneLength <- as.matrix(GeneLength)
dgList$genes$Length=GeneLength
dgList$genes$Length = as.numeric(as.character(dgList$genes$Length))
dgList
summary(cpm(dgList))
pheno_data <- read.csv(sample_info)
sampleInfo <- data.frame(fileName= pheno_data$SampleName, condition = pheno_data$Treatment, replicate= pheno_data$Experiment)
rawCountTable <- as.matrix(read.csv(gcsv_hisat,row.names="gene_id"))
dgList <- DGEList(rawCountTable, group=sampleInfo$condition, genes=rownames(rawCountTable))
summary(cpm(dgList))
```
# filtering
# filtering and normalization
First get rid of genes which did not occur frequently enough. We can choose this cutoff by saying we must have at least 2 counts per million (calculated with cpm() in R) on any particular gene that we want to keep. In this example, we're only keeping a gene if it has a cpm of 2 or greater for at least two samples.
Genes with very low counts across all libraries provide little evidence for differential expression.
In the biological point of view, a gene must be expressed at some minimal level before it is likely
to be translated into a protein or to be biologically important.
The filterByExpr function keeps rows that have worthwhile counts in a minumum number of samples
```{r}
countsPerMillion <- cpm(dgList)
countCheck <- countsPerMillion > cutoff_cpm
keep <- which(rowSums(countCheck) >= cutoff_nb_echantillons)
dgList <- dgList[keep,]
keep <- filterByExpr(dgList)
table(keep)
dgList <- dgList[keep, , keep.lib.sizes=FALSE]
summary(cpm(dgList))
```
```{r}
......@@ -112,7 +125,6 @@ dgList <- calcNormFactors(dgList, method="TMM")
eff.lib.size <- dgList$samples$lib.size*dgList$samples$norm.factors
normCounts <- cpm(dgList)
#normCounts <- rpkm(dgList)
write.table(dgList$counts,file=counts_file,sep=";")
write.table(normCounts, file=normCount_file, sep=";")
......@@ -124,51 +136,50 @@ write.table(normCounts, file=normCount_file, sep=";")
plotMDS(dgList)
```
```{r}
pseudoCounts <- log2(dgList$counts+1)
sampleDists <- as.matrix(dist(t(pseudoCounts)))
#Boxplot for pseudo-counts
boxplot(pseudoCounts, col="gray", las=3)
# design matrix and estimating the dispersion
```
```{r}
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(16)
cim(sampleDists, color=cimColor, symkey=FALSE, zoom=FALSE)
#cim(sampleDists, color=cimColor, symkey=FALSE, zoom=TRUE)
#mypalette <- brewer.pal(11,"RdYlBu")
#morecols <- colorRampPalette(mypalette)
#heatmap(sampleDists, col = morecols)
Treat = factor(sampleInfo$condition)
Treat = relevel(Treat, ref="FC")
design <- model.matrix(~Treat)
logFC <- predFC(dgList,design,prior.count=1,dispersion=0.05)
cor(logFC[,4:6])
design <- model.matrix(~Time+Treat)
rownames(design) <- colnames(dgList)
design
library(statmod)
dgList <- estimateDisp(dgList, design, robust=TRUE)
fit <- glmQLFit(dgList, design, robust=TRUE)
plotQLDisp(fit)
qlf <- glmQLFTest(fit, coef=2:3)
topTags(qlf)
FDR <- p.adjust(qlf$table$PValue, method="BH")
sum(FDR < 0.05)
qlf <- glmQLFTest(fit)
topTags(qlf)
top <- rownames(topTags(qlf))
cpm(dgList)[top,]
summary(decideTests(qlf))
plotMD(qlf)
abline(h=c(-1,1), col="blue")
```
# Differential expression analysis
## estimating the dispersion
```{r}
dgList <- estimateCommonDisp(dgList)
dgList <- estimateTagwiseDisp(dgList)
dgList
plotBCV(dgList)
```
# DE and result
## DE and result
```{r include = FALSE}
# test en fonction des paires
comp = read.csv(comparaisons_file)
pair_txt = matrix(nrow=nrow(comp),ncol=2)
for (lines in 1:nrow(comp)){
pair_txt[lines,1] = lines
pair_txt[lines,2] = knit_expand(text= "\n\n### {{comp[lines,1]}},{{comp[lines,2]}}\n\n")
}
```
```{r, echo=FALSE,include = FALSE}
# You need this code to conduct the magic dependences attaching...
DT::datatable(matrix())
```
```{r, results='asis', echo=FALSE, message = FALSE, warning = FALSE}
for (lines in 1:nrow(comp)){
cat(paste(knit(text = pair_txt[lines,2]), collapse = '\n'))
......@@ -185,7 +196,7 @@ for (lines in 1:nrow(comp)){
cat("\n\nnumber of differencialy expressed genes (risk: 1%)\n\n")
sum(resNoFilt$table$FDR < thres_FDR)
sigDownReg <- resNoFilt$table[resNoFilt$table$FDR<thres_FDR,]
sigDownReg <- sigDownReg[order(sigDownReg$logFC),]
cat(knitr::knit_print((DT::datatable(sigDownReg))))
......@@ -207,7 +218,7 @@ for (lines in 1:nrow(comp)){
plot(finalHM$ddc, leaflab="none")
abline(h=10, lwd=2, col="pink")
geneClust <- cutree(as.hclust(finalHM$ddc), h=10)
x = matrix(nrow = length(unique(geneClust)), ncol = 2)
......@@ -217,5 +228,4 @@ for (lines in 1:nrow(comp)){
}
cat(knitr::knit_print(DT::datatable(x)))
}
```
```
\ No newline at end of file
import re
import csv
import sys
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)
listeGene = "/home/comtea/Documents/PHIM/AUTRES/RNASEQ/DE/DownRegFDR0.01logFC1.5.csv"
gff = "/home/comtea/Documents/PHIM/BURKADAPT/RNAja/DATA/ref/msu7.gff3"
output = "/home/comtea/Documents/PHIM/AUTRES/RNASEQ/DE/TEST.csv"
gene_functions={}
with open(gff, 'r') as f2:
spamreader = csv.reader(f2, delimiter="\t")
next(spamreader, None)
for line in spamreader:
gene = ""
if line[2] == "gene":
gene=re.search(r"ID=([^;,]+)[,;]",line[8]).group(1)
gene_functions[gene] = line[8]
with open(output, 'w') as fichier, open(listeGene, 'r') as f1:
spamreader = csv.reader(f1, delimiter=",")
for line in spamreader:
if line[0] == "genes":
line = f"{','.join(line)},functions\n"
fichier.write(line)
else:
gene1 = line[0]
line = f"{','.join(line)},{gene_functions[gene1]}\n"
fichier.write(line)
......@@ -86,10 +86,10 @@ def final_return(wildcards):
"baminfo" : f"{out_dir}/2_mapping/bamfile_info.txt",
"STRINGTIE" : expand(f'{out_dir}/3_count/STRINGTIE/{{fastq}}.gtf', fastq = SAMPLE_NAME),
"STRINGTIE_list" : f'{out_dir}/3_count/STRINGTIE/HISAT_Stringtie_list.txt',
"gffcompare_stat" : f'{out_dir}/3_count/STRINGTIE/merged.stats',
#"gffcompare_stat" : f'{out_dir}/3_count/STRINGTIE/merged.stats',
"gcsv_hisat" : f'{out_dir}/3_count/STRINGTIE/HISAT_gene_count_matrix.csv',
"tcsv_hisat" : f'{out_dir}/3_count/STRINGTIE/HISAT_transcript_count_matrix.csv',
"sRNA_diff_exp_html" : f"{out_dir}/4_DE_analysis/edger.html"
"RNA_diff_exp" : f"{out_dir}/4_DE_analysis/diane_Counts.csv"
}
return dico_final
......@@ -469,10 +469,11 @@ rule diff_exp_analysis:
normCount= f"{out_dir}/4_DE_analysis/normcount.csv",
Counts=f"{out_dir}/4_DE_analysis/Counts.csv",
resDE= f"{out_dir}/4_DE_analysis/resDE.csv",
cutoff_cpm=config["DE"]["cutoff_cpm"],
cutoff_nb_echantillons=config["DE"]["cutoff_nb_echantillons"],
thres_FDR=config["DE"]["thres_FDR"],
thres_logFC=config["DE"]["thres_logFC"],
dianeCounts=f"{out_dir}/4_DE_analysis/diane_Counts.csv",
#cutoff_cpm=config["DE"]["cutoff_cpm"],
#cutoff_nb_echantillons=config["DE"]["cutoff_nb_echantillons"],
#thres_FDR=config["DE"]["thres_FDR"],
#thres_logFC=config["DE"]["thres_logFC"],
output:
html_output = f"{out_dir}/4_DE_analysis/edger.html",
log:
......@@ -481,4 +482,4 @@ rule diff_exp_analysis:
singularity:
config["SINGULARITY"]["MAIN"]
script:
f"{RNAja.RNAJA_SCRIPTS}/edger.Rmd"
\ No newline at end of file
f"{RNAja.RNAJA_SCRIPTS}/EdgeR.R"
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment