###Differential Methylation Analysis with Whole Genome Bisulfite Sequencing (WGBS) Data in TCGA
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
library(BiocManager)
BiocManager::install("tcgaWGBSData.hg19", version = "devel")
Data can be extracted in the following way:
tcga_data <- eh[["EH1661"]]
TCGA_bs <- eh[["EH1662"]]
tcga_eh_directory <- dirname(tcga_data)
assay_file <- tcga_data
rds_file <- paste0(tcga_eh_directory, '/1662')
file.rename(from=assay_file, to=paste0(tcga_eh_directory, '/assays.h5'))
file.rename(from=rds_file, to=paste0(tcga_eh_directory, '/se.rds'))
TCGA_bs <- HDF5Array::loadHDF5SummarizedExperiment(tcga_eh_directory)
Phenotypic data can be extracted by
Methylation Comparison between normal and tumor sample
library(ggplot2)
cov_matrix <- bsseq::getCoverage(TCGA_bs)
meth_matrix <- bsseq::getCoverage(TCGA_bs, type='M')
meth_matrix <- meth_matrix/cov_matrix
# Get total CpG coverage
totCov <- colSums(cov_matrix>0)
# Restrict to CpGs with minimum read covergae of 10
meth_matrix[cov_matrix<10] <- NA
meanMethylation <- DelayedArray::colMeans(meth_matrix, na.rm=TRUE)
sampleType <- phenoData$sample.type
Df <- data.frame('mean-methylation' = meanMethylation, 'type' = sampleType)
g <- ggplot2::ggplot()
g <- g + ggplot2::geom_boxplot(data=Df,ggplot2::aes(x=type,y=mean.methylation))
g <- g + ggplot2::xlab('sample type') + ggplot2::ylab('Methylation')
g <- g + ggplot2::theme(axis.text.x = element_text(angle = 0, hjust = 1))
g
As expected overall methylation is lower in tumor samples compared to the normal samples.
In this analysis we are using functions from bsseq package to conduct the differntial methylation analysis.
# In this analysis we restrict the data to just chromosome 22
chrIndex <- seqnames(TCGA_bs) == 'chr22'
# Also we are only conducting the analysis for 3 normal and tumor pairs
group1 <- c(11, 6, 23) # normal samples
group2 <- c(20, 26, 25) # Tumor samples
subSample <- c(group1, group2)
TCGA_bs_sub <- updateObject(TCGA_bs[chrIndex, subSample])
TCGA_bs_sub.fit <- bsseq::BSmooth(TCGA_bs_sub, mc.cores = 2, verbose = TRUE)
TCGA_bs_sub.tstat <- bsseq::BSmooth.tstat(TCGA_bs_sub.fit,
group1 = c(1, 2, 3),
group2 = c(4, 5, 6),
estimate.var = "group2",
local.correct = TRUE,
verbose = TRUE)
plot(TCGA_bs_sub.tstat)
dmrs0 <- bsseq::dmrFinder(TCGA_bs_sub.tstat, cutoff = c(-4.6, 4.6))
dmrs <- subset(dmrs0, n >= 3 & abs(meanDiff) >= 0.1)
nrow(dmrs)
head(dmrs, n = 3)
pData <- pData(TCGA_bs_sub.fit)
pData$col <- rep(c("red", "blue"), each = 3)
pData(TCGA_bs_sub.fit) <- pData
bsseq::plotRegion(TCGA_bs_sub.fit, dmrs[1,], extend = 5000, addRegions = dmrs)
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.11-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.11-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.0.0 magrittr_1.5 htmltools_0.4.0 tools_4.0.0
## [5] yaml_2.2.1 Rcpp_1.0.4.6 stringi_1.4.6 rmarkdown_2.1
## [9] knitr_1.28 stringr_1.4.0 digest_0.6.25 xfun_0.13
## [13] rlang_0.4.6 evaluate_0.14