Confident fold change

Paul Harrison

2019-05-02

This document shows typical Topconfects usage with limma, edgeR, or DESeq2.

The first step is to load a dataset. Here, we’re looking at RNA-seq data that investigates the response of Arabodopsis thaliana to a bacterial pathogen. Besides the experimental and control conditions, there is also a batch effect. This dataset is also examined in section 4.2 of the edgeR user manual, and I’ve followed the initial filtering steps in the edgeR manual.

library(topconfects)

library(NBPSeq)
library(edgeR)
library(limma)

library(dplyr)
library(ggplot2)

data(arab)

# Retrieve symbol and description for each gene
info <- 
    AnnotationDbi::select(
        org.At.tair.db::org.At.tair.db, 
        rownames(arab), c("SYMBOL","GENENAME")) %>%
    group_by(TAIR) %>% 
    summarize(
        SYMBOL=paste(unique(na.omit(SYMBOL)),collapse="/"), 
        GENENAME=paste(unique(na.omit(GENENAME)),collapse="/"))
arab_info <- 
    info[match(rownames(arab),info$TAIR),] %>% 
    select(-TAIR) %>%
    as.data.frame
rownames(arab_info) <- rownames(arab)

# Extract experimental design from sample names
Treat <- factor(substring(colnames(arab),1,4)) %>% relevel(ref="mock")
Time <- factor(substring(colnames(arab),5,5))

y <- DGEList(arab, genes=as.data.frame(arab_info))

# Keep genes with at least 3 samples having an RPM of more than 2
keep <- rowSums(cpm(y)>2) >= 3
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)

limma analysis

Standard limma analysis steps

##   (Intercept) Time2 Time3 Treathrcc
## 1           1     0     0         0
## 2           1     1     0         0
## 3           1     0     1         0
## 4           1     0     0         1
## 5           1     1     0         1
## 6           1     0     1         1

Apply topconfects

Find largest fold changes that we can be confident of at FDR 0.05.

## $table
##  rank index confect effect   AveExpr  name      SYMBOL       
##   1    9023 3.090   4.849117 6.566691 AT3G46280              
##   2   10774 2.895   4.330654 9.155218 AT4G12500              
##   3    4945 2.895   4.493384 6.052661 AT2G19190 FRK1         
##   4   10773 2.608   3.838721 9.165703 AT4G12490              
##   5    2562 2.608   3.951678 6.615271 AT1G51800 IOS1         
##   6    6264 2.608   4.299425 5.440243 AT2G39530              
##   7    6239 2.606   3.907806 7.898640 AT2G39200 ATMLO12/MLO12
##   8   16247 2.606   3.710435 8.728518 AT5G64120 AtPRX71/PRX71
##   9   14449 2.595   5.346437 1.806933 AT5G31702              
##  10    4642 2.563   4.888477 2.382601 AT2G08986              
##  GENENAME                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  Receptor-like protein kinase.  Involved in early defense signaling.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  A member of a large family of seven-transmembrane domain proteins specific to plants, homologs of the barley mildew resistance locus o (MLO) protein. The Arabidopsis genome contains 15 genes encoding MLO proteins, with localization in plasma membrane. Phylogenetic analysis revealed four clades of closely-related AtMLO genes. ATMLO6 belongs to the clade IV, with AtMLO2, AtMLO3 and AtMLO12. The gene is expressed during early seedling growth, in root tips and cotyledon vascular system, in floral organs (anthers and stigma), and in fruit abscission zone, as shown by GUS activity patterns. The expression of several phylogenetically closely-related AtMLO genes showed similar or overlapping tissue specificity and analogous responsiveness to external stimuli, suggesting functional redundancy, co-function, or antagonistic function(s).
##  encodes a cell wall bound peroxidase that is induced by hypo-osmolarity                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## ...
## 2184 of 16526 non-zero log2 fold change at FDR 0.05
## Prior df 18.2

Looking at the result

Here the usual logFC values estimated by limma are shown as dots, with lines to the confect value.

confects_plot_me overlays the confects (black) on a Mean-Difference Plot (grey) (as might be produced by limma::plotMD). As we should expect, the very noisy differences with low mean expression are removed if we look at the confects.

Let’s compare this to the ranking we obtain from topTable.

You can see that the top 19 genes from topTable are all within the top 40 for topconfects ranking, but topconfects has also highly ranked some other genes. These have a large effect size, and sufficient if not overwhelming evidence of this.

An MD-plot highlighting the positions of the top 40 genes in both rankings also illustrates the differences between these two ways of ranking genes.

edgeR analysis

An analysis in edgeR produces similar results. Note that only quasi-likelihood testing from edgeR is supported.

Standard edgeR analysis

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values

Apply topconfects

A step of 0.05 is used here merely so that the vignette will build quickly. edger_confects calls edgeR::glmTreat repeatedly, which is necessarily slow. In practice a smaller value such as 0.01 should be used.

## $table
##  rank index confect effect   logCPM   name      SYMBOL           
##   1   15207 3.05    6.319297 6.729025 AT5G48430                  
##   2    9023 2.95    4.784830 8.096900 AT3G46280                  
##   3    9617 2.95    5.778590 4.902951 AT3G55150 ATEXO70H1/EXO70H1
##   4    4945 2.95    4.497055 7.373501 AT2G19190 FRK1             
##   5    6253 2.95    4.942601 5.771324 AT2G39380 ATEXO70H2/EXO70H2
##   6    2565 2.95    5.321291 5.418659 AT1G51850                  
##   7    6619 2.95    5.414120 5.199481 AT2G44370                  
##   8    6264 2.85    4.340932 6.709049 AT2G39530                  
##   9    2564 2.55    4.336766 6.373316 AT1G51820                  
##  10    5662 2.55    4.526719 5.587339 AT2G30750 CYP71A12         
##  GENENAME                                                                                                                                                                                                        
##                                                                                                                                                                                                                  
##                                                                                                                                                                                                                  
##  A member of EXO70 gene family, putative exocyst subunits, conserved in land plants. Arabidopsis thaliana contains 23 putative EXO70 genes, which can be classified into eight clusters on the phylogenetic tree.
##  Receptor-like protein kinase.  Involved in early defense signaling.                                                                                                                                             
##  A member of EXO70 gene family, putative exocyst subunits, conserved in land plants. Arabidopsis thaliana contains 23 putative EXO70 genes, which can be classified into eight clusters on the phylogenetic tree.
##                                                                                                                                                                                                                  
##                                                                                                                                                                                                                  
##                                                                                                                                                                                                                  
##                                                                                                                                                                                                                  
##  putative cytochrome P450                                                                                                                                                                                        
## ...
## 1729 of 16526 non-zero log2 fold change at FDR 0.05
## Dispersion 0.045 to 0.31
## Biological CV 21.1% to 55.8%

Looking at the result

DESeq2 analysis

DESeq2 does its own filtering of lowly expressed genes, so we start from the original count matrix. The initial steps are as for a normal DESeq2 analysis.

library(DESeq2)

dds <- DESeqDataSetFromMatrix(
    countData = arab,
    colData = data.frame(Time, Treat),
    rowData = arab_info,
    design = ~Time+Treat)

dds <- DESeq(dds)

Apply topconfects

The contrast or coefficient to test is specified as in the DESeq2::results function. The step of 0.05 is merely so that this vignette will build quickly, in practice a smaller value such as 0.01 should be used. deseq2_confects calls results repeatedly, and in fairness results has not been optimized for this.

DESeq2 offers shrunken estimates of LFC. This is another sensible way of ranking genes. Let’s compare them to the confect values.

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## $table
##  rank index confect effect   baseMean   name      filtered shrunk  
##   1   14551 3.40    4.785076  586.43055 AT3G46280 FALSE    4.636526
##   2    8130 3.15    4.500949  354.89430 AT2G19190 FALSE    4.362666
##   3   17495 3.10    4.320279 2997.59547 AT4G12500 FALSE    4.210779
##   4   10088 2.90    4.976427  115.19576 AT2G39380 FALSE    4.595049
##   5    4063 2.90    5.433121   89.38966 AT1G51850 FALSE    4.829164
##   6   10106 2.90    4.350196  223.26235 AT2G39530 FALSE    4.176095
##   7   10602 2.85    5.453274   77.12549 AT2G44370 FALSE    4.784629
##   8   15411 2.85    5.875086   62.87522 AT3G55150 FALSE    4.930467
##   9   17494 2.85    3.838114 2532.56056 AT4G12490 FALSE    3.762829
##  10    4058 2.80    3.969472  448.54556 AT1G51800 FALSE    3.863938
## ...
## 1219 of 26222 non-zero effect size at FDR 0.05

DESeq2 filters some genes, these are placed last in the table. If your intention is to obtain a ranking of all genes, you should disable this with deseq2_confects(..., cooksCutoff=Inf, independentFiltering=FALSE).

## 
## FALSE  TRUE 
## 11479 14743
##        rank index confect     effect  baseMean      name filtered
## 26217 26217 26203      NA  0.8487098  6.324289 AT5G67460     TRUE
## 26218 26218 26206      NA -0.6852089  1.220362 AT5G67488     TRUE
## 26219 26219 26207      NA  1.4505333  4.657085 AT5G67490     TRUE
## 26220 26220 26210      NA -0.5940465  6.699483 AT5G67520     TRUE
## 26221 26221 26213      NA  0.6767779  1.542612 AT5G67550     TRUE
## 26222 26222 26219      NA  0.1805069 12.111601 AT5G67610     TRUE
##            shrunk
## 26217  0.08464033
## 26218 -0.01672139
## 26219  0.13703565
## 26220 -0.06727665
## 26221  0.02225846
## 26222  0.03021758

Looking at the result

Shrunk LFC estimates are shown in red.

lfcShrink aims for a best estimate of the LFC, whereas confect is a conservative estimate. lfcShrink can produce non-zero values for genes which can’t be said to significantly differ from zero – it doesn’t do double duty as an indication of significance – whereas the confect value will be NA in this case. The plot below compares these two quantities. Only un-filtered genes are shown (see above).

Comparing results

rank_rank_plot(confects$table$name, econfects$table$name, 
    "limma confects", "edgeR confects")

rank_rank_plot(confects$table$name, dconfects$table$name, 
    "limma confects", "DESeq2 confects")

rank_rank_plot(econfects$table$name, dconfects$table$name, 
    "edgeR confects", "DESeq2 confects")


sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.9-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.9-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DESeq2_1.24.0               SummarizedExperiment_1.14.0
##  [3] DelayedArray_0.10.0         BiocParallel_1.18.0        
##  [5] matrixStats_0.54.0          Biobase_2.44.0             
##  [7] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
##  [9] IRanges_2.18.0              S4Vectors_0.22.0           
## [11] BiocGenerics_0.30.0         ggplot2_3.1.1              
## [13] dplyr_0.8.0.1               edgeR_3.26.0               
## [15] limma_3.40.0                NBPSeq_0.3.0               
## [17] topconfects_1.0.0          
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_0.9-7            doParallel_1.0.14     
##  [4] RColorBrewer_1.1-2     tools_3.6.0            backports_1.1.4       
##  [7] R6_2.4.0               rpart_4.1-15           Hmisc_4.2-0           
## [10] DBI_1.0.0              lazyeval_0.2.2         colorspace_1.4-1      
## [13] nnet_7.3-12            withr_2.1.2            tidyselect_0.2.5      
## [16] gridExtra_2.3          bit_1.1-14             compiler_3.6.0        
## [19] htmlTable_1.13.1       labeling_0.3           scales_1.0.0          
## [22] checkmate_1.9.1        SQUAREM_2017.10-1      genefilter_1.66.0     
## [25] mixsqp_0.1-97          stringr_1.4.0          digest_0.6.18         
## [28] foreign_0.8-71         rmarkdown_1.12         XVector_0.24.0        
## [31] pscl_1.5.2             base64enc_0.1-3        pkgconfig_2.0.2       
## [34] htmltools_0.3.6        org.At.tair.db_3.8.2   htmlwidgets_1.3       
## [37] rlang_0.3.4            rstudioapi_0.10        RSQLite_2.1.1         
## [40] acepack_1.4.1          RCurl_1.95-4.12        magrittr_1.5          
## [43] GenomeInfoDbData_1.2.1 Formula_1.2-3          Matrix_1.2-17         
## [46] Rcpp_1.0.1             munsell_0.5.0          stringi_1.4.3         
## [49] yaml_2.2.0             MASS_7.3-51.4          zlibbioc_1.30.0       
## [52] plyr_1.8.4             qvalue_2.16.0          grid_3.6.0            
## [55] blob_1.1.1             crayon_1.3.4           lattice_0.20-38       
## [58] splines_3.6.0          annotate_1.62.0        locfit_1.5-9.1        
## [61] knitr_1.22             pillar_1.3.1           codetools_0.2-16      
## [64] geneplotter_1.62.0     reshape2_1.4.3         XML_3.98-1.19         
## [67] glue_1.3.1             evaluate_0.13          latticeExtra_0.6-28   
## [70] data.table_1.12.2      foreach_1.4.4          gtable_0.3.0          
## [73] purrr_0.3.2            assertthat_0.2.1       ashr_2.2-32           
## [76] xfun_0.6               xtable_1.8-4           survival_2.44-1.1     
## [79] truncnorm_1.0-8        tibble_2.1.1           iterators_1.0.10      
## [82] AnnotationDbi_1.46.0   memoise_1.1.0          cluster_2.0.9         
## [85] statmod_1.4.30