You can install the release version of sparseMatrixStats from BioConductor:
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("sparseMatrixStats")
The sparseMatrixStats package implements a number of summary functions for sparse matrices from the Matrix package.
Let us load the package and create a synthetic sparse matrix.
library(sparseMatrixStats)
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
# Matrix defines the sparse Matrix class
# dgCMatrix that we will use
library(Matrix)
# For reproducibility
set.seed(1)
Create a synthetic table with customers, items, and how often they bought that item.
customer_ids <- seq_len(100)
item_ids <- seq_len(30)
n_transactions <- 1000
customer <- sample(customer_ids, size = n_transactions, replace = TRUE,
prob = runif(100))
item <- sample(item_ids, size = n_transactions, replace = TRUE,
prob = runif(30))
tmp <- table(paste0(customer, "-", item))
tmp2 <- strsplit(names(tmp), "-")
purchase_table <- data.frame(
customer = as.numeric(sapply(tmp2, function(x) x[1])),
item = as.numeric(sapply(tmp2, function(x) x[2])),
n = as.numeric(tmp)
)
head(purchase_table, n = 10)
#> customer item n
#> 1 1 15 2
#> 2 1 20 1
#> 3 1 22 1
#> 4 1 23 1
#> 5 1 30 1
#> 6 100 11 1
#> 7 100 15 3
#> 8 100 17 1
#> 9 100 19 1
#> 10 100 20 2
Let us turn the table into a matrix to simplify the analysis:
purchase_matrix <- sparseMatrix(purchase_table$customer, purchase_table$item,
x = purchase_table$n,
dims = c(100, 30),
dimnames = list(customer = paste0("Customer_", customer_ids),
item = paste0("Item_", item_ids)))
purchase_matrix[1:10, 1:15]
#> 10 x 15 sparse Matrix of class "dgCMatrix"
#> [[ suppressing 15 column names 'Item_1', 'Item_2', 'Item_3' ... ]]
#>
#> Customer_1 . . . . . . . . . . . . . . 2
#> Customer_2 . . 2 . . . . . . . . . . . 1
#> Customer_3 . . 1 . . . . . 1 . . 1 . . 1
#> Customer_4 . . . . . 1 . . 1 1 . 1 . 1 3
#> Customer_5 . . . . . . . . . . . 1 . . .
#> Customer_6 . . . 1 . 1 1 . . 1 1 . . . 1
#> Customer_7 2 . 1 . . 1 . 1 1 . 1 2 . . 1
#> Customer_8 . . . . . . . . 1 . . . . . .
#> Customer_9 . . . . . 2 . . 1 . . . . 1 .
#> Customer_10 . . . . . . . . . . . . . . .
We can see that some customers did not buy anything, where as some bought a lot.
sparseMatrixStats
can help us to identify interesting patterns in this data:
# How often was each item bough in total?
colSums2(purchase_matrix)
#> [1] 34 13 30 10 12 34 29 31 71 28 20 40 0 48 52 27 30 55 14 46 8 41 40 73 56
#> [26] 83 20 18 13 24
# What is the range of number of items each
# customer bought?
head(rowRanges(purchase_matrix))
#> [,1] [,2]
#> [1,] 0 2
#> [2,] 0 2
#> [3,] 0 3
#> [4,] 0 3
#> [5,] 0 1
#> [6,] 0 4
# What is the variance in the number of items
# each customer bought?
head(rowVars(purchase_matrix))
#> [1] 0.23448276 0.40919540 0.39195402 0.74022989 0.06436782 0.81034483
# How many items did a customer not buy at all, one time, 2 times,
# or exactly 4 times?
head(rowTabulates(purchase_matrix, values = c(0, 1, 2, 4)))
#> 0 1 2 4
#> Customer_1 25 4 1 0
#> Customer_2 25 2 3 0
#> Customer_3 25 4 0 0
#> Customer_4 19 8 1 0
#> Customer_5 28 2 0 0
#> Customer_6 20 7 2 1
In the previous section, I demonstrated how to create a sparse matrix from scratch using the sparseMatrix()
function.
However, often you already have an existing matrix and want to convert it to a sparse representation.
mat <- matrix(0, nrow=10, ncol=6)
mat[sample(seq_len(60), 4)] <- 1:4
# Convert dense matrix to sparse matrix
sparse_mat <- as(mat, "dgCMatrix")
sparse_mat
#> 10 x 6 sparse Matrix of class "dgCMatrix"
#>
#> [1,] . . . . . .
#> [2,] . . . . 4 .
#> [3,] . . . . . .
#> [4,] . . . . . .
#> [5,] . . . . . .
#> [6,] 1 . . 2 . .
#> [7,] . . 3 . . .
#> [8,] . . . . . .
#> [9,] . . . . . .
#> [10,] . . . . . .
The sparseMatrixStats package is a derivative of the matrixStats package and implements it’s API for
sparse matrices. For example, to calculate the variance for each column of mat
you can do
apply(mat, 2, var)
#> [1] 0.1 0.0 0.9 0.4 1.6 0.0
However, this is quite inefficient and matrixStats provides the direct function
matrixStats::colVars(mat)
#> [1] 0.1 0.0 0.9 0.4 1.6 0.0
Now for sparse matrices, you can also just call
sparseMatrixStats::colVars(sparse_mat)
#> [1] 0.1 0.0 0.9 0.4 1.6 0.0
If you have a large matrix with many exact zeros, working on the sparse representation can considerably speed up the computations.
I generate a dataset with 10,000 rows and 50 columns that is 99% empty
big_mat <- matrix(0, nrow=1e4, ncol=50)
big_mat[sample(seq_len(1e4 * 50), 5000)] <- rnorm(5000)
# Convert dense matrix to sparse matrix
big_sparse_mat <- as(big_mat, "dgCMatrix")
I use the bench package to benchmark the performance difference:
bench::mark(
sparseMatrixStats=sparseMatrixStats::colVars(big_sparse_mat),
matrixStats=matrixStats::colVars(big_mat),
apply=apply(big_mat, 2, var)
)
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sparseMatrixStats 69.86µs 81.24µs 11908. NA 2.02
#> 2 matrixStats 1.98ms 2.01ms 491. NA 0
#> 3 apply 5.78ms 6.05ms 160. NA 62.8
As you can see sparseMatrixStats
is ca. 50 times fast than matrixStats
, which in turn is 7 times faster than the apply()
version.
sessionInfo()
#> R version 4.2.0 RC (2022-04-19 r82224)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] Matrix_1.4-1 sparseMatrixStats_1.8.0 MatrixGenerics_1.8.0
#> [4] matrixStats_0.62.0 BiocStyle_2.24.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.8.3 bslib_0.3.1 compiler_4.2.0
#> [4] pillar_1.7.0 BiocManager_1.30.17 jquerylib_0.1.4
#> [7] tools_4.2.0 digest_0.6.29 jsonlite_1.8.0
#> [10] evaluate_0.15 tibble_3.1.6 lifecycle_1.0.1
#> [13] lattice_0.20-45 pkgconfig_2.0.3 rlang_1.0.2
#> [16] bench_1.1.2 cli_3.3.0 yaml_2.3.5
#> [19] xfun_0.30 fastmap_1.1.0 stringr_1.4.0
#> [22] knitr_1.38 vctrs_0.4.1 sass_0.4.1
#> [25] grid_4.2.0 glue_1.6.2 R6_2.5.1
#> [28] fansi_1.0.3 rmarkdown_2.14 bookdown_0.26
#> [31] magrittr_2.0.3 htmltools_0.5.2 ellipsis_0.3.2
#> [34] utf8_1.2.2 stringi_1.7.6 crayon_1.5.1