Contents

0.1 Introduction

The Barcode, UMI, Set (BUS) format is a new way to represent pseudoalignments of reads from RNA-seq. Files of this format can be efficiently generated by the command line tool kallisto bus. With kallisto bus and this package, we go from the fastq files to the sparse matrix used for downstream analysis such as with Seurat within half an hour, while CellRanger would take hours.

In this vignette, we convert an 10x 1:1 mouse and human cell mixture dataset from the BUS format to a sparse matrix. To see how the BUS format can be generated from fastq file, as well as more in depth vignettes, see the website of this package.

0.2 Download the dataset

# The dataset package
library(TENxBUSData)
library(BUSpaRse)
library(Matrix)
library(DropletUtils)
library(zeallot)
library(ggplot2)
TENxBUSData(".", dataset = "hgmm100")
#> snapshotDate(): 2019-10-22
#> see ?TENxBUSData and browseVignettes('TENxBUSData') for documentation
#> loading from cache
#> The downloaded files are in /tmp/RtmpaZOu6E/Rbuild666031986928/BUSpaRse/vignettes/out_hgmm100
#> [1] "/tmp/RtmpaZOu6E/Rbuild666031986928/BUSpaRse/vignettes/out_hgmm100"

0.3 Convert to sparse matrix

First, we map transcripts, as in the kallisto index, to the corresponding genes.

tr2g <- transcript2gene(species = c("Homo sapiens", "Mus musculus"), type = "vertebrate",
                         kallisto_out_path = "./out_hgmm100", ensembl_version = 94)
#> Querying biomart for transcript and gene IDs of Homo sapiens
#> Querying biomart for transcript and gene IDs of Mus musculus
#> Cache found
#> Sorting transcripts
head(tr2g)
#>           transcript              gene gene_name
#> 1: ENST00000632684.1 ENSG00000282431.1     TRBD1
#> 2: ENST00000434970.2 ENSG00000237235.2     TRDD2
#> 3: ENST00000448914.1 ENSG00000228985.1     TRDD3
#> 4: ENST00000415118.1 ENSG00000223997.1     TRDD1
#> 5: ENST00000631435.1 ENSG00000282253.1     TRBD1
#> 6: ENST00000390583.1 ENSG00000211923.1  IGHD3-10

Here we make both the gene count matrix and the TCC matrix.

c(gene_count, tcc) %<-% make_sparse_matrix("./out_hgmm100/output.sorted.txt",
                               tr2g = tr2g, est_ncells = 1e5,
                              est_ngenes = nrow(tr2g), ncores = 2)
#> Reading matrix.ec
#> Processing ECs
#> Matching genes to ECs
#> Reading data
#> Constructing gene count matrix
#> Constructing TCC matrix

0.4 Remove empty droplets

Here we have a sparse matrix with genes in rows and cells in columns.

dim(gene_count)
#> [1]  29148 138965

This dataset should only have about 100 cells, but here we get over 100,000. In fact, most of the barcodes correspond to empty droplets; they can be removed by filtering out barcodes with too few UMI.

tot_counts <- Matrix::colSums(gene_count)
summary(tot_counts)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>     1.00     1.00     2.00    22.17     4.00 55869.00
bc_rank <- barcodeRanks(gene_count)
qplot(bc_rank$rank, bc_rank$total, geom = "line") +
  geom_hline(yintercept = metadata(bc_rank)$knee, color = "blue", linetype = 2) +
  geom_hline(yintercept = metadata(bc_rank)$inflection, color = "green", linetype = 2) +
  annotate("text", x = 1000, y = 1.5 * c(metadata(bc_rank)$knee, metadata(bc_rank)$inflection),
           label = c("knee", "inflection"), color = c("blue", "green")) +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = "Rank", y = "Total UMI counts") +
  theme_bw()

gene_count <- gene_count[, tot_counts > metadata(bc_rank)$inflection]
dim(gene_count)
#> [1] 29148   121

Then this sparse matrix can be used in Seurat for downstream analysis.

Likewise, we can remove empty droplets from the TCC matrix.

dim(tcc)
#> [1] 142704 157340

This dataset should only have about 100 cells, but here we get over 100,000. In fact, most of the barcodes correspond to empty droplets; they can be removed by filtering out barcodes with too few UMI.

tot_counts <- Matrix::colSums(tcc)
summary(tot_counts)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>     1.00     1.00     2.00    25.76     5.00 68968.00
bc_rank <- barcodeRanks(tcc)
qplot(bc_rank$rank, bc_rank$total, geom = "line") +
  geom_hline(yintercept = metadata(bc_rank)$knee, color = "blue", linetype = 2) +
  geom_hline(yintercept = metadata(bc_rank)$inflection, color = "green", linetype = 2) +
  annotate("text", x = 1000, y = 1.5 * c(metadata(bc_rank)$knee, metadata(bc_rank)$inflection),
           label = c("knee", "inflection"), color = c("blue", "green")) +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = "Rank", y = "Total UMI counts") +
  theme_bw()

tcc <- tcc[, tot_counts > metadata(bc_rank)$inflection]
dim(tcc)
#> [1] 142704    121
sessionInfo()
#> R version 3.6.1 (2019-07-05)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.10-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.10-bioc/R/lib/libRlapack.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] ggplot2_3.2.1               zeallot_0.1.0              
#>  [3] DropletUtils_1.6.0          SingleCellExperiment_1.8.0 
#>  [5] SummarizedExperiment_1.16.0 DelayedArray_0.12.0        
#>  [7] BiocParallel_1.20.0         matrixStats_0.55.0         
#>  [9] Biobase_2.46.0              GenomicRanges_1.38.0       
#> [11] GenomeInfoDb_1.22.0         IRanges_2.20.0             
#> [13] S4Vectors_0.24.0            BiocGenerics_0.32.0        
#> [15] Matrix_1.2-17               BUSpaRse_1.0.0             
#> [17] TENxBUSData_0.99.3          BiocStyle_2.14.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] ProtGenerics_1.18.0           bitops_1.0-6                 
#>  [3] bit64_0.9-7                   progress_1.2.2               
#>  [5] httr_1.4.1                    tools_3.6.1                  
#>  [7] backports_1.1.5               R6_2.4.0                     
#>  [9] HDF5Array_1.14.0              colorspace_1.4-1             
#> [11] DBI_1.0.0                     lazyeval_0.2.2               
#> [13] withr_2.1.2                   tidyselect_0.2.5             
#> [15] prettyunits_1.0.2             bit_1.1-14                   
#> [17] curl_4.2                      compiler_3.6.1               
#> [19] rtracklayer_1.46.0            bookdown_0.14                
#> [21] scales_1.0.0                  askpass_1.1                  
#> [23] rappdirs_0.3.1                stringr_1.4.0                
#> [25] digest_0.6.22                 Rsamtools_2.2.0              
#> [27] rmarkdown_1.16                R.utils_2.9.0                
#> [29] XVector_0.26.0                pkgconfig_2.0.3              
#> [31] htmltools_0.4.0               limma_3.42.0                 
#> [33] dbplyr_1.4.2                  fastmap_1.0.1                
#> [35] ensembldb_2.10.0              BSgenome_1.54.0              
#> [37] rlang_0.4.1                   RSQLite_2.1.2                
#> [39] shiny_1.4.0                   R.oo_1.22.0                  
#> [41] dplyr_0.8.3                   RCurl_1.95-4.12              
#> [43] magrittr_1.5                  GenomeInfoDbData_1.2.2       
#> [45] munsell_0.5.0                 Rcpp_1.0.2                   
#> [47] Rhdf5lib_1.8.0                R.methodsS3_1.7.1            
#> [49] lifecycle_0.1.0               edgeR_3.28.0                 
#> [51] stringi_1.4.3                 yaml_2.2.0                   
#> [53] zlibbioc_1.32.0               rhdf5_2.30.0                 
#> [55] BiocFileCache_1.10.0          AnnotationHub_2.18.0         
#> [57] grid_3.6.1                    blob_1.2.0                   
#> [59] dqrng_0.2.1                   promises_1.1.0               
#> [61] ExperimentHub_1.12.0          crayon_1.3.4                 
#> [63] lattice_0.20-38               Biostrings_2.54.0            
#> [65] GenomicFeatures_1.38.0        hms_0.5.1                    
#> [67] locfit_1.5-9.1                knitr_1.25                   
#> [69] pillar_1.4.2                  biomaRt_2.42.0               
#> [71] XML_3.98-1.20                 glue_1.3.1                   
#> [73] BiocVersion_3.10.1            evaluate_0.14                
#> [75] data.table_1.12.6             BiocManager_1.30.9           
#> [77] RcppParallel_4.4.4            vctrs_0.2.0                  
#> [79] httpuv_1.5.2                  gtable_0.3.0                 
#> [81] openssl_1.4.1                 purrr_0.3.3                  
#> [83] tidyr_1.0.0                   assertthat_0.2.1             
#> [85] xfun_0.10                     mime_0.7                     
#> [87] xtable_1.8-4                  AnnotationFilter_1.10.0      
#> [89] later_1.0.0                   tibble_2.1.3                 
#> [91] GenomicAlignments_1.22.0      AnnotationDbi_1.48.0         
#> [93] plyranges_1.6.0               memoise_1.1.0                
#> [95] interactiveDisplayBase_1.24.0