Preprocessing

Timothy Keyes

2024-10-29

library(tidytof)
library(dplyr)

Preprocessing with tof_preprocess

Generally speaking, the raw ion counts measured for each analyte on a mass cytometer (the content of raw FCS files obtained directly from a mass cytometer) need to be transformed before CyTOF data analysis. Common preprocessing steps may include variance-stabilizing transformations - such as the hyperbolic arcsine (arcsinh) transformation or a log transformation - scaling/centering, and/or denoising.

To perform standard preprocessing tasks with {tidytof}, use tof_preprocess. tof_preprocess’s default behavior is to apply the arcsinh transformation (with a cofactor of 5) to each numeric column in the input tof_tibble as well as to remove the gaussian noise that Fluidigm software adds to each ion count (this noise is added for visualization purposes, but for most analyses, removing it is recommended).

As an example, we can preprocess {tidytof}’s built-in phenograph_data tof_tibble and see how our first few measurements change before and after.

data(phenograph_data)

# before preprocessing
phenograph_data %>%
    select(cd45, cd34, cd38) %>%
    head()
#> # A tibble: 6 × 3
#>    cd45   cd34  cd38
#>   <dbl>  <dbl> <dbl>
#> 1  131.  3.23   1.51
#> 2  230. -0.582 11.4 
#> 3  293.  5.20   1.84
#> 4  431.  0.363 13.3 
#> # ℹ 2 more rows
phenograph_data %>%
    # perform preprocessing
    tof_preprocess() %>%
    # inspect new values
    select(cd45, cd34, cd38) %>%
    head()
#> # A tibble: 6 × 3
#>    cd45    cd34  cd38
#>   <dbl>   <dbl> <dbl>
#> 1  3.96  0.608  0.298
#> 2  4.52 -0.116  1.56 
#> 3  4.76  0.909  0.360
#> 4  5.15  0.0725 1.70 
#> # ℹ 2 more rows

To alter tof_preprocess’s default behavior, change the channel_cols argument to specify which columns of tof_tibble should be transformed. Alter the transform_fun argument to specify a vector-valued function that should be used to transform each of the channel_cols. For example, suppose we want to center and scale each of our numeric columns instead of arcsinh-transforming them:

phenograph_data %>%
    # preprocess
    tof_preprocess(transform_fun = scale) %>%
    # inspect new values
    select(cd45, cd34, cd38) %>%
    head()
#> # A tibble: 6 × 3
#>   cd45[,1] cd34[,1] cd38[,1]
#>      <dbl>    <dbl>    <dbl>
#> 1   -1.40     1.01   -0.437 
#> 2   -1.15    -0.911   0.0316
#> 3   -0.999    2.00   -0.422 
#> 4   -0.661   -0.436   0.120 
#> # ℹ 2 more rows

To keep the gaussian noise added by Fluidigm software (or if you are working with a dataset that does not have this noise), set the undo_noise argument to FALSE.

Postprocessing with tof_postprocess

As a final note, note that the built-in function tof_postprocess works nearly identically tof_preprocess, but provides different default behavior (namely, applying the reverse arcsinh transformation with a cofactor of 5 to all numeric columns. See ?tof_postprocess for details).

print(phenograph_data) %>%
    select(cd45, cd34, cd38) %>%
    head()
#> # A tibble: 3,000 × 25
#>   sample_name  phenograph_cluster    cd19 cd11b   cd34  cd45  cd123   cd33  cd47
#>   <chr>        <chr>                <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl> <dbl>
#> 1 H1_PhenoGra… cluster1           -0.168  29.0   3.23   131. -0.609  1.21   13.0
#> 2 H1_PhenoGra… cluster1            1.65    4.83 -0.582  230.  2.53  -0.507  12.9
#> 3 H1_PhenoGra… cluster1            2.79   36.1   5.20   293. -0.265  3.67   27.1
#> 4 H1_PhenoGra… cluster1            0.0816 48.8   0.363  431.  2.04   9.40   41.0
#> # ℹ 2,996 more rows
#> # ℹ 16 more variables: cd7 <dbl>, cd44 <dbl>, cd38 <dbl>, cd3 <dbl>,
#> #   cd117 <dbl>, cd64 <dbl>, cd41 <dbl>, pstat3 <dbl>, pstat5 <dbl>,
#> #   pampk <dbl>, p4ebp1 <dbl>, ps6 <dbl>, pcreb <dbl>, `pzap70-syk` <dbl>,
#> #   prb <dbl>, `perk1-2` <dbl>
#> # A tibble: 6 × 3
#>    cd45   cd34  cd38
#>   <dbl>  <dbl> <dbl>
#> 1  131.  3.23   1.51
#> 2  230. -0.582 11.4 
#> 3  293.  5.20   1.84
#> 4  431.  0.363 13.3 
#> # ℹ 2 more rows

# after preprocessing and post-processing, the data are the same
# except that the re-added noise component is different for each value
phenograph_data %>%
    tof_preprocess() %>%
    tof_postprocess(redo_noise = TRUE) %>%
    select(cd45, cd34, cd38) %>%
    head()
#> # A tibble: 6 × 3
#>    cd45   cd34   cd38
#>   <dbl>  <dbl>  <dbl>
#> 1  130.  2.50   0.924
#> 2  230. -1.55  10.9  
#> 3  293.  4.92   1.67 
#> 4  431.  0.229 12.9  
#> # ℹ 2 more rows

Session info

sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.20-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] tidyr_1.3.1                 stringr_1.5.1              
#>  [3] HDCytoData_1.25.0           flowCore_2.18.0            
#>  [5] SummarizedExperiment_1.36.0 Biobase_2.66.0             
#>  [7] GenomicRanges_1.58.0        GenomeInfoDb_1.42.0        
#>  [9] IRanges_2.40.0              S4Vectors_0.44.0           
#> [11] MatrixGenerics_1.18.0       matrixStats_1.4.1          
#> [13] ExperimentHub_2.14.0        AnnotationHub_3.14.0       
#> [15] BiocFileCache_2.14.0        dbplyr_2.5.0               
#> [17] BiocGenerics_0.52.0         forcats_1.0.0              
#> [19] ggplot2_3.5.1               dplyr_1.1.4                
#> [21] tidytof_1.0.0              
#> 
#> loaded via a namespace (and not attached):
#>   [1] jsonlite_1.8.9          shape_1.4.6.1           magrittr_2.0.3         
#>   [4] farver_2.1.2            rmarkdown_2.28          zlibbioc_1.52.0        
#>   [7] vctrs_0.6.5             memoise_2.0.1           htmltools_0.5.8.1      
#>  [10] S4Arrays_1.6.0          curl_5.2.3              SparseArray_1.6.0      
#>  [13] sass_0.4.9              parallelly_1.38.0       bslib_0.8.0            
#>  [16] lubridate_1.9.3         cachem_1.1.0            commonmark_1.9.2       
#>  [19] igraph_2.1.1            mime_0.12               lifecycle_1.0.4        
#>  [22] iterators_1.0.14        pkgconfig_2.0.3         Matrix_1.7-1           
#>  [25] R6_2.5.1                fastmap_1.2.0           GenomeInfoDbData_1.2.13
#>  [28] future_1.34.0           digest_0.6.37           colorspace_2.1-1       
#>  [31] furrr_0.3.1             AnnotationDbi_1.68.0    irlba_2.3.5.1          
#>  [34] RSQLite_2.3.7           philentropy_0.8.0       labeling_0.4.3         
#>  [37] filelock_1.0.3          cytolib_2.18.0          fansi_1.0.6            
#>  [40] yardstick_1.3.1         timechange_0.3.0        httr_1.4.7             
#>  [43] polyclip_1.10-7         abind_1.4-8             compiler_4.4.1         
#>  [46] bit64_4.5.2             withr_3.0.2             doParallel_1.0.17      
#>  [49] viridis_0.6.5           DBI_1.2.3               hexbin_1.28.4          
#>  [52] highr_0.11              ggforce_0.4.2           MASS_7.3-61            
#>  [55] lava_1.8.0              embed_1.1.4             rappdirs_0.3.3         
#>  [58] DelayedArray_0.32.0     tools_4.4.1             future.apply_1.11.3    
#>  [61] nnet_7.3-19             glue_1.8.0              grid_4.4.1             
#>  [64] Rtsne_0.17              generics_0.1.3          recipes_1.1.0          
#>  [67] gtable_0.3.6            tzdb_0.4.0              class_7.3-22           
#>  [70] rsample_1.2.1           data.table_1.16.2       hms_1.1.3              
#>  [73] tidygraph_1.3.1         utf8_1.2.4              XVector_0.46.0         
#>  [76] RcppAnnoy_0.0.22        markdown_1.13           ggrepel_0.9.6          
#>  [79] BiocVersion_3.20.0      foreach_1.5.2           pillar_1.9.0           
#>  [82] vroom_1.6.5             RcppHNSW_0.6.0          splines_4.4.1          
#>  [85] tweenr_2.0.3            lattice_0.22-6          survival_3.7-0         
#>  [88] bit_4.5.0               emdist_0.3-3            RProtoBufLib_2.18.0    
#>  [91] tidyselect_1.2.1        Biostrings_2.74.0       knitr_1.48             
#>  [94] gridExtra_2.3           xfun_0.48               graphlayouts_1.2.0     
#>  [97] hardhat_1.4.0           timeDate_4041.110       stringi_1.8.4          
#> [100] UCSC.utils_1.2.0        yaml_2.3.10             evaluate_1.0.1         
#> [103] codetools_0.2-20        ggraph_2.2.1            tibble_3.2.1           
#> [106] BiocManager_1.30.25     cli_3.6.3               uwot_0.2.2             
#> [109] rpart_4.1.23            munsell_0.5.1           jquerylib_0.1.4        
#> [112] Rcpp_1.0.13             globals_0.16.3          png_0.1-8              
#> [115] parallel_4.4.1          gower_1.0.1             readr_2.1.5            
#> [118] blob_1.2.4              listenv_0.9.1           glmnet_4.1-8           
#> [121] viridisLite_0.4.2       ipred_0.9-15            ggridges_0.5.6         
#> [124] scales_1.3.0            prodlim_2024.06.25      purrr_1.0.2            
#> [127] crayon_1.5.3            rlang_1.1.4             KEGGREST_1.46.0