# (PART) Case studies {-}

# Human PBMCs (10X Genomics)

<script>
document.addEventListener("click", function (event) {
    if (event.target.classList.contains("rebook-collapse")) {
        event.target.classList.toggle("active");
        var content = event.target.nextElementSibling;
        if (content.style.display === "block") {
            content.style.display = "none";
        } else {
            content.style.display = "block";
        }
    }
})
</script>

<style>
.rebook-collapse {
  background-color: #eee;
  color: #444;
  cursor: pointer;
  padding: 18px;
  width: 100%;
  border: none;
  text-align: left;
  outline: none;
  font-size: 15px;
}

.rebook-content {
  padding: 0 18px;
  display: none;
  overflow: hidden;
  background-color: #f1f1f1;
}
</style>

## Introduction

This performs an analysis of the public PBMC ID dataset generated by 10X Genomics [@zheng2017massively],
starting from the filtered count matrix.

## Data loading


``` r
library(TENxPBMCData)
all.sce <- list(
    pbmc3k=TENxPBMCData('pbmc3k'),
    pbmc4k=TENxPBMCData('pbmc4k'),
    pbmc8k=TENxPBMCData('pbmc8k')
)
```

## Quality control


``` r
unfiltered <- all.sce
```

Cell calling implicitly serves as a QC step to remove libraries with low total counts and number of detected genes.
Thus, we will only filter on the mitochondrial proportion.


``` r
library(scater)
stats <- high.mito <- list()
for (n in names(all.sce)) {
    current <- all.sce[[n]]
    is.mito <- grep("MT", rowData(current)$Symbol_TENx)
    stats[[n]] <- perCellQCMetrics(current, subsets=list(Mito=is.mito))
    high.mito[[n]] <- isOutlier(stats[[n]]$subsets_Mito_percent, type="higher")
    all.sce[[n]] <- current[,!high.mito[[n]]]
}
```


``` r
qcplots <- list()
for (n in names(all.sce)) {
    current <- unfiltered[[n]]
    colData(current) <- cbind(colData(current), stats[[n]])
    current$discard <- high.mito[[n]]
    qcplots[[n]] <- plotColData(current, x="sum", y="subsets_Mito_percent",
        colour_by="discard") + scale_x_log10()
}
do.call(gridExtra::grid.arrange, c(qcplots, ncol=3))
```

<div class="figure">
<img src="tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-pbmc-filtered-var-1.png" alt="Percentage of mitochondrial reads in each cell in each of the 10X PBMC datasets, compared to the total count. Each point represents a cell and is colored according to whether that cell was discarded." width="672" />
<p class="caption">(\#fig:unref-pbmc-filtered-var)Percentage of mitochondrial reads in each cell in each of the 10X PBMC datasets, compared to the total count. Each point represents a cell and is colored according to whether that cell was discarded.</p>
</div>


``` r
lapply(high.mito, summary)
```

```
## $pbmc3k
##    Mode   FALSE    TRUE 
## logical    2609      91 
## 
## $pbmc4k
##    Mode   FALSE    TRUE 
## logical    4182     158 
## 
## $pbmc8k
##    Mode   FALSE    TRUE 
## logical    8157     224
```

## Normalization

We perform library size normalization, simply for convenience when dealing with file-backed matrices.


``` r
all.sce <- lapply(all.sce, logNormCounts)
```


``` r
lapply(all.sce, function(x) summary(sizeFactors(x)))
```

```
## $pbmc3k
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.234   0.748   0.926   1.000   1.157   6.604 
## 
## $pbmc4k
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.315   0.711   0.890   1.000   1.127  11.027 
## 
## $pbmc8k
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.296   0.704   0.877   1.000   1.118   6.794
```

## Variance modelling


``` r
library(scran)
all.dec <- lapply(all.sce, modelGeneVar)
all.hvgs <- lapply(all.dec, getTopHVGs, prop=0.1)
```


``` r
par(mfrow=c(1,3))
for (n in names(all.dec)) {
    curdec <- all.dec[[n]]
    plot(curdec$mean, curdec$total, pch=16, cex=0.5, main=n,
        xlab="Mean of log-expression", ylab="Variance of log-expression")
    curfit <- metadata(curdec)
    curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
```

<div class="figure">
<img src="tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-filtered-pbmc-variance-1.png" alt="Per-gene variance as a function of the mean for the log-expression values in each PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to the variances." width="672" />
<p class="caption">(\#fig:unref-filtered-pbmc-variance)Per-gene variance as a function of the mean for the log-expression values in each PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to the variances.</p>
</div>

## Dimensionality reduction

For various reasons, we will first analyze each PBMC dataset separately rather than merging them together.
We use randomized SVD, which is more efficient for file-backed matrices.


``` r
library(BiocSingular)
set.seed(10000)
all.sce <- mapply(FUN=runPCA, x=all.sce, subset_row=all.hvgs, 
    MoreArgs=list(ncomponents=25, BSPARAM=RandomParam()), 
    SIMPLIFY=FALSE)

set.seed(100000)
all.sce <- lapply(all.sce, runTSNE, dimred="PCA")

set.seed(1000000)
all.sce <- lapply(all.sce, runUMAP, dimred="PCA")
```

## Clustering


``` r
for (n in names(all.sce)) {
    g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')
    clust <- igraph::cluster_walktrap(g)$membership
    colLabels(all.sce[[n]])  <- factor(clust)
}
```


``` r
lapply(all.sce, function(x) table(colLabels(x)))
```

```
## $pbmc3k
## 
##   1   2   3   4   5   6   7   8   9  10 
## 475 636 153 476 164  31 159 164 340  11 
## 
## $pbmc4k
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 127 594 518 775 211 394 187 993  55 201  91  36 
## 
## $pbmc8k
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  292 1603  388   94  738 1035 1049  156  203  153 2098  261   64   14    9
```


``` r
all.tsne <- list()
for (n in names(all.sce)) {
    all.tsne[[n]] <- plotTSNE(all.sce[[n]], colour_by="label") + ggtitle(n)
}
do.call(gridExtra::grid.arrange, c(all.tsne, list(ncol=2)))
```

<div class="figure">
<img src="tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-filtered-pbmc-tsne-1.png" alt="Obligatory $t$-SNE plots of each PBMC dataset, where each point represents a cell in the corresponding dataset and is colored according to the assigned cluster." width="672" />
<p class="caption">(\#fig:unref-filtered-pbmc-tsne)Obligatory $t$-SNE plots of each PBMC dataset, where each point represents a cell in the corresponding dataset and is colored according to the assigned cluster.</p>
</div>

## Data integration

With the per-dataset analyses out of the way, we will now repeat the analysis after merging together the three batches.


``` r
# Intersecting the common genes.
universe <- Reduce(intersect, lapply(all.sce, rownames))
all.sce2 <- lapply(all.sce, "[", i=universe,)
all.dec2 <- lapply(all.dec, "[", i=universe,)

# Renormalizing to adjust for differences in depth.
library(batchelor)
normed.sce <- do.call(multiBatchNorm, all.sce2)

# Identifying a set of HVGs using stats from all batches.
combined.dec <- do.call(combineVar, all.dec2)
combined.hvg <- getTopHVGs(combined.dec, n=5000)

set.seed(1000101)
merged.pbmc <- do.call(fastMNN, c(normed.sce, 
    list(subset.row=combined.hvg, BSPARAM=RandomParam())))
```

We use the percentage of lost variance as a diagnostic measure.


``` r
metadata(merged.pbmc)$merge.info$lost.var
```

```
##         pbmc3k    pbmc4k   pbmc8k
## [1,] 7.044e-03 3.129e-03 0.000000
## [2,] 6.876e-05 4.912e-05 0.003008
```

We proceed to clustering:


``` r
g <- buildSNNGraph(merged.pbmc, use.dimred="corrected")
colLabels(merged.pbmc) <- factor(igraph::cluster_louvain(g)$membership)
table(colLabels(merged.pbmc), merged.pbmc$batch)
```

```
##     
##      pbmc3k pbmc4k pbmc8k
##   1     535    426    830
##   2     331    588   1126
##   3     182    122    217
##   4     150    179    292
##   5     170    345    573
##   6     292    538   1020
##   7     342    630   1236
##   8     437    749   1538
##   9       9     18     95
##   10     97    365    782
##   11     34    120    201
##   12     11     54    159
##   13     11      3      9
##   14      4     36     64
##   15      4      9     15
```

And visualization:


``` r
set.seed(10101010)
merged.pbmc <- runTSNE(merged.pbmc, dimred="corrected")
gridExtra::grid.arrange(
    plotTSNE(merged.pbmc, colour_by="label", text_by="label", text_colour="red"),
    plotTSNE(merged.pbmc, colour_by="batch")
)
```

<div class="figure">
<img src="tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-filtered-pbmc-merged-tsne-1.png" alt="Obligatory $t$-SNE plots for the merged PBMC datasets, where each point represents a cell and is colored by cluster (top) or batch (bottom)." width="672" />
<p class="caption">(\#fig:unref-filtered-pbmc-merged-tsne)Obligatory $t$-SNE plots for the merged PBMC datasets, where each point represents a cell and is colored by cluster (top) or batch (bottom).</p>
</div>

## Session Info {-}

<button class="rebook-collapse">View session info</button>
<div class="rebook-content">
```
R Under development (unstable) (2025-10-20 r88955)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS

Matrix products: default
BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB              LC_COLLATE=C              
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] batchelor_1.27.0            BiocSingular_1.27.0        
 [3] scran_1.39.0                scater_1.39.0              
 [5] ggplot2_4.0.0               scuttle_1.21.0             
 [7] TENxPBMCData_1.29.0         HDF5Array_1.39.0           
 [9] h5mread_1.3.0               rhdf5_2.55.4               
[11] DelayedArray_0.37.0         SparseArray_1.11.1         
[13] S4Arrays_1.11.0             abind_1.4-8                
[15] Matrix_1.7-4                SingleCellExperiment_1.33.0
[17] SummarizedExperiment_1.41.0 Biobase_2.71.0             
[19] GenomicRanges_1.63.0        Seqinfo_1.1.0              
[21] IRanges_2.45.0              S4Vectors_0.49.0           
[23] BiocGenerics_0.57.0         generics_0.1.4             
[25] MatrixGenerics_1.23.0       matrixStats_1.5.0          
[27] BiocStyle_2.39.0            rebook_1.21.0              

loaded via a namespace (and not attached):
  [1] DBI_1.2.3                 gridExtra_2.3            
  [3] httr2_1.2.1               CodeDepends_0.6.6        
  [5] rlang_1.1.6               magrittr_2.0.4           
  [7] RcppAnnoy_0.0.22          compiler_4.6.0           
  [9] RSQLite_2.4.3             DelayedMatrixStats_1.33.0
 [11] dir.expiry_1.19.0         png_0.1-8                
 [13] vctrs_0.6.5               pkgconfig_2.0.3          
 [15] crayon_1.5.3              fastmap_1.2.0            
 [17] dbplyr_2.5.1              XVector_0.51.0           
 [19] labeling_0.4.3            rmarkdown_2.30           
 [21] graph_1.89.0              ggbeeswarm_0.7.2         
 [23] purrr_1.2.0               bit_4.6.0                
 [25] bluster_1.21.0            xfun_0.54                
 [27] cachem_1.1.0              beachmat_2.27.0          
 [29] jsonlite_2.0.0            blob_1.2.4               
 [31] rhdf5filters_1.23.0       Rhdf5lib_1.33.0          
 [33] BiocParallel_1.45.0       cluster_2.1.8.1          
 [35] irlba_2.3.5.1             parallel_4.6.0           
 [37] R6_2.6.1                  bslib_0.9.0              
 [39] RColorBrewer_1.1-3        limma_3.67.0             
 [41] jquerylib_0.1.4           Rcpp_1.1.0               
 [43] bookdown_0.45             knitr_1.50               
 [45] FNN_1.1.4.1               igraph_2.2.1             
 [47] tidyselect_1.2.1          viridis_0.6.5            
 [49] dichromat_2.0-0.1         yaml_2.3.10              
 [51] codetools_0.2-20          curl_7.0.0               
 [53] lattice_0.22-7            tibble_3.3.0             
 [55] withr_3.0.2               KEGGREST_1.51.0          
 [57] S7_0.2.0                  Rtsne_0.17               
 [59] evaluate_1.0.5            BiocFileCache_3.1.0      
 [61] ExperimentHub_3.1.0       Biostrings_2.79.1        
 [63] pillar_1.11.1             BiocManager_1.30.26      
 [65] filelock_1.0.3            BiocVersion_3.23.1       
 [67] sparseMatrixStats_1.23.0  scales_1.4.0             
 [69] glue_1.8.0                metapod_1.19.0           
 [71] tools_4.6.0               AnnotationHub_4.1.0      
 [73] BiocNeighbors_2.5.0       ScaledMatrix_1.19.0      
 [75] locfit_1.5-9.12           XML_3.99-0.19            
 [77] cowplot_1.2.0             grid_4.6.0               
 [79] edgeR_4.9.0               AnnotationDbi_1.73.0     
 [81] beeswarm_0.4.0            vipor_0.4.7              
 [83] cli_3.6.5                 rsvd_1.0.5               
 [85] rappdirs_0.3.3            viridisLite_0.4.2        
 [87] dplyr_1.1.4               ResidualMatrix_1.21.0    
 [89] uwot_0.2.3                gtable_0.3.6             
 [91] sass_0.4.10               digest_0.6.37            
 [93] dqrng_0.4.1               ggrepel_0.9.6            
 [95] farver_2.1.2              memoise_2.0.1            
 [97] htmltools_0.5.8.1         lifecycle_1.0.4          
 [99] httr_1.4.7                statmod_1.5.1            
[101] bit64_4.6.0-1            
```
</div>
