This is a walkthrough comparing SWNE, UMAP, and t-SNE on a hematopoiesis dataset from the Mouse Cell Atlas. Based off of the analysis done by in Figure 2d of Becht, McInnes et al.
Load the required libraries and data. The Mouse Cell Atlas data as well as the UMAP, tSNE, and PCA reductions can be downloaded here courtesy of Becht, McInnes et al.
library(Matrix)
library(swne)
## Load data
load("~/swne/Data/Han/BM_BMcKit_PB_RData/xp.RData")
load("~/swne/Data/Han/BM_BMcKit_PB_RData/g2.RData")
load("~/swne/Data/Han/BM_BMcKit_PB_RData/cells_AUC.RData")
## Filter dataset
w2 <- !is.na(g2)
xp <- Matrix::t(xp[w2,])
load("~/swne/Data/Han/BM_BMcKit_PB_RData/pca_g2.RData")
rownames(pca) <- colnames(xp)
Assign labels to cells using the classifier results from Becht, McInnes et al.
## Assign labels to cells
lineages <- c("Multi Potential Progenitor", "Macrophage Lineage", "Neutrophil Lineage",
"Erythrocyte Lineage", "B Cell Lineage", "T Cell Lineage", "NK Cell Lineage")
cutoffs <- setNames(c(0.04,0.09,0.05,0.045,0.09,0.075,0.04), lineages)
labels <- sapply(lineages, function(i) cells_AUC@assays$data[[1]][i,][w2] >= cutoffs[i])
labels <- apply(labels, 1, which)
labels <- sapply(labels, function(x) { if(length(x) == 1) {x} else {0} })
labels[labels != 0] <- lineages[labels[labels != 0]]
labels[labels == 0] <- NA
names(labels) <- colnames(xp)
labels <- factor(labels)
labels <- plyr::revalue(labels, replace = c("Multi Potential Progenitor" = "MPP",
"Macrophage Lineage" = "Macrophage",
"Neutrophil Lineage" = "Neutrophil",
"Erythrocyte Lineage" = "Erythrocyte",
"B Cell Lineage" = "B Cell",
"T Cell Lineage" = "T Cell",
"NK Cell Lineage" = "NK Cell"))
table(labels); paste("Cells with missing labels:", sum(is.na(labels)))
## labels
## B Cell Erythrocyte Macrophage MPP Neutrophil NK Cell
## 366 521 830 276 10844 297
## T Cell
## 623
## [1] "Cells with missing labels: 26453"
Make the t-SNE plot using the pre-computed t-SNE from Becht, McInnes et al.
## Set a seed to make sure the cluster colors are consistent
plot.seed <- 312525
load("~/swne/Data/Han/BM_BMcKit_PB_RData/tsne_g2.RData")
rownames(tsne) <- names(labels)
PlotDims(tsne, sample.groups = labels, show.legend = F, show.axes = F,
alpha.plot = 0.75, label.size = 4, pt.size = 0.5,
seed = plot.seed, use.brewer.pal = T)
Make the UMAP plot using the pre-computed UMAP from Becht, McInnes et al.
load("~/swne/Data/Han/BM_BMcKit_PB_RData/umap_g2.RData")
rownames(umap) <- names(labels)
PlotDims(umap, sample.groups = labels, show.legend = F, show.axes = F,
alpha.plot = 0.75, label.size = 4, pt.size = 0.5,
seed = plot.seed, use.brewer.pal = T)
Filter lowly expressed genes and get gene variance info
norm.xp <- xp*1000
norm.xp <- FilterData(norm.xp, min.samples.frac = 2.5e-4, trim = 1e-4, min.nonzero.features = 0,
max.sample.sum = Inf)
var.df <- AdjustVariance(norm.xp, verbose = F, plot = F)
Stabilize gene variances with a log-transformation
norm.xp@x <- log(norm.xp@x + 1)
Select variable genes to use
n.genes <- 4e3
var.df <- var.df[order(var.df$lp),]
var.genes <- rownames(var.df[1:n.genes,])
Run SWNE
n.cores <- 16
nmf.res <- RunNMF(norm.xp[var.genes,], k = 40, n.cores = n.cores, ica.fast = T)
nmf.res$W <- ProjectFeatures(norm.xp, nmf.res$H, n.cores = n.cores)
snn <- CalcSNN(t(pca), k = 30, prune.SNN = 0.0)
knn <- CalcKNN(t(pca), k = 30)
snn <- PruneSNN(snn, knn, qval.cutoff = 1e-3)
swne.embedding <- EmbedSWNE(nmf.res$H, SNN = snn, alpha.exp = 1.25, snn.exp = 0.25, n_pull = 3,
proj.method = "sammon")
## Initial stress : 0.11857
## stress after 10 iters: 0.03777, magic = 0.500
## stress after 20 iters: 0.03709, magic = 0.500
## stress after 30 iters: 0.03686, magic = 0.500
## stress after 40 iters: 0.03663, magic = 0.500
## stress after 50 iters: 0.03652, magic = 0.500
## stress after 60 iters: 0.03648, magic = 0.500
swne.embedding$H.coords$name <- ""
Embed some hematopoiesis marker genes and plot SWNE embedding
## Embed selected genes onto swne plot
genes.embed <- c("Ms4a1", "Cd4", "Ly6g", "Fcgr1")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed,
n_pull = 3)
## SWNE plot
PlotSWNE(swne.embedding, alpha.plot = 0.6, sample.groups = labels, do.label = T,
label.size = 6, pt.size = 0.75, show.legend = F, seed = plot.seed,
use.brewer.pal = T)
## Warning: Removed 26453 rows containing missing values (geom_point).
Next, we’ll define some helper functions for quantitative benchmarking of these embeddings
library(FNN)
library(proxy)
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
my.knn <- FNN::get.knn(t(emb), k)
n.cells <- ncol(emb)
nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k - 1)])
j <- as.numeric(x = t(x = nn.ranked))
i <- ((1:length(x = j)) - 1) %/% k + 1
nn.matrix <- as(sparseMatrix(i = i, j = j, x = 1, dims = c(ncol(emb), ncol(emb))), "dgCMatrix")
rownames(nn.matrix) <- colnames(emb)
colnames(nn.matrix) <- colnames(emb)
nn.matrix
}
## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
a <- sum(x)
b <- sum(y)
c <- sum(x == 1 & y == 1)
c/(a + b - c)
}
## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}
Compute how well each embedding maintains the pairwise distances between cell types relative to the original gene expression space. This is meant to capture how well each embedding maintains the global structure of the data.
## Compile embeddings
embeddings <- list(tsne = t(tsne), umap = t(umap))
swne.emb <- t(as.matrix(swne.embedding$sample.coords))
## Compute cluster distance correlations
label.cells <- names(labels[!is.na(labels)])
ref.dist <- CalcPairwiseDist(xp[,label.cells], labels[label.cells])
embeddings.cor <- sapply(embeddings, function(emb) {
emb.dist <- CalcPairwiseDist(emb[,label.cells], labels[label.cells])
cor(ref.dist, emb.dist)
})
## Compare the SWNE embedding to the variance stabilized expression
## space to ensure we're comparing apples to apples
norm.ref.dist <- CalcPairwiseDist(norm.xp[,label.cells], labels[label.cells])
swne.emb.dist <- CalcPairwiseDist(swne.emb[,label.cells], labels[label.cells])
embeddings.cor <- c(embeddings.cor, cor(norm.ref.dist, swne.emb.dist))
names(embeddings.cor) <- c("tsne", "umap", "swne")
Compute how well each embedding maintains the nearest neighbors of each cell relative to the original gene expression space. This is meant to capture how well each embedding maintains the local structure of the data.
# Calculate neighborhood fidelity
n.neighbors <- 30
# ref.knn <- ComputeKNN(xp[,label.cells], k = n.neighbors)
# norm.ref.knn <- ComputeKNN(norm.xp[,label.cells], k = n.neighbors)
load("~/swne/Data/Han/BM_BMcKit_PB_RData/Han_hemato_ref_knn.RData") ## Load pre-computed kNN networks to save time. Computing the kNN networks in the original expression space can take quite a bit of time.
## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, function(x) ComputeKNN(x, k = n.neighbors))
knn.simil <- sapply(embeddings.knn, function(knn.emb) {
mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})
swne.knn <- ComputeKNN(swne.emb, k = n.neighbors)
knn.simil <- c(knn.simil, mean(sapply(1:ncol(swne.knn), function(i) CalcJaccard(swne.knn[,i], ref.knn[,i]))))
names(knn.simil) <- c("tsne", "umap", "swne")
Finally we’ll plot the combined global/local structure results
library(ggplot2)
library(ggrepel)
scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings.cor))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
xlab("Neighborhood Similarity") + ylab("Cluster Distance Correlation") +
geom_text_repel(aes(x, y, label = name), size = 6.5) +
xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))