This is a walkthrough comparing SWNE, UMAP, and t-SNE on a hematopoiesis dataset from the Mouse Cell Atlas. Based off of the analysis done by in Figure 2d of Becht, McInnes et al.

Load the required libraries and data. The Mouse Cell Atlas data as well as the UMAP, tSNE, and PCA reductions can be downloaded here courtesy of Becht, McInnes et al.

library(Matrix)
library(swne)

## Load data
load("~/swne/Data/Han/BM_BMcKit_PB_RData/xp.RData")
load("~/swne/Data/Han/BM_BMcKit_PB_RData/g2.RData")
load("~/swne/Data/Han/BM_BMcKit_PB_RData/cells_AUC.RData")

## Filter dataset
w2 <- !is.na(g2)
xp <- Matrix::t(xp[w2,])

load("~/swne/Data/Han/BM_BMcKit_PB_RData/pca_g2.RData")
rownames(pca) <- colnames(xp)

Assign labels to cells using the classifier results from Becht, McInnes et al.

## Assign labels to cells
lineages <- c("Multi Potential Progenitor", "Macrophage Lineage", "Neutrophil Lineage",
              "Erythrocyte Lineage", "B Cell Lineage", "T Cell Lineage", "NK Cell Lineage")
cutoffs <- setNames(c(0.04,0.09,0.05,0.045,0.09,0.075,0.04), lineages)

labels <- sapply(lineages, function(i) cells_AUC@assays$data[[1]][i,][w2] >= cutoffs[i])
labels <- apply(labels, 1, which)
labels <- sapply(labels, function(x) { if(length(x) == 1) {x} else {0} })
labels[labels != 0] <- lineages[labels[labels != 0]]
labels[labels == 0] <- NA
names(labels) <- colnames(xp)
labels <- factor(labels)
labels <- plyr::revalue(labels, replace = c("Multi Potential Progenitor" = "MPP",
                                            "Macrophage Lineage" = "Macrophage",
                                            "Neutrophil Lineage" = "Neutrophil",
                                            "Erythrocyte Lineage" = "Erythrocyte",
                                            "B Cell Lineage" = "B Cell",
                                            "T Cell Lineage" = "T Cell",
                                            "NK Cell Lineage" = "NK Cell"))
table(labels); paste("Cells with missing labels:", sum(is.na(labels)))
## labels
##      B Cell Erythrocyte  Macrophage         MPP  Neutrophil     NK Cell 
##         366         521         830         276       10844         297 
##      T Cell 
##         623
## [1] "Cells with missing labels: 26453"

Make the t-SNE plot using the pre-computed t-SNE from Becht, McInnes et al.

## Set a seed to make sure the cluster colors are consistent
plot.seed <- 312525

load("~/swne/Data/Han/BM_BMcKit_PB_RData/tsne_g2.RData")
rownames(tsne) <- names(labels)
PlotDims(tsne, sample.groups = labels, show.legend = F, show.axes = F,
         alpha.plot = 0.75, label.size = 4, pt.size = 0.5,
         seed = plot.seed, use.brewer.pal = T)

Make the UMAP plot using the pre-computed UMAP from Becht, McInnes et al.

load("~/swne/Data/Han/BM_BMcKit_PB_RData/umap_g2.RData")
rownames(umap) <- names(labels)
PlotDims(umap, sample.groups = labels, show.legend = F, show.axes = F,
         alpha.plot = 0.75, label.size = 4, pt.size = 0.5,
         seed = plot.seed, use.brewer.pal = T)

Filter lowly expressed genes and get gene variance info

norm.xp <- xp*1000
norm.xp <- FilterData(norm.xp, min.samples.frac = 2.5e-4, trim = 1e-4, min.nonzero.features = 0,
                      max.sample.sum = Inf)
var.df <- AdjustVariance(norm.xp, verbose = F, plot = F)

Stabilize gene variances with a log-transformation

norm.xp@x <- log(norm.xp@x + 1)

Select variable genes to use

n.genes <- 4e3
var.df <- var.df[order(var.df$lp),]
var.genes <- rownames(var.df[1:n.genes,])

Run SWNE

n.cores <- 16
nmf.res <- RunNMF(norm.xp[var.genes,], k = 40, n.cores = n.cores, ica.fast = T)
nmf.res$W <- ProjectFeatures(norm.xp, nmf.res$H, n.cores = n.cores)

snn <- CalcSNN(t(pca), k = 30, prune.SNN = 0.0)
knn <- CalcKNN(t(pca), k = 30)
snn <- PruneSNN(snn, knn, qval.cutoff = 1e-3)
swne.embedding <- EmbedSWNE(nmf.res$H, SNN = snn, alpha.exp = 1.25, snn.exp = 0.25, n_pull = 3, 
                            proj.method = "sammon")
## Initial stress        : 0.11857
## stress after  10 iters: 0.03777, magic = 0.500
## stress after  20 iters: 0.03709, magic = 0.500
## stress after  30 iters: 0.03686, magic = 0.500
## stress after  40 iters: 0.03663, magic = 0.500
## stress after  50 iters: 0.03652, magic = 0.500
## stress after  60 iters: 0.03648, magic = 0.500
swne.embedding$H.coords$name <- ""

Embed some hematopoiesis marker genes and plot SWNE embedding

## Embed selected genes onto swne plot
genes.embed <- c("Ms4a1", "Cd4", "Ly6g", "Fcgr1")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed,
                                n_pull = 3)

## SWNE plot
PlotSWNE(swne.embedding, alpha.plot = 0.6, sample.groups = labels, do.label = T,
         label.size = 6, pt.size = 0.75, show.legend = F, seed = plot.seed,
         use.brewer.pal = T)
## Warning: Removed 26453 rows containing missing values (geom_point).

Next, we’ll define some helper functions for quantitative benchmarking of these embeddings

library(FNN)
library(proxy)
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
  my.knn <- FNN::get.knn(t(emb), k)

  n.cells <- ncol(emb)
  nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k - 1)])

  j <- as.numeric(x = t(x = nn.ranked))
  i <- ((1:length(x = j)) - 1) %/% k + 1
  nn.matrix <- as(sparseMatrix(i = i, j = j, x = 1, dims = c(ncol(emb), ncol(emb))), "dgCMatrix")
  rownames(nn.matrix) <- colnames(emb)
  colnames(nn.matrix) <- colnames(emb)

  nn.matrix
}


## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
  a <- sum(x)
  b <- sum(y)
  c <- sum(x == 1 & y == 1)
  c/(a + b - c)
}


## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
  data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
  return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}

Compute how well each embedding maintains the pairwise distances between cell types relative to the original gene expression space. This is meant to capture how well each embedding maintains the global structure of the data.

## Compile embeddings
embeddings <- list(tsne = t(tsne), umap = t(umap))
swne.emb <- t(as.matrix(swne.embedding$sample.coords))

## Compute cluster distance correlations
label.cells <- names(labels[!is.na(labels)])
ref.dist <- CalcPairwiseDist(xp[,label.cells], labels[label.cells])

embeddings.cor <- sapply(embeddings, function(emb) {
  emb.dist <- CalcPairwiseDist(emb[,label.cells], labels[label.cells])
  cor(ref.dist, emb.dist)
})

## Compare the SWNE embedding to the variance stabilized expression
## space to ensure we're comparing apples to apples
norm.ref.dist <- CalcPairwiseDist(norm.xp[,label.cells], labels[label.cells])
swne.emb.dist <- CalcPairwiseDist(swne.emb[,label.cells], labels[label.cells])
embeddings.cor <- c(embeddings.cor, cor(norm.ref.dist, swne.emb.dist))
names(embeddings.cor) <- c("tsne", "umap", "swne")

Compute how well each embedding maintains the nearest neighbors of each cell relative to the original gene expression space. This is meant to capture how well each embedding maintains the local structure of the data.

# Calculate neighborhood fidelity
n.neighbors <- 30
# ref.knn <- ComputeKNN(xp[,label.cells], k = n.neighbors)
# norm.ref.knn <- ComputeKNN(norm.xp[,label.cells], k = n.neighbors)
load("~/swne/Data/Han/BM_BMcKit_PB_RData/Han_hemato_ref_knn.RData") ## Load pre-computed kNN networks to save time. Computing the kNN networks in the original expression space can take quite a bit of time.

## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, function(x) ComputeKNN(x, k = n.neighbors))
knn.simil <- sapply(embeddings.knn, function(knn.emb) {
  mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})

swne.knn <- ComputeKNN(swne.emb, k = n.neighbors)
knn.simil <- c(knn.simil, mean(sapply(1:ncol(swne.knn), function(i) CalcJaccard(swne.knn[,i], ref.knn[,i])))) 
names(knn.simil) <- c("tsne", "umap", "swne")

Finally we’ll plot the combined global/local structure results

library(ggplot2)
library(ggrepel)

scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings.cor))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
  theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
  xlab("Neighborhood Similarity") + ylab("Cluster Distance Correlation") +
  geom_text_repel(aes(x, y, label = name), size = 6.5) +
  xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))