This is a walkthrough on how to recreate the hematopoiesis visualizations from Figure 2 of our Cell Systems paper.

Load the required libraries and data. The hematopoiesis data (from Paul et al) can be downloaded here.

suppressWarnings(library(Seurat))
suppressWarnings(library(monocle))
library(swne)
library(ggplot2)

## Load monocle2 object
load("~/swne/Data/hemato_data/hemato_monocle_orig_cds.RData")

## Load counts and informative genes
counts <- ReadData("~/swne/Data/hemato_data/hemato_expr_debatched.tsv", make.sparse = T)
info.genes <- scan("~/swne/Data/hemato_data/hemato_info_genes.txt", sep = "\n", what = character())

## Load clusters
clusters.df <- read.table("~/swne/Data/hemato_data/hemato_cluster_mapping.csv", sep = ",")
clusters <- clusters.df[[2]]; names(clusters) <- clusters.df[[1]]
counts <- counts[,names(clusters)]

Filter genes and cells and make Seurat object

counts <- FilterData(counts, min.samples.frac = 0.005, trim = 0.005, min.nonzero.features = 200)
info.genes <- info.genes[info.genes %in% rownames(counts)]
dim(counts)
## [1] 8653 2730
se.obj <- CreateSeuratObject(counts)

Set cluster names, exclude lymphoid cells and dendritic cells (which are not part of the developmental trajectory), set cluster colors.

se.obj <- SetIdent(se.obj, value = clusters)
Idents(se.obj) <- plyr::revalue(Idents(se.obj), 
                                c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery',
                                  "6" = 'Ery', "7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
                                  "11" = 'DC', "12" = 'Bas', "13" = 'Bas', "14" = 'M', "15" = 'M',
                                  "16" = 'Neu', "17" = 'Neu', "18" = 'Eos', "19" = 'lymphoid'))

se.obj <- SubsetData(se.obj, ident.remove = c("lymphoid", "DC"))

clusters <- Idents(se.obj)
cluster_colors <- c("Bas" = "#ff6347", "Eos" = "#EFAD1E", 
                    "Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859", 
                    "GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")

Scale data, run PCA, t-SNE, and UMAP, and build the SNN.

## Scale data
VariableFeatures(se.obj) <- info.genes
norm.counts <- ScaleCounts(counts)[,colnames(se.obj)]
## calculating variance fit ... using gam
rownames(norm.counts) <- rownames(se.obj)

se.obj <- SetAssayData(se.obj, new.data = norm.counts)
se.obj <- SetAssayData(se.obj, slot = "scale.data", new.data = as.matrix(norm.counts - Matrix::rowMeans(norm.counts)))

## Run PCA
se.obj <- RunPCA(se.obj, features = info.genes, verbose = F, pcs.compute = 40)
ElbowPlot(se.obj)

## Run t-SNE, UMAP, and SNN
se.obj <- RunTSNE(se.obj, dims = 1:10)
se.obj <- RunUMAP(se.obj, reduction.use = "pca", dims = 1:10, n.neighbors = 40, min.dist = 0.5, verbose = F)
pc.emb <- Embeddings(se.obj, reduction = "pca")[,1:15]

Identify number of factors to use for SWNE

k.range <- seq(2,20,2) ## Range of factor values to test
n.cores <- 12 ## Number of cores to use

k.err <- FindNumFactors(norm.counts[info.genes,], k.range = k.range, n.cores = n.cores, 
                        seed = 32590, do.plot = F)

PlotFactorSelection(k.err, font.size = 15)

Run SWNE

k <- 12
nmf.res <- RunNMF(norm.counts[info.genes,], k = k, init = "ica", n.cores = n.cores, ica.fast = F) ## Run NMF
nmf.scores <- nmf.res$H

nmf.res$W <- ProjectFeatures(norm.counts, nmf.res$H, n.cores = n.cores) ## Project all genes onto NMF

## Compute and prune SNN
snn <- CalcSNN(t(pc.emb), k = 40, prune.SNN = 0)
knn <- CalcKNN(t(pc.emb), k = 40) ## Extract kNN matrix
snn <- PruneSNN(snn, knn, clusters = NULL, qval.cutoff = 1e-3)

## Run SWNE embedding
swne.embedding <- EmbedSWNE(nmf.scores, SNN = snn, alpha.exp = 1.5, snn.exp = 0.1, n_pull = 3,
                            dist.use = "cosine")
## Initial stress        : 0.04686
## stress after  10 iters: 0.01816, magic = 0.500
## stress after  20 iters: 0.01808, magic = 0.500

Embed key marker genes. See Table S1 from the Cell Systems paper for how key genes were selected.

swne.embedding$H.coords$name <- ""
genes.embed <- c("Apoe", "Mt2", "Gpr56", "Sun2", "Flt3")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed, n_pull = 3)

Make SWNE plot

PlotSWNE(swne.embedding, alpha.plot = 0.4, sample.groups = clusters, do.label = T,
         label.size = 3.5, pt.size = 2, show.legend = F) +
  scale_color_manual(values = cluster_colors)

Make t-SNE plot

## tSNE plots
tsne.scores <- Embeddings(se.obj, "tsne")
PlotDims(tsne.scores, sample.groups = clusters, show.legend = F, show.axes = F,
         alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
  scale_color_manual(values = cluster_colors)

Make UMAP plot

umap.scores <- Embeddings(se.obj, "umap")
PlotDims(umap.scores, sample.groups = clusters, show.legend = F, show.axes = F,
         alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
  scale_color_manual(values = cluster_colors)

Extract pre-computed pseudotime (computed using Monocle2)

pseudotime <- cds$Pseudotime; names(pseudotime) <- colnames(cds@reducedDimS);
pseudotime <- pseudotime[names(clusters)]

SWNE pseudotime plot

FeaturePlotSWNE(swne.embedding, pseudotime, alpha.plot = 0.4, label.size = 3.5, pt.size = 1.5)

t-SNE pseudotime plot

FeaturePlotDims(tsne.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)

UMAP pseudotime plot

FeaturePlotDims(umap.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)

Now let's compute a quantitative evaluation of SWNE, tSNE, and UMAP as well as other embeddings.

First we need to compute those other embeddings, including dmaps, MDS, Isomap, LLE, and PCA

library(destiny)
dm <- DiffusionMap(t(as.matrix(norm.counts[info.genes,])), k = 20, n_eigs = 2)
diff.scores <- dm@eigenvectors; rownames(diff.scores) <- colnames(norm.counts);

library(RDRToolbox)
isomap.scores <- Isomap(t(as.matrix(norm.counts[info.genes,])), dims = 2, k = 20)$dim2
rownames(isomap.scores) <- colnames(norm.counts)

lle.scores <- LLE(t(as.matrix(norm.counts[info.genes,])), dim = 2, k = 20)
rownames(lle.scores) <- colnames(norm.counts)

mds.scores <- cmdscale(dist(t(as.matrix(norm.counts[info.genes,]))), k = 2)
pc.scores <- pc.emb[,1:2]

embeddings <- list(swne = t(as.matrix(swne.embedding$sample.coords)),
                   tsne = t(tsne.scores),
                   pca = t(pc.scores),
                   lle = t(lle.scores),
                   mds = t(mds.scores),
                   isomap = t(isomap.scores),
                   dmaps = t(diff.scores),
                   umap = t(umap.scores))

Next we define some helper functions that will help us evaluate these embeddings

library(FNN)
library(proxy)
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:destiny':
## 
##     as.matrix
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
  my.knn <- FNN::get.knn(t(emb), k)

  n.cells <- ncol(emb)
  nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k - 1)])

  j <- as.numeric(x = t(x = nn.ranked))
  i <- ((1:length(x = j)) - 1) %/% k + 1
  nn.matrix <- as(sparseMatrix(i = i, j = j, x = 1, dims = c(ncol(emb), ncol(emb))), "dgCMatrix")
  rownames(nn.matrix) <- colnames(emb)
  colnames(nn.matrix) <- colnames(emb)

  nn.matrix
}


## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
  a <- sum(x)
  b <- sum(y)
  c <- sum(x == 1 & y == 1)
  c/(a + b - c)
}


## Function for identifying cells in the same path and time step
GetPathStep <- function(metadata, step.size, make.factor = T) {
  path.step <- as.character(metadata$Group); names(path.step) <- rownames(metadata);
  for (path in levels(factor(metadata$Group))) {
    steps <- sort(unique(subset(metadata, Group == path)$Step))
    step.range <- seq(min(steps), max(steps), step.size)
    for(i in step.range) {
      cells.step <- rownames(subset(metadata, Group == path & Step %in% seq(i, i + step.size - 1, 1)))
      path.step[cells.step] <- paste(path, i, sep = ".")
    }
  }
  if (make.factor) {
    path.step <- factor(path.step)
  }
  path.step
}


## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
  data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
  return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}

Compute how well each embedding maintains local structure compared to the original gene expression space by comparing kNN networks.

n.neighbors <- 40
ref.knn <- ComputeKNN(norm.counts[info.genes,], k = n.neighbors)

## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, ComputeKNN, k = n.neighbors)

knn.simil <- sapply(embeddings.knn, function(knn.emb) {
  mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})

Compute how well each embedding maintains global strucutre by computing the centroids of each trajectory-cluster grouping in the embedding space and original gene expression space and correlating the pairwise distances between the centroids.

metadata.df <- data.frame(Group = clusters, Step = order(pseudotime[names(clusters)]))
clusters.steps <- GetPathStep(metadata.df, step.size = 50, make.factor = T)
traj.dist <- CalcPairwiseDist(norm.counts[info.genes,], clusters.steps)

embeddings.cor <- sapply(embeddings, function(emb) {
  emb.dist <- CalcPairwiseDist(emb, clusters.steps)
  cor(traj.dist, emb.dist)
})

Plot how well each embedding maintaings global vs local structure.

library(ggplot2)
library(ggrepel)

scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
  theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
  xlab("Neighborhood Score") + ylab("Path-Time Distance Correlation") +
  geom_text_repel(aes(x, y, label = name), size = 5) +
  xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))