This is a walkthrough on how to recreate the hematopoiesis visualizations from Figure 2 of our Cell Systems paper.
Load the required libraries and data. The hematopoiesis data (from Paul et al) can be downloaded here.
suppressWarnings(library(Seurat))
suppressWarnings(library(monocle))
library(swne)
library(ggplot2)
## Load monocle2 object
load("~/swne/Data/hemato_data/hemato_monocle_orig_cds.RData")
## Load counts and informative genes
counts <- ReadData("~/swne/Data/hemato_data/hemato_expr_debatched.tsv", make.sparse = T)
info.genes <- scan("~/swne/Data/hemato_data/hemato_info_genes.txt", sep = "\n", what = character())
## Load clusters
clusters.df <- read.table("~/swne/Data/hemato_data/hemato_cluster_mapping.csv", sep = ",")
clusters <- clusters.df[[2]]; names(clusters) <- clusters.df[[1]]
counts <- counts[,names(clusters)]
Filter genes and cells and make Seurat object
counts <- FilterData(counts, min.samples.frac = 0.005, trim = 0.005, min.nonzero.features = 200)
info.genes <- info.genes[info.genes %in% rownames(counts)]
dim(counts)
## [1] 8653 2730
se.obj <- CreateSeuratObject(counts)
Set cluster names, exclude lymphoid cells and dendritic cells (which are not part of the developmental trajectory), set cluster colors.
se.obj <- SetIdent(se.obj, value = clusters)
Idents(se.obj) <- plyr::revalue(Idents(se.obj),
c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery',
"6" = 'Ery', "7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
"11" = 'DC', "12" = 'Bas', "13" = 'Bas', "14" = 'M', "15" = 'M',
"16" = 'Neu', "17" = 'Neu', "18" = 'Eos', "19" = 'lymphoid'))
se.obj <- SubsetData(se.obj, ident.remove = c("lymphoid", "DC"))
clusters <- Idents(se.obj)
cluster_colors <- c("Bas" = "#ff6347", "Eos" = "#EFAD1E",
"Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859",
"GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")
Scale data, run PCA, t-SNE, and UMAP, and build the SNN.
## Scale data
VariableFeatures(se.obj) <- info.genes
norm.counts <- ScaleCounts(counts)[,colnames(se.obj)]
## calculating variance fit ... using gam
rownames(norm.counts) <- rownames(se.obj)
se.obj <- SetAssayData(se.obj, new.data = norm.counts)
se.obj <- SetAssayData(se.obj, slot = "scale.data", new.data = as.matrix(norm.counts - Matrix::rowMeans(norm.counts)))
## Run PCA
se.obj <- RunPCA(se.obj, features = info.genes, verbose = F, pcs.compute = 40)
ElbowPlot(se.obj)
## Run t-SNE, UMAP, and SNN
se.obj <- RunTSNE(se.obj, dims = 1:10)
se.obj <- RunUMAP(se.obj, reduction.use = "pca", dims = 1:10, n.neighbors = 40, min.dist = 0.5, verbose = F)
pc.emb <- Embeddings(se.obj, reduction = "pca")[,1:15]
Identify number of factors to use for SWNE
k.range <- seq(2,20,2) ## Range of factor values to test
n.cores <- 12 ## Number of cores to use
k.err <- FindNumFactors(norm.counts[info.genes,], k.range = k.range, n.cores = n.cores,
seed = 32590, do.plot = F)
PlotFactorSelection(k.err, font.size = 15)
Run SWNE
k <- 12
nmf.res <- RunNMF(norm.counts[info.genes,], k = k, init = "ica", n.cores = n.cores, ica.fast = F) ## Run NMF
nmf.scores <- nmf.res$H
nmf.res$W <- ProjectFeatures(norm.counts, nmf.res$H, n.cores = n.cores) ## Project all genes onto NMF
## Compute and prune SNN
snn <- CalcSNN(t(pc.emb), k = 40, prune.SNN = 0)
knn <- CalcKNN(t(pc.emb), k = 40) ## Extract kNN matrix
snn <- PruneSNN(snn, knn, clusters = NULL, qval.cutoff = 1e-3)
## Run SWNE embedding
swne.embedding <- EmbedSWNE(nmf.scores, SNN = snn, alpha.exp = 1.5, snn.exp = 0.1, n_pull = 3,
dist.use = "cosine")
## Initial stress : 0.04686
## stress after 10 iters: 0.01816, magic = 0.500
## stress after 20 iters: 0.01808, magic = 0.500
Embed key marker genes. See Table S1 from the Cell Systems paper for how key genes were selected.
swne.embedding$H.coords$name <- ""
genes.embed <- c("Apoe", "Mt2", "Gpr56", "Sun2", "Flt3")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed, n_pull = 3)
Make SWNE plot
PlotSWNE(swne.embedding, alpha.plot = 0.4, sample.groups = clusters, do.label = T,
label.size = 3.5, pt.size = 2, show.legend = F) +
scale_color_manual(values = cluster_colors)
Make t-SNE plot
## tSNE plots
tsne.scores <- Embeddings(se.obj, "tsne")
PlotDims(tsne.scores, sample.groups = clusters, show.legend = F, show.axes = F,
alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
scale_color_manual(values = cluster_colors)
Make UMAP plot
umap.scores <- Embeddings(se.obj, "umap")
PlotDims(umap.scores, sample.groups = clusters, show.legend = F, show.axes = F,
alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
scale_color_manual(values = cluster_colors)
Extract pre-computed pseudotime (computed using Monocle2)
pseudotime <- cds$Pseudotime; names(pseudotime) <- colnames(cds@reducedDimS);
pseudotime <- pseudotime[names(clusters)]
SWNE pseudotime plot
FeaturePlotSWNE(swne.embedding, pseudotime, alpha.plot = 0.4, label.size = 3.5, pt.size = 1.5)
t-SNE pseudotime plot
FeaturePlotDims(tsne.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)
UMAP pseudotime plot
FeaturePlotDims(umap.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)
Now let's compute a quantitative evaluation of SWNE, tSNE, and UMAP as well as other embeddings.
First we need to compute those other embeddings, including dmaps, MDS, Isomap, LLE, and PCA
library(destiny)
dm <- DiffusionMap(t(as.matrix(norm.counts[info.genes,])), k = 20, n_eigs = 2)
diff.scores <- dm@eigenvectors; rownames(diff.scores) <- colnames(norm.counts);
library(RDRToolbox)
isomap.scores <- Isomap(t(as.matrix(norm.counts[info.genes,])), dims = 2, k = 20)$dim2
rownames(isomap.scores) <- colnames(norm.counts)
lle.scores <- LLE(t(as.matrix(norm.counts[info.genes,])), dim = 2, k = 20)
rownames(lle.scores) <- colnames(norm.counts)
mds.scores <- cmdscale(dist(t(as.matrix(norm.counts[info.genes,]))), k = 2)
pc.scores <- pc.emb[,1:2]
embeddings <- list(swne = t(as.matrix(swne.embedding$sample.coords)),
tsne = t(tsne.scores),
pca = t(pc.scores),
lle = t(lle.scores),
mds = t(mds.scores),
isomap = t(isomap.scores),
dmaps = t(diff.scores),
umap = t(umap.scores))
Next we define some helper functions that will help us evaluate these embeddings
library(FNN)
library(proxy)
##
## Attaching package: 'proxy'
## The following object is masked from 'package:destiny':
##
## as.matrix
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
my.knn <- FNN::get.knn(t(emb), k)
n.cells <- ncol(emb)
nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k - 1)])
j <- as.numeric(x = t(x = nn.ranked))
i <- ((1:length(x = j)) - 1) %/% k + 1
nn.matrix <- as(sparseMatrix(i = i, j = j, x = 1, dims = c(ncol(emb), ncol(emb))), "dgCMatrix")
rownames(nn.matrix) <- colnames(emb)
colnames(nn.matrix) <- colnames(emb)
nn.matrix
}
## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
a <- sum(x)
b <- sum(y)
c <- sum(x == 1 & y == 1)
c/(a + b - c)
}
## Function for identifying cells in the same path and time step
GetPathStep <- function(metadata, step.size, make.factor = T) {
path.step <- as.character(metadata$Group); names(path.step) <- rownames(metadata);
for (path in levels(factor(metadata$Group))) {
steps <- sort(unique(subset(metadata, Group == path)$Step))
step.range <- seq(min(steps), max(steps), step.size)
for(i in step.range) {
cells.step <- rownames(subset(metadata, Group == path & Step %in% seq(i, i + step.size - 1, 1)))
path.step[cells.step] <- paste(path, i, sep = ".")
}
}
if (make.factor) {
path.step <- factor(path.step)
}
path.step
}
## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}
Compute how well each embedding maintains local structure compared to the original gene expression space by comparing kNN networks.
n.neighbors <- 40
ref.knn <- ComputeKNN(norm.counts[info.genes,], k = n.neighbors)
## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, ComputeKNN, k = n.neighbors)
knn.simil <- sapply(embeddings.knn, function(knn.emb) {
mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})
Compute how well each embedding maintains global strucutre by computing the centroids of each trajectory-cluster grouping in the embedding space and original gene expression space and correlating the pairwise distances between the centroids.
metadata.df <- data.frame(Group = clusters, Step = order(pseudotime[names(clusters)]))
clusters.steps <- GetPathStep(metadata.df, step.size = 50, make.factor = T)
traj.dist <- CalcPairwiseDist(norm.counts[info.genes,], clusters.steps)
embeddings.cor <- sapply(embeddings, function(emb) {
emb.dist <- CalcPairwiseDist(emb, clusters.steps)
cor(traj.dist, emb.dist)
})
Plot how well each embedding maintaings global vs local structure.
library(ggplot2)
library(ggrepel)
scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
xlab("Neighborhood Score") + ylab("Path-Time Distance Correlation") +
geom_text_repel(aes(x, y, label = name), size = 5) +
xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))