Run benchmarking for all the methods. Do it within R, as both Scanorama and BBKNN are responsive to reticulate, and this way I don't have to rpy2 over gigantic Splatter count matrices.
Import the python stuff first, as Scanorama is a primadonna and refuses to load if it comes after the R block below because reasons.
library(reticulate)
use_python('/Users/kp9/anaconda3/envs/orig/bin/python')
bbknn = import('bbknn')
scanorama = import('scanorama')
library(splatter)
library(harmony)
library(irlba)
library(magrittr)
library(scran)
library(Seurat)
library(RcppCNPy)
Loading required package: SingleCellExperiment Loading required package: SummarizedExperiment Loading required package: GenomicRanges Loading required package: stats4 Loading required package: BiocGenerics Loading required package: parallel Attaching package: ‘BiocGenerics’ The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs The following objects are masked from ‘package:base’: anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums, dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min Loading required package: S4Vectors Attaching package: ‘S4Vectors’ The following object is masked from ‘package:base’: expand.grid Loading required package: IRanges Loading required package: GenomeInfoDb Loading required package: Biobase Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. Loading required package: DelayedArray Loading required package: matrixStats Attaching package: ‘matrixStats’ The following objects are masked from ‘package:Biobase’: anyMissing, rowMedians Loading required package: BiocParallel Attaching package: ‘DelayedArray’ The following objects are masked from ‘package:matrixStats’: colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges The following objects are masked from ‘package:base’: aperm, apply Loading required package: Rcpp Loading required package: Matrix Attaching package: ‘Matrix’ The following object is masked from ‘package:S4Vectors’: expand Loading required package: ggplot2 Loading required package: cowplot Attaching package: ‘cowplot’ The following object is masked from ‘package:ggplot2’: ggsave
Run the algorithms on datasets with cell totals scaling up in powers of two. Start by running all the methods up to a total of 2^15 (~32,000) cells. The data is simulated via Splatter, split evenly into two batches, each of those with two cell types, with 5000 differentially expressed genes emulating a highly variable gene pool. Not like the actual numeric content is of much relevance here, but may as well keep it proper.
Note the use_faiss = FALSE
in BBKNN. For some reason, faiss is not happy with being ran via this particular reticulate setup. It's going to be benchmarked separately in python.
scanorama_time = c()
mnn_time = c()
cca_time = c()
harmony_time = c()
bbknn_time = c()
annoy_time = c()
for (i in 10:14)
{
print(i)
#prepare splatter data
params = newSplatParams()
params = setParam(params, "nGenes", 5000)
params = setParam(params, "de.prob", 1)
params = setParam(params, "batchCells", c(2^i,2^i))
params = setParam(params, "group.prob", c(0.5,0.5))
sim = splatSimulate(params, method="groups", verbose=FALSE)
#data prep for scanorama
counts = list()
counts[[1]] = t(counts(sim)[,1:(2^i)])
counts[[2]] = t(counts(sim)[,(2^i+1):(2^(i+1))])
genes = list()
genes[[1]] = 1:5000
genes[[2]] = 1:5000
#run scanorama
t1 = Sys.time()
corrected = scanorama$correct(counts,genes)
t2 = Sys.time()
scanorama_time = c(scanorama_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(scanorama_time,n=1))
#repurpose scanorama input for mnncorrect
counts[[1]] = t(counts[[1]])
counts[[2]] = t(counts[[2]])
#run mnncorrect
t1 = Sys.time()
mnn = mnnCorrect(counts[[1]], counts[[2]], BPPARAM=MulticoreParam(detectCores()))
t2 = Sys.time()
mnn_time = c(mnn_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(mnn_time,n=1))
#prepare Seurat stuff
srat = CreateSeuratObject(raw.data = counts(sim))
srat = NormalizeData(object = srat, normalization.method = "LogNormalize", scale.factor = 10000)
srat@var.genes = rownames(srat@data)
srat = ScaleData(object = srat)
srat@meta.data[,'Batch'] = colData(sim)[,'Batch']
#and then it needs to be split so that CCA can see it
srat1 = SubsetData(srat, subset.name='Batch', accept.value='Batch1')
srat2 = SubsetData(srat, subset.name='Batch', accept.value='Batch2')
#run CCA
t1 = Sys.time()
srat = RunCCA(srat1, srat2, num.cc=20)
srat = AlignSubspace(srat, reduction.type='cca', grouping.var='Batch', dims.align=1:20)
t2 = Sys.time()
cca_time = c(cca_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(cca_time,n=1))
#extract batch vector
batch = colData(sim)[,'Batch']
#run irlba PCA
pca = irlba(A=counts(sim),nv=50)
pca = pca$v
#run harmony
t1 = Sys.time()
hem = HarmonyMatrix(pca, batch)
t2 = Sys.time()
harmony_time = c(harmony_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(harmony_time,n=1))
#run BBKNN with cKDTree neighbours
t1 = Sys.time()
bem = bbknn$bbknn_pca_matrix(pca = pca, batch_list = batch, approx = FALSE, use_faiss = FALSE)
t2 = Sys.time()
bbknn_time = c(bbknn_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(bbknn_time,n=1))
#run BBKNN with approximate neighbours
t1 = Sys.time()
bem = bbknn$bbknn_pca_matrix(pca = pca, batch_list = batch)
t2 = Sys.time()
annoy_time = c(annoy_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(annoy_time,n=1))
}
[1] 10 [1] 6.228694 [1] 76.0922
Scaling data matrix Scaling data matrix Scaling data matrix Scaling data matrix
[1] 97.57821
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 149 iterations
Harmony 4/10
Clustered for 78 iterations
Harmony 5/10
Clustered for 75 iterations
Harmony 6/10
Clustered for 45 iterations
Harmony 7/10
Clustered for 35 iterations
Harmony 8/10
Clustered for 24 iterations
Harmony 9/10
Clustered for 16 iterations
Harmony 10/10
Clustered for 17 iterations [1] 8.485609 [1] 0.2886209 [1] 0.29918 [1] 11 [1] 12.72698 [1] 250.6735
Scaling data matrix Scaling data matrix Scaling data matrix Scaling data matrix
[1] 163.2185
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 133 iterations
Harmony 4/10
Clustered for 63 iterations
Harmony 5/10
Clustered for 41 iterations
Harmony 6/10
Clustered for 12 iterations
Harmony 7/10
Clustered for 28 iterations
Harmony 8/10
Clustered for 26 iterations
Harmony 9/10
Clustered for 15 iterations
Harmony 10/10
Clustered for 12 iterations [1] 12.75697 [1] 0.727396 [1] 0.4967551 [1] 12 [1] 31.04822 [1] 924.8357
Scaling data matrix Scaling data matrix Scaling data matrix Scaling data matrix
[1] 314.9184
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 158 iterations
Harmony 4/10
Clustered for 51 iterations
Harmony 5/10
Clustered for 33 iterations
Harmony 6/10
Clustered for 15 iterations
Harmony 7/10
Clustered for 12 iterations
Harmony 8/10
Clustered for 12 iterations
Harmony 9/10
Clustered for 12 iterations
Harmony 10/10
Clustered for 12 iterations [1] 29.8317 [1] 2.040226 [1] 0.9200101 [1] 13 [1] 70.94677 [1] 3636.818
Scaling data matrix Scaling data matrix Scaling data matrix Scaling data matrix
[1] 730.423
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 109 iterations
Harmony 4/10
Clustered for 20 iterations
Harmony 5/10
Clustered for 12 iterations
Harmony 6/10
Clustered for 12 iterations
Harmony 7/10
Clustered for 12 iterations
Harmony 8/10
Clustered for 12 iterations Harmony converged after 8 iterations [1] 54.75862 [1] 5.953090 [1] 1.787516 [1] 14 [1] 160.7911 [1] 13138.54
Scaling data matrix Scaling data matrix Scaling data matrix Scaling data matrix
[1] 2688.891
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 106 iterations
Harmony 4/10
Clustered for 20 iterations
Harmony 5/10
Clustered for 12 iterations Harmony converged after 5 iterations [1] 108.129 [1] 28.7195 [1] 4.139049
dir.create('benchmark-times',showWarnings = FALSE)
fid = file('benchmark-times/scanorama.txt')
writeLines(as.character(scanorama_time),fid)
close(fid)
fid = file('benchmark-times/mnn.txt')
writeLines(as.character(mnn_time),fid)
close(fid)
fid = file('benchmark-times/cca.txt')
writeLines(as.character(cca_time),fid)
close(fid)
fid = file('benchmark-times/harmony.txt')
writeLines(as.character(harmony_time),fid)
close(fid)
fid = file('benchmark-times/bbknn.txt')
writeLines(as.character(bbknn_time),fid)
close(fid)
fid = file('benchmark-times/annoy.txt')
writeLines(as.character(annoy_time),fid)
close(fid)
We're entering big dataset land. Run Harmony and BBKNN, as Scanorama is resource intensive and dies on the very first of the larger datasets (possibly due to some overhead from being reticulated, don't know, but it's not happy). Shrink Splatter gene pool to 500 in the interest of resource efficiency - the actual gene count doesn't matter in the slightest for run time benchmarking of these two, as they both operate on PCA space. Just in case, write out run times after each loop iteration.
for (i in 15:18)
{
print(i)
#prepare splatter data
params = newSplatParams()
params = setParam(params, "nGenes", 500)
params = setParam(params, "de.prob", 1)
params = setParam(params, "batchCells", c(2^i,2^i))
params = setParam(params, "group.prob", c(0.5,0.5))
sim = splatSimulate(params, method="groups", verbose=FALSE)
#extract batch vector
batch = colData(sim)[,'Batch']
#run irlba PCA
pca = irlba(A=counts(sim),nv=50)
pca = pca$v
#run harmony
t1 = Sys.time()
hem = HarmonyMatrix(pca, batch)
t2 = Sys.time()
harmony_time = c(harmony_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(harmony_time,n=1))
#run BBKNN with cKDTree neighbours
t1 = Sys.time()
bem = bbknn$bbknn_pca_matrix(pca = pca, batch_list = batch, approx = FALSE, use_faiss = FALSE)
t2 = Sys.time()
bbknn_time = c(bbknn_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(bbknn_time,n=1))
#run BBKNN with approximate neighbours
t1 = Sys.time()
bem = bbknn$bbknn_pca_matrix(pca = pca, batch_list = batch)
t2 = Sys.time()
annoy_time = c(annoy_time, as.numeric(difftime(t2,t1,units='secs')))
print(tail(annoy_time,n=1))
#write output, in case something goes south
fid = file('benchmark-times/harmony.txt')
writeLines(as.character(harmony_time),fid)
close(fid)
fid = file('benchmark-times/bbknn.txt')
writeLines(as.character(bbknn_time),fid)
close(fid)
fid = file('benchmark-times/annoy.txt')
writeLines(as.character(annoy_time),fid)
close(fid)
}
[1] 15
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 78 iterations
Harmony 4/10
Clustered for 22 iterations
Harmony 5/10
Clustered for 12 iterations Harmony converged after 5 iterations [1] 211.84 [1] 141.256 [1] 7.87409 [1] 16
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 81 iterations
Harmony 4/10
Clustered for 19 iterations Harmony converged after 4 iterations [1] 479.0176 [1] 522.1306 [1] 15.69402 [1] 17
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 75 iterations
Harmony 4/10
Clustered for 20 iterations
Harmony 5/10
Clustered for 12 iterations Harmony converged after 5 iterations [1] 1218.27 [1] 2538.578 [1] 31.73074 [1] 18
Harmony 1/10 Harmony 2/10 Harmony 3/10
Clustered for 71 iterations
Harmony 4/10
Clustered for 17 iterations Harmony converged after 4 iterations [1] 2484.623 [1] 13600.3 [1] 65.80895
For whatever reason, faiss just doesn't seem to get along with R at all - trying to rpy2 in some Splatter data makes it fail as well. As such, let's generate the PCAs to run faiss on from within this R notebook, and then import them into Python and feed them to faiss. In retrospect, I could have saved them as I made them while benchmarking previously, but I did not foresee faiss being quite this picky with regards to R and the benchmark itself sure took a while to run.
for (i in 10:18)
{
#prepare splatter data
params = newSplatParams()
params = setParam(params, "nGenes", 500)
params = setParam(params, "de.prob", 1)
params = setParam(params, "batchCells", c(2^i,2^i))
params = setParam(params, "group.prob", c(0.5,0.5))
sim = splatSimulate(params, method="groups", verbose=FALSE)
#run irlba PCA
pca = irlba(A=counts(sim),nv=50)
pca = pca$v
#save pickle
npySave(paste('pca',i,'.npy',sep=''),pca)
}