import matplotlib.pyplot as pl import numpy as np import pandas as pd import scanpy as sc from time import time import scvelo as scv scv.logging.print_versions() scv.settings.set_figure_params(style='scvelo') scv.settings.verbosity = 0 def run_scv(adata, mode='steady_state'): scv.pp.filter_and_normalize(adata, min_counts=20, min_counts_u=10, n_top_genes=1000) scv.pp.moments(adata, method='hnsw', n_pcs=30, n_neighbors=30) if mode is 'dynamical': scv.tl.velocity(adata, mode="steady_state", filter_genes=True) scv.tl.recover_dynamics(adata) scv.tl.velocity(adata, mode=mode) else: scv.tl.velocity(adata, mode=mode, filter_genes=True) try: scv.tl.velocity_graph(adata) scv.tl.velocity_embedding(adata) except: print("neighbor graph corrupted") def run_vcy(vlm): vlm.score_detection_levels(min_expr_counts=20, min_expr_counts_U=10) vlm.filter_genes(by_detection_levels=True) if vlm.S.shape[0] > 1000: vlm.score_cv_vs_mean(1000) vlm.filter_genes(by_cv_vs_mean=True) vlm._normalize_S(relative_size=vlm.initial_cell_size, target_size=np.median(vlm.initial_cell_size)) vlm._normalize_U(relative_size=vlm.initial_Ucell_size, target_size=np.median(vlm.initial_Ucell_size)) vlm.S_norm = np.log1p(vlm.S_sz) vlm.perform_PCA(n_components=30) vlm.knn_imputation(n_pca_dims=30, k=30) vlm.normalize_median() vlm.fit_gammas() vlm.filter_genes_good_fit() vlm.predict_U() vlm.calculate_velocity() vlm.calculate_shift() vlm.extrapolate_cell_at_t() vlm.estimate_transition_prob() vlm.calculate_embedding_shift() vlm.calculate_grid_arrows() adata_ref = scv.read('data/pancreas/endocrinogenesis.h5ad') test = False #scvelo's steady-state and stochastic model np.random.seed(42) n_cells = np.array([1e3, 5e3, 10e3, 15e3, 20e3, 25919, 30e3, 35e3, 40e3, 50e3, 60e3], dtype=int) if test: n_cells = np.array([.5e3, 1e3], dtype=int) scv_stoch_time, scv_time = [], [] for n in n_cells: indices = np.random.choice(adata_ref.n_obs, n) adata = adata_ref[indices] adata.obs_names_make_unique() scv.utils.cleanup(adata, clean='all') adata_run = adata.copy() t_init = time() run_scv(adata_run) scv_time.extend([(time() - t_init) / 60]) adata_run = adata.copy() t_init = time() run_scv(adata_run, mode='stochastic') scv_stoch_time.extend([(time() - t_init) / 60]) np.save('data/runtime/scv_time' + str(n), scv_time[-1]) np.save('data/runtime/scv_stoch_time' + str(n), scv_stoch_time[-1]) scv_time = np.array(scv_time) scv_stoch_time = np.array(scv_stoch_time) np.save('data/runtime/n_cells', n_cells) np.save('data/runtime/scv_time', scv_time) np.save('data/runtime/scv_stoch_time', scv_stoch_time) #velocyto's steady-state model np.random.seed(42) n_cells = np.array([1e3, 5e3, 10e3, 15e3, 20e3, 25919, 30e3, 35e3, 40e3, 50e3, 60e3], dtype=int) # run out of 64GB RAM at 40k cells if test: n_cells = np.array([.5e3, 1e3], dtype=int) vcy_time = [] for n in n_cells: indices = np.random.choice(adata_ref.n_obs, n) adata = adata_ref[indices] adata.obs_names_make_unique() scv.utils.cleanup(adata, clean='all') adata_run = adata.copy() vlm = scv.utils.convert_to_loom(adata_run, basis='umap') t_init = time() run_vcy(vlm) vcy_time.extend([(time() - t_init) / 60]) np.save('data/runtime/vcy_time' + str(n), vcy_time[-1]) vcy_time = np.array(vcy_time) np.save('data/runtime/vcy_time', vcy_time) vcy_time = np.array(vcy_time) np.save('data/runtime/vcy_time', vcy_time) #scvelo's dynamical model np.random.seed(42) n_dyn_cells = np.array([1e3, 10e3, 20e3, 25919, 30e3, 40e3, 60e3], dtype=int) if test: n_dyn_cells = np.array([.5e3, 1e3], dtype=int) scv_dyn_time = [] for n in n_dyn_cells: indices = np.random.choice(adata_ref.n_obs, n) adata = adata_ref[indices] adata.obs_names_make_unique() scv.utils.cleanup(adata, clean='all') adata_run = adata.copy() t_init = time() run_scv(adata_run, mode='dynamical') scv_dyn_time.extend([(time() - t_init) / 60]) np.save('data/runtime/scv_dyn_time' + str(n), scv_dyn_time[-1]) scv_dyn_time = np.array(scv_dyn_time) np.save('data/runtime/n_dyn_cells', n_dyn_cells) np.save('data/runtime/scv_dyn_time', scv_dyn_time) #scvelo's steady-state and stochastic model second run with large cell numbers np.random.seed(2020) n_large_cells = np.array([100e3, 150e3, 200e3, 250e3, 300e3], dtype=int) if test: n_large_cells = np.array([.5e3, 1e3], dtype=int) scv_stoch_large_time, scv_large_time = [], [] for n in n_large_cells: indices = np.random.choice(adata_ref.n_obs, n) adata = adata_ref[indices] adata.obs_names_make_unique() scv.utils.cleanup(adata, clean='all') adata_run = adata.copy() t_init = time() run_scv(adata_run) scv_large_time.extend([(time() - t_init) / 60]) adata_run = adata.copy() t_init = time() run_scv(adata_run, mode='stochastic') scv_stoch_large_time.extend([(time() - t_init) / 60]) np.save('data/runtime/scv_large_time' + str(n), scv_large_time[-1]) np.save('data/runtime/scv_stoch_large_time' + str(n), scv_stoch_large_time[-1]) scv_large_time = np.array(scv_large_time) scv_stoch_large_time = np.array(scv_stoch_large_time) np.save('data/runtime/n_large_cells', n_large_cells) np.save('data/runtime/scv_large_time', scv_large_time) np.save('data/runtime/scv_stoch_large_time', scv_stoch_large_time) n_cells = scv.load('data/runtime/n_cells.npy') scv_time = scv.load('data/runtime/scv_time.npy') scv_stoch_time = scv.load('data/runtime/scv_stoch_time.npy') vcy_time = scv.load('data/runtime/vcy_time.npy') n_dyn_cells = scv.load('data/runtime/n_dyn_cells.npy') scv_dyn_time = scv.load('data/runtime/scv_dyn_time.npy') n_large_cells = scv.load('data/runtime/n_large_cells.npy') scv_large_time = scv.load('data/runtime/scv_large_time.npy') scv_stoch_large_time = scv.load('data/runtime/scv_stoch_large_time.npy') vcy_time_ext = np.hstack([vcy_time, np.ones(len(n_cells) - len(vcy_time))*np.nan]) pd.DataFrame(np.vstack([n_cells, scv_time, scv_stoch_time, vcy_time_ext]).T.round(2), columns=['n_cells', 'scvelo steady-state', 'scvelo stochastic', 'velocyto steady-state']) kwargs = {'show': False, 'fontsize': 20, 'size': 100, 'linewidth': 3} labels = ['velocyto steady-state', 'scVelo steady-state', 'scVelo stochastic', 'scVelo dynamical'] ax = scv.pl.scatter(x=n_cells[:len(vcy_time)], y=vcy_time, **kwargs, show_polyfit=5, label=labels[0]) ax = scv.pl.scatter(x=n_cells, y=scv_time, color='lightblue', ax=ax, **kwargs, show_polyfit=True, label=labels[1]) ax = scv.pl.scatter(x=n_cells, y=scv_stoch_time, ax=ax, color='royalblue', **kwargs, show_polyfit=True, label=labels[2]) ax = scv.pl.scatter(x=n_dyn_cells, y=scv_dyn_time, ax=ax, color='purple', **kwargs, show_polyfit=True, label=labels[3]) pl.legend(fontsize=14) pl.xlabel('# cells', fontsize=20); pl.ylabel('runtime (min)', fontsize=20); from scipy.optimize import curve_fit def expo(x, a, b, c): o = 1 if np.isscalar(x) else np.ones(len(x)) return a * x ** b + c * o def fit(x, y, n_extrapol=None): def do_fit(function, x, y): popt, pcov = curve_fit(eval(function), x, y, maxfev=10000) print('complexity: ', popt[1]) #print('Standard error fit: ', np.sqrt(np.diag(pcov))) return popt, pcov ax=pl.figure(dpi=50).gca() x = np.array(x) ax.plot(x, y) popt, pcov = do_fit('expo', x, y) ax.plot(x, expo(x, popt[0], popt[1], popt[2])) pl.legend(['Data', 'fit']) pl.show() if n_extrapol is not None: return expo(n_extrapol, popt[0], popt[1], popt[2]) fit(n_cells, scv_time) fit(n_dyn_cells, scv_dyn_time) fit(n_cells[:len(vcy_time)], vcy_time)