In [1]:

import anndata
import numpy as np
import pandas as pd
import scplot as sp

Read in data. The data consists of 3K PBMCs from a healthy donor from 10x Genomics.

In [2]:

adata = anndata.read('pbmc3k.h5ad')

Violin plot of QC metrics

In [3]:

sp.violin(adata, ['n_genes', 'n_counts', 'percent_mito'])

Out[3]:

Scatter plot matrix of QC metrics. You can optionally color by the plot by cluster assignment.

In [4]:

sp.scatter_matrix(adata, ['n_genes', 'n_counts', 'percent_mito'], color='louvain')

Out[4]:

Violin plot of QC metrics by cluster assignment

In [5]:

sp.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], by='louvain', width=450, height=400, cols=2)

Out[5]:

Violin plot of expression of gene expression by cluster

In [6]:

sp.violin(adata, ['CST3', 'NKG7'], by='louvain', width=450, height=400)

Out[6]:

Embedding of gene expression and cluster assignments

In [7]:

sp.embedding(adata, basis=['umap'], keys=['CST3', 'louvain'])

Out[7]:

We can display the labels directly on the plot

In [8]:

sp.embedding(adata, basis='umap', keys=['louvain'], legend='data', width=500)

Out[8]:

Link plots across multiple embeddings

In [9]:

sp.embedding(adata, basis=['umap', 'pca'], keys=['CST3', 'louvain'], sort=False, brush_categorical=True)

Out[9]:

Visualize binned number of counts within each louvain clustering

In [10]:

# You can optionally provide a color map
cmap={'0':'#e41a1c','8': 'LightGrey', '9':'black'}
adata.obs['binned_n_counts'] = pd.cut(adata.obs['n_counts'], bins=10, labels=False)
sp.composition_plot(adata, 'louvain', 'binned_n_counts', cmap=cmap, height=400)

Out[10]:

In [11]:

# Sort binned_n_counts by mean fraction across all louvain clusters
sp.composition_plot(adata, 'louvain', 'binned_n_counts', cmap=cmap, height=400, condition_sort_by='mean')

Out[11]:

Gene expression heatmap

In [12]:

marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
                'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
                'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']
sp.heatmap(adata, keys=marker_genes,by='louvain')

Out[12]:

Gene expression dotplot

In [13]:

sp.dotplot(adata, keys=marker_genes, by='louvain')

Out[13]:

Scatter plot of FCGR3A versus MS4A7, colored by expression of CD14

In [14]:

sp.scatter(adata, x='FCGR3A', y='MS4A7', color='CD14')

Out[14]:

Use the box select tool to select cells. After selection is complete, you can get the selected range.

In [15]:

# save a reference to the plot to get the selection bounds
embedding_plot = sp.embedding(adata, basis='umap', keys=['CST3'])
embedding_plot

Out[15]:

In [16]:

from IPython.display import display, clear_output
import ipywidgets as widgets
clear_output()
basis = 'umap'
x_coord = embedding_plot.df['X_{}1'.format(basis)]
y_coord = embedding_plot.df['X_{}2'.format(basis)]
bounds = sp.get_bounds(embedding_plot[0,0])

if bounds is not None:
    selected_bounds = (x_coord>=bounds[0]) & (x_coord<=bounds[2]) & (y_coord>= bounds[1])&(y_coord<=bounds[3])
    selected_adata = adata[selected_bounds]
    if selected_adata.shape[0] > 0:
        print('{} cells selected'.format(selected_adata.shape[0]))
        # get union of selected cells and cells belonging to the clusters in the selection
        cluster_selection = (adata.obs['louvain'].isin(selected_adata.obs['louvain'].unique()))
        selected_adata = adata[selected_bounds.values|cluster_selection.values]
        print('{} clusters'.format(len(selected_adata.obs['louvain'].unique())))
        print('{} total cells'.format(selected_adata.shape[0]))
        # you can also run a tool here to generate a new embedding using the selected data only
        display(sp.embedding(selected_adata, basis=basis, keys=['CST3']))

In [17]:

# Save plot to png
# hv.save(p, 'test.png')

Duplicate cells to create a dataset with 5 million cells. We include only 3 genes to conserve memory. You can also open a large AnnData file in backed mode to load data on demand.

In [18]:

genes_to_include = ['CST3', 'NKG7', 'PPBP']
upsampled_adata = anndata.AnnData(adata.raw[:, genes_to_include].X, adata.obs.copy(), pd.DataFrame(index=genes_to_include))
upsampled_adata.obsm['X_umap'] = adata.obsm['X_umap']
upsampled_adata = upsampled_adata[np.repeat(np.arange(0, upsampled_adata.shape[0]), 2000)]
"{:,} cells".format(upsampled_adata.shape[0])

Out[18]:

'5,276,000 cells'

In [19]:

sp.embedding(upsampled_adata, basis='umap', keys=['CST3', 'louvain']) # will automatically bin

Out[19]:

In [20]:

sp.scatter(upsampled_adata, x='CST3', y='NKG7', color='PPBP')  # will automatically bin

Out[20]: