Single cell RNA-seq (scRNA-seq) is a powerful method to interrogate gene expression across thousands of single cells. This method provides thousands of measurements (single cells) across thousands of dimensions (genes). This notebook uses Clustergrammer2 to interactively explore an example dataset measuring the gene expression of 2,700 PBMCs obtained from 10X Genomics. Bulg gene expression signatures of cell types from CIBERSORT were used to obtain a tentative cell type for each cell.
from clustergrammer2 import net
df = {}
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
def calc_mean_var_disp(df_inst):
mean_arr = []
var_arr = []
mean_names = []
for inst_gene in df_inst.index.tolist():
mean_arr.append( df_inst.loc[inst_gene].mean() )
var_arr.append(df_inst.loc[inst_gene].var())
mean_names.append(inst_gene)
ser_mean = pd.Series(data=mean_arr, index=mean_names)
ser_var = pd.Series(data=var_arr, index=mean_names)
return ser_mean, ser_var
def cell_umi_count(df):
sum_arr = []
sum_names = []
for inst_cell in df:
sum_arr.append( df[inst_cell].sum() )
sum_names.append(inst_cell)
ser_sum = pd.Series(data=sum_arr, index=sum_names)
return ser_sum
df = net.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df.shape
(32738, 2700)
all_genes = df.index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))
df = df.loc[keep_genes]
df.shape
# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']
all_genes = df.index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or
x.split('_')[0] in list_mito_genes]
print(mito_genes)
keep_genes = [x for x in all_genes if x not in mito_genes]
df = df.loc[keep_genes]
32738 32546 ['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']
ser_mean, ser_var = calc_mean_var_disp(df)
num_keep_mean = 5000
num_top_var = 250
# filter for top expressing genes
keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_mean].index.tolist()
df = df.loc[keep_mean]
df.shape
(5000, 2700)
ser_keep_var = ser_var[keep_mean]
# filter for top variance based
keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]
len(keep_var)
250
%%time
ser_sum = cell_umi_count(df)
df = df.div(ser_sum)
print(df.shape)
print(df.sum().head())
(5000, 2700) AAACATACAACCAC 1.0 AAACATTGAGCTAC 1.0 AAACATTGATCAGC 1.0 AAACCGTGCTTCCG 1.0 AAACCGTGTATGCG 1.0 dtype: float64 CPU times: user 889 ms, sys: 248 ms, total: 1.14 s Wall time: 779 ms
ser_keep_var = ser_var[keep_mean]
# filter for top variance based
keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]
# ArcSinh transform
df = np.arcsinh(df/5)
# Z-score genes
net.load_df(df)
net.normalize(axis='row', norm_type='zscore')
# round to two decimal points
df = net.export_df().round(2)
print(df.shape)
(5000, 2700)
# df.columns = [(x, 'Cell Type: Unknown') for x in df.columns.tolist()]
net.load_df(df.loc[keep_var])
net.clip(lower=-5, upper=5)
# net.manual_category(col='Cell Type')
net.widget()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-49-a4fbed862154> in <module>() ----> 1 net.load_df(df.loc[keep_var]) 2 net.clip(lower=-5, upper=5) 3 net.manual_category(col='Cell Type') 4 net.widget() ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key) 1476 1477 maybe_callable = com._apply_if_callable(key, self.obj) -> 1478 return self._getitem_axis(maybe_callable, axis=axis) 1479 1480 def _is_scalar_access(self, key): ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1899 raise ValueError('Cannot index with multidimensional key') 1900 -> 1901 return self._getitem_iterable(key, axis=axis) 1902 1903 # nested tuple slicing ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis) 1141 if labels.is_unique and Index(keyarr).is_unique: 1142 indexer = ax.get_indexer_for(key) -> 1143 self._validate_read_indexer(key, indexer, axis) 1144 1145 d = {axis: [ax.reindex(keyarr)[0], indexer]} ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis) 1204 raise KeyError( 1205 u"None of [{key}] are in the [{axis}]".format( -> 1206 key=key, axis=self.obj._get_axis_name(axis))) 1207 1208 # we skip the warning on Categorical/Interval KeyError: "None of [['FTL', 'FTH1', 'MALAT1', 'TMSB4X', 'B2M', 'LYZ', 'ACTB', 'S100A9', 'CD74', 'HLA-DRA', 'TMSB10', 'CST3', 'EEF1A1', 'S100A4', 'S100A8', 'TPT1', 'HLA-DPB1', 'PTMA', 'NKG7', 'GNLY', 'TYROBP', 'JUNB', 'GNB2L1', 'HLA-C', 'NACA', 'GAPDH', 'LTB', 'HLA-DPA1', 'OAZ1', 'PFN1', 'HLA-DRB1', 'HLA-A', 'COTL1', 'S100A6', 'ACTG1', 'FOS', 'SAT1', 'EIF1', 'LGALS1', 'LST1', 'CCL5', 'AIF1', 'VIM', 'H3F3B', 'SH3BGRL3', 'CYBA', 'FCER1G', 'UBA52', 'EEF1D', 'DUSP1', 'FAU', 'ARHGDIB', 'CFL1', 'HLA-B', 'CTSS', 'IGJ', 'FCN1', 'IFITM2', 'MYL6', 'BTG1', 'COX4I1', 'HLA-E', 'CD52', 'S100A11', 'IL32', 'YBX1', 'GZMB', 'SRGN', 'MYL12A', 'ARPC1B', 'ARPC3', 'PFDN5', 'JUN', 'CD37', 'BTF3', 'EEF1B2', 'PABPC1', 'PSAP', 'UBB', 'ANXA1', 'HNRNPA1', 'S100A10', 'NPC2', 'ATP5G2', 'PPBP', 'UBC', 'GLTSCR2', 'NPM1', 'SLC25A6', 'GSTP1', 'IGLL5', 'ARPC2', 'ZFP36', 'LDHB', 'EMP3', 'GPX1', 'RBM3', 'HLA-DRB5', 'LY6E', 'EIF3K', 'KLF6', 'EEF2', 'CORO1A', 'ISG15', 'ITM2B', 'EIF4A1', 'FXYD5', 'IFITM3', 'NEAT1', 'TYMP', 'LAPTM5', 'PSMA7', 'FCGR3A', 'TUBA1B', 'CLIC1', 'SERF2', 'LGALS2', 'CD48', 'TMEM66', 'HSP90AA1', 'HLA-DQA1', 'CD79A', 'SRSF5', 'GMFG', 'IER2', 'EIF3H', 'CD3D', 'DDX5', 'ATP6V0E1', 'TXNIP', 'PPIB', 'CXCR4', 'GIMAP7', 'HNRNPA2B1', 'PSME1', 'YWHAB', 'ARPC5', 'PYCARD', 'UBXN4', 'MYL12B', 'LIMD2', 'PTPRCAP', 'HMGB2', 'PRDX1', 'CIRBP', 'HMGB1', 'RAC2', 'TIMP1', 'CCL4', 'TALDO1', 'FUS', 'PNRC1', 'HLA-DQB1', 'ALDOA', 'SRP14', 'CEBPB', 'C1orf162', 'CALM1', 'TSC22D3', 'SLC25A5', 'PRELID1', 'HINT1', 'ENO1', 'ID2', 'GIMAP4', 'GIMAP5', 'SRSF7', 'NFKBIA', 'UBXN1', 'EIF1AY', 'CHCHD2', 'UBE2D3', 'CCL3', 'NAP1L1', 'CST7', 'SSR2', 'IFI6', 'HLA-DMA', 'VAMP8', 'CCNI', 'SNX3', 'PRF1', 'IL7R', 'CFD', 'SNRPB', 'C6orf48', 'PLAC8', 'RAB2A', 'GPSM3', 'ARL6IP5', 'ANXA2', 'CTSW', 'SF3B5', 'STK17A', 'VPS28', 'FYB', 'EDF1', 'ATP5L', 'LDHA', 'PSMB8', 'TRAF3IP3', 'SMARCA4', 'PPDPF', 'CAPZB', 'SELL', 'CD79B', 'AP2S1', 'SRSF3', 'CD3E', 'HSPA8', 'CNBP', 'ATP5D', 'C4orf3', 'NDUFA2', 'SH3BGRL', 'SDHB', 'GTF3A', 'PPIA', 'NDUFA4', 'ZFP36L2', 'RBM39', 'CD2', 'BRK1', 'PRDX3', 'RARRES3', 'PSME2', 'ATP5E', 'TPI1', 'RHOG', 'GZMA', 'SERP1', 'CCND3', 'PSMB9', 'AES', 'UBE2D2', 'KIF5B', 'RAN', 'H2AFZ', 'TOMM7', 'ATP5A1', 'EIF4A2', 'RAC1', 'ATP5O', 'DRAP1', 'NOSIP', 'PSMB6', 'ATP5H', 'TMBIM6', 'FGFBP2', 'PPA1']] are in the [index]"
# man_cat = net.get_manual_category('col', 'Cell Type')
# man_cat['Cell Type'].value_counts()
net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
net.normalize(axis='row', norm_type='zscore')
df_sig = net.export_df()
print(df_sig.shape)
rows = df_sig.index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df_sig.index = new_rows
(523, 22)
ct_color = {}
ct_color['T cells CD8'] = 'red'
ct_color['T cells CD4 naive'] = 'blue'
ct_color['T cells CD4 memory activated'] = 'blue'
ct_color['T cells CD4 memory resting'] = '#87cefa' # sky blue
ct_color['B cells naive'] = 'purple'
ct_color['B cells memory'] = '#DA70D6' # orchid
ct_color['NK cells activated'] = 'yellow'
ct_color['NK cells resting'] = '#FCD116' # sign yellow
ct_color['Monocytes'] = '#98ff98' # mint green
ct_color['Macrophages M0'] = '#D3D3D3' # light grey
ct_color['Macrophages M1'] = '#C0C0C0' # silver
ct_color['Macrophages M2'] = '#A9A9A9' # dark grey
ct_color['N.A.'] = 'white'
def set_cat_colors(axis, cat_index, cat_title=False):
for inst_ct in ct_color:
if cat_title != False:
cat_name = cat_title + ': ' + inst_ct
else:
cat_name = inst_ct
inst_color = ct_color[inst_ct]
net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)
set_cat_colors('col', 1)
gene_sig = df_sig.idxmax(axis=1)
gs_dict = {}
for inst_gene in gene_sig.index.tolist():
gs_dict[inst_gene] = gene_sig[inst_gene][0]
df_sig_cat = deepcopy(df_sig)
rows = df_sig_cat.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df_sig_cat.index = new_rows
net.load_df(df_sig_cat)
set_cat_colors('row', 1, 'Cell Type')
net.load_df(df_sig_cat)
net.clip(lower=-5, upper=5)
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "ABCB4", "ini": 523, "clust": 318, "rank": 341, "rankvar": 8, "…
df_pred_cat, df_sig_sim, y_info = net.predict_cats_from_sigs(df, df_sig,
predict_level='Cell Type', unknown_thresh=0.05)
df.columns = df_pred_cat.columns.tolist()
print(df_pred_cat.shape)
(188, 2700)
df_sig_sim = df_sig_sim.round(2)
net.load_df(df_sig_sim)
set_cat_colors('col', 1, cat_title='Cell Type')
set_cat_colors('row', 1)
df_sig_sim.columns = df_pred_cat.columns.tolist()
net.load_df(df_sig_sim)
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "B cells naive", "ini": 22, "clust": 10, "rank": 0, "rankvar": …
rows = df_pred_cat.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df_pred_cat.index = new_rows
net.load_df(df_pred_cat)
net.clip(lower=-5, upper=5)
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "C5AR1", "ini": 188, "clust": 129, "rank": 140, "rankvar": 33, …
df = df.loc[keep_var]
rows = df.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df.index = new_rows
net.load_df(df)
net.clip(lower=-5, upper=5)
net.load_df(net.export_df().round(2))
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "FTL", "ini": 250, "clust": 236, "rank": 245, "rankvar": 247, "…
!mkdir ../jsons
net.save_dict_to_json(net.viz, '../jsons/pbmc_2700.json')
mkdir: ../jsons: File exists