#!/usr/bin/env python # coding: utf-8 # # 3.0 2,700 PBMC scRNA-seq # Single cell RNA-seq (scRNA-seq) is a powerful method to interrogate gene expression across thousands of single cells. This method provides thousands of measurements (single cells) across thousands of dimensions (genes). This notebook uses Clustergrammer2 to interactively explore an example dataset measuring the gene expression of 2,700 PBMCs obtained from [10X Genomics](https://www.10xgenomics.com/resources/datasets/). Bulg gene expression signatures of cell types from [CIBERSORT](https://cibersort.stanford.edu/) were used to obtain a tentative cell type for each cell. # In[20]: from clustergrammer2 import net df = {} # In[21]: from sklearn.metrics import f1_score import pandas as pd import numpy as np from copy import deepcopy import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # In[22]: def calc_mean_var_disp(df_inst): mean_arr = [] var_arr = [] mean_names = [] for inst_gene in df_inst.index.tolist(): mean_arr.append( df_inst.loc[inst_gene].mean() ) var_arr.append(df_inst.loc[inst_gene].var()) mean_names.append(inst_gene) ser_mean = pd.Series(data=mean_arr, index=mean_names) ser_var = pd.Series(data=var_arr, index=mean_names) return ser_mean, ser_var # In[23]: def cell_umi_count(df): sum_arr = [] sum_names = [] for inst_cell in df: sum_arr.append( df[inst_cell].sum() ) sum_names.append(inst_cell) ser_sum = pd.Series(data=sum_arr, index=sum_names) return ser_sum # ### Load Data # In[24]: df = net.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/') df.shape # ### Remove Ribosomal and Mitochondrial Genes # In[25]: all_genes = df.index.tolist() print(len(all_genes)) keep_genes = [x for x in all_genes if 'RPL' not in x] keep_genes = [x for x in keep_genes if 'RPS' not in x] print(len(keep_genes)) df = df.loc[keep_genes] df.shape # Removing Mitochondrial Genes list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7', 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4'] all_genes = df.index.tolist() mito_genes = [x for x in all_genes if 'MT-' == x[:3] or x.split('_')[0] in list_mito_genes] print(mito_genes) keep_genes = [x for x in all_genes if x not in mito_genes] df = df.loc[keep_genes] # ### Keep top 5K Expressing Genes # In[26]: ser_mean, ser_var = calc_mean_var_disp(df) num_keep_mean = 5000 num_top_var = 250 # filter for top expressing genes keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_mean].index.tolist() df = df.loc[keep_mean] df.shape # ### Find top 250 Variable Genes # In[27]: ser_keep_var = ser_var[keep_mean] # filter for top variance based keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var] len(keep_var) # ### UMI Normalize GEX Data # In[28]: get_ipython().run_cell_magic('time', '', 'ser_sum = cell_umi_count(df)\ndf = df.div(ser_sum)\nprint(df.shape)\nprint(df.sum().head())\n') # ### Find top expressing genes # In[29]: ser_keep_var = ser_var[keep_mean] # filter for top variance based keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var] # ### ArcSinh Transform and Z-score GEX Data # In[30]: # ArcSinh transform df = np.arcsinh(df/5) # Z-score genes net.load_df(df) net.normalize(axis='row', norm_type='zscore') # round to two decimal points df = net.export_df().round(2) print(df.shape) # In[48]: # df.columns = [(x, 'Cell Type: Unknown') for x in df.columns.tolist()] # # Unlabeled Cells # In[49]: net.load_df(df.loc[keep_var]) net.clip(lower=-5, upper=5) # net.manual_category(col='Cell Type') net.widget() # In[33]: # man_cat = net.get_manual_category('col', 'Cell Type') # man_cat['Cell Type'].value_counts() # ### Load CIBERSORT gene sigantures # In[34]: net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt') net.normalize(axis='row', norm_type='zscore') df_sig = net.export_df() print(df_sig.shape) rows = df_sig.index.tolist() new_rows = [x.split('_')[0] for x in rows] df_sig.index = new_rows # In[35]: ct_color = {} ct_color['T cells CD8'] = 'red' ct_color['T cells CD4 naive'] = 'blue' ct_color['T cells CD4 memory activated'] = 'blue' ct_color['T cells CD4 memory resting'] = '#87cefa' # sky blue ct_color['B cells naive'] = 'purple' ct_color['B cells memory'] = '#DA70D6' # orchid ct_color['NK cells activated'] = 'yellow' ct_color['NK cells resting'] = '#FCD116' # sign yellow ct_color['Monocytes'] = '#98ff98' # mint green ct_color['Macrophages M0'] = '#D3D3D3' # light grey ct_color['Macrophages M1'] = '#C0C0C0' # silver ct_color['Macrophages M2'] = '#A9A9A9' # dark grey ct_color['N.A.'] = 'white' # In[36]: def set_cat_colors(axis, cat_index, cat_title=False): for inst_ct in ct_color: if cat_title != False: cat_name = cat_title + ': ' + inst_ct else: cat_name = inst_ct inst_color = ct_color[inst_ct] net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color) # In[37]: set_cat_colors('col', 1) # In[38]: gene_sig = df_sig.idxmax(axis=1) gs_dict = {} for inst_gene in gene_sig.index.tolist(): gs_dict[inst_gene] = gene_sig[inst_gene][0] df_sig_cat = deepcopy(df_sig) rows = df_sig_cat.index.tolist() new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ] df_sig_cat.index = new_rows net.load_df(df_sig_cat) set_cat_colors('row', 1, 'Cell Type') # In[39]: net.load_df(df_sig_cat) net.clip(lower=-5, upper=5) net.widget() # # Predict Cell Types using CIBERSORT Signatures # In[40]: df_pred_cat, df_sig_sim, y_info = net.predict_cats_from_sigs(df, df_sig, predict_level='Cell Type', unknown_thresh=0.05) df.columns = df_pred_cat.columns.tolist() print(df_pred_cat.shape) # ### Cell Type Similarity # In[41]: df_sig_sim = df_sig_sim.round(2) net.load_df(df_sig_sim) set_cat_colors('col', 1, cat_title='Cell Type') set_cat_colors('row', 1) # In[42]: df_sig_sim.columns = df_pred_cat.columns.tolist() net.load_df(df_sig_sim) net.widget() # # Cells in CIBERSORT GEX Space # In[43]: rows = df_pred_cat.index.tolist() new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ] df_pred_cat.index = new_rows # In[44]: net.load_df(df_pred_cat) net.clip(lower=-5, upper=5) net.widget() # # Cells with CIBERSORT Predictions, Top Genes Based on Variance # In[45]: df = df.loc[keep_var] rows = df.index.tolist() new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ] df.index = new_rows # In[46]: net.load_df(df) net.clip(lower=-5, upper=5) net.load_df(net.export_df().round(2)) net.widget() # In[47]: get_ipython().system('mkdir ../jsons') net.save_dict_to_json(net.viz, '../jsons/pbmc_2700.json') # In[ ]: