In this notebook I will generate cell types (e.g. NK cells) from hierarchical clustering of the downsampled data and tranfer these cell type categories to the original data. I will then save two tsv files with cell types called: plasma_ct.txt
and pma_ct.txt
.
import numpy as np
import pandas as pd
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
# load plasma.txt original data (e.g. not normalized, etc)
net.load_file('../cytof_data/Plasma_clean.txt')
df_plasma = net.export_df()
# load pma.txt original data
net.load_file('../cytof_data/PMA_clean.txt')
df_pma = net.export_df()
# load plasma data
net.load_df(df_plasma)
# cluster based on surface markers only to identify cell types
net.filter_cat('col', 1, 'Marker-type: surface marker')
# first, normalize columns so that all markers are comparable
net.normalize(axis='col', norm_type='zscore', keep_orig=False)
# downsample to 1000 cell-clusters and save downsampled data to associate
# clusters with original data
ds_data_plasma = net.downsample(ds_type='kmeans', axis='row', num_samples=1000)
# clip z-scores since we do not care about extreme outliers
net.clip(-10,10)
# produce categories from dendrogram level 5
net.cluster()
net.dendro_cats('row', dendro_level=5)
net.cluster(views=[])
# export df with new cats for later use
df_plasma_cat = net.export_df()
# set the colors of plasma and pma treated cells
net.set_cat_color('row', 1, 'Majority-Treatment: Plasma', 'blue')
net.set_cat_color('row', 1, 'Majority-Treatment: PMA', 'red')
# visualize
net.widget()
/Users/nickfernandez/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.py:1382: RuntimeWarning: init_size=300 should be larger than k=1000. Setting it to 3*k init_size=init_size)
# load pma data
net.load_df(df_pma)
# cluster based on surface markers only to identify cell types
net.filter_cat('col', 1, 'Marker-type: surface marker')
# first, normalize columns so that all markers are comparable
net.normalize(axis='col', norm_type='zscore', keep_orig=False)
# downsample to 1000 cell-clusters and save downsampled data to associate
# clusters with original data
ds_data_pma = net.downsample(ds_type='kmeans', axis='row', num_samples=1000)
# clip z-scores since we do not care about extreme outliers
net.clip(-10,10)
# produce categories from dendrogram level 5
net.cluster()
net.dendro_cats('row', dendro_level=5)
net.cluster(views=[])
# export df with new cats for later use
df_pma_cat = net.export_df()
# visualize
net.widget()
cell_type = {}
cell_type['plasma'] = {}
cell_type['pma'] = {}
cell_type['plasma']['Group 5: cat-4'] = 'Cell Types: T cells'
cell_type['plasma']['Group 5: cat-3'] = 'Cell Types: CD8 T cells'
cell_type['plasma']['Group 5: cat-2'] = 'Cell Types: Monocytes and Granulocytes'
cell_type['plasma']['Group 5: cat-1'] = 'Cell Types: NK cells'
cell_type['pma']['Group 5: cat-7'] = 'Cell Types: NK cells'
cell_type['pma']['Group 5: cat-6'] = 'Cell Types: NK cells'
cell_type['pma']['Group 5: cat-5'] = 'Cell Types: NK cells'
cell_type['pma']['Group 5: cat-4'] = 'Cell Types: Monocytes and Granulocytes'
cell_type['pma']['Group 5: cat-3'] = 'Cell Types: CD8 T cells'
cell_type['pma']['Group 5: cat-2'] = 'Cell Types: T cells'
cell_type['pma']['Group 5: cat-1'] = 'Cell Types: T cells'
# replace these categories with cell type categories
rows = df_plasma_cat.index.tolist()
new_rows = []
for inst_row in rows:
inst_type = cell_type['plasma'][inst_row[3]]
new_row = (inst_row[0], 'Majority-Treatment: Plasma', inst_type)
new_rows.append(new_row)
df_plasma_CT = df_plasma_cat
df_plasma_CT.index = new_rows
# replace these categories with cell type categories
rows = df_pma_cat.index.tolist()
new_rows = []
for inst_row in rows:
inst_type = cell_type['pma'][inst_row[3]]
new_row = (inst_row[0], 'Majority-Treatment: PMA', inst_type)
new_rows.append(new_row)
df_pma_CT = df_pma_cat
df_pma_CT.index = new_rows
df_plasma_CT.index.tolist()[0]
('Cluster: cluster-0', 'Majority-Treatment: Plasma', 'Cell Types: T cells')
df_pma_CT.index.tolist()[0]
('Cluster: cluster-0', 'Majority-Treatment: PMA', 'Cell Types: T cells')
We'll verify that the categories are being set correctly and see whether the downsampled versions of the Plasma and PMA data cluster similarly by stacking the datasets
df_merge = pd.concat([df_plasma_CT, df_pma_CT])
net.load_df(df_merge)
net.cluster(views=[])
net.widget()
We see that the combined datasets form four large cell clusters that are primarily composed of a single cell type. We also see that Plasma and PMA cells are well mixed.
I will transfer the cell type data that was calculated from the downsampled data to the original data.
ds_list_plasma = list(ds_data_plasma)
ds_list_pma = list(ds_data_pma)
# generate new Plasma rows
rows = df_plasma.index.tolist()
rows_ds = df_plasma_CT.index.tolist()
new_rows = []
for i in range(len(rows)):
inst_index = ds_list_plasma[i]
inst_CT = rows_ds[inst_index][2]
new_row = rows[i] + (inst_CT,)
new_rows.append(new_row)
df_plasma.index = new_rows
# generate new PMA rows
rows = df_pma.index.tolist()
rows_ds = df_pma_CT.index.tolist()
new_rows = []
for i in range(len(rows)):
inst_index = ds_list_pma[i]
inst_CT = rows_ds[inst_index][2]
new_row = rows[i] + (inst_CT,)
new_rows.append(new_row)
df_pma.index = new_rows
net.load_df(df_plasma)
net.write_matrix_to_tsv('../cytof_data/Plasma_CT.txt')
net.load_df(df_pma)
net.write_matrix_to_tsv('../cytof_data/PMA_CT.txt')
These TSVs with cell type categories will be used for later analysis.