Here we will use hierarchical clustering and surface markers to identify cell types. First, we will downsample our Plasma and PMA data, then merge and cluster, then identify cell clusters by selecting a level of the dendrogram tree, finally label cell clusters based on surface marker composition and transfer these labels back to the single cell data.
import pandas as pd
import numpy as np
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
Here we are loading the data and setting the cell type colors for consistent coloring across multiple heatmaps.
# load plasma.txt original data (e.g. not normalized, etc)
net.load_file('../cytof_data/Plasma_clean.txt')
df_plasma = net.export_df()
# load pma.txt original data
net.load_file('../cytof_data/PMA_clean.txt')
df_pma = net.export_df()
# manually set treatment colors
net.set_cat_color('col', 1, 'Marker-type: phospho marker', 'red')
net.set_cat_color('col', 1, 'Marker-type: surface marker', 'blue')
# manually set row colors: subsample
net.set_cat_color('row', 1, 'B cells', '#22316C')
net.set_cat_color('row', 1, 'Basophils', '#000033')
net.set_cat_color('row', 1, 'CD14hi monocytes', 'yellow')
net.set_cat_color('row', 1, 'CD14low monocytes', '#93b8bf')
net.set_cat_color('row', 1, 'CD1c DCs', '#3636e2')
net.set_cat_color('row', 1, 'CD4 Tcells', 'blue')
net.set_cat_color('row', 1, 'CD4 Tcells_CD127hi', '#FF6347')
net.set_cat_color('row', 1, 'CD4 Tcells CD161hi', '#F87531')
net.set_cat_color('row', 1, 'CD4 Tcells_Tregs', '#8B4513')
net.set_cat_color('row', 1, 'CD4 Tcells+CD27hi', '#330303')
net.set_cat_color('row', 1, 'CD8 Tcells', '#ffb247')
net.set_cat_color('row', 1, 'Neutrophils', 'purple')
net.set_cat_color('row', 1, 'NK cells_CD16hi', 'red')
net.set_cat_color('row', 1, 'NK cells_CD16hi_CD57hi', 'orange')
net.set_cat_color('row', 1, 'NK cells_CD56hi', '#e052e5')
net.set_cat_color('row', 1, 'Undefined', 'gray')
Here we will normalize the surface marker distributions (using Z-score) and downsample the Plasma and PMA cells (to obtain 1000 cell-clusters) using K-means clustering. We will keep track of which single cells are associated with each cluster using ds_data_plasma
and ds_data_pma
- this will be used to transfer labels back to the single cell data. We will store the downsampled datasets as ds_plasma
and ds_pma
.
net.load_df(df_plasma)
net.filter_cat('col', 1, 'Marker-type: surface marker')
net.normalize(axis='col', norm_type='zscore', keep_orig=False)
ds_data_plasma = net.downsample(ds_type='kmeans', axis='row', num_samples=1000)
net.clip(-10,10)
ds_plasma = net.export_df()
/Users/nickfernandez/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.py:1382: RuntimeWarning: init_size=300 should be larger than k=1000. Setting it to 3*k init_size=init_size)
net.load_df(df_pma)
net.filter_cat('col', 1, 'Marker-type: surface marker')
net.normalize(axis='col', norm_type='zscore', keep_orig=False)
ds_data_pma = net.downsample(ds_type='kmeans', axis='row', num_samples=1000)
net.clip(-10,10)
ds_pma = net.export_df()
Downsampled versions of the Plasma and PMA surface marker data will be merged and hierarchically clustered.
ds_merge = pd.concat([ds_plasma, ds_pma])
# drop number in clust from row cateogory
rows = ds_merge.index.tolist()
new_rows = []
for inst_row in rows:
inst_name = inst_row[0]
inst_cat = inst_row[1]
inst_tuple = (inst_name, inst_cat)
new_rows.append(inst_tuple)
ds_merge.index = new_rows
We will identify clusters of cells by selecting the clusters identified at level three of the dendrogram, which gives us 27 cell clusters. Below we use the dendro_cats
method to produce categories from level three of the dendrogram. These clusters can be seen under the 'Group 3' row category.
net.load_df(ds_merge)
net.cluster(views=[])
net.dendro_cats('row', dendro_level=3)
net.cluster(views=[])
net.widget()
Here we will average the data in each of the 27 clusters, which will give us the average surface marker level in each of the 27 clusters. This simplified view of the data was used by our collaborators at the Icahn School of Medicine Immune Core to manually assign cell types. These cell types will then be passed back to the single cell data below.
ds_merge_3 = net.export_df()
ds_merge_3.to_csv('../cytof_data/ds_merge_level_3.txt', sep='\t')
rows = ds_merge_3.index.tolist()
dendro_cats = []
for inst_row in rows:
dendro_cats.append(inst_row[2])
dendro_cats = sorted(list(set(dendro_cats)))
num_cats = len(dendro_cats)
print(num_cats)
27
mean_merge_3 = pd.DataFrame()
rows = []
for i in range(num_cats):
inst_cat = 'Group 3: cat-' + str(i + 1)
net.load_df(ds_merge_3)
net.filter_cat('row', 2, inst_cat)
tmp = net.export_df()
tmp_mean = tmp.mean(axis=0)
mean_merge_3 = pd.concat([mean_merge_3, tmp_mean], axis=1)
rows.append(inst_cat.replace('cat-', 'cluster-'))
# transpose
mean_merge_3 = mean_merge_3.transpose()
mean_merge_3.index = rows
mean_merge_3.to_csv('../cytof_data/mean_merge_3.txt', sep='\t')
mean_merge_3.shape
(27, 18)
The heatmap below visualizes the average surface marker levels in the 27 clusters defined by level-3 of the dendrogram.
net.load_df(mean_merge_3)
net.cluster(views=[])
net.widget()
Our collaborators at the Icahn School of Medicine Immune Core labeled cell cell clusters based on marker expression. Here we will load this data and transfer them back to the single cell data.
f = open('../cytof_data/dendro_level_3_labeled.txt')
lines = f.readlines()
f.close()
cell_cats = {}
for inst_line in lines:
inst_line = inst_line.strip().split('\t')
clust_num = inst_line[0].split(' ')[2]
cell_type = inst_line[1]
cell_cats[clust_num] = cell_type
Below we have added manually assigned labels to the average cluster data. This is the same data visualized in the previous heatmap, but now cell types have been assigned to each cluster.
rows = mean_merge_3.index.tolist()
new_rows = []
for inst_row in rows:
clust_num = inst_row.split('-')[1]
inst_type = cell_cats['cluster-'+str(clust_num)]
new_rows.append(inst_type)
mean_merge_3.index = new_rows
mean_merge_3.to_csv('../cytof_data/plasma_pma_dendro_avg_SM.txt', sep='\t')
net.load_df(mean_merge_3)
net.cluster()
net.widget()
Here we add cell type as a category to the downsampled data. This gives us a more detailed view of the global distribution of cell types.
rows = ds_merge_3.index.tolist()
new_rows = []
for inst_row in rows:
clust_num = inst_row[2].split('-')[1]
inst_type = cell_cats['cluster-'+str(clust_num)]
inst_tuple = (inst_row[0], inst_row[1], inst_type)
new_rows.append(inst_tuple)
ds_merge_3.index = new_rows
net.load_df(ds_merge_3)
net.cluster()
net.widget()
Finally, we will transfer the cell types to the original single cell data and save updated TSVs: Plasma_UCT.txt
and PMA_UCT.txt
.
rows = ds_merge_3.index.tolist()
plasma_ds_rows = rows[0:1000]
pma_ds_rows = rows[1000:]
print('number of downsampled clusters')
print(len(plasma_ds_rows))
print(len(pma_ds_rows))
print('\ndownsampled cluster mapping data')
print(len(list(set(ds_data_plasma))))
print(len(list(set(ds_data_pma))))
number of downsampled clusters 1000 1000 downsampled cluster mapping data 1000 1000
plasma_rows = df_plasma.index.tolist()
pma_rows = df_pma.index.tolist()
plasma_rows_cat = []
for i in range(len(plasma_rows)):
inst_row = list(plasma_rows[i])
clust_num = ds_data_plasma[i]
inst_type = plasma_ds_rows[clust_num][2]
inst_row.append(inst_type)
inst_tuple = tuple(inst_row)
plasma_rows_cat.append(inst_tuple)
pma_rows_cat = []
for i in range(len(pma_rows)):
inst_row = list(pma_rows[i])
clust_num = ds_data_pma[i]
inst_type = pma_ds_rows[clust_num][2]
inst_row.append(inst_type)
inst_tuple = tuple(inst_row)
pma_rows_cat.append(inst_tuple)
df_plasma.index = plasma_rows_cat
df_plasma.to_csv('../cytof_data/Plasma_UCT.txt', sep='\t')
df_pma.index = pma_rows_cat
df_pma.to_csv('../cytof_data/Pma_UCT.txt', sep='\t')
df_plasma.index.tolist()[0]
('Cell-1', 'Treatment: Plasma', 'CD8 Tcells')