1, Get data
2, PCA
3, T-SNE
4, K-means
5, Hierarchical clustering
Dezso Ribli
%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
figsize(8,8)
mpl.rcParams['font.size']=16
It's here : https://datadryad.org/resource/doi:10.5061/dryad.2t41r64
%%bash
# wget https://datadryad.org/resource/doi:10.5061/dryad.2t41r64
data = pd.read_csv('hurricane.csv') # renamed it
data.head()
ID | Hurricane | Origin | Sex | SVL | Femur | Tibia | Metatarsal | LongestToe | Humerus | ... | FingerArea2 | FingerArea3 | ToeArea1 | ToeArea2 | ToeArea3 | MeanFingerArea | MeanToeArea | SumFingers | SumToes | MaxFingerForce | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 537 | After | Pine Cay | Male | 48.69 | 10.39 | 11.87 | 7.52 | 7.43 | 8.66 | ... | 1.338 | 1.339 | 2.529 | 2.402 | 2.369 | 1.332667 | 2.433333 | 2.663 | 4.791 | 0.116 |
1 | 539 | After | Pine Cay | Female | 40.31 | 8.66 | 9.79 | 6.18 | 6.20 | 8.01 | ... | 0.950 | 0.972 | 1.498 | 1.525 | 1.530 | 0.961333 | 1.517667 | 2.595 | 3.678 | 0.048 |
2 | 540 | After | Pine Cay | Male | 58.30 | 12.87 | 14.76 | 9.45 | 9.58 | 11.72 | ... | 2.702 | 2.685 | 4.157 | 4.140 | 3.996 | 2.631333 | 4.097667 | 7.347 | 4.682 | 0.424 |
3 | 541 | After | Pine Cay | Female | 43.15 | 8.55 | 10.29 | 6.60 | 6.26 | 7.43 | ... | 1.175 | 1.186 | 1.898 | 1.871 | 1.867 | 1.177667 | 1.878667 | 2.786 | 5.378 | 0.171 |
4 | 542 | After | Pine Cay | Female | 45.51 | 10.26 | 11.02 | 6.89 | 7.02 | 7.71 | ... | 1.357 | 1.420 | 2.627 | 2.435 | 2.529 | 1.384333 | 2.530333 | 3.575 | 6.646 | 0.014 |
5 rows × 26 columns
data.T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ID | 537 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | ... | WC61 | WC62 | WC63 | WC64 | WC65 | WC66 | WC69 | WC70 | WC71 | WC72 |
Hurricane | After | After | After | After | After | After | After | After | After | After | ... | Before | Before | Before | Before | Before | Before | Before | Before | Before | Before |
Origin | Pine Cay | Pine Cay | Pine Cay | Pine Cay | Pine Cay | Pine Cay | Pine Cay | Pine Cay | Pine Cay | Pine Cay | ... | Water Cay | Water Cay | Water Cay | Water Cay | Water Cay | Water Cay | Water Cay | Water Cay | Water Cay | Water Cay |
Sex | Male | Female | Male | Female | Female | Female | Male | Male | Female | Male | ... | Male | Male | Male | Female | Female | Female | Female | Female | Female | Female |
SVL | 48.69 | 40.31 | 58.3 | 43.15 | 45.51 | 46.97 | 52.88 | 57.01 | 43.17 | 54.2 | ... | 55.89 | 55.5 | 55.76 | 41.92 | 41.06 | 43.04 | 42.38 | 45.74 | 40.95 | 40.62 |
Femur | 10.39 | 8.66 | 12.87 | 8.55 | 10.26 | 10.02 | 12.74 | 11.87 | 9.99 | 11.32 | ... | 12.35 | 13.12 | 12.32 | 9.77 | 9.18 | 9.23 | 9.21 | 9.79 | 9.04 | 8.64 |
Tibia | 11.87 | 9.79 | 14.76 | 10.29 | 11.02 | 10.78 | 12.43 | 12.91 | 11.13 | 13.07 | ... | 13.73 | 14.26 | 14.32 | 10.09 | 10.13 | 9.96 | 9.8 | 10.08 | 10.08 | 9.77 |
Metatarsal | 7.52 | 6.18 | 9.45 | 6.6 | 6.89 | 6.85 | 7.9 | 8.24 | 6.88 | 7.77 | ... | 8.47 | 8.83 | 8.97 | 6.2 | 6.57 | 6.29 | 6.68 | 6.61 | 6.26 | 6.14 |
LongestToe | 7.43 | 6.2 | 9.58 | 6.26 | 7.02 | 7.18 | 8.23 | 8.02 | 6.7 | 7.7 | ... | 8.67 | 8.35 | 8.37 | 5.99 | 6.23 | 5.72 | 6.29 | 6.54 | 5.52 | 6.61 |
Humerus | 8.66 | 8.01 | 11.72 | 7.43 | 7.71 | 8.45 | 9.88 | 10.31 | 7.78 | 10.19 | ... | 10.33 | 10.89 | 9.94 | 7.78 | 7.42 | 7.12 | 7.38 | 8.05 | 7.27 | 6.91 |
Radius | 7.99 | 6.51 | 9.54 | 6.6 | 7.25 | 7.15 | 8.4 | 8.79 | 7.11 | 8.7 | ... | 9.02 | 9.38 | 9.11 | 6.69 | 6.3 | 6.73 | 6.62 | 7.23 | 6.66 | 6.38 |
Metacarpal | 2.22 | 2.38 | 3.54 | 2.79 | 2.52 | 2.39 | 3.15 | 3.18 | 2.82 | 3.05 | ... | 3.28 | 3.5 | 2.87 | 2.6 | 2.26 | 2.5 | 2.17 | 2.4 | 2.24 | 2.52 |
LongestFinger | 3.19 | 3.55 | 5.09 | 3.55 | 3.37 | 3.26 | 4.3 | 4.2 | 3.36 | 4.12 | ... | 4.47 | 4.36 | 4.3 | 3.2 | 2.6 | 3.07 | 3.28 | 3.41 | 3 | 2.94 |
FingerCount | 10 | 10 | 14 | 11 | 11 | 12 | 11 | 12 | 12 | 12 | ... | 12 | 12 | 12 | 9 | 10 | 10 | 10 | 11 | 10 | 11 |
ToeCount | 12 | 13 | 15 | 12 | 13 | 14 | 14 | 12 | 13 | 16 | ... | 14 | 14 | 15 | 11 | 13 | 12 | 12 | 13 | 12 | 13 |
FingerArea1 | 1.321 | 0.962 | 2.507 | 1.172 | 1.376 | 1.428 | 1.873 | 2.558 | 1.114 | 2.284 | ... | 1.835 | 1.796 | 1.807 | 0.919 | 0.797 | 1.015 | 0.912 | 1.136 | 0.782 | 0.827 |
FingerArea2 | 1.338 | 0.95 | 2.702 | 1.175 | 1.357 | 1.41 | 1.85 | 2.544 | 1.08 | 2.344 | ... | 1.914 | 1.794 | 1.825 | 0.913 | 0.77 | 1.008 | 0.92 | 1.119 | 0.793 | 0.842 |
FingerArea3 | 1.339 | 0.972 | 2.685 | 1.186 | 1.42 | 1.44 | 1.85 | 2.574 | 1.04 | 2.27 | ... | 1.901 | 1.766 | 1.764 | 0.931 | 0.782 | 1.031 | 0.894 | 1.146 | 0.777 | 0.845 |
ToeArea1 | 2.529 | 1.498 | 4.157 | 1.898 | 2.627 | 2.061 | 2.984 | 4.016 | 1.794 | 3.916 | ... | 2.897 | 3.269 | 2.793 | 1.383 | 1.372 | 1.17 | 1.542 | 1.719 | 1.227 | 1.122 |
ToeArea2 | 2.402 | 1.525 | 4.14 | 1.871 | 2.435 | 2.018 | 2.983 | 3.952 | 1.716 | 3.913 | ... | 2.916 | 3.325 | 2.746 | 1.331 | 1.345 | 1.127 | 1.526 | 1.716 | 1.234 | 1.203 |
ToeArea3 | 2.369 | 1.53 | 3.996 | 1.867 | 2.529 | 2.029 | 2.958 | 3.968 | 1.805 | 3.88 | ... | 2.869 | 3.258 | 2.748 | 1.365 | 1.342 | 1.148 | 1.527 | 1.703 | 1.252 | 1.116 |
MeanFingerArea | 1.33267 | 0.961333 | 2.63133 | 1.17767 | 1.38433 | 1.426 | 1.85767 | 2.55867 | 1.078 | 2.29933 | ... | 1.88333 | 1.78533 | 1.79867 | 0.921 | 0.783 | 1.018 | 0.908667 | 1.13367 | 0.784 | 0.838 |
MeanToeArea | 2.43333 | 1.51767 | 4.09767 | 1.87867 | 2.53033 | 2.036 | 2.975 | 3.97867 | 1.77167 | 3.903 | ... | 2.894 | 3.284 | 2.76233 | 1.35967 | 1.353 | 1.14833 | 1.53167 | 1.71267 | 1.23767 | 1.147 |
SumFingers | 2.663 | 2.595 | 7.347 | 2.786 | 3.575 | 3.829 | 5.453 | 7.812 | 2.68 | 6.907 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
SumToes | 4.791 | 3.678 | 4.682 | 5.378 | 6.646 | 5.771 | 8.427 | 9.427 | 4.243 | 10.913 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
MaxFingerForce | 0.116 | 0.048 | 0.424 | 0.171 | 0.014 | 0.267 | 0.356 | 0.191 | 0.191 | 0.151 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
26 rows × 164 columns
Some attributes were only masured after the hurricane, and one data point has a lot of missing values
imshow(data.isnull().values.T)
<matplotlib.image.AxesImage at 0x1a171a83d0>
data.iloc[39,:]
ID 588 Hurricane After Origin Pine Cay Sex Male SVL 47.44 Femur 10.32 Tibia 11.6 Metatarsal 7.73 LongestToe 6.66 Humerus 8.5 Radius 7.56 Metacarpal 2.35 LongestFinger 3.65 FingerCount NaN ToeCount NaN FingerArea1 NaN FingerArea2 NaN FingerArea3 NaN ToeArea1 NaN ToeArea2 NaN ToeArea3 NaN MeanFingerArea NaN MeanToeArea NaN SumFingers NaN SumToes NaN MaxFingerForce 0.274 Name: 39, dtype: object
# drop the one data point wth lots of NA
data = data.drop([39])
# only keep numercal body measurements
data_num = data.drop(['ID','Hurricane','Origin','Sex'],axis=1)
# drop which was not measured
data_num = data_num.drop(['SumFingers','SumToes','MaxFingerForce'],axis=1)
imshow(data_num.isnull().values.T)
<matplotlib.image.AxesImage at 0x1a1ab76b10>
# scale before PCA
X = StandardScaler().fit_transform(data_num)
pca = PCA()
X_pca = pca.fit_transform(X)
plot(X_pca[:,0],X_pca[:,1],'o')
xlabel('PCA 1')
ylabel('PCA 2')
show()
male_idx = data['Sex'] == 'Male'
plot(X_pca[male_idx,0],X_pca[male_idx,1],'o', label='Male')
plot(X_pca[~male_idx,0],X_pca[~male_idx,1],'x', label='Female')
xlabel('PCA 1')
ylabel('PCA 2')
legend()
<matplotlib.legend.Legend at 0x1a1aec2dd0>
plot(X_pca[:,1],X_pca[:,2],'o')
xlabel('PCA 2')
ylabel('PCA 3')
Text(0,0.5,'PCA 3')
orig_idx = data['Origin'] == 'Pine Cay'
plot(X_pca[orig_idx,1],X_pca[orig_idx,2],'o', label='Pine Cay')
plot(X_pca[~orig_idx,1],X_pca[~orig_idx,2],'x', label='Water Cay')
xlabel('PCA 2')
ylabel('PCA 3')
legend()
<matplotlib.legend.Legend at 0x1a1ae5c710>
hurr_idx = data['Hurricane'] == 'After'
plot(X_pca[hurr_idx,1],X_pca[hurr_idx,2],'o', label='After hurricane')
plot(X_pca[~hurr_idx,1],X_pca[~hurr_idx,2],'x', label='Before hurricane')
xlabel('PCA 2')
ylabel('PCA 3')
legend()
<matplotlib.legend.Legend at 0x1a1b0c48d0>
hurr_idx = data['Hurricane'] == 'After'
plot(X_pca[hurr_idx,0],X_pca[hurr_idx,2],'o', label='After hurricane')
plot(X_pca[~hurr_idx,0],X_pca[~hurr_idx,2],'x', label='Before hurricane')
xlabel('PCA 1')
ylabel('PCA 3')
legend()
<matplotlib.legend.Legend at 0x1a1b32ca50>
zip(data_num.columns,pca.components_[0])
[('SVL', 0.2396699068873259), ('Femur', 0.22787727077890135), ('Tibia', 0.23911059988009495), ('Metatarsal', 0.2366058325405837), ('LongestToe', 0.22661416394099954), ('Humerus', 0.23583888230278138), ('Radius', 0.24081352224425923), ('Metacarpal', 0.20109839039008381), ('LongestFinger', 0.22255726865127848), ('FingerCount', 0.17064296922266192), ('ToeCount', 0.17997059568328241), ('FingerArea1', 0.23971929539505535), ('FingerArea2', 0.23710651531690163), ('FingerArea3', 0.23914791932390145), ('ToeArea1', 0.24113144273838197), ('ToeArea2', 0.24130280317578753), ('ToeArea3', 0.24108709961903993), ('MeanFingerArea', 0.23949943955814071), ('MeanToeArea', 0.241448303304203)]
zip(data_num.columns,pca.components_[1])
[('SVL', -0.1264116423487418), ('Femur', -0.087180321654185419), ('Tibia', -0.034287079581090056), ('Metatarsal', -0.017872575956949147), ('LongestToe', -0.079592633212735089), ('Humerus', -0.062324471858596381), ('Radius', -0.071317747466594902), ('Metacarpal', -0.003123450078838077), ('LongestFinger', 0.049457065395697442), ('FingerCount', 0.71832636566670749), ('ToeCount', 0.61730112255325165), ('FingerArea1', -0.0052252564903880914), ('FingerArea2', -0.03794270634308515), ('FingerArea3', -0.0039468185410039503), ('ToeArea1', -0.1220862932300881), ('ToeArea2', -0.11718960228925009), ('ToeArea3', -0.12297391062949543), ('MeanFingerArea', -0.015894230241668198), ('MeanToeArea', -0.12088714442673387)]
zip(data_num.columns,pca.components_[2])
[('SVL', -0.10031708144735453), ('Femur', -0.34801099979186972), ('Tibia', -0.23757229624192613), ('Metatarsal', -0.28738164612456424), ('LongestToe', -0.3287004624442737), ('Humerus', -0.11076157225672248), ('Radius', -0.16281208805504277), ('Metacarpal', 0.0099190929537464369), ('LongestFinger', -0.1959173293427677), ('FingerCount', 0.1537247004016587), ('ToeCount', -0.20524728861177247), ('FingerArea1', 0.30879125133348778), ('FingerArea2', 0.34186501685373183), ('FingerArea3', 0.33016763335354538), ('ToeArea1', 0.11254794694959888), ('ToeArea2', 0.11285274974123567), ('ToeArea3', 0.10911972833480814), ('MeanFingerArea', 0.32822325347183928), ('MeanToeArea', 0.11163157253758042)]
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X)
plot(X_tsne[:,0],X_tsne[:,1],'o')
xlabel('T-SNE 1')
ylabel('T-SNE 2')
Text(0,0.5,'T-SNE 2')
male_idx = data['Sex'] == 'Male'
plot(X_tsne[male_idx,0],X_tsne[male_idx,1],'o', label='Male')
plot(X_tsne[~male_idx,0],X_tsne[~male_idx,1],'x',label='Female')
legend()
xlabel('T-SNE 1')
ylabel('T-SNE 2')
Text(0,0.5,'T-SNE 2')
plot(X_tsne[hurr_idx,0],X_tsne[hurr_idx,1],'o', label='After hurricane')
plot(X_tsne[~hurr_idx,0],X_tsne[~hurr_idx,1],'x',label='Before hurricane')
legend()
xlabel('T-SNE 1')
ylabel('T-SNE 2')
Text(0,0.5,'T-SNE 2')
plot(X_tsne[orig_idx,0],X_tsne[orig_idx,1],'o', label='Pine Cay')
plot(X_tsne[~orig_idx,0],X_tsne[~orig_idx,1],'x', label='Water Cay')
legend()
xlabel('T-SNE 1')
ylabel('T-SNE 2')
Text(0,0.5,'T-SNE 2')
figsize(16,4)
f,a = subplots(1,3, figsize=(16,4), sharey=True, sharex=True)
tsne = TSNE(random_state=0)
X_tsne = tsne.fit_transform(X)
plt.sca(a[0])
plot(X_tsne[:,0],X_tsne[:,1],'o')
xlabel('T-SNE 1')
ylabel('T-SNE 2')
plt.sca(a[1])
tsne = TSNE(random_state=1)
X_tsne = tsne.fit_transform(X)
plot(X_tsne[:,0],X_tsne[:,1],'o')
xlabel('T-SNE 1')
ylabel('T-SNE 2')
plt.sca(a[2])
tsne = TSNE(random_state=2)
X_tsne = tsne.fit_transform(X)
plot(X_tsne[:,0],X_tsne[:,1],'o')
xlabel('T-SNE 1')
ylabel('T-SNE 2')
Text(0,0.5,'T-SNE 2')
figsize(9,9)
tsne = TSNE(n_components=3)
X_tsne = tsne.fit_transform(X)
plot(X_tsne[male_idx,0],X_tsne[male_idx,1],'o', label='Male')
plot(X_tsne[~male_idx,0],X_tsne[~male_idx,1],'x',label='Female')
legend()
xlabel('T-SNE 1')
ylabel('T-SNE 2')
Text(0,0.5,'T-SNE 2')
plot(X_tsne[male_idx,1],X_tsne[male_idx,2],'o', label='Male')
plot(X_tsne[~male_idx,1],X_tsne[~male_idx,2],'x',label='Female')
legend()
xlabel('T-SNE 2')
ylabel('T-SNE 3')
Text(0,0.5,'T-SNE 3')
plot(X_tsne[hurr_idx,0],X_tsne[hurr_idx,1],'o', label='After hurricane')
plot(X_tsne[~hurr_idx,0],X_tsne[~hurr_idx,1],'x',label='After hurricane')
legend()
xlabel('T-SNE 2')
ylabel('T-SNE 3')
Text(0,0.5,'T-SNE 3')
plot(X_tsne[hurr_idx,1],X_tsne[hurr_idx,2],'o', label='After hurricane')
plot(X_tsne[~hurr_idx,1],X_tsne[~hurr_idx,2],'x',label='After hurricane')
legend()
xlabel('T-SNE 2')
ylabel('T-SNE 3')
Text(0,0.5,'T-SNE 3')
kmeans = KMeans(n_clusters=2)
acc = (male_idx.astype('int')==kmeans.fit_predict(X)).astype('int').mean()
if acc<0.5:
acc = 1 -acc
print 'K-means correctly categorized gender: %.3f'%acc
K-means correctly categorized gender: 0.957
kmeans = KMeans(n_clusters=2)
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
acc = (male_idx.astype('int')==kmeans.fit_predict(X_tsne)).astype('int').mean()
if acc<0.5:
acc = 1 -acc
print 'K-means correctly categorized gender: %.3f'%acc
K-means correctly categorized gender: 0.963
kmeans = KMeans(n_clusters=3)
x_kmeans = kmeans.fit_predict(X)
figsize(14,6)
subplot(1,2,1)
hurr_idx = data['Hurricane'] == 'After'
for i in [0,1,2]:
plot(X_pca[x_kmeans==i,0],X_pca[x_kmeans==i,1],'o', label=str(i)+'. cluster')
xlabel('PCA 1')
ylabel('PCA 2')
legend()
subplot(1,2,2)
plot(X_pca[hurr_idx,0],X_pca[hurr_idx,2],'o', label='After hurricane')
plot(X_pca[~hurr_idx,0],X_pca[~hurr_idx,2],'x', label='Before hurricane')
xlabel('PCA 1')
ylabel('PCA 3')
legend()
<matplotlib.legend.Legend at 0x1a1cdcd410>
kmeans = KMeans(n_clusters=4)
x_kmeans = kmeans.fit_predict(X)
figsize(14,6)
subplot(1,2,1)
hurr_idx = data['Hurricane'] == 'After'
for i in [0,1,2,3]:
plot(X_pca[x_kmeans==i,0],X_pca[x_kmeans==i,1],'o', label=str(i)+'. cluster')
xlabel('PCA 1')
ylabel('PCA 2')
legend()
subplot(1,2,2)
plot(X_pca[hurr_idx,0],X_pca[hurr_idx,2],'o', label='After hurricane')
plot(X_pca[~hurr_idx,0],X_pca[~hurr_idx,2],'x', label='Before hurricane')
xlabel('PCA 1')
ylabel('PCA 3')
legend()
<matplotlib.legend.Legend at 0x1a1d08f410>
sns.clustermap(data_num, z_score=1, cmap='coolwarm')
<seaborn.matrix.ClusterGrid at 0x1a1c83f2d0>
Male is black
lut = dict(zip(['Male','Female'], "kw"))
rc = data['Sex'].map(lut)
sns.clustermap(data_num, cmap='coolwarm', row_colors=rc, z_score=1)
<seaborn.matrix.ClusterGrid at 0x1a1d6ae290>
Pine Cay is black
lut = dict(zip(['Pine Cay', 'Water Cay'], "kw"))
rc = data['Origin'].map(lut)
sns.clustermap(data_num, cmap='coolwarm', row_colors=rc, z_score=1)
<seaborn.matrix.ClusterGrid at 0x1a1da92150>
After hurricane is black
lut = dict(zip(['After','Before'], 'kw' ))
rc = data['Hurricane'].map(lut)
sns.clustermap(data_num, cmap='coolwarm', row_colors=rc, z_score=1)
<seaborn.matrix.ClusterGrid at 0x1a1d6ae2d0>