Data is simulated using ResilUS (Miles and Chang, 2011).
Coded by Scott Miles
Import some cool stuff for doing the analysis: Scikit Learn, Matplotlib, and Pandas.
from sklearn import decomposition # allows you to do PCA
from sklearn import cluster # allows you to do k means clustering
from matplotlib import cm, colors # helps with creating a legend
import pandas as pd # awesome data handling
import plotly as py
import warnings
warnings.filterwarnings('ignore')
Set display options for pandas for easier viewing of tables
pandas.set_option('display.height', 500)
pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.width', 500)
pandas.set_option('display.mpl_style', False)
username='geomando'
api_key = '83orx28wm4'
py = plotly.plotly(username=username, key=api_key)
Read data for PCA and pull out columns and rows for labeling later
rec_data = pd.read_csv('sample_data.csv') # our recovery data table
row_labels = rec_data.index.astype('int')
column_labels = rec_data.columns
Print out recovery data. Notice how pandas permits pretty printing of tables
rec_data
% No Mitigation | % High Income | % Low Income | % MFR | % SFR | % Owner | % Renter | % Damaged | % Low Damage | % Medium Damage | % High Damage | % MFR Damaged | % Injured | Health Recovery Wks | # Residents Left | % Residents Left | Household Debt Wk 13 | Household Debt Wk 208 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 88.902861 | 14.397988 | 85.602012 | 29.424709 | 70.575291 | 48.066646 | 51.933354 | 0.125747 | 25.000000 | 50.000000 | 25.000000 | 0.427350 | 100.000000 | 4 | 64 | 0.1981 | 0.0011 | 0.0007 |
1 | 80.258398 | 6.201550 | 93.798450 | 56.227390 | 43.772610 | 21.705426 | 78.294574 | 0.671835 | 7.692308 | 61.538462 | 30.769231 | 1.011029 | 92.307692 | 1 | 62 | 0.3116 | 0.0099 | 0.0040 |
2 | 94.282311 | 5.479452 | 94.520548 | 34.901727 | 65.098273 | 34.633711 | 65.366289 | 0.268017 | 0.000000 | 88.888889 | 11.111111 | 0.597270 | 100.000000 | 1 | 78 | 0.2415 | 0.0009 | 0.0006 |
3 | 86.457342 | 12.781478 | 87.218522 | 51.538218 | 48.461782 | 41.801459 | 58.198541 | 1.998097 | 53.968254 | 38.095238 | 7.936508 | 3.753846 | 96.825397 | 120 | 69 | 0.2270 | 0.0347 | 0.0327 |
4 | 86.523929 | 9.319899 | 90.680101 | 45.541562 | 54.458438 | 37.002519 | 62.997481 | 0.377834 | 13.333333 | 40.000000 | 46.666667 | 0.829646 | 100.000000 | 105 | 91 | 0.2258 | 0.0111 | 0.0063 |
5 | 67.575758 | 17.595960 | 82.404040 | 79.878788 | 20.121212 | 22.222222 | 77.777778 | 1.757576 | 36.781609 | 44.827586 | 18.390805 | 2.124431 | 88.505747 | 120 | 130 | 0.2808 | 0.0152 | 0.0098 |
6 | 69.792439 | 17.550037 | 82.449963 | 73.202372 | 26.797628 | 24.369904 | 75.630096 | 1.593773 | 41.860465 | 38.372093 | 19.767442 | 2.101266 | 94.186047 | 120 | 145 | 0.2599 | 0.0162 | 0.0090 |
7 | 72.354740 | 9.816514 | 90.183486 | 71.376147 | 28.623853 | 19.143731 | 80.856269 | 0.550459 | 22.222222 | 61.111111 | 16.666667 | 0.771208 | 88.888889 | 8 | 100 | 0.3226 | 0.0112 | 0.0052 |
8 | 75.730230 | 49.465685 | 50.534315 | 37.117074 | 62.882926 | 63.452862 | 36.547138 | 1.733555 | 38.356164 | 43.835616 | 17.808219 | 4.094690 | 78.082192 | 12 | 84 | 0.1795 | 0.0220 | 0.0117 |
9 | 76.138681 | 17.165194 | 82.834806 | 61.080897 | 38.919103 | 33.752549 | 66.247451 | 6.084296 | 50.279330 | 35.754190 | 13.966480 | 9.015025 | 93.854749 | 27 | 81 | 0.2765 | 0.0340 | 0.0112 |
10 | 87.480438 | 19.092332 | 80.907668 | 20.031299 | 79.968701 | 64.241002 | 35.758998 | 11.189358 | 60.839161 | 27.272727 | 11.888112 | 19.531250 | 90.209790 | 31 | 28 | 0.1972 | 0.0913 | 0.0327 |
11 | 93.720336 | 12.340270 | 87.659730 | 9.602045 | 90.397955 | 36.692223 | 63.307777 | 7.192406 | 51.269036 | 37.055838 | 11.675127 | 16.730038 | 92.893401 | 120 | 66 | 0.2661 | 0.0397 | 0.0255 |
12 | 95.712695 | 12.193764 | 87.806236 | 6.737194 | 93.262806 | 53.229399 | 46.770601 | 15.367483 | 60.144928 | 24.637681 | 15.217391 | 23.966942 | 90.217391 | 120 | 40 | 0.2299 | 0.0486 | 0.0344 |
13 | 95.726496 | 14.574899 | 85.425101 | 3.463788 | 96.536212 | 63.607737 | 36.392263 | 0.314890 | 0.000000 | 71.428571 | 28.571429 | 2.597403 | 100.000000 | 120 | 60 | 0.2469 | 0.0160 | 0.0084 |
14 | 95.519542 | 21.401335 | 78.598665 | 2.621544 | 97.378456 | 76.739752 | 23.260248 | 26.358437 | 66.184448 | 21.518987 | 12.296564 | 43.636364 | 82.640145 | 120 | 42 | 0.1963 | 0.1018 | 0.0353 |
15 | 94.163424 | 25.052380 | 74.947620 | 4.160431 | 95.839569 | 67.015864 | 32.984136 | 3.621670 | 59.504132 | 29.752066 | 10.743802 | 9.352518 | 86.776860 | 53 | 76 | 0.2190 | 0.0526 | 0.0354 |
16 | 94.011229 | 28.820961 | 71.179039 | 4.179663 | 95.820337 | 69.931379 | 30.068621 | 15.127885 | 61.443299 | 25.360825 | 13.195876 | 24.626866 | 79.587629 | 120 | 66 | 0.1935 | 0.0560 | 0.0291 |
17 | 93.702178 | 21.542084 | 78.457916 | 5.002943 | 94.997057 | 62.831077 | 37.168923 | 3.766922 | 50.000000 | 35.156250 | 14.843750 | 7.058824 | 87.500000 | 120 | 76 | 0.2190 | 0.0508 | 0.0284 |
18 | 90.293542 | 24.422701 | 75.577299 | 12.054795 | 87.945205 | 40.273973 | 59.726027 | 1.487280 | 57.894737 | 23.684211 | 18.421053 | 6.493506 | 84.210526 | 5 | 47 | 0.1888 | 0.0252 | 0.0164 |
19 | 93.202694 | 17.677077 | 82.322923 | 9.471321 | 90.528679 | 40.640947 | 59.359053 | 0.796081 | 30.769231 | 56.410256 | 12.820513 | 4.094828 | 94.871795 | 120 | 141 | 0.2640 | 0.0233 | 0.0102 |
20 | 96.365173 | 11.355311 | 88.644689 | 7.213300 | 92.786700 | 43.927867 | 56.072133 | 0.028177 | 0.000000 | 0.000000 | 100.000000 | 0.390625 | 100.000000 | 1 | 94 | 0.2500 | 0.0005 | 0.0005 |
Reduce the dimensionality of the data using principal components analysis and visualize
Run PCA to with neighborhoods as observations
pca = decomposition.ProbabilisticPCA(n_components=2, whiten=True)
myPCA = pca.fit_transform(rec_data)
pc1 = myPCA[:, 0]
pc2 = myPCA[:, 1]
Run PCA to with recovery indicators as observations (transpose of original data)
pca_rec = decomposition.ProbabilisticPCA(n_components=2, whiten=True)
myPCA_rec = pca.fit_transform(rec_data.T) # took the transpose of the recovery data table
pc1_rec = myPCA_rec[:, 0]
pc2_rec = myPCA_rec[:, 1]
Create a bubble chart showing six variables:
figure(figsize=(10,8))
# Choose recovery indicator for graduated symbol
# Set arbitrary scale value
area_scale = 10
indicator_for_area = rec_data['# Residents Left']*area_scale
# Choose recovery indicator for graduated colors
# Do various stuff for implementing graduated colors
indicator_for_color = rec_data['% Renter']
color_norm = [item/100. for item in indicator_for_color] # normalize from 0 to 1 in order to use as grayscale colormap
color_norm_str = [str(item) for item in color_norm] # convert each entry to string for use in plot command
# create scatter plot of the two neighborhood principal components with graduated symbols and colors.
scatter(pc1, pc2, s=indicator_for_area, c=color_norm_str, alpha=1)
# Plot the neighborhood labels
# have to use parallel for loop using zip because annotate function will only label one point at a time
for label, x, y in zip(row_labels, pc1, pc2):
annotate(
label,
xy = (x, y), xytext = (0, 0),
textcoords = 'offset points', ha = 'center', va = 'center', size='8')
# label x and y axes of plot, inserting the calculated variance explained by the first and second principal components
xlabel("Principal Component 1" + " (" + str(round(100*pca.explained_variance_ratio_[0],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label
ylabel("Principal Component 2" + " (" + str(round(100*pca.explained_variance_ratio_[1],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label
# Create colorbar legend
m = cm.ScalarMappable()
m.set_array(indicator_for_color)
m.set_cmap(cmap='gray')
norm = colors.Normalize(vmin=0, vmax=100)
m.set_norm(norm)
m.set_clim(vmin=0.0, vmax=100.0)
cbar = plt.colorbar(m)
cbar.set_label('% Renter') # Too lazy to generalize legend label
# Anyone know a good solution for making a graduated symbol legend?
# None of my attempts have been generalizable
annotate('Number Residents Left', xy = (1.8, 2.25), xytext = (0, 0),
textcoords = 'offset points', ha = 'right', va = 'center', size='12')
annotate('as Graduated Symbols', xy = (1.8, 2.1), xytext = (0, 0),
textcoords = 'offset points', ha = 'right', va = 'center', size='12')
# Plot the principal components -- as just labels -- of the recovery indicators
for label, x, y in zip(column_labels, pc1_rec, pc2_rec):
annotate(
label,
xy = (x, y), xytext = (0, 0),
textcoords = 'offset points', ha = 'center', va = 'center', size='8')
Identify neighborhoods with similar resilience using k-means clustering and visualize
Run kmeans cluster on neighborhoods PCA to identify and symbolize potential clusters in pca results
kmeans = cluster.KMeans(n_clusters=5)
clust = kmeans.fit(myPCA)
Run kmeans cluster on recovery indicators PCA to identify and symbolize potential clusters in pca results
kmeans_rec = cluster.KMeans(n_clusters=5)
clusters_rec = kmeans_rec.fit(myPCA_rec)
figure(figsize=(10,8))
# Plot two principal components of neigbhorhoods with color determined by which of the five k-means clusters assigned to components
scatter(pc1, pc2, s=200, c=clust.labels_, alpha=.5)
# label x and y axes
xlabel("Principal Component 1" + " (" + str(round(100*pca.explained_variance_ratio_[0],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label
ylabel("Principal Component 2" + " (" + str(round(100*pca.explained_variance_ratio_[1],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label
# plot labels of PCA recovery indicators
# have to use parallel for loop using zip because annotate function will only label one point at a time
for label, x, y in zip(column_labels, pc1_rec, pc2_rec):
annotate(
label,
xy = (x, y), xytext = (0, 0),
textcoords = 'offset points', ha = 'center', va = 'center', size='8')
# plot labels of neighborhood IDs
for label, x, y in zip(row_labels, pc1, pc2):
annotate(
label,
xy = (x, y), xytext = (0, 0),
textcoords = 'offset points', ha = 'center', va = 'center', size='8')
Using plot.ly can make the bubble chart interactive...the size scale is different between the two plots and I'm too lazy to figure out the different units...
plotly_color_scale = ['hsl('+ str(200) + ',' + str(100) + ',' + str(indicator_for_color.values[i]) + ')' for i in indicator_for_color.index]
data = [
{
'x': pc1,
'y': pc2,
'marker':
{'size': indicator_for_area.values / 30.0,
'color': plotly_color_scale,
'line':{'width':2}
},
'type':'scatter',
'mode': 'markers'
},
{
"x": pc1_rec, "y": pc2_rec,
"text": list(column_labels),
"type": "scatter",
"mode": "text",
"textposition": "top",
}
]
xlabel = "Principal Component 1" + " (" + str(round(100*pca.explained_variance_ratio_[0],2)) + "% of variance)"
ylabel = "Principal Component 2" + " (" + str(round(100*pca.explained_variance_ratio_[1],2)) + "% of variance)"
layout = {
'autosize': False,
'height': 750,
'width': 750,
'xaxis':{'title': xlabel},
'yaxis':{'title': ylabel},
'showlegend':False
}
filename = 'resilus_pca_bubble'
py.iplot(data, layout=layout, filename=filename, fileopt='overwrite')