%run 1.0-adm-load-data-2012.ipynb
Populating the interactive namespace from numpy and matplotlib Variables now available: df
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer, StandardScaler
imp = Imputer(strategy='mean')
scl = StandardScaler()
pca = PCA()
pipeline = Pipeline([
('imp', imp),
('scl', scl),
('pca', pca),
])
scaler_pipeline = Pipeline([
('imp', imp),
('scl', scl),
])
data_pca = pipeline.fit_transform(df)
_scaled = scaler_pipeline.transform(df)
dem = df[data_pca[:, 0] <= 0]
rep = df[data_pca[:, 0] > 0]
rep_pca = pipeline.fit_transform(rep)
rep_scaled = scaler_pipeline.transform(rep)
def plot_explained_variance(pca):
import plotly
from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line
plotly.offline.init_notebook_mode() # run at the start of every notebook
explained_var = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(explained_var)
plotly.offline.iplot({
"data": [Bar(y=explained_var, name='individual explained variance'),
Scatter(y=cum_var_exp, name='cumulative explained variance')
],
"layout": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))
})
plot_explained_variance(pca)
def biplot(pca, dat, title='', show_points=True, components=(0, 1)):
import plotly
from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line
plotly.offline.init_notebook_mode() # run at the start of every notebook
pc1, pc2 = components
# 0,1 denote PC1 and PC2; change values for other PCs
xvector = pca.components_[pc1]
yvector = pca.components_[pc2]
tmp = pca.transform(dat.values)
xs = tmp[:,pc1]
ys = tmp[:,pc2]
if show_points:
annotations = [Scatter(x=xs, y=ys, mode ='markers', marker=dict(size=1), name='cumulative explained variance')]
else:
annotations = []
for i in range(len(xvector)):
txt = list(dat.columns.values)[i]
annotations.append(
Scatter(
x=[0, xvector[i]*max(xs)],
y=[0, yvector[i]*max(ys)],
mode='lines+text',
text=['', txt],
name=txt,
))
plotly.offline.iplot({
"data": annotations,
"layout": Layout(xaxis=XAxis(title='Principal Component ' + str(pc1 + 1)),
yaxis=YAxis(title='Principal Component ' + str(pc2 + 1)),
title=title)
})
plt.show()
biplot(pca, pd.DataFrame(rep_scaled, columns=df.columns), title='Biplot for conservatives', components=(0, 1))
rep.mean()
campfin_limcorp 0.404807 pid_self -0.369540 spsrvpr_ssself 2.838948 defsppr_self -4.629902 inspre_self -5.249009 gun_control 0.179421 guarpr_self -5.256634 immig_policy 2.282252 aidblack_self -5.851562 envjob_self -4.224597 aa_uni -0.676372 fedspend_ss 0.276444 fedspend_schools 0.217910 fedspend_scitech 0.208832 fedspend_crime 0.358561 fedspend_welfare -0.666667 envir_gwarm -1.322544 gayrt_marry -2.135054 penalty_favdpen 1.148679 relig_churchoft 2.157926 dem_edu 10.723117 dem_veteran 1.832222 budget_rdefctax 0.037067 budget_rdefmil -0.414125 patriot_amident 1.485612 milln_milltax 0.424157 budget_rdef250k 1.704536 fairjob_opin -1.792572 immigpo_jobs 2.419887 wiretap_warrant 0.368209 postvote_presvt 1.014540 dtype: float64