#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('pylab', '--no-import-all inline') # # Clustering analysis # # Use various clustering techniques to identify a good subset of questions. # # --- # In[2]: import os import sys import pandas as pd import seaborn as sns # Load the "autoreload" extension get_ipython().run_line_magic('load_ext', 'autoreload') # always reload modules marked with "%aimport" get_ipython().run_line_magic('autoreload', '1') # add the 'src' directory as one where we can import modules src_dir = os.path.join(os.pardir, 'src') sys.path.append(src_dir) # import my method from the source code get_ipython().run_line_magic('aimport', 'features.build_features') get_ipython().run_line_magic('aimport', 'visualization.visualize') from visualization.visualize import biplot, plot_explained_variance, triplot # In[3]: YEAR = 2012 # In[4]: df = pd.read_csv("../data/processed/{}.csv".format(YEAR), index_col=0) # --- # # ## Correlations in data # In[5]: # Spearman is recommended for ordinal data. correlations = df.corr(method='spearman') sns.heatmap(correlations, square=True); # Note that if we were to scale the data, the correlation matrix would be unchanged. # In[6]: cg = sns.clustermap(correlations, square=True) plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0); # Fix rotation of y-labels. # The expected clusters emerged. Party ID got grouped with economics more than with moral attitudes. Economics and race line up with one another. # ## Principal component analysis # In[7]: from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.preprocessing import Imputer, StandardScaler imp = Imputer(strategy='mean') scl = StandardScaler() pca = PCA() pipeline = Pipeline([ ('imp', imp), ('scl', scl), ('pca', pca), ]) scaler_pipeline = Pipeline([ ('imp', imp), ('scl', scl), ]) data_pca = pipeline.fit_transform(df) _scaled = scaler_pipeline.transform(df) # ### Explained variance # # How much of the variance in the data is explained by each successive component? # In[8]: plot_explained_variance(pca) # ### Biplot # # A scatterplot projected onto the first two principal components. # In[9]: data_scaled = pd.DataFrame(_scaled, columns=df.columns) triplot(pca, data_scaled, title='ANES {} Biplot'.format(YEAR), color=data_scaled.PartyID) # In[10]: biplot(pca, data_scaled, title='ANES {} Biplot'.format(YEAR), color=data_scaled.PartyID) # In[11]: pca.explained_variance_ # ## Dropping na # In[12]: df2 = df.dropna() #imp = Imputer(strategy='mean') scl = StandardScaler() pca = PCA() pipeline = Pipeline([ # ('imp', imp), ('scl', scl), ('pca', pca), ]) scaler_pipeline = Pipeline([ # ('imp', imp), ('scl', scl), ]) data_pca = pipeline.fit_transform(df2) _scaled = scaler_pipeline.transform(df2) data_scaled = pd.DataFrame(_scaled, columns=df.columns) # In[13]: biplot(pca, data_scaled, title='ANES {} Biplot'.format(YEAR), color=data_scaled.PartyID) # In[14]: plot_explained_variance(pca) # In[ ]: