import pandas as pd import numpy as np import statsmodels.api as sm import patsy %pylab inline df = pd.read_table("data/unemployment.tsv") df_pca = df[['education', 'income', 'unemployment', 'disability', 'life', 'obesity']] df_pca -= df_pca.mean() df_pca /= df_pca.var() ** .5 var = df_pca.values.T.dot(df_pca.values) U, d, Vt = np.linalg.svd(df_pca) V = Vt.T values = df_pca.values.dot(V) for i in range(6): df['v' + str(i+1)] = values[:, i] plot(d ** 2) pd.DataFrame(V.T, columns=df_pca.columns) plot(df['rank'], df['v1'], 'o') xlabel("Rank according to the NYT") ylabel("First Principal Component Value") title("First principal component matches rank") plot(df.v1, df.v2, 'o') xlabel("First PC") ylabel("Second PC") df.sort('v2') plot(d ** 2) title("Skree plot")