import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
%pylab inline

df = pd.read_table("data/unemployment.tsv")

df_pca = df[['education', 'income', 'unemployment', 'disability', 'life', 'obesity']]
df_pca -= df_pca.mean()
df_pca /= df_pca.var() ** .5

var = df_pca.values.T.dot(df_pca.values)

U, d, Vt = np.linalg.svd(df_pca)
V = Vt.T

values = df_pca.values.dot(V)
for i in range(6):
    df['v' + str(i+1)] = values[:, i]

plot(d ** 2)

pd.DataFrame(V.T, columns=df_pca.columns)

plot(df['rank'], df['v1'], 'o')
xlabel("Rank according to the NYT")
ylabel("First Principal Component Value")
title("First principal component matches rank")

plot(df.v1, df.v2, 'o')
xlabel("First PC")
ylabel("Second PC")

df.sort('v2')

plot(d ** 2)
title("Skree plot")