Notebook

In [13]:

import pandas as pd
import numpy as np
import scipy.stats
from sklearn import datasets
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
%pylab inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # annoying pandas bug

Populating the interactive namespace from numpy and matplotlib

Principal Components Analysis¶

** Helpful explanations: http://flowingmotion.jojordan.org/2011/12/04/down-to-earth-principal-components-analysis-in-5-steps/ **

1. Normalize Data¶

In [2]:

# plants data with 4 dimensions: sepal length, sepal width, petal length, petal width
iris = datasets.load_iris()
X = iris.data
y = iris.target
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
raw = pd.DataFrame(list(X), columns=features)
raw['class'] = y
print 'Raw dataset:'
print raw[:5]

# normalize each feature/column to mean = 0, std = 1
normed = raw.copy()
for col in features:
    normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())
print '\nNormalized dataset:'
print normed[:5]

Raw dataset:
   sepal_length  sepal_width  petal_length  petal_width  class
0           5.1          3.5           1.4          0.2      0
1           4.9          3.0           1.4          0.2      0
2           4.7          3.2           1.3          0.2      0
3           4.6          3.1           1.5          0.2      0
4           5.0          3.6           1.4          0.2      0

Normalized dataset:
   sepal_length  sepal_width  petal_length  petal_width  class
0     -0.897674     1.028611     -1.336794    -1.308593      0
1     -1.139200    -0.124540     -1.336794    -1.308593      0
2     -1.380727     0.336720     -1.393470    -1.308593      0
3     -1.501490     0.106090     -1.280118    -1.308593      0
4     -1.018437     1.259242     -1.336794    -1.308593      0

2. Compute Covariance Matrix¶

In [3]:

# since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations
cov_df = pd.DataFrame(index=features)
for colA in features:
    column = []
    for colB in features:
        cov = normed[colA].cov(normed[colB])
        column.append(cov)
    cov_df[colA] = column

# we lose a little bit to rounding errors in normalization
print 'Covariance matrix:'
print cov_df

Covariance matrix:
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.000000    -0.109369      0.871754     0.817954
sepal_width      -0.109369     1.000000     -0.420516    -0.356544
petal_length      0.871754    -0.420516      1.000000     0.962757
petal_width       0.817954    -0.356544      0.962757     1.000000

3. Singular Value Decomposition (SVD)¶

** [U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component. **

** [S]: Represents how much variance is explained by each component. **

(I need more practice with linear algebra, so I'll admit some of this feels a little bit like magic to me.)

In [4]:

# use numpy's SVD implementation
u, s, v = scipy.linalg.svd(cov_df)
print 'U: (feature loading for each component)'
print pd.DataFrame(u, index=features)

print '\nExplained variance:\n', s

# now reduce the data down to two components
# the loading is essentially the weight of each original feature in the bundle
scores = {}
for component in xrange(4):
    reduced = normed.apply(lambda row: 
                             row[features[0]]*u[0][component] +
                             row[features[1]]*u[1][component] +
                             row[features[2]]*u[2][component] +
                             row[features[3]]*u[3][component], axis=1)
    scores[component] = reduced
scores = pd.DataFrame(scores)
print '\nFirst few reduced data points:\n', scores.head()
plt.scatter(scores[0], scores[1], c=y)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

U: (feature loading for each component)
                     0         1         2         3
sepal_length -0.522372 -0.372318  0.721017  0.261996
sepal_width   0.263355 -0.925556 -0.242033 -0.124135
petal_length -0.581254 -0.021095 -0.140892 -0.801154
petal_width  -0.565611 -0.065416 -0.633801  0.523546

Explained variance:
[ 2.91081808  0.92122093  0.14735328  0.02060771]

First few reduced data points:
          0         1         2         3
0  2.256981 -0.504015  0.121536  0.022996
1  2.079459  0.653216  0.226492  0.102864
2  2.360044  0.317414 -0.051308  0.027732
3  2.296504  0.573447 -0.098530 -0.066090
4  2.380802 -0.672514 -0.021356 -0.037272

In [16]:

# let's see what scikit comes up with...
pca = PCA()
pca.fit(iris.data)
reduced = pca.transform(iris.data)
print 'Components:'
print pd.DataFrame(pca.components_.T, index=features)
print '\nExplained variance ratio:\n', pca.explained_variance_ratio_
print '\nFirst few reduced data points:\n', reduced[:4]
plt.scatter([x[0] for x in reduced], [x[1] for x in reduced], c=y)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

Components:
                     0         1         2         3
sepal_length  0.361590 -0.656540  0.580997  0.317255
sepal_width  -0.082269 -0.729712 -0.596418 -0.324094
petal_length  0.856572  0.175767 -0.072524 -0.479719
petal_width   0.358844  0.074706 -0.549061  0.751121

Explained variance ratio:
[ 0.92461621  0.05301557  0.01718514  0.00518309]

First few reduced data points:
[[ -2.68420713e+00  -3.26607315e-01   2.15118370e-02   1.00615724e-03]
 [ -2.71539062e+00   1.69556848e-01   2.03521425e-01   9.96024240e-02]
 [ -2.88981954e+00   1.37345610e-01  -2.47092410e-02   1.93045428e-02]
 [ -2.74643720e+00   3.11124316e-01  -3.76719753e-02  -7.59552741e-02]]

In [ ]: