import pandas as pd
import numpy as np
import scipy.stats
from sklearn import datasets
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
%pylab inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # annoying pandas bug
Populating the interactive namespace from numpy and matplotlib
** Helpful explanations: http://flowingmotion.jojordan.org/2011/12/04/down-to-earth-principal-components-analysis-in-5-steps/ **
# plants data with 4 dimensions: sepal length, sepal width, petal length, petal width
iris = datasets.load_iris()
X = iris.data
y = iris.target
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
raw = pd.DataFrame(list(X), columns=features)
raw['class'] = y
print 'Raw dataset:'
print raw[:5]
# normalize each feature/column to mean = 0, std = 1
normed = raw.copy()
for col in features:
normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())
print '\nNormalized dataset:'
print normed[:5]
Raw dataset: sepal_length sepal_width petal_length petal_width class 0 5.1 3.5 1.4 0.2 0 1 4.9 3.0 1.4 0.2 0 2 4.7 3.2 1.3 0.2 0 3 4.6 3.1 1.5 0.2 0 4 5.0 3.6 1.4 0.2 0 Normalized dataset: sepal_length sepal_width petal_length petal_width class 0 -0.897674 1.028611 -1.336794 -1.308593 0 1 -1.139200 -0.124540 -1.336794 -1.308593 0 2 -1.380727 0.336720 -1.393470 -1.308593 0 3 -1.501490 0.106090 -1.280118 -1.308593 0 4 -1.018437 1.259242 -1.336794 -1.308593 0
# since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations
cov_df = pd.DataFrame(index=features)
for colA in features:
column = []
for colB in features:
cov = normed[colA].cov(normed[colB])
column.append(cov)
cov_df[colA] = column
# we lose a little bit to rounding errors in normalization
print 'Covariance matrix:'
print cov_df
Covariance matrix: sepal_length sepal_width petal_length petal_width sepal_length 1.000000 -0.109369 0.871754 0.817954 sepal_width -0.109369 1.000000 -0.420516 -0.356544 petal_length 0.871754 -0.420516 1.000000 0.962757 petal_width 0.817954 -0.356544 0.962757 1.000000
** [U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component. **
** [S]: Represents how much variance is explained by each component. **
(I need more practice with linear algebra, so I'll admit some of this feels a little bit like magic to me.)
# use numpy's SVD implementation
u, s, v = scipy.linalg.svd(cov_df)
print 'U: (feature loading for each component)'
print pd.DataFrame(u, index=features)
print '\nExplained variance:\n', s
# now reduce the data down to two components
# the loading is essentially the weight of each original feature in the bundle
scores = {}
for component in xrange(4):
reduced = normed.apply(lambda row:
row[features[0]]*u[0][component] +
row[features[1]]*u[1][component] +
row[features[2]]*u[2][component] +
row[features[3]]*u[3][component], axis=1)
scores[component] = reduced
scores = pd.DataFrame(scores)
print '\nFirst few reduced data points:\n', scores.head()
plt.scatter(scores[0], scores[1], c=y)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()
U: (feature loading for each component) 0 1 2 3 sepal_length -0.522372 -0.372318 0.721017 0.261996 sepal_width 0.263355 -0.925556 -0.242033 -0.124135 petal_length -0.581254 -0.021095 -0.140892 -0.801154 petal_width -0.565611 -0.065416 -0.633801 0.523546 Explained variance: [ 2.91081808 0.92122093 0.14735328 0.02060771] First few reduced data points: 0 1 2 3 0 2.256981 -0.504015 0.121536 0.022996 1 2.079459 0.653216 0.226492 0.102864 2 2.360044 0.317414 -0.051308 0.027732 3 2.296504 0.573447 -0.098530 -0.066090 4 2.380802 -0.672514 -0.021356 -0.037272
# let's see what scikit comes up with...
pca = PCA()
pca.fit(iris.data)
reduced = pca.transform(iris.data)
print 'Components:'
print pd.DataFrame(pca.components_.T, index=features)
print '\nExplained variance ratio:\n', pca.explained_variance_ratio_
print '\nFirst few reduced data points:\n', reduced[:4]
plt.scatter([x[0] for x in reduced], [x[1] for x in reduced], c=y)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()
Components: 0 1 2 3 sepal_length 0.361590 -0.656540 0.580997 0.317255 sepal_width -0.082269 -0.729712 -0.596418 -0.324094 petal_length 0.856572 0.175767 -0.072524 -0.479719 petal_width 0.358844 0.074706 -0.549061 0.751121 Explained variance ratio: [ 0.92461621 0.05301557 0.01718514 0.00518309] First few reduced data points: [[ -2.68420713e+00 -3.26607315e-01 2.15118370e-02 1.00615724e-03] [ -2.71539062e+00 1.69556848e-01 2.03521425e-01 9.96024240e-02] [ -2.88981954e+00 1.37345610e-01 -2.47092410e-02 1.93045428e-02] [ -2.74643720e+00 3.11124316e-01 -3.76719753e-02 -7.59552741e-02]]