Dimensionality Reduction

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import sys
sys.path.append('/Users/kaonpark/workspace/github.com/likejazz/kaon-learn')
import kaonlearn
from kaonlearn.plots import plot_decision_regions
In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

iris.data[:5]
Out[2]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])
In [3]:
classes = np.array(["sentosa", "versicolor", "virginica"])

iris_data = np.array(iris.data)
iris_data = pd.DataFrame(iris_data)
iris_data['species'] = pd.Series(
    classes[iris.target],
    index=iris_data.index, dtype=str)

iris_data = iris_data.rename(columns={
    0: 'sepal_length', 
    1: 'sepal_width', 
    2: 'petal_length', 
    3: 'petal_width'})

iris_data.head()
Out[3]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 sentosa
1 4.9 3.0 1.4 0.2 sentosa
2 4.7 3.2 1.3 0.2 sentosa
3 4.6 3.1 1.5 0.2 sentosa
4 5.0 3.6 1.4 0.2 sentosa
In [4]:
sns.pairplot(iris_data, hue="species")
Out[4]:
<seaborn.axisgrid.PairGrid at 0x1052f77f0>
In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(iris.data)
iris.data[:3], X_scaled[:3]
Out[5]:
(array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2]]),
 array([[-0.90068117,  1.03205722, -1.3412724 , -1.31297673],
        [-1.14301691, -0.1249576 , -1.3412724 , -1.31297673],
        [-1.38535265,  0.33784833, -1.39813811, -1.31297673]]))
In [6]:
from sklearn.decomposition import PCA
# keep the first two principal components of the data
pca = PCA(n_components=2)
# fit PCA model to beast cancer data
pca.fit(X_scaled)

# transform data onto the first two principal components
X_pca = pca.transform(X_scaled)
print("Original shape: {}".format(str(X_scaled.shape)))
print("Reduced shape: {}".format(str(X_pca.shape)))
Original shape: (150, 4)
Reduced shape: (150, 2)
In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
p = make_pipeline(SVC(C=1))
p.fit(X_pca, iris.target)
Out[7]:
Pipeline(steps=[('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
In [8]:
# plot fist vs second principal component, color by class
plt.xlabel("First principal component")
plt.ylabel("Second principal component")
plt.title("SVC to classify a dataset with applying PCA")
plot_decision_regions(X_pca, iris.target, p, target_names=iris.target_names, shows=True)
Score: 0.92
In [9]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42)
In [10]:
X_tsne = tsne.fit_transform(iris.data)

p = make_pipeline(SVC(C=1, gamma=0.001))
p.fit(X_tsne, iris.target)

plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
plt.title("SVC to classify a dataset with applying t-SNE")
plot_decision_regions(X_tsne, iris.target, p, target_names=iris.target_names, shows=True)
Score: 0.89
In [11]:
X_tsne = tsne.fit_transform(X_scaled)

p = make_pipeline(SVC(C=1, gamma=0.001))
p.fit(X_tsne, iris.target)

plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
plt.title("SVC to classify a dataset with applying t-SNE, StandardScaler")
plot_decision_regions(X_tsne, iris.target, p, target_names=iris.target_names, shows=True)
Score: 0.87
In [12]:
import pandas as pd
# create a dataframe with an integer feature and a categorical string feature
demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1],
                        'Categorical Feature': ['socks', 'fox', 'socks', 'box']})
demo_df
Out[12]:
Categorical Feature Integer Feature
0 socks 0
1 fox 1
2 socks 2
3 box 1
In [13]:
pd.get_dummies(demo_df)
Out[13]:
Integer Feature Categorical Feature_box Categorical Feature_fox Categorical Feature_socks
0 0 0 0 1
1 1 0 1 0
2 2 0 0 1
3 1 1 0 0
In [14]:
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)
pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature'])
Out[14]:
Integer Feature_0 Integer Feature_1 Integer Feature_2 Categorical Feature_box Categorical Feature_fox Categorical Feature_socks
0 1 0 0 0 0 1
1 0 1 0 0 1 0
2 0 0 1 0 0 1
3 0 1 0 1 0 0
In [15]:
kaonlearn.plots.plot_scaling()
In [16]:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

cancer = load_breast_cancer()
cancer.data
Out[16]:
array([[  1.79900000e+01,   1.03800000e+01,   1.22800000e+02, ...,
          2.65400000e-01,   4.60100000e-01,   1.18900000e-01],
       [  2.05700000e+01,   1.77700000e+01,   1.32900000e+02, ...,
          1.86000000e-01,   2.75000000e-01,   8.90200000e-02],
       [  1.96900000e+01,   2.12500000e+01,   1.30000000e+02, ...,
          2.43000000e-01,   3.61300000e-01,   8.75800000e-02],
       ..., 
       [  1.66000000e+01,   2.80800000e+01,   1.08300000e+02, ...,
          1.41800000e-01,   2.21800000e-01,   7.82000000e-02],
       [  2.06000000e+01,   2.93300000e+01,   1.40100000e+02, ...,
          2.65000000e-01,   4.08700000e-01,   1.24000000e-01],
       [  7.76000000e+00,   2.45400000e+01,   4.79200000e+01, ...,
          0.00000000e+00,   2.87100000e-01,   7.03900000e-02]])

Before we apply PCA, we scale our data so that each feature has unit variance using StandardScaler.

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(cancer.data)
X_scaled
Out[17]:
array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ..., 
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])
In [18]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

clf = SVC(C=1, gamma=0.01).fit(X_pca, cancer.target)

plt.xlabel("First principal component")
plt.ylabel("Second principal component")
plt.title("SVC, PCA, StandardScaler")
plot_decision_regions(X_pca, cancer.target, clf=clf, res=1, target_names=cancer.target_names, shows=True)
Score: 0.95
In [19]:
pca.components_
Out[19]:
array([[ 0.21890244,  0.10372458,  0.22753729,  0.22099499,  0.14258969,
         0.23928535,  0.25840048,  0.26085376,  0.13816696,  0.06436335,
         0.20597878,  0.01742803,  0.21132592,  0.20286964,  0.01453145,
         0.17039345,  0.15358979,  0.1834174 ,  0.04249842,  0.10256832,
         0.22799663,  0.10446933,  0.23663968,  0.22487053,  0.12795256,
         0.21009588,  0.22876753,  0.25088597,  0.12290456,  0.13178394],
       [-0.23385713, -0.05970609, -0.21518136, -0.23107671,  0.18611302,
         0.15189161,  0.06016536, -0.0347675 ,  0.19034877,  0.36657547,
        -0.10555215,  0.08997968, -0.08945723, -0.15229263,  0.20443045,
         0.2327159 ,  0.19720728,  0.13032156,  0.183848  ,  0.28009203,
        -0.21986638, -0.0454673 , -0.19987843, -0.21935186,  0.17230435,
         0.14359317,  0.09796411, -0.00825724,  0.14188335,  0.27533947]])
In [20]:
pca.components_.shape
Out[20]:
(2, 30)
In [21]:
pca_data = pd.DataFrame(data=pca.components_, columns=cancer.feature_names)

plt.figure(figsize=(30,2))
sns.heatmap(pca_data, yticklabels=["First component", "Second component"], annot=True)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f58a240>
In [22]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(cancer.data)

clf = SVC(C=1, gamma=0.1).fit(X_tsne, cancer.target)

plt.title("SVC, t-SNE")
plot_decision_regions(X_tsne, cancer.target, clf=clf, res=1, target_names=cancer.target_names, shows=True)
Score: 0.94
In [23]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(scaler.fit_transform(cancer.data))

clf = SVC(C=1, gamma=0.1).fit(X_tsne, cancer.target)

plt.title("SVC, t-SNE, StandardScaler")
plot_decision_regions(X_tsne, cancer.target, clf=clf, res=1, target_names=cancer.target_names, shows=True)
Score: 0.97