Learning scikit-learn

An Introduction to Machine Learning in Python

at PyData Chicago 2016

In [ ]:
%load_ext watermark
%watermark -a "Sebastian Raschka" -u -d -p numpy,scipy,matplotlib,sklearn,pandas,mlxtend
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

1 Introduction to Machine Learning

2 Linear Regression

Loading the dataset

Source: R.J. Gladstone (1905). "A Study of the Relations of the Brain to to the Size of the Head", Biometrika, Vol. 4, pp105-123

Description: Brain weight (grams) and head size (cubic cm) for 237 adults classified by gender and age group.

Variables/Columns

  • Gender (1=Male, 2=Female)
  • Age Range (1=20-46, 2=46+)
  • Head size (cm^3)
  • Brain weight (grams)
In [ ]:
df = pd.read_csv('dataset_brain.txt', 
                 encoding='utf-8', 
                 comment='#',
                 sep='\s+')
df.tail()
In [ ]:
plt.scatter(df['head-size'], df['brain-weight'])
plt.xlabel('Head size (cm^3)')
plt.ylabel('Brain weight (grams)');

Preparing the dataset

In [ ]:
y = df['brain-weight'].values
y.shape
In [ ]:
X = df['head-size'].values
X = X[:, np.newaxis]
X.shape
In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)
In [ ]:
plt.scatter(X_train, y_train, c='blue', marker='o')
plt.scatter(X_test, y_test, c='red', marker='s')
plt.xlabel('Head size (cm^3)')
plt.ylabel('Brain weight (grams)');

Fitting the model

In [ ]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

Evaluating the model

In [ ]:
sum_of_squares = ((y_test - y_pred) ** 2).sum()
res_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
r2_score = 1 - (sum_of_squares / res_sum_of_squares)
print('R2 score: %.3f' % r2_score)
In [ ]:
print('R2 score: %.3f' % lr.score(X_test, y_test))
In [ ]:
lr.coef_
In [ ]:
lr.intercept_
In [ ]:
min_pred = X_train.min() * lr.coef_ + lr.intercept_
max_pred = X_train.max() * lr.coef_ + lr.intercept_

plt.scatter(X_train, y_train, c='blue', marker='o')
plt.plot([X_train.min(), X_train.max()],
         [min_pred, max_pred],
         color='red',
         linewidth=4)
plt.xlabel('Head size (cm^3)')
plt.ylabel('Brain weight (grams)');

3 Introduction to Classification

The Iris dataset

In [ ]:
df = pd.read_csv('dataset_iris.txt', 
                 encoding='utf-8', 
                 comment='#',
                 sep=',')
df.tail()
In [ ]:
X = df.iloc[:, :4].values 
y = df['class'].values
np.unique(y)

Class label encoding

In [ ]:
from sklearn.preprocessing import LabelEncoder

l_encoder = LabelEncoder()
l_encoder.fit(y)
l_encoder.classes_
In [ ]:
y_enc = l_encoder.transform(y)
np.unique(y_enc)
In [ ]:
np.unique(l_encoder.inverse_transform(y_enc))

Scikit-learn's in-build datasets

In [ ]:
from sklearn.datasets import load_iris

iris = load_iris()
print(iris['DESCR'])

Test/train splits

In [ ]:
X, y = iris.data[:, :2], iris.target
# ! We only use 2 features for visual purposes

print('Class labels:', np.unique(y))
print('Class proportions:', np.bincount(y))
In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)

print('Class labels:', np.unique(y_train))
print('Class proportions:', np.bincount(y_train))
In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123,
        stratify=y)

print('Class labels:', np.unique(y_train))
print('Class proportions:', np.bincount(y_train))

Logistic Regression

In [ ]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='newton-cg', 
                        multi_class='multinomial', 
                        random_state=1)

lr.fit(X_train, y_train)
print('Test accuracy %.2f' % lr.score(X_test, y_test))
In [ ]:
from mlxtend.evaluate import plot_decision_regions

plot_decision_regions

plot_decision_regions(X=X, y=y, clf=lr, X_highlight=X_test)
plt.xlabel('sepal length [cm]')
plt.xlabel('sepal width [cm]');

K-Nearest Neighbors

In [ ]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=4)

kn.fit(X_train, y_train)
print('Test accuracy %.2f' % kn.score(X_test, y_test))
In [ ]:
plot_decision_regions(X=X, y=y, clf=kn, X_highlight=X_test)
plt.xlabel('sepal length [cm]')
plt.xlabel('sepal width [cm]');

3 - Exercises

  • Which of the two models above would you prefer if you had to choose? Why?
  • What would be possible ways to resolve ties in KNN when n_neighbors is an even number?
  • Can you find the right spot in the scikit-learn documentation to read about how scikit-learn handles this?
  • Train & evaluate the Logistic Regression and KNN algorithms on the 4-dimensional iris datasets.
    • What performance do you observe?
    • Why is it different vs. using only 2 dimensions?
    • Would adding more dimensions help?

4 - Feature Preprocessing & scikit-learn Pipelines

Categorical features: nominal vs ordinal

In [ ]:
import pandas as pd

df = pd.DataFrame([
            ['green', 'M', 10.0], 
            ['red', 'L', 13.5], 
            ['blue', 'XL', 15.3]])

df.columns = ['color', 'size', 'prize']
df
In [ ]:
from sklearn.feature_extraction import DictVectorizer

dvec = DictVectorizer(sparse=False)

X = dvec.fit_transform(df.transpose().to_dict().values())
X
In [ ]:
size_mapping = {
           'XL': 3,
           'L': 2,
           'M': 1}

df['size'] = df['size'].map(size_mapping)
df
In [ ]:
X = dvec.fit_transform(df.transpose().to_dict().values())
X

Normalization

In [ ]:
df = pd.DataFrame([1., 2., 3., 4., 5., 6.], columns=['feature'])
df
In [ ]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

mmxsc = MinMaxScaler()
stdsc = StandardScaler()

X = df['feature'].values[:, np.newaxis]

df['minmax'] = mmxsc.fit_transform(X)
df['z-score'] = stdsc.fit_transform(X)

df

Pipelines

In [ ]:
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123,
        stratify=y)

lr = LogisticRegression(solver='newton-cg', 
                        multi_class='multinomial', 
                        random_state=1)

lr_pipe = make_pipeline(StandardScaler(), lr)

lr_pipe.fit(X_train, y_train)
lr_pipe.score(X_test, y_test)
In [ ]:
lr_pipe.named_steps
In [ ]:
lr_pipe.named_steps['standardscaler'].transform(X[:5])

4 - Exercises

  • Why is it important that we scale test and training sets separately?
  • Fit a KNN classifier to the standardized Iris dataset. Do you notice difference in the predictive performance of the model compared to the non-standardized one? Why or why not?

5 - Dimensionality Reduction: Feature Selection & Extraction

In [ ]:
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123, stratify=y)

Recursive Feature Elimination

In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

lr = LogisticRegression()
rfe = RFECV(lr, step=1, cv=5, scoring='accuracy')

rfe.fit(X_train, y_train)
print('Number of features:', rfe.n_features_)
print('Feature ranking', rfe.ranking_)

Sequential Feature Selection

In [ ]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import plot_sequential_feature_selection as plot_sfs


sfs = SFS(lr, k_features=4, forward=True, floating=False, cv=5)

sfs.fit(X_train, y_train)
sfs = SFS(lr, 
          k_features=4, 
          forward=True, 
          floating=False, 
          scoring='accuracy',
          cv=2)

sfs = sfs.fit(X, y)
fig1 = plot_sfs(sfs.get_metric_dict())

plt.ylim([0.8, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
In [ ]:
sfs.subsets_

Principal Component Analysis

In [ ]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
pca = PCA(n_components=4)

pca.fit_transform(X_train, y_train)

var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

idx = [i for i in range(len(var_exp))]
labels = [str(i + 1) for i in idx]
with plt.style.context('seaborn-whitegrid'):
    plt.bar(range(4), var_exp, alpha=0.5, align='center',
            label='individual explained variance')
    plt.step(range(4), cum_var_exp, where='mid',
             label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.xticks(idx, labels)
    plt.legend(loc='center right')
    plt.tight_layout()
    plt.show()
In [ ]:
X_train_pca = pca.transform(X_train)

for lab, col, mar in zip((0, 1, 2),
                         ('blue', 'red', 'green'),
                         ('o', 's', '^')):
    plt.scatter(X_train_pca[y_train == lab, 0],
                X_train_pca[y_train == lab, 1],
                label=lab,
                marker=mar,
                c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower right')
plt.tight_layout()

6 - Model Evaluation & Hyperparameter Tuning

Wine Dataset

In [ ]:
from mlxtend.data import wine_data

X, y = wine_data()

Wine dataset.

Source : https://archive.ics.uci.edu/ml/datasets/Wine

Number of samples : 178

Class labels : {0, 1, 2}, distribution: [59, 71, 48]

Dataset Attributes:

  1. Alcohol
  2. Malic acid
  3. Ash
  4. Alcalinity of ash
  5. Magnesium
  6. Total phenols
  7. Flavanoids
  8. Nonflavanoid phenols
  9. Proanthocyanins
  10. Color intensity
  11. Hue
  12. OD280/OD315 of diluted wines
  13. Proline

Stratified K-Fold

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier as KNN

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123, stratify=y)

pipe_kn = make_pipeline(StandardScaler(), 
                        PCA(n_components=1),
                        KNN(n_neighbors=3))

kfold = StratifiedKFold(y=y_train, 
                        n_folds=10,
                        random_state=1)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_kn.fit(X_train[train], y_train[train])
    score = pipe_kn.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
In [ ]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(estimator=pipe_kn,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=2)

print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
In [ ]:
pipe_kn.named_steps
In [ ]:
from sklearn.grid_search import GridSearchCV


param_grid = {'pca__n_components': [1, 2, 3, 4, 5, 6, None],
              'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9, 11]}

gs = GridSearchCV(estimator=pipe_kn, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=2,
                  refit=True)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
In [ ]:
gs.score(X_test, y_test)