# Learning scikit-learn¶

## An Introduction to Machine Learning in Python¶

### at PyData Chicago 2016¶

In [ ]:
%load_ext watermark
%watermark -a "Sebastian Raschka" -u -d -p numpy,scipy,matplotlib,sklearn,pandas,mlxtend

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# 2 Linear Regression¶

Source: R.J. Gladstone (1905). "A Study of the Relations of the Brain to to the Size of the Head", Biometrika, Vol. 4, pp105-123

Description: Brain weight (grams) and head size (cubic cm) for 237 adults classified by gender and age group.

Variables/Columns

• Gender (1=Male, 2=Female)
• Age Range (1=20-46, 2=46+)
• Brain weight (grams)
In [ ]:
df = pd.read_csv('dataset_brain.txt',
encoding='utf-8',
comment='#',
sep='\s+')
df.tail()

In [ ]:
plt.scatter(df['head-size'], df['brain-weight'])
plt.ylabel('Brain weight (grams)');


### Preparing the dataset¶

In [ ]:
y = df['brain-weight'].values
y.shape

In [ ]:
X = df['head-size'].values
X = X[:, np.newaxis]
X.shape

In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123)

In [ ]:
plt.scatter(X_train, y_train, c='blue', marker='o')
plt.scatter(X_test, y_test, c='red', marker='s')
plt.ylabel('Brain weight (grams)');


### Fitting the model¶

In [ ]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)


### Evaluating the model¶

In [ ]:
sum_of_squares = ((y_test - y_pred) ** 2).sum()
res_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
r2_score = 1 - (sum_of_squares / res_sum_of_squares)
print('R2 score: %.3f' % r2_score)

In [ ]:
print('R2 score: %.3f' % lr.score(X_test, y_test))

In [ ]:
lr.coef_

In [ ]:
lr.intercept_

In [ ]:
min_pred = X_train.min() * lr.coef_ + lr.intercept_
max_pred = X_train.max() * lr.coef_ + lr.intercept_

plt.scatter(X_train, y_train, c='blue', marker='o')
plt.plot([X_train.min(), X_train.max()],
[min_pred, max_pred],
color='red',
linewidth=4)
plt.ylabel('Brain weight (grams)');


# 3 Introduction to Classification¶

### The Iris dataset¶

In [ ]:
df = pd.read_csv('dataset_iris.txt',
encoding='utf-8',
comment='#',
sep=',')
df.tail()

In [ ]:
X = df.iloc[:, :4].values
y = df['class'].values
np.unique(y)


### Class label encoding¶

In [ ]:
from sklearn.preprocessing import LabelEncoder

l_encoder = LabelEncoder()
l_encoder.fit(y)
l_encoder.classes_

In [ ]:
y_enc = l_encoder.transform(y)
np.unique(y_enc)

In [ ]:
np.unique(l_encoder.inverse_transform(y_enc))


### Scikit-learn's in-build datasets¶

In [ ]:
from sklearn.datasets import load_iris

print(iris['DESCR'])


### Test/train splits¶

In [ ]:
X, y = iris.data[:, :2], iris.target
# ! We only use 2 features for visual purposes

print('Class labels:', np.unique(y))
print('Class proportions:', np.bincount(y))

In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123)

print('Class labels:', np.unique(y_train))
print('Class proportions:', np.bincount(y_train))

In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123,
stratify=y)

print('Class labels:', np.unique(y_train))
print('Class proportions:', np.bincount(y_train))


### Logistic Regression¶

In [ ]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='newton-cg',
multi_class='multinomial',
random_state=1)

lr.fit(X_train, y_train)
print('Test accuracy %.2f' % lr.score(X_test, y_test))

In [ ]:
from mlxtend.evaluate import plot_decision_regions

plot_decision_regions

plot_decision_regions(X=X, y=y, clf=lr, X_highlight=X_test)
plt.xlabel('sepal length [cm]')
plt.xlabel('sepal width [cm]');


### K-Nearest Neighbors¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=4)

kn.fit(X_train, y_train)
print('Test accuracy %.2f' % kn.score(X_test, y_test))

In [ ]:
plot_decision_regions(X=X, y=y, clf=kn, X_highlight=X_test)
plt.xlabel('sepal length [cm]')
plt.xlabel('sepal width [cm]');


### 3 - Exercises¶

• Which of the two models above would you prefer if you had to choose? Why?
• What would be possible ways to resolve ties in KNN when n_neighbors is an even number?
• Can you find the right spot in the scikit-learn documentation to read about how scikit-learn handles this?
• Train & evaluate the Logistic Regression and KNN algorithms on the 4-dimensional iris datasets.
• What performance do you observe?
• Why is it different vs. using only 2 dimensions?
• Would adding more dimensions help?

# 4 - Feature Preprocessing & scikit-learn Pipelines¶

### Categorical features: nominal vs ordinal¶

In [ ]:
import pandas as pd

df = pd.DataFrame([
['green', 'M', 10.0],
['red', 'L', 13.5],
['blue', 'XL', 15.3]])

df.columns = ['color', 'size', 'prize']
df

In [ ]:
from sklearn.feature_extraction import DictVectorizer

dvec = DictVectorizer(sparse=False)

X = dvec.fit_transform(df.transpose().to_dict().values())
X

In [ ]:
size_mapping = {
'XL': 3,
'L': 2,
'M': 1}

df['size'] = df['size'].map(size_mapping)
df

In [ ]:
X = dvec.fit_transform(df.transpose().to_dict().values())
X


### Normalization¶

In [ ]:
df = pd.DataFrame([1., 2., 3., 4., 5., 6.], columns=['feature'])
df

In [ ]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

mmxsc = MinMaxScaler()
stdsc = StandardScaler()

X = df['feature'].values[:, np.newaxis]

df['minmax'] = mmxsc.fit_transform(X)
df['z-score'] = stdsc.fit_transform(X)

df


### Pipelines¶

In [ ]:
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split

X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123,
stratify=y)

lr = LogisticRegression(solver='newton-cg',
multi_class='multinomial',
random_state=1)

lr_pipe = make_pipeline(StandardScaler(), lr)

lr_pipe.fit(X_train, y_train)
lr_pipe.score(X_test, y_test)

In [ ]:
lr_pipe.named_steps

In [ ]:
lr_pipe.named_steps['standardscaler'].transform(X[:5])


### 4 - Exercises¶

• Why is it important that we scale test and training sets separately?
• Fit a KNN classifier to the standardized Iris dataset. Do you notice difference in the predictive performance of the model compared to the non-standardized one? Why or why not?

# 5 - Dimensionality Reduction: Feature Selection & Extraction¶

In [ ]:
from sklearn.cross_validation import train_test_split

X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123, stratify=y)


### Recursive Feature Elimination¶

In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

lr = LogisticRegression()
rfe = RFECV(lr, step=1, cv=5, scoring='accuracy')

rfe.fit(X_train, y_train)
print('Number of features:', rfe.n_features_)
print('Feature ranking', rfe.ranking_)


### Sequential Feature Selection¶

In [ ]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import plot_sequential_feature_selection as plot_sfs

sfs = SFS(lr, k_features=4, forward=True, floating=False, cv=5)

sfs.fit(X_train, y_train)
sfs = SFS(lr,
k_features=4,
forward=True,
floating=False,
scoring='accuracy',
cv=2)

sfs = sfs.fit(X, y)
fig1 = plot_sfs(sfs.get_metric_dict())

plt.ylim([0.8, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()

In [ ]:
sfs.subsets_


### Principal Component Analysis¶

In [ ]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
pca = PCA(n_components=4)

pca.fit_transform(X_train, y_train)

var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

idx = [i for i in range(len(var_exp))]
labels = [str(i + 1) for i in idx]
with plt.style.context('seaborn-whitegrid'):
plt.bar(range(4), var_exp, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(4), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.xticks(idx, labels)
plt.legend(loc='center right')
plt.tight_layout()
plt.show()

In [ ]:
X_train_pca = pca.transform(X_train)

for lab, col, mar in zip((0, 1, 2),
('blue', 'red', 'green'),
('o', 's', '^')):
plt.scatter(X_train_pca[y_train == lab, 0],
X_train_pca[y_train == lab, 1],
label=lab,
marker=mar,
c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower right')
plt.tight_layout()


# 6 - Model Evaluation & Hyperparameter Tuning¶

### Wine Dataset¶

In [ ]:
from mlxtend.data import wine_data

X, y = wine_data()


Wine dataset.

Number of samples : 178

Class labels : {0, 1, 2}, distribution: [59, 71, 48]

Dataset Attributes:

1. Alcohol
2. Malic acid
3. Ash
4. Alcalinity of ash
5. Magnesium
6. Total phenols
7. Flavanoids
8. Nonflavanoid phenols
9. Proanthocyanins
10. Color intensity
11. Hue
12. OD280/OD315 of diluted wines
13. Proline

### Stratified K-Fold¶

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier as KNN

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123, stratify=y)

pipe_kn = make_pipeline(StandardScaler(),
PCA(n_components=1),
KNN(n_neighbors=3))

kfold = StratifiedKFold(y=y_train,
n_folds=10,
random_state=1)

scores = []
for k, (train, test) in enumerate(kfold):
pipe_kn.fit(X_train[train], y_train[train])
score = pipe_kn.score(X_train[test], y_train[test])
scores.append(score)
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
np.bincount(y_train[train]), score))

print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [ ]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(estimator=pipe_kn,
X=X_train,
y=y_train,
cv=10,
n_jobs=2)

print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [ ]:
pipe_kn.named_steps

In [ ]:
from sklearn.grid_search import GridSearchCV

param_grid = {'pca__n_components': [1, 2, 3, 4, 5, 6, None],
'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9, 11]}

gs = GridSearchCV(estimator=pipe_kn,
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=2,
refit=True)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [ ]:
gs.score(X_test, y_test)