Copyright (c) 2015-2017 Sebastian Raschka
Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
%load_ext watermark
%watermark -a 'Sebastian Raschka' -u -d -p numpy,pandas,matplotlib,sklearn
Sebastian Raschka last updated: 2017-03-10 numpy 1.12.0 pandas 0.19.2 matplotlib 2.0.0 sklearn 0.18.1
The use of watermark
is optional. You can install this IPython extension via "pip install watermark
". For more information, please see: https://github.com/rasbt/watermark.
from IPython.display import Image
%matplotlib inline
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
# If you are using Python 2.7, you need
# to convert the string to unicode:
# csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 |
1 | 5.0 | 6.0 | NaN | 8.0 |
2 | 10.0 | 11.0 | 12.0 | NaN |
df.isnull().sum()
A 0 B 0 C 1 D 1 dtype: int64
df.dropna()
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 |
df.dropna(axis=1)
A | B | |
---|---|---|
0 | 1.0 | 2.0 |
1 | 5.0 | 6.0 |
2 | 10.0 | 11.0 |
# only drop rows where all columns are NaN
df.dropna(how='all')
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 |
1 | 5.0 | 6.0 | NaN | 8.0 |
2 | 10.0 | 11.0 | 12.0 | NaN |
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 |
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 |
2 | 10.0 | 11.0 | 12.0 | NaN |
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data
array([[ 1. , 2. , 3. , 4. ], [ 5. , 6. , 7.5, 8. ], [ 10. , 11. , 12. , 6. ]])
df.values
array([[ 1., 2., 3., 4.], [ 5., 6., nan, 8.], [ 10., 11., 12., nan]])
Image(filename='./images/04_04.png', width=400)
Image(filename='./images/04_05.png', width=400)
import pandas as pd
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df
color | size | price | classlabel | |
---|---|---|---|---|
0 | green | M | 10.1 | class1 |
1 | red | L | 13.5 | class2 |
2 | blue | XL | 15.3 | class1 |
size_mapping = {'XL': 3,
'L': 2,
'M': 1}
df['size'] = df['size'].map(size_mapping)
df
color | size | price | classlabel | |
---|---|---|---|---|
0 | green | 1 | 10.1 | class1 |
1 | red | 2 | 13.5 | class2 |
2 | blue | 3 | 15.3 | class1 |
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)
0 M 1 L 2 XL Name: size, dtype: object
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping
{'class1': 0, 'class2': 1}
df['classlabel'] = df['classlabel'].map(class_mapping)
df
color | size | price | classlabel | |
---|---|---|---|---|
0 | green | 1 | 10.1 | 0 |
1 | red | 2 | 13.5 | 1 |
2 | blue | 3 | 15.3 | 0 |
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df
color | size | price | classlabel | |
---|---|---|---|---|
0 | green | 1 | 10.1 | class1 |
1 | red | 2 | 13.5 | class2 |
2 | blue | 3 | 15.3 | class1 |
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y
array([0, 1, 0])
class_le.inverse_transform(y)
array(['class1', 'class2', 'class1'], dtype=object)
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X
array([[1, 1, 10.1], [2, 2, 13.5], [0, 3, 15.3]], dtype=object)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()
array([[ 0. , 1. , 0. , 1. , 10.1], [ 0. , 0. , 1. , 2. , 13.5], [ 1. , 0. , 0. , 3. , 15.3]])
pd.get_dummies(df[['price', 'color', 'size']])
price | size | color_blue | color_green | color_red | |
---|---|---|---|---|---|
0 | 10.1 | 1 | 0 | 1 | 0 |
1 | 13.5 | 2 | 0 | 0 | 1 |
2 | 15.3 | 3 | 1 | 0 | 0 |
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/wine/wine.data',
header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
'Proline']
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()
Class labels [1 2 3]
Class label | Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
If the link to the Wine dataset provided above does not work for you, you can find a local copy in this repository at ./../datasets/wine/wine.data.
Or you could fetch it via
df_wine = pd.read_csv('https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
df_wine.head()
Class label | Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
else:
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
A visual example:
ex = pd.DataFrame([0, 1, 2, 3, 4, 5])
# standardize
ex[1] = (ex[0] - ex[0].mean()) / ex[0].std(ddof=0)
# Please note that pandas uses ddof=1 (sample standard deviation)
# by default, whereas NumPy's std method and the StandardScaler
# uses ddof=0 (population standard deviation)
# normalize
ex[2] = (ex[0] - ex[0].min()) / (ex[0].max() - ex[0].min())
ex.columns = ['input', 'standardized', 'normalized']
ex
input | standardized | normalized | |
---|---|---|---|
0 | 0 | -1.46385 | 0.0 |
1 | 1 | -0.87831 | 0.2 |
2 | 2 | -0.29277 | 0.4 |
3 | 3 | 0.29277 | 0.6 |
4 | 4 | 0.87831 | 0.8 |
5 | 5 | 1.46385 | 1.0 |
...
Image(filename='./images/04_12.png', width=500)
Image(filename='./images/04_13.png', width=500)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))
Training accuracy: 0.983870967742 Test accuracy: 0.981481481481
lr.intercept_
array([-0.38383346, -0.15808134, -0.70036089])
lr.coef_
array([[ 0.28030813, 0. , 0. , -0.02801734, 0. , 0. , 0.71015558, 0. , 0. , 0. , 0. , 0. , 1.23621266], [-0.64400768, -0.06879233, -0.05720974, 0. , 0. , 0. , 0. , 0. , 0. , -0.92679949, 0.06011963, 0. , -0.37101927], [ 0. , 0.06144394, 0. , 0. , 0. , 0. , -0.63694567, 0. , 0. , 0.49850798, -0.35806863, -0.57045459, 0. ]])
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.subplot(111)
colors = ['blue', 'green', 'red', 'cyan',
'magenta', 'yellow', 'black',
'pink', 'lightgreen', 'lightblue',
'gray', 'indigo', 'orange']
weights, params = [], []
for c in np.arange(-4., 6.):
lr = LogisticRegression(penalty='l1', C=10.**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10.**c)
weights = np.array(weights)
for column, color in zip(range(weights.shape[1]), colors):
plt.plot(params, weights[:, column],
label=df_wine.columns[column + 1],
color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10**(-5), 10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',
bbox_to_anchor=(1.38, 1.03),
ncol=1, fancybox=True)
# plt.savefig('./figures/l1_path.png', dpi=300)
plt.show()
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
else:
from sklearn.model_selection import train_test_split
class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
def fit(self, X, y):
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=self.test_size,
random_state=self.random_state)
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)
self.scores_ = [score]
while dim > self.k_features:
scores = []
subsets = []
for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train, y_train,
X_test, y_test, p)
scores.append(score)
subsets.append(p)
best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1
self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]
return self
def transform(self, X):
return X[:, self.indices_]
def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
# selecting features
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)
# plotting performance of feature subsets
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.tight_layout()
# plt.savefig('./sbs.png', dpi=300)
plt.show()
k5 = list(sbs.subsets_[8])
print(df_wine.columns[1:][k5])
Index(['Alcohol', 'Malic acid', 'Alcalinity of ash', 'Hue', 'Proline'], dtype='object')
knn.fit(X_train_std, y_train)
print('Training accuracy:', knn.score(X_train_std, y_train))
print('Test accuracy:', knn.score(X_test_std, y_test))
Training accuracy: 0.983870967742 Test accuracy: 0.944444444444
knn.fit(X_train_std[:, k5], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k5], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k5], y_test))
Training accuracy: 0.959677419355 Test accuracy: 0.962962962963
from sklearn.ensemble import RandomForestClassifier
feat_labels = df_wine.columns[1:]
forest = RandomForestClassifier(n_estimators=10000,
random_state=0,
n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),
importances[indices],
color='lightblue',
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('./random_forest.png', dpi=300)
plt.show()
1) Color intensity 0.182483 2) Proline 0.158610 3) Flavanoids 0.150948 4) OD280/OD315 of diluted wines 0.131987 5) Alcohol 0.106589 6) Hue 0.078243 7) Total phenols 0.060718 8) Alcalinity of ash 0.032033 9) Malic acid 0.025400 10) Proanthocyanins 0.022351 11) Magnesium 0.022078 12) Nonflavanoid phenols 0.014645 13) Ash 0.013916
if Version(sklearn_version) < '0.18':
X_selected = forest.transform(X_train, threshold=0.15)
else:
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(forest, threshold=0.15, prefit=True)
X_selected = sfm.transform(X_train)
X_selected.shape
(124, 3)
Now, let's print the 3 features that met the threshold criterion for feature selection that we set earlier (note that this code snippet does not appear in the actual book but was added to this notebook later for illustrative purposes):
for f in range(X_selected.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
1) Color intensity 0.182483 2) Proline 0.158610 3) Flavanoids 0.150948
...