%load_ext watermark
%watermark -a "Sebastian Raschka" -u -d -p numpy,scipy,matplotlib,sklearn,pandas,mlxtend
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Source: R.J. Gladstone (1905). "A Study of the Relations of the Brain to to the Size of the Head", Biometrika, Vol. 4, pp105-123
Description: Brain weight (grams) and head size (cubic cm) for 237 adults classified by gender and age group.
Variables/Columns
df = pd.read_csv('dataset_brain.txt',
encoding='utf-8',
comment='#',
sep='\s+')
df.tail()
plt.scatter(df['head-size'], df['brain-weight'])
plt.xlabel('Head size (cm^3)')
plt.ylabel('Brain weight (grams)');
y = df['brain-weight'].values
y.shape
X = df['head-size'].values
X = X[:, np.newaxis]
X.shape
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123)
plt.scatter(X_train, y_train, c='blue', marker='o')
plt.scatter(X_test, y_test, c='red', marker='s')
plt.xlabel('Head size (cm^3)')
plt.ylabel('Brain weight (grams)');
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
sum_of_squares = ((y_test - y_pred) ** 2).sum()
res_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
r2_score = 1 - (sum_of_squares / res_sum_of_squares)
print('R2 score: %.3f' % r2_score)
print('R2 score: %.3f' % lr.score(X_test, y_test))
lr.coef_
lr.intercept_
min_pred = X_train.min() * lr.coef_ + lr.intercept_
max_pred = X_train.max() * lr.coef_ + lr.intercept_
plt.scatter(X_train, y_train, c='blue', marker='o')
plt.plot([X_train.min(), X_train.max()],
[min_pred, max_pred],
color='red',
linewidth=4)
plt.xlabel('Head size (cm^3)')
plt.ylabel('Brain weight (grams)');
df = pd.read_csv('dataset_iris.txt',
encoding='utf-8',
comment='#',
sep=',')
df.tail()
X = df.iloc[:, :4].values
y = df['class'].values
np.unique(y)
from sklearn.preprocessing import LabelEncoder
l_encoder = LabelEncoder()
l_encoder.fit(y)
l_encoder.classes_
y_enc = l_encoder.transform(y)
np.unique(y_enc)
np.unique(l_encoder.inverse_transform(y_enc))
from sklearn.datasets import load_iris
iris = load_iris()
print(iris['DESCR'])
X, y = iris.data[:, :2], iris.target
# ! We only use 2 features for visual purposes
print('Class labels:', np.unique(y))
print('Class proportions:', np.bincount(y))
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123)
print('Class labels:', np.unique(y_train))
print('Class proportions:', np.bincount(y_train))
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123,
stratify=y)
print('Class labels:', np.unique(y_train))
print('Class proportions:', np.bincount(y_train))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='newton-cg',
multi_class='multinomial',
random_state=1)
lr.fit(X_train, y_train)
print('Test accuracy %.2f' % lr.score(X_test, y_test))
from mlxtend.evaluate import plot_decision_regions
plot_decision_regions
plot_decision_regions(X=X, y=y, clf=lr, X_highlight=X_test)
plt.xlabel('sepal length [cm]')
plt.xlabel('sepal width [cm]');
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=4)
kn.fit(X_train, y_train)
print('Test accuracy %.2f' % kn.score(X_test, y_test))
plot_decision_regions(X=X, y=y, clf=kn, X_highlight=X_test)
plt.xlabel('sepal length [cm]')
plt.xlabel('sepal width [cm]');
n_neighbors
is an even number?import pandas as pd
df = pd.DataFrame([
['green', 'M', 10.0],
['red', 'L', 13.5],
['blue', 'XL', 15.3]])
df.columns = ['color', 'size', 'prize']
df
from sklearn.feature_extraction import DictVectorizer
dvec = DictVectorizer(sparse=False)
X = dvec.fit_transform(df.transpose().to_dict().values())
X
size_mapping = {
'XL': 3,
'L': 2,
'M': 1}
df['size'] = df['size'].map(size_mapping)
df
X = dvec.fit_transform(df.transpose().to_dict().values())
X
df = pd.DataFrame([1., 2., 3., 4., 5., 6.], columns=['feature'])
df
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
mmxsc = MinMaxScaler()
stdsc = StandardScaler()
X = df['feature'].values[:, np.newaxis]
df['minmax'] = mmxsc.fit_transform(X)
df['z-score'] = stdsc.fit_transform(X)
df
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123,
stratify=y)
lr = LogisticRegression(solver='newton-cg',
multi_class='multinomial',
random_state=1)
lr_pipe = make_pipeline(StandardScaler(), lr)
lr_pipe.fit(X_train, y_train)
lr_pipe.score(X_test, y_test)
lr_pipe.named_steps
lr_pipe.named_steps['standardscaler'].transform(X[:5])
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123, stratify=y)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
lr = LogisticRegression()
rfe = RFECV(lr, step=1, cv=5, scoring='accuracy')
rfe.fit(X_train, y_train)
print('Number of features:', rfe.n_features_)
print('Feature ranking', rfe.ranking_)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import plot_sequential_feature_selection as plot_sfs
sfs = SFS(lr, k_features=4, forward=True, floating=False, cv=5)
sfs.fit(X_train, y_train)
sfs = SFS(lr,
k_features=4,
forward=True,
floating=False,
scoring='accuracy',
cv=2)
sfs = sfs.fit(X, y)
fig1 = plot_sfs(sfs.get_metric_dict())
plt.ylim([0.8, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
sfs.subsets_
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
pca = PCA(n_components=4)
pca.fit_transform(X_train, y_train)
var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
idx = [i for i in range(len(var_exp))]
labels = [str(i + 1) for i in idx]
with plt.style.context('seaborn-whitegrid'):
plt.bar(range(4), var_exp, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(4), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.xticks(idx, labels)
plt.legend(loc='center right')
plt.tight_layout()
plt.show()
X_train_pca = pca.transform(X_train)
for lab, col, mar in zip((0, 1, 2),
('blue', 'red', 'green'),
('o', 's', '^')):
plt.scatter(X_train_pca[y_train == lab, 0],
X_train_pca[y_train == lab, 1],
label=lab,
marker=mar,
c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower right')
plt.tight_layout()
from mlxtend.data import wine_data
X, y = wine_data()
Wine dataset.
Source : https://archive.ics.uci.edu/ml/datasets/Wine
Number of samples : 178
Class labels : {0, 1, 2}, distribution: [59, 71, 48]
Dataset Attributes:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier as KNN
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=123, stratify=y)
pipe_kn = make_pipeline(StandardScaler(),
PCA(n_components=1),
KNN(n_neighbors=3))
kfold = StratifiedKFold(y=y_train,
n_folds=10,
random_state=1)
scores = []
for k, (train, test) in enumerate(kfold):
pipe_kn.fit(X_train[train], y_train[train])
score = pipe_kn.score(X_train[test], y_train[test])
scores.append(score)
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
np.bincount(y_train[train]), score))
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(estimator=pipe_kn,
X=X_train,
y=y_train,
cv=10,
n_jobs=2)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
pipe_kn.named_steps
from sklearn.grid_search import GridSearchCV
param_grid = {'pca__n_components': [1, 2, 3, 4, 5, 6, None],
'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9, 11]}
gs = GridSearchCV(estimator=pipe_kn,
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=2,
refit=True)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
gs.score(X_test, y_test)