%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNetCV, ElasticNet, Ridge
from sklearn.model_selection import learning_curve, ShuffleSplit
from sklearn.model_selection import GridSearchCV, cross_val_score
n_samples = 500
n_features = 500
all_inputs = np.random.randn(n_samples, n_features)
# Ground truth model: linear combination of the first two variables
w_true = np.zeros(n_features)
w_true[:2] = 1.
y = np.dot(all_inputs, w_true)
# Partial observation: the first informative variable cannot be observed
X = all_inputs[:, 1:]
y.shape
(500,)
plt.hist(y);
X.shape
(500, 499)
gs_ridge = GridSearchCV(Ridge(), {'alpha': np.logspace(-6, 2, 20)},
scoring='neg_median_absolute_error', cv=10).fit(X, y)
gs_ridge.best_params_
{'alpha': 100.0}
gs_ridge.best_score_
-0.88770978619896734
enet_params = {
'alpha': np.logspace(-6, -1, 6),
'l1_ratio': [0.1, 0.2, 0.3],
}
gs_enet = GridSearchCV(ElasticNet(max_iter=10000), enet_params,
scoring='neg_median_absolute_error', cv=10, n_jobs=10)
gs_enet.fit(X, y);
gs_enet.best_score_
-0.62234474540259543
gs_enet.best_params_
{'alpha': 0.10000000000000001, 'l1_ratio': 0.3}
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
scoring='neg_median_absolute_error')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
title = "Learning Curves (ElasticNet)"
cv = ShuffleSplit(n_splits=50, test_size=0.1, random_state=0)
enet = ElasticNet(alpha=1e-1, l1_ratio=0.3, max_iter=10000)
plot_learning_curve(enet, title, X, y, ylim=(-2, 0.1), cv=cv, n_jobs=4);
title = "Learning Curves (Ridge)"
cv = ShuffleSplit(n_splits=50, test_size=0.1, random_state=0)
ridge = Ridge(alpha=100)
plot_learning_curve(ridge, title, X, y, ylim=(-2, 0.1), cv=cv, n_jobs=4);