import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import cross_validation
%matplotlib inline
np.random.seed(0)
n_samples = 30
degrees = [1, 4, 15]
true_fun = lambda X: np.cos(1.5 * np.pi * X)
X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
ax = plt.subplot(1, len(degrees), i + 1)
plt.setp(ax, xticks=(), yticks=())
polynomial_features = PolynomialFeatures(degree=degrees[i],
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_validation.cross_val_score(pipeline,
X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X, y, label="Samples")
plt.xlabel("x")
plt.ylabel("y")
plt.xlim((0, 1))
plt.ylim((-2, 2))
plt.legend(loc="best")
plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
degrees[i], -scores.mean(), scores.std()))
def gen_regression_plots(alpha):
fig, axes = plt.subplots(figsize=(15,5), nrows=1, ncols=2)
polynomial_features = PolynomialFeatures(degree=15,
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_validation.cross_val_score(pipeline,
X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
axes[0].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
axes[0].plot(X_test, true_fun(X_test), label="True function")
axes[0].scatter(X, y, label="Samples")
axes[0].set_xlabel("x")
axes[0].set_ylabel("y")
axes[0].set_xlim((0, 1))
axes[0].set_ylim((-2, 2))
axes[0].legend(loc="best")
axes[0].set_title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(15, -scores.mean(), scores.std()))
polynomial_features = PolynomialFeatures(degree=15,
include_bias=False)
linear_regression = Ridge(alpha=alpha)
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_validation.cross_val_score(pipeline,
X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
axes[1].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
axes[1].plot(X_test, true_fun(X_test), label="True function")
axes[1].scatter(X, y, label="Samples")
axes[1].set_xlabel("x")
axes[1].set_ylabel("y")
axes[1].set_xlim((0, 1))
axes[1].set_ylim((-2, 2))
axes[1].legend(loc="best")
axes[1].set_title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(15, -scores.mean(), scores.std()));
gen_regression_plots(1)
gen_regression_plots(1000)
gen_regression_plots(0.00001)
gen_regression_plots(1e-30)
from sklearn.datasets import load_boston
boston = load_boston()
plt.scatter(boston.data[:, 5], boston.target)
plt.xlabel("Mean number of rooms")
plt.ylabel("Appartment price")
<matplotlib.text.Text at 0x21bc1e45588>
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
x = boston.data[:, 5]
y = boston.target
x = x[:,np.newaxis]
lr.fit(x, y)
y_predicted = lr.predict(x)
plt.scatter(boston.data[:, 5], boston.target)
plt.xlabel("Mean number of rooms")
plt.ylabel("Appartment price")
xs = np.array([x.min(), x.max()])
plt.plot(xs, lr.predict(xs[:,np.newaxis]), color='g')
[<matplotlib.lines.Line2D at 0x21bc0a49278>]
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, lr.predict(x))
rmse = np.sqrt(mse)
print(mse)
print(rmse)
43.6005517712 6.60307138922
from sklearn.metrics import r2_score
print(r2_score(y, lr.predict(x)))
print(lr.score(x, y))
print(1 - np.sum((y - y_predicted) ** 2) / np.sum((y - np.mean(y)) ** 2))
0.483525455991 0.483525455991 0.483525455991
from sklearn import cross_validation
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
print(scores)
[ 0.70708692 0.63476138 0.50385441 -0.21594318 -1.77736913]
scores.mean()
-0.029521919957810328
x = boston.data
y = boston.target
lr.fit(x, y)
y_predicted = lr.predict(x)
mse = mean_squared_error(y, lr.predict(x))
np.sqrt(mse)
4.6795063006355164
lr.score(x, y)
0.7406077428649428
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
scores.mean()
0.35074135093252512
from sklearn.linear_model import Lasso
las = Lasso()
alphas = np.logspace(-5, 2, 1000)
alphas, coefs, _ = las.path(x, y, alphas=alphas)
fig, ax = plt.subplots()
ax.plot(alphas, coefs.T)
ax.set_xscale('log')
ax.set_xlim(alphas.max(), alphas.min())
ax.set_ylabel("features weights")
ax.set_xlabel("alpha");
alphas = np.logspace(-5, 5, 1000)
clf = Ridge()
coefs = []
for a in alphas:
clf.set_params(alpha=a)
clf.fit(x, y)
coefs.append(clf.coef_)
ax = plt.gca()
ax.set_color_cycle(['b', 'r', 'g', 'c', 'k', 'y', 'm'])
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight');
C:\Users\ivypa\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The set_color_cycle attribute was deprecated in version 1.5. Use set_prop_cycle instead. warnings.warn(message, mplDeprecation, stacklevel=1)
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=200)
scores = cross_validation.cross_val_score(ridge, x, y, cv=5)
scores.mean()
0.49775159276568087
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(ridge, {'alpha': [0.0001, 0.01, 1, 10, 100, 1000000]})
gs.fit(x, y)
GridSearchCV(cv=None, error_score='raise', estimator=Ridge(alpha=200, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), fit_params={}, iid=True, n_jobs=1, param_grid={'alpha': [0.0001, 0.01, 1, 10, 100, 1000000]}, pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
gs.grid_scores_
[mean: -1.57007, std: 3.02069, params: {'alpha': 0.0001}, mean: -1.54538, std: 2.98794, params: {'alpha': 0.01}, mean: -0.66233, std: 1.79668, params: {'alpha': 1}, mean: -0.06774, std: 0.95313, params: {'alpha': 10}, mean: 0.40866, std: 0.20802, params: {'alpha': 100}, mean: -0.21091, std: 0.23242, params: {'alpha': 1000000}]
gs.best_params_
{'alpha': 100}