import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import cross_validation
%matplotlib inline
np.random.seed(0)
n_samples = 30
degrees = [1, 4, 15]
true_fun = lambda X: np.cos(1.5 * np.pi * X)
X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
ax = plt.subplot(1, len(degrees), i + 1)
plt.setp(ax, xticks=(), yticks=())
polynomial_features = PolynomialFeatures(degree=degrees[i],
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_validation.cross_val_score(pipeline,
X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X, y, label="Samples")
plt.xlabel("x")
plt.ylabel("y")
plt.xlim((0, 1))
plt.ylim((-2, 2))
plt.legend(loc="best")
plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
degrees[i], -scores.mean(), scores.std()))
def gen_regression_plots(alpha):
fig, axes = plt.subplots(figsize=(15,5), nrows=1, ncols=2)
polynomial_features = PolynomialFeatures(degree=15,
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_validation.cross_val_score(pipeline,
X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
axes[0].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
axes[0].plot(X_test, true_fun(X_test), label="True function")
axes[0].scatter(X, y, label="Samples")
axes[0].set_xlabel("x")
axes[0].set_ylabel("y")
axes[0].set_xlim((0, 1))
axes[0].set_ylim((-2, 2))
axes[0].legend(loc="best")
axes[0].set_title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(15, -scores.mean(), scores.std()))
polynomial_features = PolynomialFeatures(degree=15,
include_bias=False)
linear_regression = Ridge(alpha=alpha)
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_validation.cross_val_score(pipeline,
X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
axes[1].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
axes[1].plot(X_test, true_fun(X_test), label="True function")
axes[1].scatter(X, y, label="Samples")
axes[1].set_xlabel("x")
axes[1].set_ylabel("y")
axes[1].set_xlim((0, 1))
axes[1].set_ylim((-2, 2))
axes[1].legend(loc="best")
axes[1].set_title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(15, -scores.mean(), scores.std()));
gen_regression_plots(1)
gen_regression_plots(1e-5)
gen_regression_plots(0.001)
gen_regression_plots(1e-30)
from sklearn.datasets import load_boston
boston = load_boston()
boston.data.shape
(506, 13)
boston.data[:, 5].shape
(506,)
plt.scatter(boston.data[:, 5], boston.target)
plt.xlabel("Mean number of rooms")
plt.ylabel("Appartment price")
<matplotlib.text.Text at 0x2329e353da0>
from sklearn.preprocessing import PolynomialFeatures
polynomial_features = PolynomialFeatures(degree=3, include_bias=False)
x = boston.data[:, 5]
y = boston.target
x = x[:,np.newaxis]
x[:10]
array([[ 6.575], [ 6.421], [ 7.185], [ 6.998], [ 7.147], [ 6.43 ], [ 6.012], [ 6.172], [ 5.631], [ 6.004]])
x_new = polynomial_features.fit_transform(x)
x_new[:10]
array([[ 6.575 , 43.230625 , 284.24135938], [ 6.421 , 41.229241 , 264.73295646], [ 7.185 , 51.624225 , 370.92005662], [ 6.998 , 48.972004 , 342.70608399], [ 7.147 , 51.079609 , 365.06596552], [ 6.43 , 41.3449 , 265.847707 ], [ 6.012 , 36.144144 , 217.29859373], [ 6.172 , 38.093584 , 235.11360045], [ 5.631 , 31.708161 , 178.54865459], [ 6.004 , 36.048016 , 216.43228806]])
x = boston.data[:10, 5:7]
x
array([[ 6.575, 65.2 ], [ 6.421, 78.9 ], [ 7.185, 61.1 ], [ 6.998, 45.8 ], [ 7.147, 54.2 ], [ 6.43 , 58.7 ], [ 6.012, 66.6 ], [ 6.172, 96.1 ], [ 5.631, 100. ], [ 6.004, 85.9 ]])
polynomial_features = PolynomialFeatures(degree=3, include_bias=False)
x_new = polynomial_features.fit_transform(x)
x_new.shape
(10, 9)
x_new
array([[ 6.57500000e+00, 6.52000000e+01, 4.32306250e+01, 4.28690000e+02, 4.25104000e+03, 2.84241359e+02, 2.81863675e+03, 2.79505880e+04, 2.77167808e+05], [ 6.42100000e+00, 7.89000000e+01, 4.12292410e+01, 5.06616900e+02, 6.22521000e+03, 2.64732956e+02, 3.25298711e+03, 3.99720734e+04, 4.91169069e+05], [ 7.18500000e+00, 6.11000000e+01, 5.16242250e+01, 4.39003500e+02, 3.73321000e+03, 3.70920057e+02, 3.15424015e+03, 2.68231138e+04, 2.28099131e+05], [ 6.99800000e+00, 4.58000000e+01, 4.89720040e+01, 3.20508400e+02, 2.09764000e+03, 3.42706084e+02, 2.24291778e+03, 1.46792847e+04, 9.60719120e+04], [ 7.14700000e+00, 5.42000000e+01, 5.10796090e+01, 3.87367400e+02, 2.93764000e+03, 3.65065966e+02, 2.76851481e+03, 2.09953131e+04, 1.59220088e+05], [ 6.43000000e+00, 5.87000000e+01, 4.13449000e+01, 3.77441000e+02, 3.44569000e+03, 2.65847707e+02, 2.42694563e+03, 2.21557867e+04, 2.02262003e+05], [ 6.01200000e+00, 6.66000000e+01, 3.61441440e+01, 4.00399200e+02, 4.43556000e+03, 2.17298594e+02, 2.40719999e+03, 2.66665867e+04, 2.95408296e+05], [ 6.17200000e+00, 9.61000000e+01, 3.80935840e+01, 5.93129200e+02, 9.23521000e+03, 2.35113600e+02, 3.66079342e+03, 5.69997161e+04, 8.87503681e+05], [ 5.63100000e+00, 1.00000000e+02, 3.17081610e+01, 5.63100000e+02, 1.00000000e+04, 1.78548655e+02, 3.17081610e+03, 5.63100000e+04, 1.00000000e+06], [ 6.00400000e+00, 8.59000000e+01, 3.60480160e+01, 5.15743600e+02, 7.37881000e+03, 2.16432288e+02, 3.09652457e+03, 4.43023752e+04, 6.33839779e+05]])
polynomial_features = PolynomialFeatures(2)
x = polynomial_features.fit_transform(boston.data)
x.shape
(506, 105)
lr = Ridge(2000)
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
scores
array([ 0.42974417, 0.56656339, 0.74587033, 0.35513514, -12.73529056])
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
x = boston.data[:, 5]
y = boston.target
x = x[:,np.newaxis]
lr.fit(x, y)
y_predicted = lr.predict(x)
lr.coef_, lr.intercept_
(array([ 9.10210898]), -34.670620776438554)
np.sum(x[0] * lr.coef_) + lr.intercept_
25.175745774821969
x[0].dot(lr.coef_) + lr.intercept_
25.175745774821969
y_predicted[0]
25.175745774821969
y[:10]
array([ 24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])
np.round(y_predicted[:10])
array([ 25., 24., 31., 29., 30., 24., 20., 22., 17., 20.])
plt.scatter(boston.data[:, 5], boston.target)
plt.xlabel("Mean number of rooms")
plt.ylabel("Appartment price")
xs = np.array([x.min(), x.max()])
plt.plot(xs, lr.predict(xs[:,np.newaxis]), color='g')
[<matplotlib.lines.Line2D at 0x2329c76b4e0>]
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, lr.predict(x))
rmse = np.sqrt(mse)
print(mse)
print(rmse)
43.6005517712 6.60307138922
y.mean()
22.532806324110677
from sklearn.metrics import r2_score
print(r2_score(y, lr.predict(x)))
print(lr.score(x, y))
print(1 - np.sum((y - y_predicted) ** 2) / np.sum((y - np.mean(y)) ** 2))
0.483525455991 0.483525455991 0.483525455991
from sklearn import cross_validation
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
print(scores)
[ 0.70708692 0.63476138 0.50385441 -0.21594318 -1.77736913]
scores.mean()
-0.029521919957810328
x = boston.data
y = boston.target
lr.fit(x, y)
y_predicted = lr.predict(x)
mse = mean_squared_error(y, lr.predict(x))
np.sqrt(mse)
4.6795063006355164
lr.score(x, y)
0.7406077428649428
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
scores.mean()
0.35074135093252512
from sklearn.linear_model import Lasso
las = Lasso()
alphas = np.logspace(-5, 2, 1000)
alphas, coefs, _ = las.path(x, y, alphas=alphas)
fig, ax = plt.subplots()
ax.plot(alphas, coefs.T)
ax.set_xscale('log')
ax.set_xlim(alphas.max(), alphas.min())
ax.set_ylabel("features weights")
ax.set_xlabel("alpha");
alphas = np.logspace(-5, 5, 1000)
clf = Ridge()
coefs = []
for a in alphas:
clf.set_params(alpha=a)
clf.fit(x, y)
coefs.append(clf.coef_)
ax = plt.gca()
ax.set_color_cycle(['b', 'r', 'g', 'c', 'k', 'y', 'm'])
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight');
C:\Users\ivypa\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The set_color_cycle attribute was deprecated in version 1.5. Use set_prop_cycle instead. warnings.warn(message, mplDeprecation, stacklevel=1)
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=200)
scores = cross_validation.cross_val_score(ridge, x, y, cv=5)
scores.mean()
0.49775159276568087
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(ridge, {'alpha': [0.0001, 0.01, 1, 10, 100, 200, 1000000]},)
gs.fit(x, y)
GridSearchCV(cv=None, error_score='raise', estimator=Ridge(alpha=200, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), fit_params={}, iid=True, n_jobs=1, param_grid={'alpha': [0.0001, 0.01, 1, 10, 100, 200, 1000000]}, pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
gs.grid_scores_
[mean: -1.57007, std: 3.02069, params: {'alpha': 0.0001}, mean: -1.54538, std: 2.98794, params: {'alpha': 0.01}, mean: -0.66233, std: 1.79668, params: {'alpha': 1}, mean: -0.06774, std: 0.95313, params: {'alpha': 10}, mean: 0.40866, std: 0.20802, params: {'alpha': 100}, mean: 0.45021, std: 0.12174, params: {'alpha': 200}, mean: -0.21091, std: 0.23242, params: {'alpha': 1000000}]
gs.best_params_
{'alpha': 200}