In [74]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import cross_validation

%matplotlib inline

np.random.seed(0)

n_samples = 30
degrees = [1, 4, 15]

true_fun = lambda X: np.cos(1.5 * np.pi * X)
X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())

    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    # Evaluate the models using crossvalidation
    scores = cross_validation.cross_val_score(pipeline,
        X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)

    X_test = np.linspace(0, 1, 100)
    plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    plt.plot(X_test, true_fun(X_test), label="True function")
    plt.scatter(X, y, label="Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc="best")
    plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
        degrees[i], -scores.mean(), scores.std()))
In [75]:
def gen_regression_plots(alpha):
    fig, axes = plt.subplots(figsize=(15,5), nrows=1, ncols=2)

    polynomial_features = PolynomialFeatures(degree=15,
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    # Evaluate the models using crossvalidation
    scores = cross_validation.cross_val_score(pipeline,
        X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)

    X_test = np.linspace(0, 1, 100)
    axes[0].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    axes[0].plot(X_test, true_fun(X_test), label="True function")
    axes[0].scatter(X, y, label="Samples")
    axes[0].set_xlabel("x")
    axes[0].set_ylabel("y")
    axes[0].set_xlim((0, 1))
    axes[0].set_ylim((-2, 2))
    axes[0].legend(loc="best")
    axes[0].set_title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(15, -scores.mean(), scores.std()))

    polynomial_features = PolynomialFeatures(degree=15,
                                             include_bias=False)
    linear_regression = Ridge(alpha=alpha)
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    # Evaluate the models using crossvalidation
    scores = cross_validation.cross_val_score(pipeline,
        X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)

    X_test = np.linspace(0, 1, 100)
    axes[1].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    axes[1].plot(X_test, true_fun(X_test), label="True function")
    axes[1].scatter(X, y, label="Samples")
    axes[1].set_xlabel("x")
    axes[1].set_ylabel("y")
    axes[1].set_xlim((0, 1))
    axes[1].set_ylim((-2, 2))
    axes[1].legend(loc="best")
    axes[1].set_title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(15, -scores.mean(), scores.std()));
gen_regression_plots(1)
In [76]:
gen_regression_plots(1e-5)
In [81]:
gen_regression_plots(0.001)
In [78]:
gen_regression_plots(1e-30)
In [82]:
from sklearn.datasets import load_boston
boston = load_boston()
In [83]:
boston.data.shape
Out[83]:
(506, 13)
In [84]:
boston.data[:, 5].shape
Out[84]:
(506,)
In [85]:
plt.scatter(boston.data[:, 5], boston.target)
plt.xlabel("Mean number of rooms")
plt.ylabel("Appartment price")
Out[85]:
<matplotlib.text.Text at 0x2329e353da0>
In [141]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_features = PolynomialFeatures(degree=3, include_bias=False)
In [152]:
x = boston.data[:, 5]
y = boston.target
x = x[:,np.newaxis]
In [153]:
x[:10]
Out[153]:
array([[ 6.575],
       [ 6.421],
       [ 7.185],
       [ 6.998],
       [ 7.147],
       [ 6.43 ],
       [ 6.012],
       [ 6.172],
       [ 5.631],
       [ 6.004]])
In [154]:
x_new = polynomial_features.fit_transform(x)
x_new[:10]
Out[154]:
array([[   6.575     ,   43.230625  ,  284.24135938],
       [   6.421     ,   41.229241  ,  264.73295646],
       [   7.185     ,   51.624225  ,  370.92005662],
       [   6.998     ,   48.972004  ,  342.70608399],
       [   7.147     ,   51.079609  ,  365.06596552],
       [   6.43      ,   41.3449    ,  265.847707  ],
       [   6.012     ,   36.144144  ,  217.29859373],
       [   6.172     ,   38.093584  ,  235.11360045],
       [   5.631     ,   31.708161  ,  178.54865459],
       [   6.004     ,   36.048016  ,  216.43228806]])
In [158]:
x = boston.data[:10, 5:7]
x
Out[158]:
array([[   6.575,   65.2  ],
       [   6.421,   78.9  ],
       [   7.185,   61.1  ],
       [   6.998,   45.8  ],
       [   7.147,   54.2  ],
       [   6.43 ,   58.7  ],
       [   6.012,   66.6  ],
       [   6.172,   96.1  ],
       [   5.631,  100.   ],
       [   6.004,   85.9  ]])
In [162]:
polynomial_features = PolynomialFeatures(degree=3, include_bias=False)
In [163]:
x_new = polynomial_features.fit_transform(x)
x_new.shape
Out[163]:
(10, 9)
In [164]:
x_new
Out[164]:
array([[  6.57500000e+00,   6.52000000e+01,   4.32306250e+01,
          4.28690000e+02,   4.25104000e+03,   2.84241359e+02,
          2.81863675e+03,   2.79505880e+04,   2.77167808e+05],
       [  6.42100000e+00,   7.89000000e+01,   4.12292410e+01,
          5.06616900e+02,   6.22521000e+03,   2.64732956e+02,
          3.25298711e+03,   3.99720734e+04,   4.91169069e+05],
       [  7.18500000e+00,   6.11000000e+01,   5.16242250e+01,
          4.39003500e+02,   3.73321000e+03,   3.70920057e+02,
          3.15424015e+03,   2.68231138e+04,   2.28099131e+05],
       [  6.99800000e+00,   4.58000000e+01,   4.89720040e+01,
          3.20508400e+02,   2.09764000e+03,   3.42706084e+02,
          2.24291778e+03,   1.46792847e+04,   9.60719120e+04],
       [  7.14700000e+00,   5.42000000e+01,   5.10796090e+01,
          3.87367400e+02,   2.93764000e+03,   3.65065966e+02,
          2.76851481e+03,   2.09953131e+04,   1.59220088e+05],
       [  6.43000000e+00,   5.87000000e+01,   4.13449000e+01,
          3.77441000e+02,   3.44569000e+03,   2.65847707e+02,
          2.42694563e+03,   2.21557867e+04,   2.02262003e+05],
       [  6.01200000e+00,   6.66000000e+01,   3.61441440e+01,
          4.00399200e+02,   4.43556000e+03,   2.17298594e+02,
          2.40719999e+03,   2.66665867e+04,   2.95408296e+05],
       [  6.17200000e+00,   9.61000000e+01,   3.80935840e+01,
          5.93129200e+02,   9.23521000e+03,   2.35113600e+02,
          3.66079342e+03,   5.69997161e+04,   8.87503681e+05],
       [  5.63100000e+00,   1.00000000e+02,   3.17081610e+01,
          5.63100000e+02,   1.00000000e+04,   1.78548655e+02,
          3.17081610e+03,   5.63100000e+04,   1.00000000e+06],
       [  6.00400000e+00,   8.59000000e+01,   3.60480160e+01,
          5.15743600e+02,   7.37881000e+03,   2.16432288e+02,
          3.09652457e+03,   4.43023752e+04,   6.33839779e+05]])
In [177]:
polynomial_features = PolynomialFeatures(2)
x = polynomial_features.fit_transform(boston.data)
In [178]:
x.shape
Out[178]:
(506, 105)
In [184]:
lr = Ridge(2000)
In [185]:
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
scores
Out[185]:
array([  0.42974417,   0.56656339,   0.74587033,   0.35513514, -12.73529056])
In [86]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [ ]:
 
In [130]:
x = boston.data[:, 5]
y = boston.target
x = x[:,np.newaxis]
lr.fit(x, y)
y_predicted = lr.predict(x)
In [88]:
lr.coef_, lr.intercept_
Out[88]:
(array([ 9.10210898]), -34.670620776438554)
In [89]:
np.sum(x[0] * lr.coef_) + lr.intercept_
Out[89]:
25.175745774821969
In [90]:
x[0].dot(lr.coef_) + lr.intercept_
Out[90]:
25.175745774821969
In [91]:
y_predicted[0]
Out[91]:
25.175745774821969
In [92]:
y[:10]
Out[92]:
array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,  18.9])
In [93]:
np.round(y_predicted[:10])
Out[93]:
array([ 25.,  24.,  31.,  29.,  30.,  24.,  20.,  22.,  17.,  20.])
In [94]:
plt.scatter(boston.data[:, 5], boston.target)
plt.xlabel("Mean number of rooms")
plt.ylabel("Appartment price")
xs = np.array([x.min(), x.max()])
plt.plot(xs, lr.predict(xs[:,np.newaxis]), color='g')
Out[94]:
[<matplotlib.lines.Line2D at 0x2329c76b4e0>]
In [95]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, lr.predict(x))
rmse = np.sqrt(mse)
print(mse)
print(rmse)
43.6005517712
6.60307138922
In [96]:
y.mean()
Out[96]:
22.532806324110677
In [97]:
from sklearn.metrics import r2_score
print(r2_score(y, lr.predict(x))) 
print(lr.score(x, y))
print(1 - np.sum((y - y_predicted) ** 2) / np.sum((y - np.mean(y)) ** 2))
0.483525455991
0.483525455991
0.483525455991
In [98]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
print(scores)
[ 0.70708692  0.63476138  0.50385441 -0.21594318 -1.77736913]
In [99]:
scores.mean()
Out[99]:
-0.029521919957810328
In [100]:
x = boston.data
y = boston.target
lr.fit(x, y)
y_predicted = lr.predict(x)
In [101]:
mse = mean_squared_error(y, lr.predict(x))
np.sqrt(mse)
Out[101]:
4.6795063006355164
In [102]:
lr.score(x, y)
Out[102]:
0.7406077428649428
In [108]:
scores = cross_validation.cross_val_score(lr, x, y, cv=5)
scores.mean()
Out[108]:
0.35074135093252512
In [124]:
from sklearn.linear_model import Lasso
las = Lasso()
alphas = np.logspace(-5, 2, 1000)
alphas, coefs, _ = las.path(x, y, alphas=alphas)

fig, ax = plt.subplots()
ax.plot(alphas, coefs.T)
ax.set_xscale('log')
ax.set_xlim(alphas.max(), alphas.min())
ax.set_ylabel("features weights")
ax.set_xlabel("alpha");
In [125]:
alphas = np.logspace(-5, 5, 1000)
clf = Ridge()

coefs = []
for a in alphas:
    clf.set_params(alpha=a)
    clf.fit(x, y)
    coefs.append(clf.coef_)

ax = plt.gca()
ax.set_color_cycle(['b', 'r', 'g', 'c', 'k', 'y', 'm'])

ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight');
C:\Users\ivypa\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The set_color_cycle attribute was deprecated in version 1.5. Use set_prop_cycle instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [115]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=200)
scores = cross_validation.cross_val_score(ridge, x, y, cv=5)
scores.mean()
Out[115]:
0.49775159276568087
In [121]:
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(ridge, {'alpha': [0.0001, 0.01, 1, 10, 100, 200, 1000000]},)
gs.fit(x, y)
Out[121]:
GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=200, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0001, 0.01, 1, 10, 100, 200, 1000000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
In [122]:
gs.grid_scores_
Out[122]:
[mean: -1.57007, std: 3.02069, params: {'alpha': 0.0001},
 mean: -1.54538, std: 2.98794, params: {'alpha': 0.01},
 mean: -0.66233, std: 1.79668, params: {'alpha': 1},
 mean: -0.06774, std: 0.95313, params: {'alpha': 10},
 mean: 0.40866, std: 0.20802, params: {'alpha': 100},
 mean: 0.45021, std: 0.12174, params: {'alpha': 200},
 mean: -0.21091, std: 0.23242, params: {'alpha': 1000000}]
In [123]:
gs.best_params_
Out[123]:
{'alpha': 200}
In [ ]: