In [1]:

import sklearn
import numpy as np
import matplotlib.pyplot as plt

print(sklearn.__version__, np.__version__)

0.20.0 1.15.4

In [2]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

In [3]:

housing = fetch_california_housing()
print(housing.data.shape, housing.target.shape)

(20640, 8) (20640,)

In [4]:

X_train, X_test, y_train, y_test = train_test_split(housing.data[:, 0:1], housing.target, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(15480, 1) (5160, 1) (15480,) (5160,)

normal equation

In [5]:

x0 = np.ones((X_train.shape[0],1))
X = np.hstack((x0, X_train))
print(X.shape)

(15480, 2)

In [6]:

w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y_train)
print(w)

[0.44967564 0.41788087]

In [7]:

plt.scatter(X_train, y_train)
plt.plot([0, 10], [w[0], 10*w[1]+w[0]], 'r')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

basic regression

In [8]:

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

Out[8]:

0.47083837938023365

In [9]:

y_pred = lr.predict(X_test)
r2 = 1 - ((y_test - y_pred)**2).sum() / ((y_test - y_test.mean())**2).sum()
print(r2)

0.47083837938023365

In [10]:

print(lr.coef_, lr.intercept_)

[0.41788087] 0.44967564199686194

In [11]:

plt.scatter(X_train, y_train)
plt.plot([0, 10], [lr.intercept_, 10 * lr.coef_ + lr.intercept_], 'r')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

not fit intercept

In [12]:

lr_no_intercept = LinearRegression(fit_intercept=False)
lr_no_intercept.fit(X_train, y_train)
print(lr_no_intercept.coef_, lr_no_intercept.intercept_)

[0.51131441] 0.0

In [13]:

plt.scatter(X_train, y_train)
plt.plot([0, 10], [lr_no_intercept.intercept_, 10 * lr_no_intercept.coef_ + lr_no_intercept.intercept_], 'r')
plt.xlabel('x')
plt.ylabel('y')
plt.show()