import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline
rng = np.random.RandomState(123)
mean = [100, 1000]
cov = [[1, 0.9], [0.9, 1]]
sample = rng.multivariate_normal(mean, cov, size=100)
x, y = sample[:, 0], sample[:, 1]
plt.scatter(x, y)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
np.corrcoef(np.vstack([x, y]))
array([[ 1. , 0.87552229], [ 0.87552229, 1. ]])
$w_1 = \frac{\sigma_{x,y}}{\sigma_{x}^{2}}$
$b = \bar{y} - w_1\bar{x}$
where
$\text{covariance: } \sigma_{xy} = \frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})$
$\text{variance: } \sigma^{2}_{x} = \frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^2$
cov_xy = np.cov(np.vstack((x, y)), ddof=0)[0, 1]
var_x = np.var(x, ddof=0)
w1 = cov_xy / var_x
b = np.mean(y) - w1*np.mean(x)
print('slope: %.2f' % w1)
print('y-intercept: %.2f' % b)
slope: 0.84 y-intercept: 915.59
X = x[:, np.newaxis]
# adding a column vector of "ones"
Xb = np.hstack((np.ones((X.shape[0], 1)), X))
w = np.zeros(X.shape[1])
z = np.linalg.inv(np.dot(Xb.T, Xb))
w = np.dot(z, np.dot(Xb.T, y))
b, w1 = w[0], w[1]
print('slope: %.2f' % w1)
print('y-intercept: %.2f' % b)
slope: 0.84 y-intercept: 915.59
w = np.polyfit(x, y, deg=1)
b, w1 = w[1], w[0]
print('slope: %.2f' % w1)
print('y-intercept: %.2f' % b)
slope: 0.84 y-intercept: 915.59
extremes = np.array([np.min(x), np.max(x)])
predict = extremes*w1 + b
plt.plot(x, y, marker='o', linestyle='')
plt.plot(extremes, predict)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
y_predicted = x*w1 + b
mse = np.mean((y - y_predicted)**2)
mse
0.21920128791624113
rmse = np.sqrt(mse)
rmse
0.46818937185314358
plt.scatter(np.arange(x.shape[0]), y - y_predicted)
plt.ylabel('vertical offset')
plt.xlabel('index')
<matplotlib.text.Text at 0x113272b38>
mean_y = np.mean(y)
SS_total = np.sum((y - mean_y)**2)
SS_residual = np.sum((y_predicted - mean_y)**2)
r_squared = SS_residual / SS_total
r_squared
0.76653928492766576
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
r_value**2
0.766539284927652