#!/usr/bin/env python
# coding: utf-8
# # Least-Squares Regression Workbook
# ## CH EN 2450 - Numerical Methods
# **Prof. Tony Saad (www.tsaad.net)
Department of Chemical Engineering
University of Utah**
# # Example 1
# The height of a person vs femur length is given by the following data:
#
# | femur length (cm) | height (cm) |
# | :----------------: |:-----------: |
# | 40 | 163|
# |41 |165|
# |43 |167|
# |43 |164|
# |44 |168|
# |44 |169|
# |45 |170|
# |45 |167|
# |48 |170|
# |51 |175|
#
# Plot this data and then perform regression to a line and a quadratic using the standard derivation as well as the normal equations. Also use `numpy`'s `polyfit` and compare to you solutions.
# Let's first get the boiler plate out of the way and import matplotlib and numpy
# In[1]:
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'svg'")
import matplotlib.pyplot as plt
import numpy as np
# Now place the input data in `numpy` arrays. We can enter these manually or import from a text file.
# In[2]:
xi = np.array([40,41,43,43,44,44,45,45,48,51])
yi = np.array([163,165,167,164,168,169,170,167,170,175])
# Now plot the data
# In[3]:
plt.plot(xi,yi,'ro')
plt.xlabel('Femur length (cm)')
plt.ylabel('Person\' height (cm)')
plt.title('Height vs femur length')
plt.grid()
# ## Regression to a straight line
# Here, we regress the data to a straight line $a_0 + a_1 x$. Recall that, for regression to a straight line, we must solve the following system of equations:
# \begin{equation}
# \label{eq:regression-line}
# \left[
# \begin{array}{*{20}{c}}
# N& \sum{x_i}\\
# \sum{x_i} &\sum{x_i^2}
# \end{array}
# \right]
# \left(
# \begin{array}{*{20}{c}}
# a_0\\
# a_1
# \end{array} \right)
# = \left(
# \begin{array}{*{20}{c}}
# \sum{y_i}\\
# \sum {x_i y_i}
# \end{array}
# \right)
# \end{equation}
# first build the coefficient matrix as a list of lists [ [a,b], [c,d]]. We will use `numpy.sum` to compute the various summations in the system \eqref{eq:regression-line}.
# In[4]:
N = len(xi)
A = np.array([[N, np.sum(xi)],
[np.sum(xi), np.sum(xi**2)]])
# next, construct the right-hand-side using a 1D numpy array
# In[5]:
b = np.array([np.sum(yi), np.sum(xi*yi)])
# now find the solution of $[\mathbf{A}]\mathbf{a} = \mathbf{b}$, where $a = (a_0, a_1)$ are the coefficients of the regressed line. Use `numpy`'s built-in linear solver.
# In[6]:
sol = np.linalg.solve(A,b)
# print the solution. Note that in this case, the solution should contain two values, a0 and a1, respectively.
print(sol)
# The variable `sol` contains the solution of the system of equations. It is a list of two entries corresponding to the coefficients $a_0$ and $a_1$ of the regressed line.
# We can now plot the line, $a_0 + a_1 x$ that we just fitted into the data
# In[7]:
# first get the coefficients a0 and a1 from the variable sol
a0 = sol[0]
a1 = sol[1]
# construct a function f(x) for the regressed line
y_line = lambda x: a0 + a1*x
# now plot the regressed line as a function of the input data xi
plt.plot(xi, y_line(xi), label='least-squares fit')
# plot the original data
plt.plot(xi, yi,'ro', label='original data')
plt.xlabel('Femur length (cm)')
plt.ylabel('Person\' height (cm)')
plt.title('Height vs femur length')
plt.legend()
plt.grid()
# ## Standard Error
# The standard error of the model quantifies the spread of the data around the regression curve. It is given by
# \begin{equation}
# S_{y/x} = \sqrt{\frac{S_r}{n-2}} = \sqrt{\frac{\sum{(y_i - f_i)^2}}{n-2}}
# \end{equation}
# In[8]:
ybar = np.average(yi)
print(ybar)
fi = y_line(xi)
stdev = np.sqrt(np.sum((yi - ybar)**2)/(N-1))
print(stdev - ybar)
Sr = np.sum((yi-fi)**2)
Syx = np.sqrt(Sr/(N-2))
print(Syx)
# ### R2 Value
# The $R^2$ value can be computed as:
# \begin{equation}R^2 = 1 - \frac{\sum{(y_i - f_i)^2}}{\sum{(y_i - \bar{y})^2}}\end{equation}
# where
# \begin{equation}
# \bar{y} = \frac{1}{N}\sum{y_i}
# \end{equation}
# In[9]:
ybar = np.average(yi)
fi = y_line(xi)
rsq = 1 - np.sum( (yi - fi)**2 )/np.sum( (yi - ybar)**2 )
print(rsq)
# To enable quick calculation of the R2 value for other functions, let's declare a function that computes the R2 value for an arbitrary model fit.
# In[10]:
def rsquared(xi,yi,ymodel):
'''
xi: vector of length n representing the known x values.
yi: vector of length n representing the known y values that correspond to xi.
ymodel: a python function (of x only) that can be evaluated at xi and represents a model fit of
the data (e.g. a regressed curve).
'''
ybar = np.average(yi)
fi = ymodel(xi)
result = 1 - np.sum( (yi - fi)**2 )/np.sum( (yi - ybar)**2 )
return result
# In[11]:
rsq = rsquared(xi,yi,y_line)
print(rsq)
# ## Regression to a quadratic
# Here, we regress the data to a quadratic polynomial $a_0 + a_1 x + a_2 x^2$. Recall that, for regression to a quadratic, we must solve the following system of equations:
# \begin{equation}
# \label{eq:regression-line}
# \left[
# \begin{array}{*{20}{c}}
# N & \sum{x_i} & \sum{x_i^2}\\
# \sum{x_i} &\sum{x_i^2} & \sum{x_i^3} \\
# \sum{x_i^2} & \sum{x_i^3} & \sum{x_i^4}
# \end{array}
# \right]
# \left(
# \begin{array}{*{20}{c}}
# a_0\\
# a_1 \\
# a_2
# \end{array} \right)
# = \left(
# \begin{array}{*{20}{c}}
# \sum{y_i}\\
# \sum {x_i y_i}\\
# \sum {x_i^2 y_i}
# \end{array}
# \right)
# \end{equation}
# In[12]:
A = np.array([[len(xi), np.sum(xi) , np.sum(xi**2) ],
[np.sum(xi), np.sum(xi**2), np.sum(xi**3)],
[np.sum(xi**2), np.sum(xi**3), np.sum(xi**4)] ])
b = np.array([np.sum(yi), np.sum(xi*yi), np.sum(xi*xi*yi)])
print(A,b)
# In[13]:
sol = np.linalg.solve(A,b)
print(sol)
# In[14]:
a0 = sol[0]
a1 = sol[1]
a2 = sol[2]
y_quad = lambda x: a0 + a1*x + a2*x**2
# now plot the regressed line as a function of the input data xi
plt.plot(xi, y_quad(xi), label='least-squares fit (quadratic)')
# plot the original data
plt.plot(xi, yi,'ko', label='original data')
plt.xlabel('Femur length (cm)')
plt.ylabel('Person\' height (cm)')
plt.title('Height vs femur length')
plt.legend()
plt.grid()
# Let's now compute the R2 value for the quadratic fit
# In[15]:
rsq = rsquared(xi,yi,y_quad)
print(rsq)
# This value of 85% is not much different than the one we got with a straight line fit. The data is very likely to be distributed linearly.
# ## Using the normal equations
# Fit a straight line to the data using the normal equations
# In[16]:
A = np.array([np.ones(len(xi)),xi]).T
print(A)
ATA = A.T @ A
sol = np.linalg.solve(ATA,A.T@yi)
print(sol)
# For a quadratic fit using the normal equations
# In[17]:
A = np.array([np.ones(len(xi)),xi,xi**2]).T
ATA = A.T @ A
sol = np.linalg.solve(ATA,A.T@yi)
print(sol)
# ## Using Numpy's Polyfit
# It is possible to repeat the previous analysis using `numpy's` `polyfit` function. Let's try it out and compare to our results
# We first compare a straight line fit
# In[18]:
coefs = np.polyfit(xi,yi,1)
print(coefs)
# Indeed, the coefficients are the same as the ones we obtained from our straight-line fit. Polyfit returns the coefs sorted in reverse order.
# To use the data returned by `polyfit`, we can construct a polynomial using `poly1d` and simply pass the coefficients returned by `polyfit`
# In[19]:
p = np.poly1d(coefs)
# no, simply call `p` on any value, say `p(43)`
# In[20]:
print(p(43))
# We can also plot the fit
# In[21]:
# now plot the regressed line as a function of the input data xi
plt.plot(xi, p(xi), label='least-squares fit (quadratic)')
# plot the original data
plt.plot(xi, yi,'ko', label='original data')
plt.xlabel('Femur length (cm)')
plt.ylabel('Person\' height (cm)')
plt.title('Height vs femur length')
plt.legend()
plt.grid()
# We can do the same for the quadratic fit - simply change the degree of the polynomial in `polyfit`
# In[22]:
coefs = np.polyfit(xi,yi,2)
pquad = np.poly1d(coefs)
print(pquad(43))
# ## Using numpy's Least Squares
# In[23]:
np.linalg.lstsq(A,yi,rcond=None)
# In[24]:
from IPython.core.display import HTML
def css_styling():
styles = open("../../styles/custom.css", "r").read()
return HTML(styles)
css_styling()
# In[ ]: