#!/usr/bin/env python # coding: utf-8 # You can read an overview of this Numerical Linear Algebra course in [this blog post](http://www.fast.ai/2017/07/17/num-lin-alg/). The course was originally taught in the [University of San Francisco MS in Analytics](https://www.usfca.edu/arts-sciences/graduate-programs/analytics) graduate program. Course lecture videos are [available on YouTube](https://www.youtube.com/playlist?list=PLtmWHNX-gukIc92m1K0P6bIOnZb-mg0hY) (note that the notebook numbers and video numbers do not line up, since some notebooks took longer than 1 video to cover). # # You can ask questions about the course on [our fast.ai forums](http://forums.fast.ai/c/lin-alg). # # 6. How to Implement Linear Regression # In the previous lesson, we calculated the least squares linear regression for a diabetes dataset, using scikit learn's implementation. Today, we will look at how we could write our own implementation. # In[2]: from sklearn import datasets, linear_model, metrics from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures import math, scipy, numpy as np from scipy import linalg # In[3]: np.set_printoptions(precision=6) # In[4]: data = datasets.load_diabetes() # In[5]: feature_names=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'] # In[6]: trn,test,y_trn,y_test = train_test_split(data.data, data.target, test_size=0.2) # In[7]: trn.shape, test.shape # In[8]: def regr_metrics(act, pred): return (math.sqrt(metrics.mean_squared_error(act, pred)), metrics.mean_absolute_error(act, pred)) # ### How did sklearn do it? # How is sklearn doing this? By checking [the source code](https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/linear_model/base.py#L417), you can see that in the dense case, it calls [scipy.linalg.lstqr](https://github.com/scipy/scipy/blob/v0.19.0/scipy/linalg/basic.py#L892-L1058), which is calling a LAPACK method: # # Options are ``'gelsd'``, ``'gelsy'``, ``'gelss'``. Default # (``'gelsd'``) is a good choice. However, ``'gelsy'`` can be slightly # faster on many problems. ``'gelss'`` was used historically. It is # generally slow but uses less memory. # # - [gelsd](https://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mkl_lapack_examples/_gelsd.htm): uses SVD and a divide-and-conquer method # - [gelsy](https://software.intel.com/en-us/node/521113): uses QR factorization # - [gelss](https://software.intel.com/en-us/node/521114): uses SVD # #### Scipy Sparse Least Squares # We will not get into too much detail about the sparse version of least squares. Here is a bit of info if you are interested: # [Scipy sparse lsqr](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.lsqr.html#id1) uses an iterative method called [Golub and Kahan bidiagonalization](https://web.stanford.edu/class/cme324/paige-saunders2.pdf). # from [scipy sparse lsqr source code](https://github.com/scipy/scipy/blob/v0.14.0/scipy/sparse/linalg/isolve/lsqr.py#L96): # Preconditioning is another way to reduce the number of iterations. If it is possible to solve a related system ``M*x = b`` efficiently, where M approximates A in some helpful way (e.g. M - A has low rank or its elements are small relative to those of A), LSQR may converge more rapidly on the system ``A*M(inverse)*z = b``, after which x can be recovered by solving M\*x = z. # # If A is symmetric, LSQR should not be used! Alternatives are the symmetric conjugate-gradient method (cg) and/or SYMMLQ. SYMMLQ is an implementation of symmetric cg that applies to any symmetric A and will converge more rapidly than LSQR. If A is positive definite, there are other implementations of symmetric cg that require slightly less work per iteration than SYMMLQ (but will take the same number of iterations). # ### linalg.lstqr # The sklearn implementation handled adding a constant term (since the y-intercept is presumably not 0 for the line we are learning) for us. We will need to do that by hand now: # In[9]: trn_int = np.c_[trn, np.ones(trn.shape[0])] test_int = np.c_[test, np.ones(test.shape[0])] # Since `linalg.lstsq` lets us specify which LAPACK routine we want to use, lets try them all and do some timing comparisons: # In[10]: get_ipython().run_line_magic('timeit', 'coef, _,_,_ = linalg.lstsq(trn_int, y_trn, lapack_driver="gelsd")') # In[11]: get_ipython().run_line_magic('timeit', 'coef, _,_,_ = linalg.lstsq(trn_int, y_trn, lapack_driver="gelsy")') # In[12]: get_ipython().run_line_magic('timeit', 'coef, _,_,_ = linalg.lstsq(trn_int, y_trn, lapack_driver="gelss")') # ### Naive Solution # Recall that we want to find $\hat{x}$ that minimizes: # $$ \big\vert\big\vert Ax - b \big\vert\big\vert_2$$ # # Another way to think about this is that we are interested in where vector $b$ is closest to the subspace spanned by $A$ (called the *range of* $A$). This is the projection of $b$ onto $A$. Since $b - A\hat{x}$ must be perpendicular to the subspace spanned by $A$, we see that # # $$A^T (b - A\hat{x}) = 0 $$ # # (we are using $A^T$ because we want to multiply each column of $A$ by $b - A\hat{x}$ # This leads us to the *normal equations*: # $$ x = (A^TA)^{-1}A^T b $$ # In[13]: def ls_naive(A, b): return np.linalg.inv(A.T @ A) @ A.T @ b # In[14]: get_ipython().run_line_magic('timeit', 'coeffs_naive = ls_naive(trn_int, y_trn)') # In[15]: coeffs_naive = ls_naive(trn_int, y_trn) regr_metrics(y_test, test_int @ coeffs_naive) # ### Normal Equations (Cholesky) # Normal equations: # $$ A^TA x = A^T b $$ # # If $A$ has full rank, the pseudo-inverse $(A^TA)^{-1}A^T$ is a **square, hermitian positive definite** matrix. The standard way of solving such a system is *Cholesky Factorization*, which finds upper-triangular R s.t. $A^TA = R^TR$. # The following steps are based on Algorithm 11.1 from Trefethen: # In[16]: A = trn_int # In[17]: b = y_trn # In[18]: AtA = A.T @ A Atb = A.T @ b # **Warning:** Numpy and Scipy default to different upper/lower for Cholesky # In[19]: R = scipy.linalg.cholesky(AtA) # In[196]: np.set_printoptions(suppress=True, precision=4) R # check our factorization: # In[20]: np.linalg.norm(AtA - R.T @ R) # $$ A^T A x = A^T b $$ # $$ R^T R x = A^T b $$ # $$ R^T w = A^T b $$ # $$ R x = w $$ # In[21]: w = scipy.linalg.solve_triangular(R, Atb, lower=False, trans='T') # It's always good to check that our result is what we expect it to be: (in case we entered the wrong params, the function didn't return what we thought, or sometimes the docs are even outdated) # In[22]: np.linalg.norm(R.T @ w - Atb) # In[23]: coeffs_chol = scipy.linalg.solve_triangular(R, w, lower=False) # In[24]: np.linalg.norm(R @ coeffs_chol - w) # In[25]: def ls_chol(A, b): R = scipy.linalg.cholesky(A.T @ A) w = scipy.linalg.solve_triangular(R, A.T @ b, trans='T') return scipy.linalg.solve_triangular(R, w) # In[26]: get_ipython().run_line_magic('timeit', 'coeffs_chol = ls_chol(trn_int, y_trn)') # In[27]: coeffs_chol = ls_chol(trn_int, y_trn) regr_metrics(y_test, test_int @ coeffs_chol) # ### QR Factorization # $$ A x = b $$ # $$ A = Q R $$ # $$ Q R x = b $$ # # $$ R x = Q^T b $$ # In[28]: def ls_qr(A,b): Q, R = scipy.linalg.qr(A, mode='economic') return scipy.linalg.solve_triangular(R, Q.T @ b) # In[29]: get_ipython().run_line_magic('timeit', 'coeffs_qr = ls_qr(trn_int, y_trn)') # In[30]: coeffs_qr = ls_qr(trn_int, y_trn) regr_metrics(y_test, test_int @ coeffs_qr) # ### SVD # $$ A x = b $$ # # $$ A = U \Sigma V $$ # # $$ \Sigma V x = U^T b $$ # # $$ \Sigma w = U^T b $$ # # $$ x = V^T w $$ # SVD gives the pseudo-inverse # In[253]: def ls_svd(A,b): m, n = A.shape U, sigma, Vh = scipy.linalg.svd(A, full_matrices=False, lapack_driver='gesdd') w = (U.T @ b)/ sigma return Vh.T @ w # In[32]: get_ipython().run_line_magic('timeit', 'coeffs_svd = ls_svd(trn_int, y_trn)') # In[254]: get_ipython().run_line_magic('timeit', 'coeffs_svd = ls_svd(trn_int, y_trn)') # In[255]: coeffs_svd = ls_svd(trn_int, y_trn) regr_metrics(y_test, test_int @ coeffs_svd) # ### Random Sketching Technique for Least Squares Regression # [Linear Sketching](http://researcher.watson.ibm.com/researcher/files/us-dpwoodru/journal.pdf) (Woodruff) # # 1. Sample a r x n random matrix S, r << n # 2. Compute S A and S b # 3. Find exact solution x to regression SA x = Sb # ### Timing Comparison # In[244]: import timeit import pandas as pd # In[245]: def scipylstq(A, b): return scipy.linalg.lstsq(A,b)[0] # In[246]: row_names = ['Normal Eqns- Naive', 'Normal Eqns- Cholesky', 'QR Factorization', 'SVD', 'Scipy lstsq'] name2func = {'Normal Eqns- Naive': 'ls_naive', 'Normal Eqns- Cholesky': 'ls_chol', 'QR Factorization': 'ls_qr', 'SVD': 'ls_svd', 'Scipy lstsq': 'scipylstq'} # In[247]: m_array = np.array([100, 1000, 10000]) n_array = np.array([20, 100, 1000]) # In[248]: index = pd.MultiIndex.from_product([m_array, n_array], names=['# rows', '# cols']) # In[249]: pd.options.display.float_format = '{:,.6f}'.format df = pd.DataFrame(index=row_names, columns=index) df_error = pd.DataFrame(index=row_names, columns=index) # In[256]: # %%prun for m in m_array: for n in n_array: if m >= n: x = np.random.uniform(-10,10,n) A = np.random.uniform(-40,40,[m,n]) # removed np.asfortranarray b = np.matmul(A, x) + np.random.normal(0,2,m) for name in row_names: fcn = name2func[name] t = timeit.timeit(fcn + '(A,b)', number=5, globals=globals()) df.set_value(name, (m,n), t) coeffs = locals()[fcn](A, b) reg_met = regr_metrics(b, A @ coeffs) df_error.set_value(name, (m,n), reg_met[0]) # In[257]: df # In[252]: df_error # In[618]: store = pd.HDFStore('least_squares_results.h5') # In[619]: store['df'] = df # ## Notes # I used the magick %prun to profile my code. # Alternative: least absolute deviation (L1 regression) # - Less sensitive to outliers than least squares. # - No closed form solution, but can solve with linear programming # ## Conditioning & stability # #### Condition Number # *Condition number* is a measure of how small changes to the input cause the output to change. # # **Question**: Why do we care about behavior with small changes to the input in numerical linear algebra? # The *relative condition number* is defined by # # $$ \kappa = \sup_{\delta x} \frac{\|\delta f\|}{\| f(x) \|}\bigg/ \frac{\| \delta x \|}{\| x \|} $$ # # where $\delta x$ is infinitesimal # # According to Trefethen (pg. 91), a problem is *well-conditioned* if $\kappa$ is small (e.g. $1$, $10$, $10^2$) and *ill-conditioned* if $\kappa$ is large (e.g. $10^6$, $10^{16}$) # **Conditioning**: perturbation behavior of a mathematical problem (e.g. least squares) # # **Stability**: perturbation behavior of an algorithm used to solve that problem on a computer (e.g. least squares algorithms, householder, back substitution, gaussian elimination) # #### Conditioning example # The problem of computing eigenvalues of a non-symmetric matrix is often ill-conditioned # In[178]: A = [[1, 1000], [0, 1]] B = [[1, 1000], [0.001, 1]] # In[179]: wA, vrA = scipy.linalg.eig(A) wB, vrB = scipy.linalg.eig(B) # In[180]: wA, wB # #### Condition Number of a Matrix # The product $\| A\| \|A^{-1} \|$ comes up so often it has its own name: the *condition number* of $A$. Note that normally we talk about the conditioning of problems, not matrices. # # The *condition number* of $A$ relates to: # - computing $b$ given $A$ and $x$ in $Ax = b$ # - computing $x$ given $A$ and $b$ in $Ax = b$ # ## Loose ends from last time # ### Full vs Reduced Factorizations # **SVD** # Diagrams from Trefethen: # # # # # **QR Factorization exists for ALL matrices** # Just like with SVD, there are full and reduced versions of the QR factorization. # # # # # ### Matrix Inversion is Unstable # In[197]: from scipy.linalg import hilbert # In[229]: n = 14 A = hilbert(n) x = np.random.uniform(-10,10,n) b = A @ x # In[230]: A_inv = np.linalg.inv(A) # In[233]: np.linalg.norm(np.eye(n) - A @ A_inv) # In[231]: np.linalg.cond(A) # In[232]: A @ A_inv # In[237]: row_names = ['Normal Eqns- Naive', 'QR Factorization', 'SVD', 'Scipy lstsq'] name2func = {'Normal Eqns- Naive': 'ls_naive', 'QR Factorization': 'ls_qr', 'SVD': 'ls_svd', 'Scipy lstsq': 'scipylstq'} # In[238]: pd.options.display.float_format = '{:,.9f}'.format df = pd.DataFrame(index=row_names, columns=['Time', 'Error']) # In[239]: for name in row_names: fcn = name2func[name] t = timeit.timeit(fcn + '(A,b)', number=5, globals=globals()) coeffs = locals()[fcn](A, b) df.set_value(name, 'Time', t) df.set_value(name, 'Error', regr_metrics(b, A @ coeffs)[0]) # #### SVD is best here! # DO NOT RERUN # In[240]: df # In[240]: df # **Another reason not to take inverse** # Even if $A$ is incredibly sparse, $A^{-1}$ is generally dense. For large matrices, $A^{-1}$ could be so dense as to not fit in memory. # ## Runtime # Matrix Inversion: $2n^3$ # # Matrix Multiplication: $n^3$ # # Cholesky: $\frac{1}{3}n^3$ # # QR, Gram Schmidt: $2mn^2$, $m\geq n$ (chapter 8 of Trefethen) # # QR, Householder: $2mn^2 - \frac{2}{3}n^3$ (chapter 10 of Trefethen) # # Solving a triangular system: $n^2$ # **Why Cholesky Factorization is Fast:** # # # (source: [Stanford Convex Optimization: Numerical Linear Algebra Background Slides](http://stanford.edu/class/ee364a/lectures/num-lin-alg.pdf)) # ### A Case Where QR is the Best # In[65]: m=100 n=15 t=np.linspace(0, 1, m) # In[66]: # Vandermonde matrix A=np.stack([t**i for i in range(n)], 1) # In[67]: b=np.exp(np.sin(4*t)) # This will turn out to normalize the solution to be 1 b /= 2006.787453080206 # In[68]: from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[69]: plt.plot(t, b) # Check that we get 1: # In[58]: 1 - ls_qr(A, b)[14] # Bad condition number: # In[60]: kappa = np.linalg.cond(A); kappa # In[181]: row_names = ['Normal Eqns- Naive', 'QR Factorization', 'SVD', 'Scipy lstsq'] name2func = {'Normal Eqns- Naive': 'ls_naive', 'QR Factorization': 'ls_qr', 'SVD': 'ls_svd', 'Scipy lstsq': 'scipylstq'} # In[74]: pd.options.display.float_format = '{:,.9f}'.format df = pd.DataFrame(index=row_names, columns=['Time', 'Error']) # In[75]: for name in row_names: fcn = name2func[name] t = timeit.timeit(fcn + '(A,b)', number=5, globals=globals()) coeffs = locals()[fcn](A, b) df.set_value(name, 'Time', t) df.set_value(name, 'Error', np.abs(1 - coeffs[-1])) # In[76]: df # # The solution for least squares via the normal equations is unstable in general, although stable for problems with small condition numbers. # ### Low-rank # In[258]: m = 100 n = 10 x = np.random.uniform(-10,10,n) A2 = np.random.uniform(-40,40, [m, int(n/2)]) # removed np.asfortranarray A = np.hstack([A2, A2]) # In[259]: A.shape, A2.shape # In[260]: b = A @ x + np.random.normal(0,1,m) # In[263]: row_names = ['Normal Eqns- Naive', 'QR Factorization', 'SVD', 'Scipy lstsq'] name2func = {'Normal Eqns- Naive': 'ls_naive', 'QR Factorization': 'ls_qr', 'SVD': 'ls_svd', 'Scipy lstsq': 'scipylstq'} # In[264]: pd.options.display.float_format = '{:,.9f}'.format df = pd.DataFrame(index=row_names, columns=['Time', 'Error']) # In[265]: for name in row_names: fcn = name2func[name] t = timeit.timeit(fcn + '(A,b)', number=5, globals=globals()) coeffs = locals()[fcn](A, b) df.set_value(name, 'Time', t) df.set_value(name, 'Error', regr_metrics(b, A @ coeffs)[0]) # In[266]: df # ## Comparison # Our results from above: # In[257]: df # From Trefethen (page 84): # # Normal equations/Cholesky is fastest when it works. Cholesky can only be used on symmetric, positive definite matrices. Also, normal equations/Cholesky is unstable for matrices with high condition numbers or with low-rank. # # Linear regression via QR has been recommended by numerical analysts as the standard method for years. It is natural, elegant, and good for "daily use".