#!/usr/bin/env python # coding: utf-8 # You can read an overview of this Numerical Linear Algebra course in [this blog post](http://www.fast.ai/2017/07/17/num-lin-alg/). The course was originally taught in the [University of San Francisco MS in Analytics](https://www.usfca.edu/arts-sciences/graduate-programs/analytics) graduate program. Course lecture videos are [available on YouTube](https://www.youtube.com/playlist?list=PLtmWHNX-gukIc92m1K0P6bIOnZb-mg0hY) (note that the notebook numbers and video numbers do not line up, since some notebooks took longer than 1 video to cover). # # You can ask questions about the course on [our fast.ai forums](http://forums.fast.ai/c/lin-alg). # # 5. Health Outcomes with Linear Regression # In[26]: from sklearn import datasets, linear_model, metrics from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures import math, scipy, numpy as np from scipy import linalg # ## Diabetes Dataset # We will use a dataset from patients with diabates. The data consists of 442 samples and 10 variables (all are physiological characteristics), so it is tall and skinny. The dependent variable is a quantitative measure of disease progression one year after baseline. # This is a classic dataset, famously used by Efron, Hastie, Johnstone, and Tibshirani in their [Least Angle Regression](https://arxiv.org/pdf/math/0406456.pdf) paper, and one of the [many datasets included with scikit-learn](http://scikit-learn.org/stable/datasets/). # In[27]: data = datasets.load_diabetes() # In[28]: feature_names=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'] # In[29]: trn,test,y_trn,y_test = train_test_split(data.data, data.target, test_size=0.2) # In[30]: trn.shape, test.shape # ## Linear regression in Scikit Learn # Consider a system $X\beta = y$, where $X$ has more rows than columns. This occurs when you have more data samples than variables. We want to find $\hat{\beta}$ that minimizes: # $$ \big\vert\big\vert X\beta - y \big\vert\big\vert_2$$ # Let's start by using the sklearn implementation: # In[173]: regr = linear_model.LinearRegression() get_ipython().run_line_magic('timeit', 'regr.fit(trn, y_trn)') # In[7]: pred = regr.predict(test) # It will be helpful to have some metrics on how good our prediciton is. We will look at the mean squared norm (L2) and mean absolute error (L1). # In[128]: def regr_metrics(act, pred): return (math.sqrt(metrics.mean_squared_error(act, pred)), metrics.mean_absolute_error(act, pred)) # In[129]: regr_metrics(y_test, regr.predict(test)) # ## Polynomial Features # Linear regression finds the best coefficients $\beta_i$ for: # # $$ x_0\beta_0 + x_1\beta_1 + x_2\beta_2 = y $$ # # Adding polynomial features is still a linear regression problem, just with more terms: # # $$ x_0\beta_0 + x_1\beta_1 + x_2\beta_2 + x_0^2\beta_3 + x_0 x_1\beta_4 + x_0 x_2\beta_5 + x_1^2\beta_6 + x_1 x_2\beta_7 + x_2^2\beta_8 = y $$ # # We need to use our original data $X$ to calculate the additional polynomial features. # In[172]: trn.shape # Now, we want to try improving our model's performance by adding some more features. Currently, our model is linear in each variable, but we can add polynomial features to change this. # In[130]: poly = PolynomialFeatures(include_bias=False) # In[131]: trn_feat = poly.fit_transform(trn) # In[132]: ', '.join(poly.get_feature_names(feature_names)) # In[133]: trn_feat.shape # In[134]: regr.fit(trn_feat, y_trn) # In[135]: regr_metrics(y_test, regr.predict(poly.fit_transform(test))) # Time is squared in #features and linear in #points, so this will get very slow! # In[136]: get_ipython().run_line_magic('timeit', 'poly.fit_transform(trn)') # ## Speeding up feature generation # We would like to speed this up. We will use [Numba](http://numba.pydata.org/numba-doc/0.12.2/tutorial_firststeps.html), a Python library that compiles code directly to C. # **Numba is a compiler.** # #### Resources # [This tutorial](https://jakevdp.github.io/blog/2012/08/24/numba-vs-cython/) from Jake VanderPlas is a nice introduction. Here Jake [implements a non-trivial algorithm](https://jakevdp.github.io/blog/2015/02/24/optimizing-python-with-numpy-and-numba/) (non-uniform fast Fourier transform) with Numba. # # Cython is another alternative. I've found Cython to require more knowledge to use than Numba (it's closer to C), but to provide similar speed-ups to Numba. # # Cython vs Numba # # Here is a [thorough answer](https://softwareengineering.stackexchange.com/questions/246094/understanding-the-differences-traditional-interpreter-jit-compiler-jit-interp) on the differences between an Ahead Of Time (AOT) compiler, a Just In Time (JIT) compiler, and an interpreter. # ### Experiments with vectorization and native code # Let's first get aquainted with Numba, and then we will return to our problem of polynomial features for regression on the diabates data set. # In[140]: get_ipython().run_line_magic('matplotlib', 'inline') # In[141]: import math, numpy as np, matplotlib.pyplot as plt from pandas_summary import DataFrameSummary from scipy import ndimage # In[142]: from numba import jit, vectorize, guvectorize, cuda, float32, void, float64 # We will show the impact of: # - Avoiding memory allocations and copies (slower than CPU calculations) # - Better locality # - Vectorization # # If we use numpy on whole arrays at a time, it creates lots of temporaries, and can't use cache. If we use numba looping through an array item at a time, then we don't have to allocate large temporary arrays, and can reuse cached data since we're doing multiple calculations on each array item. # In[175]: # Untype and Unvectorized def proc_python(xx,yy): zz = np.zeros(nobs, dtype='float32') for j in range(nobs): x, y = xx[j], yy[j] x = x*2 - ( y * 55 ) y = x + y*2 z = x + y + 99 z = z * ( z - .88 ) zz[j] = z return zz # In[176]: nobs = 10000 x = np.random.randn(nobs).astype('float32') y = np.random.randn(nobs).astype('float32') # In[177]: get_ipython().run_line_magic('timeit', 'proc_python(x,y) # Untyped and unvectorized') # #### Numpy # Numpy lets us vectorize this: # In[146]: # Typed and Vectorized def proc_numpy(x,y): z = np.zeros(nobs, dtype='float32') x = x*2 - ( y * 55 ) y = x + y*2 z = x + y + 99 z = z * ( z - .88 ) return z # In[147]: np.allclose( proc_numpy(x,y), proc_python(x,y), atol=1e-4 ) # In[148]: get_ipython().run_line_magic('timeit', 'proc_numpy(x,y) # Typed and vectorized') # #### Numba # Numba offers several different decorators. We will try two different ones: # # - `@jit`: very general # - `@vectorize`: don't need to write a for loop. useful when operating on vectors of the same size # First, we will use Numba's jit (just-in-time) compiler decorator, without explicitly vectorizing. This avoids large memory allocations, so we have better locality: # In[149]: @jit() def proc_numba(xx,yy,zz): for j in range(nobs): x, y = xx[j], yy[j] x = x*2 - ( y * 55 ) y = x + y*2 z = x + y + 99 z = z * ( z - .88 ) zz[j] = z return zz # In[150]: z = np.zeros(nobs).astype('float32') np.allclose( proc_numpy(x,y), proc_numba(x,y,z), atol=1e-4 ) # In[151]: get_ipython().run_line_magic('timeit', 'proc_numba(x,y,z)') # Now we will use Numba's `vectorize` decorator. Numba's compiler optimizes this in a smarter way than what is possible with plain Python and Numpy. # In[152]: @vectorize def vec_numba(x,y): x = x*2 - ( y * 55 ) y = x + y*2 z = x + y + 99 return z * ( z - .88 ) # In[153]: np.allclose(vec_numba(x,y), proc_numba(x,y,z), atol=1e-4 ) # In[154]: get_ipython().run_line_magic('timeit', 'vec_numba(x,y)') # Numba is **amazing**. Look how fast this is! # ### Numba polynomial features # In[155]: @jit(nopython=True) def vec_poly(x, res): m,n=x.shape feat_idx=0 for i in range(n): v1=x[:,i] for k in range(m): res[k,feat_idx] = v1[k] feat_idx+=1 for j in range(i,n): for k in range(m): res[k,feat_idx] = v1[k]*x[k,j] feat_idx+=1 # #### Row-Major vs Column-Major Storage # From this [blog post by Eli Bendersky](http://eli.thegreenplace.net/2015/memory-layout-of-multi-dimensional-arrays/): # # "The row-major layout of a matrix puts the first row in contiguous memory, then the second row right after it, then the third, and so on. Column-major layout puts the first column in contiguous memory, then the second, etc.... While knowing which layout a particular data set is using is critical for good performance, there's no single answer to the question which layout 'is better' in general. # # "It turns out that matching the way your algorithm works with the data layout can make or break the performance of an application. # # "The short takeaway is: **always traverse the data in the order it was laid out**." # # **Column-major layout**: Fortran, Matlab, R, and Julia # # **Row-major layout**: C, C++, Python, Pascal, Mathematica # In[156]: trn = np.asfortranarray(trn) test = np.asfortranarray(test) # In[157]: m,n=trn.shape n_feat = n*(n+1)//2 + n trn_feat = np.zeros((m,n_feat), order='F') test_feat = np.zeros((len(y_test), n_feat), order='F') # In[158]: vec_poly(trn, trn_feat) vec_poly(test, test_feat) # In[159]: regr.fit(trn_feat, y_trn) # In[160]: regr_metrics(y_test, regr.predict(test_feat)) # In[161]: get_ipython().run_line_magic('timeit', 'vec_poly(trn, trn_feat)') # Recall, this was the time from the scikit learn implementation PolynomialFeatures, which was created by experts: # In[136]: get_ipython().run_line_magic('timeit', 'poly.fit_transform(trn)') # In[162]: 605/7.7 # This is a big deal! Numba is **amazing**! With a single line of code, we are getting a 78x speed-up over scikit learn (which was optimized by experts). # ## Regularization and noise # Regularization is a way to reduce over-fitting and create models that better generalize to new data. # ### Regularization # Lasso regression uses an L1 penalty, which pushes towards sparse coefficients. The parameter $\alpha$ is used to weight the penalty term. Scikit Learn's LassoCV performs cross validation with a number of different values for $\alpha$. # # Watch this [Coursera video on Lasso regression](https://www.coursera.org/learn/machine-learning-data-analysis/lecture/0KIy7/what-is-lasso-regression) for more info. # In[163]: reg_regr = linear_model.LassoCV(n_alphas=10) # In[164]: reg_regr.fit(trn_feat, y_trn) # In[165]: reg_regr.alpha_ # In[166]: regr_metrics(y_test, reg_regr.predict(test_feat)) # ### Noise # Now we will add some noise to the data # In[167]: idxs = np.random.randint(0, len(trn), 10) # In[168]: y_trn2 = np.copy(y_trn) y_trn2[idxs] *= 10 # label noise # In[169]: regr = linear_model.LinearRegression() regr.fit(trn, y_trn) regr_metrics(y_test, regr.predict(test)) # In[170]: regr.fit(trn, y_trn2) regr_metrics(y_test, regr.predict(test)) # Huber loss is a loss function that is less sensitive to outliers than squared error loss. It is quadratic for small error values, and linear for large values. # # $$L(x)= # \begin{cases} # \frac{1}{2}x^2, & \text{for } \lvert x\rvert\leq \delta \\ # \delta(\lvert x \rvert - \frac{1}{2}\delta), & \text{otherwise} # \end{cases}$$ # In[171]: hregr = linear_model.HuberRegressor() hregr.fit(trn, y_trn2) regr_metrics(y_test, hregr.predict(test)) # # End