#!/usr/bin/env python # coding: utf-8 # # Data Driven Modeling #

# ### PhD seminar series at Chair for Computer Aided Architectural Design (CAAD), ETH Zurich # # # [Vahid Moosavi](https://vahidmoosavi.com/) #

# # # # Fourth Session #

# 11 October 2016 # # ### Topics to be discussed # # * Density Learning # * Meta-Parameters in Machine Learning # * Regression Problem # * Least Squares Method # * Linear Regression # * Polynomial Regression # * Local Polynomial Regression # In[1]: import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np from matplotlib import pyplot as plt pd.__version__ import sys from scipy import stats get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np from matplotlib import pyplot as plt pd.__version__ import sys from scipy import stats get_ipython().run_line_magic('matplotlib', 'inline') from bokeh.models import CustomJS, ColumnDataSource, Slider,TextInput from bokeh.models import TapTool, CustomJS, ColumnDataSource from bokeh.layouts import column from bokeh.layouts import gridplot from bokeh.models import ColumnDataSource from bokeh.layouts import gridplot from bokeh.plotting import figure from bokeh.io import output_notebook, show from bokeh.plotting import Figure, output_file, show from bokeh.layouts import widgetbox from bokeh.models.widgets import Slider from bokeh.io import push_notebook from bokeh.io import show from bokeh.models import ( ColumnDataSource, HoverTool, LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar, ) output_notebook() from bokeh.layouts import column from bokeh.models import CustomJS, ColumnDataSource, Slider from bokeh.plotting import Figure, output_file, show # # Density Learning # ## Or "Non-parametric Density Models" # # In[2]: plt.subplot(2,1,1) X1 = np.random.normal(loc=0,scale=1,size=1000) plt.hist(X1,bins=100); plt.title('A well shaped distribution: A Gaussian Distribution?') plt.subplot(2,1,2) X1 = np.concatenate((np.random.normal(loc=0,scale=1,size=1000) , np.random.normal(loc=10,scale=2,size=1000),np.random.normal(loc=20,scale=1,size=200))) plt.title('Not a well shaped distribution: A mixture of three Gaussian Distributions?!') plt.hist(X1,bins=100); plt.tight_layout() # # Now the question is that can we learn these (not well shaped) distributions? # # # Histograms # ## The first and the most data driven density model is the histogram of the data itself! # * We just need to select bandwiths (or the number of bins) to descretize the observed data into groups # * Then, we count the number of observations within each group # * And then, we have it! # In[3]: # X1 = np.random.randint(-1,2,size=400) X1 = np.concatenate((np.random.normal(loc=0,scale=1,size=1000) , np.random.normal(loc=10,scale=2,size=1000),np.random.normal(loc=20,scale=1,size=200))) # X1 = np.random.normal(loc=0,scale=1,size=200) X1 = np.asarray(X1) a = plt.hist(X1,bins=100); # a[0] has the counts print a[0] # a[1] has the bins print a[1] print np.sum(a[0]) # In[4]: plt.plot(a[1][:-1],a[0],'.-'); # # But this looks noisy! # # # # # ## Now if we want to do some smoothing (denoising), we are in the field of Kernel Density Estimation # #

# # # ## where K(•) is the kernel — a non-negative function that integrates to one and has mean zero — and h > 0 is a smoothing parameter called the bandwidth. # # * **what happens here is that we draw a small distribution around each of the observed points and then, sum it over the whole range** # * ** These Kernels can be any of the parametric distributions we discusses before.** # # In[5]: # A basic example with few data points X1 = [-2,-1,0,2,5,6,-2,0] X1 = np.asarray(X1) a = plt.hist(X1,bins=20); # In[6]: def GaussianKernel(u): # here we assume a "standard normal distribution" with mean zero and variance 1 return np.exp(-1*np.power(u, 2)/2)/(np.sqrt(2*np.pi)) def Kde_Gaussian(h=3): mn = np.min(X1)-3 mx = np.max(X1)+3 # h is the Bandwidth # range of the the data xrng = np.linspace(mn,mx,num=500) counts = np.zeros((1,500)) # this is just to histogram of the data # a = plt.hist(X1,bins=len(X1),normed=True); # plt.close() # for i,d in enumerate(a[0]): # plt.plot([a[1][i],a[1][i]],[0,d],'-or',markersize=10); # Now we go through all the data points one by one. # Assume that the observed point is the center of Gaussian Distribution with standard deviation equal to the bandwidth for xi in X1: local_counts = [GaussianKernel((x-xi)/h) for x in xrng] # (x-xi)/h is to transform the data to "standard normal distribution"! counts = counts + np.asarray(local_counts)/(h*len(X1)) # this to normalize the counts. Look at the formula above plt.plot(xrng,np.asarray(local_counts)/(h*len(X1)),'r'); plt.plot(xrng,counts.T,linewidth=4); # In[7]: Kde_Gaussian(1) # In[8]: # #ipywidgets # from ipywidgets import interact, HTML, FloatSlider # interact(Kde_Gaussian,h=(.1,5,.1)); # In[9]: mn = np.min(X1)-3 mx = np.max(X1)+3 # range of the the data xrng = np.linspace(mn,mx,num=500).T[:,np.newaxis] counts = np.zeros((1,500)).T # local_counts = np.zeros((1,500)).T #unfortunately, source in bokeh needs columns with the same size. So we fake it observations = np.zeros((1,500)).T observations[:len(X1),0] = X1.T dlen = np.asarray([len(X1) for i in range(len(xrng))])[:,np.newaxis] df = pd.DataFrame(data=np.concatenate((xrng,counts,observations,dlen),axis=1),columns=['xrng','counts','observations','dlen']) for i in range(len(X1)): df['local_counts'+str(i)] = np.zeros((1,500)).T h= 1 for i in range(dlen[0]): local_counts = df['local_counts'+str(i)].values[:] for j in range(len(xrng)): u = (xrng[j]-observations[i])/h local_counts[j] = np.exp(-1*np.power(u, 2)/2)/(np.sqrt(2*np.pi))/(h*dlen[0]) counts[j] = counts[j] + local_counts[j] df['local_counts'+str(i)] = local_counts df['counts'] = counts source = ColumnDataSource(data=df) p = figure(plot_width=600, plot_height=400,title="Kernel Density Estimation") p.circle(X1, np.zeros(len(X1)), size=10, line_color="navy", fill_color="red", fill_alpha=0.99,legend="observations") for i in range(len(X1)): p.line('xrng', 'local_counts'+str(i),alpha=0.6, line_width=1 ,source=source, line_color="red",legend="kernels") p.line('xrng','counts',line_width=4,source=source, line_color="navy",legend="KDE"); def callback1(source=source, window=None): data = source.data h = cb_obj.value dlen = data['dlen'] xrng = data['xrng'] counts = data['counts'] for i in range(len(counts)): counts[i] = 0 # observations =data['observations'] for i in range(dlen[0]): local_counts = data['local_counts'+str(i)] # window.console.log(local_counts) for j in range(len(xrng)): u = (xrng[j]-observations[i])/h local_counts[j] = window.Math.exp(-1*window.Math.pow(u, 2)/2)/(window.Math.sqrt(2*window.Math.PI))/(h*dlen[0]) counts[j] = counts[j] + local_counts[j] source.change.emit() slider1 = Slider(start=.0001, end=8, value=1, step=.05, title="bandwidth", callback=CustomJS.from_py_func(callback1)) slider1.callback.update() layout = column(slider1, p) show(layout) # ## So, now what is the optimum bandwidth in the above density learning problem? # * minimizing some error functions in relation to the bandwidth # * Rules of thumb methods # * cross validation ---> testing the learned density with new data sets # # # A note on the issue of meta-parameters in Data Driven Modeling # # # # ### Bandwidth value is usually called a " Meta-Parameter" or "Hyper-Parameter". It is important to know that in all of data driven modeling techniques, we will always have these types of parameters, where we need to find their best values in a "Meta- optimization" problem. # ### Usually, we take any of the following approaches separately, or together: # * **Exhaustive search** # * **Random grid search** # * **Cross Validation** # # # # # # # # # # # # Density Learning in Practice # # # However, usually there are optimum libraries for these kinds of tasks! # * **Scipy** # * **Scikit learn** # * **PyQt-Fit** # In[10]: # A two dimensional example fig = plt.figure() d0 = 1.6*np.random.randn(1000,2) d0[:,0]= d0[:,0] - 3 plt.plot(d0[:,0],d0[:,1],'.b') d1 = .6*np.random.randn(1000,2)+7.6 plt.plot(d1[:,0],d1[:,1],'.b') d2 = .8*np.random.randn(1000,2) d2[:,0]= d2[:,0] + 5 d2[:,1]= d2[:,1] + 1 plt.plot(d2[:,0],d2[:,1],'.b') d3 = .8*np.random.randn(1000,2) d3[:,0]= d3[:,0] - 5 d3[:,1]= d3[:,1] + 4 plt.plot(d3[:,0],d3[:,1],'.b') d4 = 1.8*np.random.randn(1000,2) d4[:,0]= d4[:,0] - 15 d4[:,1]= d4[:,1] + 14 plt.plot(d4[:,0],d4[:,1],'.b') d5 = 2.8*np.random.randn(1000,2) d5[:,0]= d5[:,0] + 25 d5[:,1]= d5[:,1] + 14 plt.plot(d5[:,0],d5[:,1],'.b') fig.set_size_inches(5,5) Data = np.concatenate((d0,d1,d2,d3,d4,d5)) # from scipy import stats # def measure(n): # "Measurement model, return two coupled measurements." # m1 = np.random.normal(size=n) # m2 = np.random.normal(scale=0.5, size=n) # return m1+m2, m1-m2 # m1, m2 = measure(2000) m1,m2 = Data[:,0],Data[:,1] xmin = Data[:,0].min() xmax = Data[:,0].max() ymin = Data[:,1].min() ymax = Data[:,1].max() Xg, Yg = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] # In[11]: val_grids = Xg # values = np.vstack([m1, m2]) # values = Data.T kernel = stats.gaussian_kde(Data[:,0]) Z = kernel(val_grids.ravel()) import matplotlib.pyplot as plt fig, ax = plt.subplots() a = ax.hist(Data[:,0],bins=300,normed=True); ax.plot(val_grids.ravel(),Z,linewidth=5); ax.plot(a[1][:-1],a[0]) plt.show() # In[12]: X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = np.vstack([X.ravel(), Y.ravel()]) # values = np.vstack([m1, m2]) values = Data.T kernel = stats.gaussian_kde(values) Z = np.reshape(kernel(positions).T, X.shape) import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,) plt.contour(np.rot90(Z)) # ax.plot(m1, m2, 'k.', markersize=2) # ax.set_xlim([xmin, xmax]) # ax.set_ylim([ymin, ymax]) fig.set_size_inches(7,7) plt.show() # The importance of the learned densities # # * **Resampling**: to generate data without observations # * **Clustering and pattern recognition** and its relation to **"Manifold Learning"** # * Here we know the likelihood of the high dimensional patterns in comparison to each other # * Here, in a way we are learning the **joint probability distributions** # * In Machine Learning terms, this is an **unsupervised learning** problem # * **Prediction**: Using it for predictions/ transfer function by conditioning the outcomes to a set of dimensions # * In this case, we are interested to know the likelihood of certain variables (**dependent variables**) conditioned on certain variables (** independent variables") # * In this sense, we can derive "supervised learning" from "unsupervised learning" # # # ## A similar Approach # * **Gaussian Mixture Model**: To learn the distribution of the data as a weighted sum of several globally defined Gaussian Distributions: # ## $$g(X) = \sum_{i = 1}^k p_i. g(X,\theta_i)$$ # ### $ g(X,\theta_i)$ is a parametric known distribution (e.g. Gaussian) and $p_i$ is the share of each of them. # # ### Further readings: # * ** on Kernel Methods: http://www.stat.rice.edu/~scottdw/ss.nh.pdf** # * **On mixture Models: https://en.wikipedia.org/wiki/Mixture_model** # # # # # # ## An alternative approach from ML (Hopefully the next session): # ## Manifold Learning # * **Self Organizing Maps** # # # # # #

# # #

# # # # Prediction, using the learned densities # # ### An example of prediction with the learned density in the previous example # In[19]: fig = plt.figure(figsize=(10,7)) X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] cm = plt.cm.get_cmap('RdYlBu_r') positions = np.vstack([X.ravel(), Y.ravel()]) # values = np.vstack([m1, m2]) values = Data.T kernel = stats.gaussian_kde(values) Z = np.reshape(kernel(positions).T, X.shape) zmn = np.min(Z) zmx = np.max(Z) rng = zmx-zmn Z = Z.ravel() X = X.ravel() Y = Y.ravel() # for i in range(len(X)): var = (Z-zmn)/float(rng) # sc = plt.scatter(X, Y, s=10, marker=u'o', facecolor=plt.cm.RdYlBu_r(var), vmin=zmn, vmax=zmx, edgecolor='None') sc = plt.scatter(X,Y,c = Z,s=20,vmin=zmn,marker='o',edgecolor='None', vmax=zmx, cmap=cm ,alpha=1); plt.colorbar(sc,ticks=np.round(np.linspace(zmn,zmx,5),decimals=3),shrink=0.6); sc = plt.scatter(X[3000:3100],Y[3000:3100],c = Z[3000:3100],s=20,vmin=zmn,marker='o',edgecolor='red', vmax=zmx, cmap=cm ,alpha=1); plt.xlim(xmin,xmax); plt.ylim(ymin,ymax); plt.xlabel('X'); plt.ylabel('Y'); # In[20]: # For example, here we fix x to a specific value and would like to know what are the possible y values x = -5 X, Y = np.mgrid[x:x+1:1j, ymin:ymax:100j] positions = np.vstack([X.ravel(), Y.ravel()]) # values = np.vstack([m1, m2]) Z = np.reshape(kernel(positions).T, X.shape) plt.plot(Y.T,Z.T); plt.ylabel('Relative likelihood'); plt.xlabel('Expected y, given x:{}'.format(x)); #

# From joint probability models to targeted Models #

# # # Regression problem # ### In comparison to density learning we have the problem of targeted learning # # In[13]: N = 50 x1= np.random.normal(loc=0,scale=1,size=N)[:,np.newaxis] x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis] y = 3*x1 + np.random.normal(loc=.0, scale=.7, size=N)[:,np.newaxis] fig = plt.figure(figsize=(7,7)) ax1= plt.subplot(111) plt.plot(x1,y,'or',markersize=10,alpha=.4 ); # # How to approache this problem? # # Least Squares Method. # * **Legendre 1805** # * **Gauss 1809** # # ### Imagine we want to estimate the observed pattern with a parametric function such as follows # ## $$f(x,\beta) = \sum_{j = 0}^m \beta_j x^j\ $$ # # ### First order linear regression: # ## $$f(x,\beta) = \beta_0 + \beta_1 x $$ # # ## Squared Residuals # ## $$S = \sum_{i = 1}^n (y_i - f(x_i,\beta))^2 = \sum_{i = 1}^n r^2$$ # # ### Least Squeares method is to minimize the above error term by finding the optimum value of $\beta$ # # ### While, in linear regression, it is easy to find the optimum parameters, for nonlinear cases we need iterative methods. # * Linear: https://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)#Motivational_example # * Nonlinear: https://en.wikipedia.org/wiki/Non-linear_least_squares # # Underlying ideas behing least squares method is the essense of many (all of) machine learning methods. #

# # Intuitive interpretations to the least squares method # # * **solving the equations of forces of several of (selfish) bodies to the line** # * **finding the line with the minimum energy and maximum stability** # * **Looking at it as a democratic voting system with a apriori given structure** # # ## And once again it's all about the inversion from knowing to learning # ### The first historical example, from Wikipedia: # An early demonstration of the strength of Gauss' Method came when it was used **to predict the future location of the newly discovered asteroid Ceres**. On 1 January 1801, the Italian astronomer Giuseppe Piazzi discovered Ceres and was able to track its path for 40 days before it was lost in the glare of the sun. Based on these data, astronomers desired to determine the location of Ceres after it emerged from behind the sun ** without solving Kepler's complicated nonlinear equations of planetary motion**. The only predictions that successfully allowed Hungarian astronomer Franz Xaver von Zach to relocate Ceres were those performed by the 24-year-old Gauss using least-squares analysis. # # In[14]: def linear_regressor(a,b): # y_ = ax+b mn = np.min(x1) mx = np.max(x1) xrng = np.linspace(mn,mx,num=500) y_ = [a*x + b for x in xrng] fig = plt.figure(figsize=(7,7)) ax1= plt.subplot(111) plt.plot(x1,y,'or',markersize=10,alpha=.4 ); plt.xlabel('x1'); plt.ylabel('y'); plt.plot(xrng,y_,'-b',linewidth=1) # plt.xlim(mn-1,mx+1) # plt.ylim(np.min(y)+1,np.max(y)-1) yy = [a*x + b for x in x1] [plt.plot([x1[i],x1[i]],[yy[i],y[i]],'-r',linewidth=1) for i in range(len(x1))]; print 'average squared error is {}'.format(np.mean((yy-y)**2)) # In[15]: linear_regressor(2,0) # In[17]: # from ipywidgets import interact, HTML, FloatSlider # interact(linear_regressor,a=(-3,6,.2),b=(-2,5,.2)); # In[20]: a = np.asarray([1 for i in range(len(x1))])[:,np.newaxis] b = np.asarray([1 for i in range(len(x1))])[:,np.newaxis] y = y # xx = np.linspace(mn,mx,num=len(x1)) yy = np.asarray([a[0]*x + b[0] for x in x1]) df = pd.DataFrame(data=np.concatenate((x1,y,yy,a,b),axis=1),columns=['x1','y','yy','a','b']) df = df.sort_values('x1') source = ColumnDataSource(data=df) p = figure(plot_width=600, plot_height=400,title="") p.circle('x1', 'y', size=15,source=source, line_color="navy", fill_color="red", fill_alpha=0.99,legend="observations") p.segment(x0='x1', y0='y', x1='x1', y1='yy', color='navy', alpha=0.6, line_width=1, source=source, ) p.line('x1','yy',source=source, line_color="navy",alpha=0.99, legend="estimated: y= ax+b") p.title.text = "average squared error is: " +str(np.mean((yy-y)**2)) p.legend.location = "top_left" def callback1(source=source,p=p, window=None): data = source.data f = cb_obj.value x1, y = data['x1'], data['y'] yy = data['yy'] a = data['a'] b = data['b'] s = 0 for i in range(len(x1)): yy[i] = a[0]*x1[i] + b[0] a[i] = f s = s + window.Math.pow(yy[i]-y[i],2) p.title.text = "average squared error is: " +str(s/len(x1)) source.change.emit() slider1 = Slider(start=-4, end=4, value=1, step=.1, title="a", callback=CustomJS.from_py_func(callback1)) def callback2(source=source,p=p, window=None): data = source.data f = cb_obj.value x1, y = data['x1'], data['y'] yy = data['yy'] a = data['a'] b = data['b'] s = 0 for i in range(len(x1)): yy[i] = a[0]*x1[i] + b[0] b[i] = f s = s + window.Math.pow(yy[i]-y[i],2) p.title.text = "average squared error is: " +str(s/len(x1)) source.change.emit() slider2 = Slider(start=-4, end=4, value=1, step=.1, title="b", callback=CustomJS.from_py_func(callback2)) layout = column(slider1,slider2, p) show(layout) # # Finding optimum parameters by grid search # ### For the case of linear regression it is easy to find the optimum parameters analytically # * Linear: https://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)#Motivational_example # In[21]: # Linear regression A,B =np.mgrid[-6:6:200j,-5:5:200j] A =A.ravel() B =B.ravel() losses = [] r2s = [] for i in range(len(A)): r2 = ([A[i]*x + B[i] for x in x1]-y)**2 r2s.append(r2) # In[22]: def calculate_loss(r2s,loss='mean'): if loss == 'mean': return [np.mean(r2) for r2 in r2s] if loss == 'variation': return [np.std(r2) for r2 in r2s] if loss == 'skewness': return np.abs([stats.skew(r2) for r2 in r2s]) if loss == 'kurtosis': return np.abs([stats.kurtosis(r2) for r2 in r2s]) # In[23]: losses = calculate_loss(r2s,loss='mean') lossmn = np.min(losses) lossmx = np.max(losses) sc = plt.scatter(A,B,c = losses,s=20,vmin=lossmn,marker='o',edgecolor='None', vmax=lossmx, cmap=cm ,alpha=1); plt.colorbar(sc,ticks=np.round(np.linspace(lossmn,lossmx,5),decimals=3),shrink=0.6); plt.xlabel('a values') plt.xlabel('b values') plt.title('loss: {}'.format('Sum of Squared Errors')) ind_min = np.argmin(losses) amn = A[ind_min] bmn = B[ind_min] print 'a*: {} and b*: {} and the minimum loss is {}'.format(amn,bmn,losses[ind_min]) linear_regressor(amn,bmn) # # polynomial regression # ## $$f(x,\beta) = \sum_{j = 0}^m \beta_j x^j\ $$ # # Regression problem from the point of view of linear algebra # #

# ## Or #

# # ## And the solution is # #

# # # #

# # Looking at this formula we can say that polynomial regression can be seen as a type of multiple regression, where different powers of x (from 0,..,m) are new features. #

# # # #

# # Now Regression in practice using Scikit-learn #

# In[31]: N = 400 x1= np.random.normal(loc=17,scale=5,size=N)[:,np.newaxis] x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis] y = 3*x1 + np.random.normal(loc=.0, scale=.4, size=N)[:,np.newaxis] # x1 = np.random.uniform(size=N)[:,np.newaxis] # y = np.sin(2*np.pi*x1**3)**3 + .1*np.random.randn(*x1.shape) y =-.1*x1**3 + 2*x1*x1 + 2*np.sqrt(x1)+ 10*np.random.normal(loc=30.0, scale=4.7, size=len(x1))[:,np.newaxis] fig = plt.figure(figsize=(7,7)) ax1= plt.subplot(111) plt.plot(x1,y,'.r',markersize=5,alpha=1 ); # In[32]: def polynomial_regr(degree=1): from sklearn.preprocessing import PolynomialFeatures from sklearn import linear_model X_tr = x1[:].astype(float) y_tr = y[:].astype(float) # X_ts = x1[150:] # y_ts = y[150:] poly = PolynomialFeatures(degree=degree) X_tr_ = poly.fit_transform(X_tr) # X_ts_ = poly.fit_transform(X_ts) regr = linear_model.LinearRegression() regr.fit(X_tr_, y_tr) y_pred_tr = regr.predict(X_tr_)[:] # y_pred_ts = regr.predict(X_ts_)[:] # Predicting the training data plt.plot(X_tr,y_tr,'.r',markersize=10,alpha=.4 ); plt.plot(X_tr,y_pred_tr,'.b',markersize=10,alpha=.4 ); # In[33]: polynomial_regr(degree=2) # In[34]: from ipywidgets import interact, HTML, FloatSlider interact(polynomial_regr,degree=(3,150,1)); # # Limits of polynomial regression # ### 1- They can't get better after a certain threshold (e.g. polynomial degree) # ### 2- They suffer from the curse of dimensionality # ### 3- Ther training time increases exponentially by using higher polynomial degrees # ### 4- Assumes functional relations between variables # ### 5- Structural assumptions about the form of functions # #

# # # # # # # # # # ## Limits of Regression Models # ### They can't get better after a certain threshold (e.g. polynomial degree) # In[35]: import time from sklearn.preprocessing import PolynomialFeatures from sklearn import linear_model y_preds = pd.DataFrame(data=np.zeros((y.shape[0],0))) ARE = pd.DataFrame(data=np.zeros((y.shape[0],0))) MAE = pd.DataFrame(data=np.zeros((1,0))) training_times = [] max_degree = 40 for degree in range(1,max_degree): poly = PolynomialFeatures(degree=degree) x1_ = poly.fit_transform(x1) t0 = time.time() regr = linear_model.LinearRegression() regr.fit(x1_, y) print 'dimension of training data for polynomial degree {} is {}'.format(degree,x1_.shape[1]) print 'degree {} is done in {} seconds'.format(degree,time.time()-t0) y_pred = regr.predict(x1_)[:] y_preds['degree-{}'.format(degree)] = y_pred MAE['degree-{}'.format(degree)] = np.mean(np.abs(y_pred-y)) ARE['degree-{}'.format(degree)] = 100*np.abs(y_pred-y)/y # In[36]: quality = [1,5,10,15,20] percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE.shape[1])),index=quality,columns=ARE.columns) for percent in quality: for m in range(ARE.shape[1]): a = ARE.values[:,m] n = float(a.shape[0]) percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n # percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0]) percentiles med_error = pd.DataFrame(data=ARE.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE.columns,index=['median']) med_error percentiles = pd.concat((med_error,percentiles)) percentiles.ix[:] = np.around(percentiles.values[:],decimals=2) # percentiles.index.name = 'ARE' percentiles.T # In[37]: ax1 = plt.subplot(111) # plt.title('degree-{} ARE for ts'.format(degree)) plt.plot(percentiles.T['median'].values[1:],'r') plt.xlabel('polynomial degree') plt.ylabel('median absolute relative error') # ## Limits of Regression Models # ### They suffer from the curse of dimensionality # ### The training time increases exponentially by using higher polynomial degrees # ### An example, by real estate data # ### predicting the rental price of houses # In[38]: path = './Images/rentalprice.csv' rental = pd.read_csv(path) rental.head() # In[39]: # To separate training and testing data from each other import random Target = 0 #Data Selection Mat_all = rental.ix[:,:].copy() a = range(Mat_all.shape[1]) ind_col = a !=np.tile(Target,(len(a))) Tr_data_size = int(.6*Mat_all.shape[0]) print Tr_data_size ind_row_train = random.sample(range(Mat_all.shape[0]),Tr_data_size) ind_row_test = range(Mat_all.shape[0]) for i in ind_row_train: ind_row_test.remove(i) print len(ind_row_test) #This is fixed always y_tr = Mat_all.values[ind_row_train].astype(float) y_tr= y_tr[:,Target].astype(float) y_ts = Mat_all.values[ind_row_test].astype(float) y_ts = y_ts[:,Target].astype(float) X_tr = Mat_all.values[ind_row_train].astype(float) X_tr = X_tr[:,ind_col].astype(float) std = X_tr.std(axis=0) randomization = 1e-25*np.random.randn(X_tr.shape[0],1) ind_std0 = std==0 X_tr[:,ind_std0] = X_tr[:,ind_std0] + randomization X_ts = Mat_all.values[ind_row_test].astype(float) X_ts = X_ts[:,ind_col].astype(float) # In[40]: import time y_preds_tr = pd.DataFrame(data=np.zeros((y_tr.shape[0],0))) ARE_tr = pd.DataFrame(data=np.zeros((y_tr.shape[0],0))) MAE_tr = pd.DataFrame(data=np.zeros((1,0))) y_preds_ts = pd.DataFrame(data=np.zeros((y_ts.shape[0],0))) ARE_ts = pd.DataFrame(data=np.zeros((y_ts.shape[0],0))) MAE_ts = pd.DataFrame(data=np.zeros((1,0))) training_times = [] max_degree = 6 for degree in range(1,max_degree): poly = PolynomialFeatures(degree=degree) X_tr_ = poly.fit_transform(X_tr) X_ts_ = poly.fit_transform(X_ts) t0 = time.time() regr = linear_model.LinearRegression() regr.fit(X_tr_, y_tr) print 'dimension of training data for polynomial degree {} is {}'.format(degree,X_tr_.shape[1]) training_times.append(time.time()-t0) print 'degree {} is done in {} seconds'.format(degree,time.time()-t0) y_pred_tr = regr.predict(X_tr_)[:] y_pred_ts = regr.predict(X_ts_)[:] y_preds_tr['degree-{}'.format(degree)] = y_pred_tr MAE_tr['degree-{}'.format(degree)] = np.mean(np.abs(y_pred_tr-y_tr)) ARE_tr['degree-{}'.format(degree)] = 100*np.abs(y_pred_tr-y_tr)/y_tr y_preds_ts['degree-{}'.format(degree)] = y_pred_ts MAE_ts['degree-{}'.format(degree)] = np.mean(np.abs(y_pred_ts-y_ts)) ARE_ts['degree-{}'.format(degree)] = 100*np.abs(y_pred_ts-y_ts)/y_ts plt.plot(range(1,max_degree),training_times) plt.xlabel('polynomials degree') plt.ylabel('training time') # In[45]: quality = [1,5,10,15,20] percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE_tr.shape[1])),index=quality,columns=ARE_tr.columns) for percent in quality: for m in range(ARE_tr.shape[1]): a = ARE_tr.values[:,m] n = float(a.shape[0]) percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n # percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0]) percentiles med_error = pd.DataFrame(data=ARE_tr.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE_tr.columns,index=['median']) med_error percentiles = pd.concat((med_error,percentiles)) percentiles.ix[:] = np.around(percentiles.values[:],decimals=2) # percentiles.index.name = 'ARE' percentiles.T # In[42]: quality = [1,5,10,15,20] percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE_ts.shape[1])),index=quality,columns=ARE_ts.columns) for percent in quality: for m in range(ARE_ts.shape[1]): a = ARE_ts.values[:,m] n = float(a.shape[0]) percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n # percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0]) percentiles med_error = pd.DataFrame(data=ARE_ts.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE_ts.columns,index=['median']) med_error percentiles = pd.concat((med_error,percentiles)) percentiles.ix[:] = np.around(percentiles.values[:],decimals=2) # percentiles.index.name = 'ARE' percentiles.T # # Limits of Regression Models # ## Non-continuous patterns (Clustered Data) # In[46]: N = 500 x1= np.random.normal(loc=17,scale=5,size=N)[:,np.newaxis] x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis] y = 3*x1 + np.random.normal(loc=.0, scale=.4, size=N)[:,np.newaxis] # y = 2*x1 x1 = np.concatenate((np.random.normal(loc=10,scale=.9,size=N) , np.random.normal(loc=13,scale=.2,size=N),np.random.normal(loc=5,scale=.1,size=N)))[:,np.newaxis] y =-.1*x1**3 + 2*x1*x1 + 7*np.sin(x1) + 3*np.sqrt(x1)+ 1*np.random.normal(loc=30.0, scale=4.7, size=len(x1))[:,np.newaxis] # x_tmp = x1.copy() # x1 = y # y = x_tmp x1 = Data[:,0][:,np.newaxis] y = Data[:,1][:,np.newaxis] fig = plt.figure(figsize=(7,7)) ax1= plt.subplot(111) plt.plot(x1,y,'.r',markersize=5,alpha=1 ); # In[47]: from ipywidgets import interact, HTML, FloatSlider interact(polynomial_regr,degree=(1,50,1)); #

# # Extensions to the original regression problem: # ## Local Polynomial Regression # ## Nonlinear Regression and curve fitting # ## Basis functions # ## Generalized Linear Models # #

# # # # # # # # # # #

# # # An Example: Local Polynomial Regression # ### Using kernel smoother to give more imortance on local information. # # ### Original Least Squares Model: # ### $$S = \sum_{i = 1}^n (x_i - f(x_i,\beta))^2 = \sum_{i = 1}^n r^2$$ # # # ### Local Least Squares Model: # # ### $$\hat{f}_n(x) = argmin_{f} \sum_i K\left(\frac{x-x_i}{h}\right) \left(y_i - f(x-x_i)\right)^2$$ # # Compare it with the kernel denisty learning we had above. # # # ### $$\hat{f}_n(x) = argmin_{f} \sum_i K\left(\frac{x-x_i}{h}\right) \left(y_i - a_0 - a_1(x-x_i) - \ldots - a_n\frac{(x-x_i)^n}{n!}\right)^2$$ # In[2]: N = 1500 x1= np.random.normal(loc=17,scale=5,size=N)[:,np.newaxis] x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis] y = 3*x1 + np.random.normal(loc=.0, scale=.4, size=N)[:,np.newaxis] # y = 2*x1 x1 = np.concatenate((np.random.normal(loc=10,scale=.9,size=N) , np.random.normal(loc=13,scale=.2,size=N),np.random.normal(loc=5,scale=.1,size=N)))[:,np.newaxis] y =-.1*x1**3 + 2*x1*x1 + 7*np.sin(x1) + 3*np.sqrt(x1)+ 1*np.random.normal(loc=30.0, scale=4.7, size=len(x1))[:,np.newaxis] y[y<100] =y[y<100] +100 y[x1>12] =y[x1>12] -50 # x_tmp = x1.copy() # x1 = y # y = x_tmp # x1 = Data[:,0][:,np.newaxis] # y = Data[:,1][:,np.newaxis] fig = plt.figure(figsize=(7,7)) ax1= plt.subplot(111) plt.plot(x1,y,'.r',markersize=5,alpha=1 ); # In[2]: # It is use is pretty similar to the previous one # Here, we use PyQt library import pyqt_fit.nonparam_regression as smooth from pyqt_fit import npr_methods import time poly_order=5 y_preds_ = pd.DataFrame(data=np.zeros((y.shape[0],0))) ARE = pd.DataFrame(data=np.zeros((y.shape[0],0))) MAE = pd.DataFrame(data=np.zeros((1,0))) t0 = time.time() regr = smooth.NonParamRegression(x1.T, y[:,0], method=npr_methods.LocalPolynomialKernel(q=poly_order)) name = 'Poly-nonparametric-'+str(poly_order) regr.fit() y_preds_local = regr(x1.T)[:,np.newaxis] print 'local polynomial with degree {} is done in {} seconds'.format(poly_order,time.time()-t0) y_preds_['local-degree-{}'.format(poly_order)] = y_preds_local MAE['local-degree-{}'.format(poly_order)] = np.mean(np.abs(y_preds_local-y)) ARE['local-degree-{}'.format(poly_order)] = 100*np.abs(y_preds_local-y)/y t0 = time.time() poly = PolynomialFeatures(degree=degree) x1_ = poly.fit_transform(x1) regr = linear_model.LinearRegression() regr.fit(x1_, y) y_preds_gloabal = regr.predict(x1_) print 'Global polynomial with degree {} is done in {} seconds'.format(poly_order,time.time()-t0) y_preds_['global-degree-{}'.format(poly_order)] = y_preds_gloabal MAE['global-degree-{}'.format(poly_order)] = np.mean(np.abs(y_preds_gloabal-y)) ARE['global-degree-{}'.format(poly_order)] = 100*np.abs(y_preds_gloabal-y)/y figure =plt.figure(figsize=(7,7)) plt.plot(x1, y, '.', alpha=0.1, label='Data') plt.plot(x1, y_preds_local, '.g', alpha=1, label='Local') plt.plot(x1, y_preds_gloabal, '.r', alpha=1, label='Global') plt.legend(bbox_to_anchor = (1.3,1)) MAE # In[ ]: quality = [1,5,10,15,20] percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE.shape[1])),index=quality,columns=ARE.columns) for percent in quality: for m in range(ARE.shape[1]): a = ARE.values[:,m] n = float(a.shape[0]) percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n # percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0]) percentiles med_error = pd.DataFrame(data=ARE.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE.columns,index=['median']) med_error percentiles = pd.concat((med_error,percentiles)) percentiles.ix[:] = np.around(percentiles.values[:],decimals=2) # percentiles.index.name = 'ARE' percentiles.T # #

# # # # (Possibly) in the next session # * ** practice with scikit learn using different regression models** # * Cross validation # * performance evaluation # * **Structure Learning and Manifold learning** # * **Space transformations and Representation learning**