#!/usr/bin/env python # coding: utf-8 # ## Linear regression with simple basis extension # ### Linear regression on fake BMI data # In[4]: # fake BMI data import numpy.random as random def generate_fake_bmi_data(N=100,noise=2,seed=2): random.seed(seed) ages=random.uniform(low=15,high=80,size=N) heights=random.normal(1.75,0.15,size=N) bmis=random.normal(30,5,size=N) weights=heights*heights*bmis+noise*random.normal(0,1,N) return (ages,heights,bmis,weights) # In[5]: def compute_weights(x_train,y_train): #w=np.dot(np.dot(np.linalg.inv(np.dot(x_train,x_train.transpose())),x_train),y_train) w=np.dot(np.linalg.pinv(x_train.transpose()),y_train) return w def compute_ridge_weights(x_train,y_train,l): w=np.dot(np.dot(np.linalg.inv(l*np.eye(x_train.shape[0])+np.dot(x_train,x_train.transpose())),x_train),y_train) #w=np.dot(np.linalg.pinv(x_train.transpose()),y_train) return w def compute_mse(w,x_test,y_test): y=np.dot(x_test.transpose(),w) mse=np.dot((y-y_test).transpose(),(y-y_test))/y.size if len(mse.shape)==2: mse=mse[0][0] return mse,y # In[6]: # generate higher order features by padding products of features up to max_order def gen_high_order_features(x1,x2,x3,max_order=2): # assert(xs.shape[0] == 4) # 'Only work with 3 basics features' from sympy.abc import x from sympy.abc import y from sympy.abc import z mat=[np.ones(x1.size)] p=sympy.poly(x+y+z) for order in range(1,max_order+1): for mon in p.monoms(): mat.append(x1**mon[0]*x2**mon[1]*x3**mon[2]) p=p*p return np.array(mat) def bmi_train(ages,heights,bmis,weights,order): N=ages.size x_train=gen_high_order_features(ages,heights,bmis,order) y_train=weights.reshape(N,1) ws=compute_weights(x_train,y_train) mse,y=compute_mse(ws,x_train,y_train) return (ws,mse,y) def bmi_test(agest,heightst,bmist,weightst,ws): N_test=agest.size x_test=gen_high_order_features(agest,heightst,bmist,order) y_test=weightst.reshape(N_test,1) mse_test,y=compute_mse(ws,x_test,y_test) return (mse_test,y_test) # In[571]: import numpy as np N=30 N_test=20 max_order=3 ages,heights,bmis,weights=generate_fake_bmi_data(N,2,1) agest,heightst,bmist,weightst=generate_fake_bmi_data(N_test,2,5) # no noise when testing #x_train=np.concatenate((np.ones(N),heights,bmis,ages),axis=0).reshape(4,N) mse_train_list=[] mse_test_list=[] for order in range(1,max_order+1): (ws,mse,y_train)=bmi_train(ages,heights,bmis,weights,order) (mse_test,y_test)=bmi_test(agest,heightst,bmist,weightst,ws) mse_train_list.append(mse) mse_test_list.append(mse_test) print(np.array(mse_train_list)) print(np.array(mse_test_list)) line1,=plt.plot(range(1,max_order+1),mse_train_list,label='Training error') line2,=plt.plot(range(1,max_order+1),mse_test_list,label='Testing error') plt.xlabel('Maximum degree of features') plt.ylabel('MSE') plt.legend(handles=[line1,line2]) #plt.savefig("bmi0.pdf") # ### Linear regression for quadratic fitting # In[7]: # general polynomial data import numpy as np def gen_poly_data(N,isRandom=True,seed=1): if isRandom: random.seed(seed) x=random.uniform(0,10,N) y=(x-3)**2+random.normal(0,2,N) else: x=np.linspace(0,10,N) y=(x-3)**2 return (x,y) # In[8]: def gen_poly_features(x,max_order=3): mat=[np.ones(x.size)] p=x for order in range(1,max_order+1): mat.append(p) p=p*x return np.array(mat) # In[580]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt max_order=9 mse_train_list=[] mse_test_list=[] for order in range(1,max_order+1): x,y=gen_poly_data(10,True,3) x_train = gen_poly_features(x,order) y_train = y xt,yt=gen_poly_data(100,False) x_test = gen_poly_features(xt,order) y_test = yt ws=compute_weights(x_train,y_train) mse,_=compute_mse(ws,x_train,y_train) print(mse) mse_train_list.append(mse) mse_test,ye=compute_mse(ws,x_test,y_test) print(mse_test) mse_test_list.append(mse_test) line1,=plt.plot(xt,ye,label="Estimated curve") plt.plot(x,y,'x') line2,=plt.plot(xt,yt,'r',label="Original curve") plt.legend(handles=[line1,line2]) # plt.savefig('curve_fit'+str(order)+'.pdf') # save figure plt.cla() p_order=6 line1,=plt.plot(range(1,p_order+1),mse_train_list[0:p_order],label="Training error") line2,=plt.plot(range(1,p_order+1),mse_test_list[0:p_order],label="Testing error") plt.legend(handles=[line1,line2]) plt.xlabel('Max degree') plt.ylabel('MSE') plt.ylim([0,10]) # plt.savefig('curve_fit_err.pdf') # save figure # #### Incorporate regularization # In[9]: get_ipython().run_line_magic('matplotlib', 'inline') from sklearn import linear_model import matplotlib.pyplot as plt order = 9 # Needed to parse for matplotlib to parse latex input # plt.clf() # plt.rc('text', usetex=True) # plt.rc('font', family='serif') for alpha in [0.15,0.3,0.5,1,2,4,8]: x,y=gen_poly_data(10,True,3) x_train = gen_poly_features(x,order) y_train = y xt,yt=gen_poly_data(100,False) x_test = gen_poly_features(xt,order) y_test = yt clf = linear_model.Lasso(alpha=alpha) clf.fit(x_train.T,y_train) ws=compute_ridge_weights(x_train,y_train,alpha) mse_test,yer=compute_mse(ws,x_test,y_test) # clfr = linear_model.Ridge(alpha=alpha) # clfr.fit(x_train.T,y_train) ye = clf.predict(x_test.T) # yer = clf.predict(x_test.T) plt.clf() line1,=plt.plot(xt,ye,label="Lasso") line3,=plt.plot(xt,yer,label="Ridge regression") plt.plot(x,y,'x') line2,=plt.plot(xt,yt,'r',label="Original curve") plt.legend(handles=[line1,line3,line2]) # plt.title(r'\lambda = '+str(alpha)) # matplotlib appears to have bug to parse latex input plt.title('lambda = '+str(alpha)) # plt.savefig('regularized_fit_'+str(alpha)+'.pdf') # save figure # In[ ]: