Linear regression with simple basis extension¶

Linear regression on fake BMI data¶

In [4]:
# fake BMI data

import numpy.random as random

def generate_fake_bmi_data(N=100,noise=2,seed=2):
random.seed(seed)
ages=random.uniform(low=15,high=80,size=N)
heights=random.normal(1.75,0.15,size=N)
bmis=random.normal(30,5,size=N)
weights=heights*heights*bmis+noise*random.normal(0,1,N)
return (ages,heights,bmis,weights)

In [5]:
def compute_weights(x_train,y_train):
#w=np.dot(np.dot(np.linalg.inv(np.dot(x_train,x_train.transpose())),x_train),y_train)
w=np.dot(np.linalg.pinv(x_train.transpose()),y_train)
return w

def compute_ridge_weights(x_train,y_train,l):
w=np.dot(np.dot(np.linalg.inv(l*np.eye(x_train.shape[0])+np.dot(x_train,x_train.transpose())),x_train),y_train)
#w=np.dot(np.linalg.pinv(x_train.transpose()),y_train)
return w

def compute_mse(w,x_test,y_test):
y=np.dot(x_test.transpose(),w)
mse=np.dot((y-y_test).transpose(),(y-y_test))/y.size
if len(mse.shape)==2:
mse=mse[0][0]

return mse,y

In [6]:
# generate higher order features by padding products of features up to max_order

def gen_high_order_features(x1,x2,x3,max_order=2):
#    assert(xs.shape[0] == 4) # 'Only work with 3 basics features'

from sympy.abc import x
from sympy.abc import y
from sympy.abc import z

mat=[np.ones(x1.size)]
p=sympy.poly(x+y+z)

for order in range(1,max_order+1):
for mon in p.monoms():
mat.append(x1**mon[0]*x2**mon[1]*x3**mon[2])
p=p*p
return np.array(mat)

def bmi_train(ages,heights,bmis,weights,order):
N=ages.size
x_train=gen_high_order_features(ages,heights,bmis,order)
y_train=weights.reshape(N,1)
ws=compute_weights(x_train,y_train)
mse,y=compute_mse(ws,x_train,y_train)

return (ws,mse,y)

def bmi_test(agest,heightst,bmist,weightst,ws):
N_test=agest.size
x_test=gen_high_order_features(agest,heightst,bmist,order)
y_test=weightst.reshape(N_test,1)

mse_test,y=compute_mse(ws,x_test,y_test)
return (mse_test,y_test)

In [571]:
import numpy as np

N=30
N_test=20
max_order=3

ages,heights,bmis,weights=generate_fake_bmi_data(N,2,1)
agest,heightst,bmist,weightst=generate_fake_bmi_data(N_test,2,5) # no noise when testing
#x_train=np.concatenate((np.ones(N),heights,bmis,ages),axis=0).reshape(4,N)

mse_train_list=[]
mse_test_list=[]
for order in range(1,max_order+1):
(ws,mse,y_train)=bmi_train(ages,heights,bmis,weights,order)
(mse_test,y_test)=bmi_test(agest,heightst,bmist,weightst,ws)
mse_train_list.append(mse)
mse_test_list.append(mse_test)

print(np.array(mse_train_list))
print(np.array(mse_test_list))

line1,=plt.plot(range(1,max_order+1),mse_train_list,label='Training error')
line2,=plt.plot(range(1,max_order+1),mse_test_list,label='Testing error')
plt.xlabel('Maximum degree of features')
plt.ylabel('MSE')
plt.legend(handles=[line1,line2])
#plt.savefig("bmi0.pdf")

[ 6.62858971  1.00510313  0.3199968 ]
[ 8.64919034  4.7262191   7.70046893]

Out[571]:
<matplotlib.legend.Legend at 0x7f9debd33990>

In [7]:
# general polynomial data
import numpy as np

def gen_poly_data(N,isRandom=True,seed=1):
if isRandom:
random.seed(seed)
x=random.uniform(0,10,N)
y=(x-3)**2+random.normal(0,2,N)
else:
x=np.linspace(0,10,N)
y=(x-3)**2
return (x,y)

In [8]:
def gen_poly_features(x,max_order=3):
mat=[np.ones(x.size)]
p=x
for order in range(1,max_order+1):
mat.append(p)
p=p*x
return np.array(mat)

In [580]:
%matplotlib inline

import matplotlib.pyplot as plt

max_order=9
mse_train_list=[]
mse_test_list=[]
for order in range(1,max_order+1):
x,y=gen_poly_data(10,True,3)
x_train = gen_poly_features(x,order)
y_train = y

xt,yt=gen_poly_data(100,False)
x_test = gen_poly_features(xt,order)
y_test = yt

ws=compute_weights(x_train,y_train)
mse,_=compute_mse(ws,x_train,y_train)
print(mse)
mse_train_list.append(mse)

mse_test,ye=compute_mse(ws,x_test,y_test)
print(mse_test)
mse_test_list.append(mse_test)

line1,=plt.plot(xt,ye,label="Estimated curve")
plt.plot(x,y,'x')
line2,=plt.plot(xt,yt,'r',label="Original curve")
plt.legend(handles=[line1,line2])
#     plt.savefig('curve_fit'+str(order)+'.pdf') # save figure
plt.cla()

p_order=6
line1,=plt.plot(range(1,p_order+1),mse_train_list[0:p_order],label="Training error")
line2,=plt.plot(range(1,p_order+1),mse_test_list[0:p_order],label="Testing error")
plt.legend(handles=[line1,line2])
plt.xlabel('Max degree')
plt.ylabel('MSE')
plt.ylim([0,10])
# plt.savefig('curve_fit_err.pdf') # save figure

64.0703368074
58.1449568138
1.76608000787
2.24223953871
1.76602144107
2.24943481913
1.74206072203
2.80766073752
0.734274250627
44.2056154586
0.575730627164
228.218092695
0.575420075101
194.306626484
0.55113926434
624.354036092
1.96999204674e-10
41349.6855465


Incorporate regularization¶

In [9]:
%matplotlib inline

from sklearn import linear_model
import matplotlib.pyplot as plt

order = 9

# Needed to parse for matplotlib to parse latex input
# plt.clf()
# plt.rc('text', usetex=True)
# plt.rc('font', family='serif')

for alpha in [0.15,0.3,0.5,1,2,4,8]:
x,y=gen_poly_data(10,True,3)
x_train = gen_poly_features(x,order)
y_train = y

xt,yt=gen_poly_data(100,False)
x_test = gen_poly_features(xt,order)
y_test = yt

clf = linear_model.Lasso(alpha=alpha)
clf.fit(x_train.T,y_train)

ws=compute_ridge_weights(x_train,y_train,alpha)
mse_test,yer=compute_mse(ws,x_test,y_test)
# clfr = linear_model.Ridge(alpha=alpha)
# clfr.fit(x_train.T,y_train)

ye = clf.predict(x_test.T)
# yer = clf.predict(x_test.T)

plt.clf()
line1,=plt.plot(xt,ye,label="Lasso")
line3,=plt.plot(xt,yer,label="Ridge regression")
plt.plot(x,y,'x')
line2,=plt.plot(xt,yt,'r',label="Original curve")
plt.legend(handles=[line1,line3,line2])
#     plt.title(r'\lambda = '+str(alpha)) # matplotlib appears to have bug to parse latex input
plt.title('lambda = '+str(alpha))

#     plt.savefig('regularized_fit_'+str(alpha)+'.pdf') # save figure

/home/phsamuel/anaconda3/envs/conda2/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
ConvergenceWarning)

In [ ]: