import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
pd.__version__
import sys
from scipy import stats
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
pd.__version__
import sys
from scipy import stats
%matplotlib inline
from bokeh.models import CustomJS, ColumnDataSource, Slider,TextInput
from bokeh.models import TapTool, CustomJS, ColumnDataSource
from bokeh.layouts import column
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.plotting import Figure, output_file, show
from bokeh.layouts import widgetbox
from bokeh.models.widgets import Slider
from bokeh.io import push_notebook
from bokeh.io import show
from bokeh.models import (
ColumnDataSource,
HoverTool,
LinearColorMapper,
BasicTicker,
PrintfTickFormatter,
ColorBar,
)
output_notebook()
from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource, Slider
from bokeh.plotting import Figure, output_file, show
plt.subplot(2,1,1)
X1 = np.random.normal(loc=0,scale=1,size=1000)
plt.hist(X1,bins=100);
plt.title('A well shaped distribution: A Gaussian Distribution?')
plt.subplot(2,1,2)
X1 = np.concatenate((np.random.normal(loc=0,scale=1,size=1000) , np.random.normal(loc=10,scale=2,size=1000),np.random.normal(loc=20,scale=1,size=200)))
plt.title('Not a well shaped distribution: A mixture of three Gaussian Distributions?!')
plt.hist(X1,bins=100);
plt.tight_layout()
# X1 = np.random.randint(-1,2,size=400)
X1 = np.concatenate((np.random.normal(loc=0,scale=1,size=1000) , np.random.normal(loc=10,scale=2,size=1000),np.random.normal(loc=20,scale=1,size=200)))
# X1 = np.random.normal(loc=0,scale=1,size=200)
X1 = np.asarray(X1)
a = plt.hist(X1,bins=100);
# a[0] has the counts
print a[0]
# a[1] has the bins
print a[1]
print np.sum(a[0])
[ 4. 6. 7. 9. 32. 39. 49. 61. 83. 97. 89. 106. 93. 87. 82. 58. 38. 25. 18. 7. 4. 2. 4. 1. 1. 1. 0. 0. 2. 2. 2. 3. 1. 10. 10. 9. 13. 11. 11. 22. 30. 29. 28. 30. 33. 45. 43. 52. 44. 59. 39. 48. 36. 45. 47. 47. 40. 41. 35. 31. 14. 20. 12. 12. 11. 4. 5. 4. 6. 3. 2. 3. 0. 0. 2. 0. 1. 1. 1. 1. 3. 2. 7. 5. 13. 15. 13. 14. 16. 24. 23. 22. 12. 10. 6. 8. 1. 0. 1. 2.] [ -2.83414392e+00 -2.57783274e+00 -2.32152155e+00 -2.06521037e+00 -1.80889919e+00 -1.55258801e+00 -1.29627683e+00 -1.03996565e+00 -7.83654466e-01 -5.27343285e-01 -2.71032104e-01 -1.47209224e-02 2.41590259e-01 4.97901440e-01 7.54212621e-01 1.01052380e+00 1.26683498e+00 1.52314617e+00 1.77945735e+00 2.03576853e+00 2.29207971e+00 2.54839089e+00 2.80470207e+00 3.06101325e+00 3.31732443e+00 3.57363562e+00 3.82994680e+00 4.08625798e+00 4.34256916e+00 4.59888034e+00 4.85519152e+00 5.11150270e+00 5.36781388e+00 5.62412507e+00 5.88043625e+00 6.13674743e+00 6.39305861e+00 6.64936979e+00 6.90568097e+00 7.16199215e+00 7.41830333e+00 7.67461452e+00 7.93092570e+00 8.18723688e+00 8.44354806e+00 8.69985924e+00 8.95617042e+00 9.21248160e+00 9.46879278e+00 9.72510397e+00 9.98141515e+00 1.02377263e+01 1.04940375e+01 1.07503487e+01 1.10066599e+01 1.12629711e+01 1.15192822e+01 1.17755934e+01 1.20319046e+01 1.22882158e+01 1.25445270e+01 1.28008381e+01 1.30571493e+01 1.33134605e+01 1.35697717e+01 1.38260829e+01 1.40823940e+01 1.43387052e+01 1.45950164e+01 1.48513276e+01 1.51076388e+01 1.53639500e+01 1.56202611e+01 1.58765723e+01 1.61328835e+01 1.63891947e+01 1.66455059e+01 1.69018170e+01 1.71581282e+01 1.74144394e+01 1.76707506e+01 1.79270618e+01 1.81833729e+01 1.84396841e+01 1.86959953e+01 1.89523065e+01 1.92086177e+01 1.94649289e+01 1.97212400e+01 1.99775512e+01 2.02338624e+01 2.04901736e+01 2.07464848e+01 2.10027959e+01 2.12591071e+01 2.15154183e+01 2.17717295e+01 2.20280407e+01 2.22843518e+01 2.25406630e+01 2.27969742e+01] 2200.0
plt.plot(a[1][:-1],a[0],'.-');
<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/15255412d35488194b7e24e4518765e0af9992b5%22width =700, height=700/>
# A basic example with few data points
X1 = [-2,-1,0,2,5,6,-2,0]
X1 = np.asarray(X1)
a = plt.hist(X1,bins=20);
def GaussianKernel(u):
# here we assume a "standard normal distribution" with mean zero and variance 1
return np.exp(-1*np.power(u, 2)/2)/(np.sqrt(2*np.pi))
def Kde_Gaussian(h=3):
mn = np.min(X1)-3
mx = np.max(X1)+3
# h is the Bandwidth
# range of the the data
xrng = np.linspace(mn,mx,num=500)
counts = np.zeros((1,500))
# this is just to histogram of the data
# a = plt.hist(X1,bins=len(X1),normed=True);
# plt.close()
# for i,d in enumerate(a[0]):
# plt.plot([a[1][i],a[1][i]],[0,d],'-or',markersize=10);
# Now we go through all the data points one by one.
# Assume that the observed point is the center of Gaussian Distribution with standard deviation equal to the bandwidth
for xi in X1:
local_counts = [GaussianKernel((x-xi)/h) for x in xrng]
# (x-xi)/h is to transform the data to "standard normal distribution"!
counts = counts + np.asarray(local_counts)/(h*len(X1))
# this to normalize the counts. Look at the formula above
plt.plot(xrng,np.asarray(local_counts)/(h*len(X1)),'r');
plt.plot(xrng,counts.T,linewidth=4);
Kde_Gaussian(1)
# #ipywidgets
# from ipywidgets import interact, HTML, FloatSlider
# interact(Kde_Gaussian,h=(.1,5,.1));
mn = np.min(X1)-3
mx = np.max(X1)+3
# range of the the data
xrng = np.linspace(mn,mx,num=500).T[:,np.newaxis]
counts = np.zeros((1,500)).T
# local_counts = np.zeros((1,500)).T
#unfortunately, source in bokeh needs columns with the same size. So we fake it
observations = np.zeros((1,500)).T
observations[:len(X1),0] = X1.T
dlen = np.asarray([len(X1) for i in range(len(xrng))])[:,np.newaxis]
df = pd.DataFrame(data=np.concatenate((xrng,counts,observations,dlen),axis=1),columns=['xrng','counts','observations','dlen'])
for i in range(len(X1)):
df['local_counts'+str(i)] = np.zeros((1,500)).T
h= 1
for i in range(dlen[0]):
local_counts = df['local_counts'+str(i)].values[:]
for j in range(len(xrng)):
u = (xrng[j]-observations[i])/h
local_counts[j] = np.exp(-1*np.power(u, 2)/2)/(np.sqrt(2*np.pi))/(h*dlen[0])
counts[j] = counts[j] + local_counts[j]
df['local_counts'+str(i)] = local_counts
df['counts'] = counts
source = ColumnDataSource(data=df)
p = figure(plot_width=600, plot_height=400,title="Kernel Density Estimation")
p.circle(X1, np.zeros(len(X1)), size=10, line_color="navy", fill_color="red", fill_alpha=0.99,legend="observations")
for i in range(len(X1)):
p.line('xrng', 'local_counts'+str(i),alpha=0.6, line_width=1 ,source=source, line_color="red",legend="kernels")
p.line('xrng','counts',line_width=4,source=source, line_color="navy",legend="KDE");
def callback1(source=source, window=None):
data = source.data
h = cb_obj.value
dlen = data['dlen']
xrng = data['xrng']
counts = data['counts']
for i in range(len(counts)):
counts[i] = 0
#
observations =data['observations']
for i in range(dlen[0]):
local_counts = data['local_counts'+str(i)]
# window.console.log(local_counts)
for j in range(len(xrng)):
u = (xrng[j]-observations[i])/h
local_counts[j] = window.Math.exp(-1*window.Math.pow(u, 2)/2)/(window.Math.sqrt(2*window.Math.PI))/(h*dlen[0])
counts[j] = counts[j] + local_counts[j]
source.change.emit()
slider1 = Slider(start=.0001, end=8, value=1, step=.05, title="bandwidth",
callback=CustomJS.from_py_func(callback1))
slider1.callback.update()
layout = column(slider1, p)
show(layout)
# A two dimensional example
fig = plt.figure()
d0 = 1.6*np.random.randn(1000,2)
d0[:,0]= d0[:,0] - 3
plt.plot(d0[:,0],d0[:,1],'.b')
d1 = .6*np.random.randn(1000,2)+7.6
plt.plot(d1[:,0],d1[:,1],'.b')
d2 = .8*np.random.randn(1000,2)
d2[:,0]= d2[:,0] + 5
d2[:,1]= d2[:,1] + 1
plt.plot(d2[:,0],d2[:,1],'.b')
d3 = .8*np.random.randn(1000,2)
d3[:,0]= d3[:,0] - 5
d3[:,1]= d3[:,1] + 4
plt.plot(d3[:,0],d3[:,1],'.b')
d4 = 1.8*np.random.randn(1000,2)
d4[:,0]= d4[:,0] - 15
d4[:,1]= d4[:,1] + 14
plt.plot(d4[:,0],d4[:,1],'.b')
d5 = 2.8*np.random.randn(1000,2)
d5[:,0]= d5[:,0] + 25
d5[:,1]= d5[:,1] + 14
plt.plot(d5[:,0],d5[:,1],'.b')
fig.set_size_inches(5,5)
Data = np.concatenate((d0,d1,d2,d3,d4,d5))
# from scipy import stats
# def measure(n):
# "Measurement model, return two coupled measurements."
# m1 = np.random.normal(size=n)
# m2 = np.random.normal(scale=0.5, size=n)
# return m1+m2, m1-m2
# m1, m2 = measure(2000)
m1,m2 = Data[:,0],Data[:,1]
xmin = Data[:,0].min()
xmax = Data[:,0].max()
ymin = Data[:,1].min()
ymax = Data[:,1].max()
Xg, Yg = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
val_grids = Xg
# values = np.vstack([m1, m2])
# values = Data.T
kernel = stats.gaussian_kde(Data[:,0])
Z = kernel(val_grids.ravel())
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
a = ax.hist(Data[:,0],bins=300,normed=True);
ax.plot(val_grids.ravel(),Z,linewidth=5);
ax.plot(a[1][:-1],a[0])
plt.show()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
# values = np.vstack([m1, m2])
values = Data.T
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,)
plt.contour(np.rot90(Z))
# ax.plot(m1, m2, 'k.', markersize=2)
# ax.set_xlim([xmin, xmax])
# ax.set_ylim([ymin, ymax])
fig.set_size_inches(7,7)
plt.show()
The importance of the learned densities
fig = plt.figure(figsize=(10,7))
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
cm = plt.cm.get_cmap('RdYlBu_r')
positions = np.vstack([X.ravel(), Y.ravel()])
# values = np.vstack([m1, m2])
values = Data.T
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
zmn = np.min(Z)
zmx = np.max(Z)
rng = zmx-zmn
Z = Z.ravel()
X = X.ravel()
Y = Y.ravel()
# for i in range(len(X)):
var = (Z-zmn)/float(rng)
# sc = plt.scatter(X, Y, s=10, marker=u'o', facecolor=plt.cm.RdYlBu_r(var), vmin=zmn, vmax=zmx, edgecolor='None')
sc = plt.scatter(X,Y,c = Z,s=20,vmin=zmn,marker='o',edgecolor='None', vmax=zmx, cmap=cm ,alpha=1);
plt.colorbar(sc,ticks=np.round(np.linspace(zmn,zmx,5),decimals=3),shrink=0.6);
sc = plt.scatter(X[3000:3100],Y[3000:3100],c = Z[3000:3100],s=20,vmin=zmn,marker='o',edgecolor='red', vmax=zmx, cmap=cm ,alpha=1);
plt.xlim(xmin,xmax);
plt.ylim(ymin,ymax);
plt.xlabel('X');
plt.ylabel('Y');
# For example, here we fix x to a specific value and would like to know what are the possible y values
x = -5
X, Y = np.mgrid[x:x+1:1j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
# values = np.vstack([m1, m2])
Z = np.reshape(kernel(positions).T, X.shape)
plt.plot(Y.T,Z.T);
plt.ylabel('Relative likelihood');
plt.xlabel('Expected y, given x:{}'.format(x));
N = 50
x1= np.random.normal(loc=0,scale=1,size=N)[:,np.newaxis]
x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis]
y = 3*x1 + np.random.normal(loc=.0, scale=.7, size=N)[:,np.newaxis]
fig = plt.figure(figsize=(7,7))
ax1= plt.subplot(111)
plt.plot(x1,y,'or',markersize=10,alpha=.4 );
Least Squares Method.
Underlying ideas behing least squares method is the essense of many (all of) machine learning methods.
An early demonstration of the strength of Gauss' Method came when it was used to predict the future location of the newly discovered asteroid Ceres. On 1 January 1801, the Italian astronomer Giuseppe Piazzi discovered Ceres and was able to track its path for 40 days before it was lost in the glare of the sun. Based on these data, astronomers desired to determine the location of Ceres after it emerged from behind the sun ** without solving Kepler's complicated nonlinear equations of planetary motion**. The only predictions that successfully allowed Hungarian astronomer Franz Xaver von Zach to relocate Ceres were those performed by the 24-year-old Gauss using least-squares analysis.
def linear_regressor(a,b):
# y_ = ax+b
mn = np.min(x1)
mx = np.max(x1)
xrng = np.linspace(mn,mx,num=500)
y_ = [a*x + b for x in xrng]
fig = plt.figure(figsize=(7,7))
ax1= plt.subplot(111)
plt.plot(x1,y,'or',markersize=10,alpha=.4 );
plt.xlabel('x1');
plt.ylabel('y');
plt.plot(xrng,y_,'-b',linewidth=1)
# plt.xlim(mn-1,mx+1)
# plt.ylim(np.min(y)+1,np.max(y)-1)
yy = [a*x + b for x in x1]
[plt.plot([x1[i],x1[i]],[yy[i],y[i]],'-r',linewidth=1) for i in range(len(x1))];
print 'average squared error is {}'.format(np.mean((yy-y)**2))
linear_regressor(2,0)
average squared error is 1.72286658308
# from ipywidgets import interact, HTML, FloatSlider
# interact(linear_regressor,a=(-3,6,.2),b=(-2,5,.2));
a = np.asarray([1 for i in range(len(x1))])[:,np.newaxis]
b = np.asarray([1 for i in range(len(x1))])[:,np.newaxis]
y = y
# xx = np.linspace(mn,mx,num=len(x1))
yy = np.asarray([a[0]*x + b[0] for x in x1])
df = pd.DataFrame(data=np.concatenate((x1,y,yy,a,b),axis=1),columns=['x1','y','yy','a','b'])
df = df.sort_values('x1')
source = ColumnDataSource(data=df)
p = figure(plot_width=600, plot_height=400,title="")
p.circle('x1', 'y', size=15,source=source, line_color="navy", fill_color="red", fill_alpha=0.99,legend="observations")
p.segment(x0='x1', y0='y', x1='x1', y1='yy', color='navy', alpha=0.6, line_width=1, source=source, )
p.line('x1','yy',source=source, line_color="navy",alpha=0.99, legend="estimated: y= ax+b")
p.title.text = "average squared error is: " +str(np.mean((yy-y)**2))
p.legend.location = "top_left"
def callback1(source=source,p=p, window=None):
data = source.data
f = cb_obj.value
x1, y = data['x1'], data['y']
yy = data['yy']
a = data['a']
b = data['b']
s = 0
for i in range(len(x1)):
yy[i] = a[0]*x1[i] + b[0]
a[i] = f
s = s + window.Math.pow(yy[i]-y[i],2)
p.title.text = "average squared error is: " +str(s/len(x1))
source.change.emit()
slider1 = Slider(start=-4, end=4, value=1, step=.1, title="a",
callback=CustomJS.from_py_func(callback1))
def callback2(source=source,p=p, window=None):
data = source.data
f = cb_obj.value
x1, y = data['x1'], data['y']
yy = data['yy']
a = data['a']
b = data['b']
s = 0
for i in range(len(x1)):
yy[i] = a[0]*x1[i] + b[0]
b[i] = f
s = s + window.Math.pow(yy[i]-y[i],2)
p.title.text = "average squared error is: " +str(s/len(x1))
source.change.emit()
slider2 = Slider(start=-4, end=4, value=1, step=.1, title="b",
callback=CustomJS.from_py_func(callback2))
layout = column(slider1,slider2, p)
show(layout)
# Linear regression
A,B =np.mgrid[-6:6:200j,-5:5:200j]
A =A.ravel()
B =B.ravel()
losses = []
r2s = []
for i in range(len(A)):
r2 = ([A[i]*x + B[i] for x in x1]-y)**2
r2s.append(r2)
def calculate_loss(r2s,loss='mean'):
if loss == 'mean':
return [np.mean(r2) for r2 in r2s]
if loss == 'variation':
return [np.std(r2) for r2 in r2s]
if loss == 'skewness':
return np.abs([stats.skew(r2) for r2 in r2s])
if loss == 'kurtosis':
return np.abs([stats.kurtosis(r2) for r2 in r2s])
losses = calculate_loss(r2s,loss='mean')
lossmn = np.min(losses)
lossmx = np.max(losses)
sc = plt.scatter(A,B,c = losses,s=20,vmin=lossmn,marker='o',edgecolor='None', vmax=lossmx, cmap=cm ,alpha=1);
plt.colorbar(sc,ticks=np.round(np.linspace(lossmn,lossmx,5),decimals=3),shrink=0.6);
plt.xlabel('a values')
plt.xlabel('b values')
plt.title('loss: {}'.format('Sum of Squared Errors'))
ind_min = np.argmin(losses)
amn = A[ind_min]
bmn = B[ind_min]
print 'a*: {} and b*: {} and the minimum loss is {}'.format(amn,bmn,losses[ind_min])
linear_regressor(amn,bmn)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-23-a6e773fb3ce7> in <module>() 3 lossmx = np.max(losses) 4 ----> 5 sc = plt.scatter(A,B,c = losses,s=20,vmin=lossmn,marker='o',edgecolor='None', vmax=lossmx, cmap=cm ,alpha=1); 6 plt.colorbar(sc,ticks=np.round(np.linspace(lossmn,lossmx,5),decimals=3),shrink=0.6); 7 plt.xlabel('a values') NameError: name 'cm' is not defined
N = 400
x1= np.random.normal(loc=17,scale=5,size=N)[:,np.newaxis]
x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis]
y = 3*x1 + np.random.normal(loc=.0, scale=.4, size=N)[:,np.newaxis]
# x1 = np.random.uniform(size=N)[:,np.newaxis]
# y = np.sin(2*np.pi*x1**3)**3 + .1*np.random.randn(*x1.shape)
y =-.1*x1**3 + 2*x1*x1 + 2*np.sqrt(x1)+ 10*np.random.normal(loc=30.0, scale=4.7, size=len(x1))[:,np.newaxis]
fig = plt.figure(figsize=(7,7))
ax1= plt.subplot(111)
plt.plot(x1,y,'.r',markersize=5,alpha=1 );
def polynomial_regr(degree=1):
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
X_tr = x1[:].astype(float)
y_tr = y[:].astype(float)
# X_ts = x1[150:]
# y_ts = y[150:]
poly = PolynomialFeatures(degree=degree)
X_tr_ = poly.fit_transform(X_tr)
# X_ts_ = poly.fit_transform(X_ts)
regr = linear_model.LinearRegression()
regr.fit(X_tr_, y_tr)
y_pred_tr = regr.predict(X_tr_)[:]
# y_pred_ts = regr.predict(X_ts_)[:]
# Predicting the training data
plt.plot(X_tr,y_tr,'.r',markersize=10,alpha=.4 );
plt.plot(X_tr,y_pred_tr,'.b',markersize=10,alpha=.4 );
polynomial_regr(degree=2)
from ipywidgets import interact, HTML, FloatSlider
interact(polynomial_regr,degree=(3,150,1));
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
y_preds = pd.DataFrame(data=np.zeros((y.shape[0],0)))
ARE = pd.DataFrame(data=np.zeros((y.shape[0],0)))
MAE = pd.DataFrame(data=np.zeros((1,0)))
training_times = []
max_degree = 40
for degree in range(1,max_degree):
poly = PolynomialFeatures(degree=degree)
x1_ = poly.fit_transform(x1)
t0 = time.time()
regr = linear_model.LinearRegression()
regr.fit(x1_, y)
print 'dimension of training data for polynomial degree {} is {}'.format(degree,x1_.shape[1])
print 'degree {} is done in {} seconds'.format(degree,time.time()-t0)
y_pred = regr.predict(x1_)[:]
y_preds['degree-{}'.format(degree)] = y_pred
MAE['degree-{}'.format(degree)] = np.mean(np.abs(y_pred-y))
ARE['degree-{}'.format(degree)] = 100*np.abs(y_pred-y)/y
dimension of training data for polynomial degree 1 is 2 degree 1 is done in 0.000731945037842 seconds dimension of training data for polynomial degree 2 is 3 degree 2 is done in 0.000519037246704 seconds dimension of training data for polynomial degree 3 is 4 degree 3 is done in 0.000624895095825 seconds dimension of training data for polynomial degree 4 is 5 degree 4 is done in 0.000648975372314 seconds dimension of training data for polynomial degree 5 is 6 degree 5 is done in 0.000561952590942 seconds dimension of training data for polynomial degree 6 is 7 degree 6 is done in 0.000478982925415 seconds dimension of training data for polynomial degree 7 is 8 degree 7 is done in 0.00047779083252 seconds dimension of training data for polynomial degree 8 is 9 degree 8 is done in 0.000458002090454 seconds dimension of training data for polynomial degree 9 is 10 degree 9 is done in 0.000449895858765 seconds dimension of training data for polynomial degree 10 is 11 degree 10 is done in 0.000422954559326 seconds dimension of training data for polynomial degree 11 is 12 degree 11 is done in 0.000428915023804 seconds dimension of training data for polynomial degree 12 is 13 degree 12 is done in 0.000501155853271 seconds dimension of training data for polynomial degree 13 is 14 degree 13 is done in 0.000458002090454 seconds dimension of training data for polynomial degree 14 is 15 degree 14 is done in 0.000411987304688 seconds dimension of training data for polynomial degree 15 is 16 degree 15 is done in 0.000443935394287 seconds dimension of training data for polynomial degree 16 is 17 degree 16 is done in 0.000494956970215 seconds dimension of training data for polynomial degree 17 is 18 degree 17 is done in 0.000458002090454 seconds dimension of training data for polynomial degree 18 is 19 degree 18 is done in 0.000478029251099 seconds dimension of training data for polynomial degree 19 is 20 degree 19 is done in 0.000493049621582 seconds dimension of training data for polynomial degree 20 is 21 degree 20 is done in 0.000580072402954 seconds dimension of training data for polynomial degree 21 is 22 degree 21 is done in 0.000566959381104 seconds dimension of training data for polynomial degree 22 is 23 degree 22 is done in 0.000507831573486 seconds dimension of training data for polynomial degree 23 is 24 degree 23 is done in 0.000540971755981 seconds dimension of training data for polynomial degree 24 is 25 degree 24 is done in 0.000530958175659 seconds dimension of training data for polynomial degree 25 is 26 degree 25 is done in 0.000581026077271 seconds dimension of training data for polynomial degree 26 is 27 degree 26 is done in 0.000582933425903 seconds dimension of training data for polynomial degree 27 is 28 degree 27 is done in 0.000594139099121 seconds dimension of training data for polynomial degree 28 is 29 degree 28 is done in 0.000581026077271 seconds dimension of training data for polynomial degree 29 is 30 degree 29 is done in 0.000651121139526 seconds dimension of training data for polynomial degree 30 is 31 degree 30 is done in 0.000666856765747 seconds dimension of training data for polynomial degree 31 is 32 degree 31 is done in 0.00067400932312 seconds dimension of training data for polynomial degree 32 is 33 degree 32 is done in 0.000744819641113 seconds dimension of training data for polynomial degree 33 is 34 degree 33 is done in 0.000752210617065 seconds dimension of training data for polynomial degree 34 is 35 degree 34 is done in 0.000808954238892 seconds dimension of training data for polynomial degree 35 is 36 degree 35 is done in 0.000858068466187 seconds dimension of training data for polynomial degree 36 is 37 degree 36 is done in 0.000936985015869 seconds dimension of training data for polynomial degree 37 is 38 degree 37 is done in 0.000910997390747 seconds dimension of training data for polynomial degree 38 is 39 degree 38 is done in 0.000905990600586 seconds dimension of training data for polynomial degree 39 is 40 degree 39 is done in 0.000905990600586 seconds
quality = [1,5,10,15,20]
percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE.shape[1])),index=quality,columns=ARE.columns)
for percent in quality:
for m in range(ARE.shape[1]):
a = ARE.values[:,m]
n = float(a.shape[0])
percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n
# percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0])
percentiles
med_error = pd.DataFrame(data=ARE.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE.columns,index=['median'])
med_error
percentiles = pd.concat((med_error,percentiles))
percentiles.ix[:] = np.around(percentiles.values[:],decimals=2)
# percentiles.index.name = 'ARE'
percentiles.T
median | 1 | 5 | 10 | 15 | 20 | |
---|---|---|---|---|---|---|
degree-1 | 15.56 | 8.25 | 20.00 | 33.25 | 48.25 | 62.75 |
degree-2 | 9.23 | 9.25 | 30.25 | 53.00 | 69.75 | 81.25 |
degree-3 | 8.64 | 10.25 | 31.75 | 54.50 | 72.75 | 84.75 |
degree-4 | 8.48 | 11.00 | 31.25 | 56.00 | 73.00 | 85.00 |
degree-5 | 8.65 | 9.50 | 31.75 | 55.75 | 73.00 | 84.75 |
degree-6 | 8.69 | 9.75 | 32.50 | 54.75 | 72.75 | 84.75 |
degree-7 | 8.74 | 11.25 | 31.00 | 55.75 | 74.00 | 85.00 |
degree-8 | 8.66 | 11.00 | 32.00 | 55.75 | 74.25 | 84.75 |
degree-9 | 8.45 | 11.00 | 31.25 | 56.00 | 72.75 | 85.25 |
degree-10 | 8.62 | 10.50 | 31.25 | 55.75 | 73.50 | 85.00 |
degree-11 | 8.54 | 10.75 | 31.00 | 55.75 | 72.75 | 85.00 |
degree-12 | 8.60 | 10.50 | 31.00 | 55.75 | 73.50 | 85.00 |
degree-13 | 8.70 | 11.00 | 32.00 | 55.25 | 74.75 | 84.75 |
degree-14 | 8.69 | 10.75 | 31.25 | 55.75 | 74.50 | 84.75 |
degree-15 | 8.72 | 10.00 | 31.75 | 55.75 | 73.25 | 83.75 |
degree-16 | 9.00 | 10.75 | 31.25 | 54.25 | 72.25 | 84.75 |
degree-17 | 8.88 | 10.75 | 32.00 | 55.25 | 72.50 | 85.25 |
degree-18 | 9.12 | 10.50 | 32.00 | 55.50 | 72.25 | 85.25 |
degree-19 | 9.06 | 11.00 | 32.25 | 54.75 | 72.25 | 85.25 |
degree-20 | 9.02 | 12.00 | 31.25 | 54.75 | 73.25 | 85.00 |
degree-21 | 8.84 | 12.00 | 31.25 | 53.50 | 73.25 | 85.00 |
degree-22 | 8.90 | 12.00 | 31.75 | 53.25 | 73.25 | 85.00 |
degree-23 | 8.87 | 11.75 | 32.00 | 54.00 | 73.50 | 84.25 |
degree-24 | 8.85 | 13.00 | 32.00 | 54.50 | 73.00 | 84.50 |
degree-25 | 8.88 | 10.75 | 32.00 | 54.25 | 73.00 | 85.00 |
degree-26 | 8.95 | 11.00 | 32.25 | 55.75 | 73.00 | 85.00 |
degree-27 | 9.00 | 10.00 | 32.00 | 56.25 | 73.00 | 84.75 |
degree-28 | 9.04 | 9.75 | 32.75 | 56.00 | 73.00 | 84.75 |
degree-29 | 8.89 | 10.25 | 31.75 | 55.25 | 72.25 | 83.75 |
degree-30 | 9.00 | 10.75 | 30.00 | 54.50 | 71.25 | 82.50 |
degree-31 | 9.18 | 11.25 | 29.25 | 54.00 | 71.25 | 82.25 |
degree-32 | 9.49 | 12.25 | 29.50 | 51.25 | 70.25 | 80.50 |
degree-33 | 9.84 | 12.25 | 29.75 | 51.25 | 69.50 | 79.25 |
degree-34 | 9.78 | 11.00 | 30.25 | 50.50 | 68.50 | 79.00 |
degree-35 | 10.17 | 10.00 | 30.50 | 49.50 | 66.75 | 77.75 |
degree-36 | 10.39 | 10.75 | 30.25 | 49.25 | 64.75 | 77.00 |
degree-37 | 10.37 | 9.50 | 31.00 | 49.50 | 63.75 | 75.50 |
degree-38 | 10.87 | 9.25 | 31.25 | 48.75 | 63.75 | 73.75 |
degree-39 | 10.93 | 8.25 | 28.75 | 48.25 | 63.25 | 73.75 |
ax1 = plt.subplot(111)
# plt.title('degree-{} ARE for ts'.format(degree))
plt.plot(percentiles.T['median'].values[1:],'r')
plt.xlabel('polynomial degree')
plt.ylabel('median absolute relative error')
<matplotlib.text.Text at 0x11791d510>
path = './Images/rentalprice.csv'
rental = pd.read_csv(path)
rental.head()
Rent | ZIP | Type_Apartment | Type_Duplex | Type_Single house | Type_Studio | Rooms | Year built | Living space | lng | lat | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 645.0 | 5000 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1954.0 | 28.0 | 8.041672 | 47.397999 |
1 | 1340.0 | 5000 | 1.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1971.0 | 88.0 | 8.057444 | 47.379288 |
2 | 1380.0 | 5000 | 1.0 | 0.0 | 0.0 | 0.0 | 3.0 | 1968.0 | 69.0 | 8.057165 | 47.378022 |
3 | 1480.0 | 5000 | 1.0 | 0.0 | 0.0 | 0.0 | 3.5 | 1976.0 | 81.0 | 8.057974 | 47.400780 |
4 | 1500.0 | 5000 | 1.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1968.0 | 80.0 | 8.057165 | 47.378022 |
# To separate training and testing data from each other
import random
Target = 0
#Data Selection
Mat_all = rental.ix[:,:].copy()
a = range(Mat_all.shape[1])
ind_col = a !=np.tile(Target,(len(a)))
Tr_data_size = int(.6*Mat_all.shape[0])
print Tr_data_size
ind_row_train = random.sample(range(Mat_all.shape[0]),Tr_data_size)
ind_row_test = range(Mat_all.shape[0])
for i in ind_row_train:
ind_row_test.remove(i)
print len(ind_row_test)
#This is fixed always
y_tr = Mat_all.values[ind_row_train].astype(float)
y_tr= y_tr[:,Target].astype(float)
y_ts = Mat_all.values[ind_row_test].astype(float)
y_ts = y_ts[:,Target].astype(float)
X_tr = Mat_all.values[ind_row_train].astype(float)
X_tr = X_tr[:,ind_col].astype(float)
std = X_tr.std(axis=0)
randomization = 1e-25*np.random.randn(X_tr.shape[0],1)
ind_std0 = std==0
X_tr[:,ind_std0] = X_tr[:,ind_std0] + randomization
X_ts = Mat_all.values[ind_row_test].astype(float)
X_ts = X_ts[:,ind_col].astype(float)
12599 8400
import time
y_preds_tr = pd.DataFrame(data=np.zeros((y_tr.shape[0],0)))
ARE_tr = pd.DataFrame(data=np.zeros((y_tr.shape[0],0)))
MAE_tr = pd.DataFrame(data=np.zeros((1,0)))
y_preds_ts = pd.DataFrame(data=np.zeros((y_ts.shape[0],0)))
ARE_ts = pd.DataFrame(data=np.zeros((y_ts.shape[0],0)))
MAE_ts = pd.DataFrame(data=np.zeros((1,0)))
training_times = []
max_degree = 6
for degree in range(1,max_degree):
poly = PolynomialFeatures(degree=degree)
X_tr_ = poly.fit_transform(X_tr)
X_ts_ = poly.fit_transform(X_ts)
t0 = time.time()
regr = linear_model.LinearRegression()
regr.fit(X_tr_, y_tr)
print 'dimension of training data for polynomial degree {} is {}'.format(degree,X_tr_.shape[1])
training_times.append(time.time()-t0)
print 'degree {} is done in {} seconds'.format(degree,time.time()-t0)
y_pred_tr = regr.predict(X_tr_)[:]
y_pred_ts = regr.predict(X_ts_)[:]
y_preds_tr['degree-{}'.format(degree)] = y_pred_tr
MAE_tr['degree-{}'.format(degree)] = np.mean(np.abs(y_pred_tr-y_tr))
ARE_tr['degree-{}'.format(degree)] = 100*np.abs(y_pred_tr-y_tr)/y_tr
y_preds_ts['degree-{}'.format(degree)] = y_pred_ts
MAE_ts['degree-{}'.format(degree)] = np.mean(np.abs(y_pred_ts-y_ts))
ARE_ts['degree-{}'.format(degree)] = 100*np.abs(y_pred_ts-y_ts)/y_ts
plt.plot(range(1,max_degree),training_times)
plt.xlabel('polynomials degree')
plt.ylabel('training time')
dimension of training data for polynomial degree 1 is 11 degree 1 is done in 0.00388598442078 seconds dimension of training data for polynomial degree 2 is 66 degree 2 is done in 0.0329880714417 seconds dimension of training data for polynomial degree 3 is 286 degree 3 is done in 0.225691080093 seconds dimension of training data for polynomial degree 4 is 1001 degree 4 is done in 5.10628604889 seconds dimension of training data for polynomial degree 5 is 3003 degree 5 is done in 256.789484978 seconds
<matplotlib.text.Text at 0x11731d910>
quality = [1,5,10,15,20]
percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE_tr.shape[1])),index=quality,columns=ARE_tr.columns)
for percent in quality:
for m in range(ARE_tr.shape[1]):
a = ARE_tr.values[:,m]
n = float(a.shape[0])
percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n
# percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0])
percentiles
med_error = pd.DataFrame(data=ARE_tr.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE_tr.columns,index=['median'])
med_error
percentiles = pd.concat((med_error,percentiles))
percentiles.ix[:] = np.around(percentiles.values[:],decimals=2)
# percentiles.index.name = 'ARE'
percentiles.T
median | 1 | 5 | 10 | 15 | 20 | |
---|---|---|---|---|---|---|
degree-1 | 13.89 | 4.08 | 19.76 | 37.39 | 53.09 | 64.87 |
degree-2 | 13.08 | 4.52 | 21.61 | 40.04 | 55.67 | 67.09 |
degree-3 | 12.51 | 4.65 | 21.80 | 41.80 | 57.36 | 70.00 |
degree-4 | 11.14 | 4.94 | 23.95 | 45.65 | 62.54 | 74.66 |
degree-5 | 10.55 | 5.29 | 25.60 | 47.89 | 64.62 | 76.96 |
quality = [1,5,10,15,20]
percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE_ts.shape[1])),index=quality,columns=ARE_ts.columns)
for percent in quality:
for m in range(ARE_ts.shape[1]):
a = ARE_ts.values[:,m]
n = float(a.shape[0])
percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n
# percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0])
percentiles
med_error = pd.DataFrame(data=ARE_ts.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE_ts.columns,index=['median'])
med_error
percentiles = pd.concat((med_error,percentiles))
percentiles.ix[:] = np.around(percentiles.values[:],decimals=2)
# percentiles.index.name = 'ARE'
percentiles.T
median | 1 | 5 | 10 | 15 | 20 | |
---|---|---|---|---|---|---|
degree-1 | 14.13 | 3.73 | 19.20 | 37.00 | 52.21 | 64.55 |
degree-2 | 13.23 | 4.30 | 20.88 | 39.21 | 55.18 | 66.80 |
degree-3 | 12.63 | 4.21 | 21.92 | 41.27 | 56.63 | 68.65 |
degree-4 | 11.47 | 4.88 | 24.49 | 45.04 | 60.71 | 72.76 |
degree-5 | 11.05 | 5.39 | 25.15 | 46.42 | 62.62 | 74.13 |
N = 500
x1= np.random.normal(loc=17,scale=5,size=N)[:,np.newaxis]
x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis]
y = 3*x1 + np.random.normal(loc=.0, scale=.4, size=N)[:,np.newaxis]
# y = 2*x1
x1 = np.concatenate((np.random.normal(loc=10,scale=.9,size=N) , np.random.normal(loc=13,scale=.2,size=N),np.random.normal(loc=5,scale=.1,size=N)))[:,np.newaxis]
y =-.1*x1**3 + 2*x1*x1 + 7*np.sin(x1) + 3*np.sqrt(x1)+ 1*np.random.normal(loc=30.0, scale=4.7, size=len(x1))[:,np.newaxis]
# x_tmp = x1.copy()
# x1 = y
# y = x_tmp
x1 = Data[:,0][:,np.newaxis]
y = Data[:,1][:,np.newaxis]
fig = plt.figure(figsize=(7,7))
ax1= plt.subplot(111)
plt.plot(x1,y,'.r',markersize=5,alpha=1 );
from ipywidgets import interact, HTML, FloatSlider
interact(polynomial_regr,degree=(1,50,1));
Compare it with the kernel denisty learning we had above.
N = 1500
x1= np.random.normal(loc=17,scale=5,size=N)[:,np.newaxis]
x2= np.random.normal(loc=0,scale=5,size=N)[:,np.newaxis]
y = 3*x1 + np.random.normal(loc=.0, scale=.4, size=N)[:,np.newaxis]
# y = 2*x1
x1 = np.concatenate((np.random.normal(loc=10,scale=.9,size=N) , np.random.normal(loc=13,scale=.2,size=N),np.random.normal(loc=5,scale=.1,size=N)))[:,np.newaxis]
y =-.1*x1**3 + 2*x1*x1 + 7*np.sin(x1) + 3*np.sqrt(x1)+ 1*np.random.normal(loc=30.0, scale=4.7, size=len(x1))[:,np.newaxis]
y[y<100] =y[y<100] +100
y[x1>12] =y[x1>12] -50
# x_tmp = x1.copy()
# x1 = y
# y = x_tmp
# x1 = Data[:,0][:,np.newaxis]
# y = Data[:,1][:,np.newaxis]
fig = plt.figure(figsize=(7,7))
ax1= plt.subplot(111)
plt.plot(x1,y,'.r',markersize=5,alpha=1 );
# It is use is pretty similar to the previous one
# Here, we use PyQt library
import pyqt_fit.nonparam_regression as smooth
from pyqt_fit import npr_methods
import time
poly_order=5
y_preds_ = pd.DataFrame(data=np.zeros((y.shape[0],0)))
ARE = pd.DataFrame(data=np.zeros((y.shape[0],0)))
MAE = pd.DataFrame(data=np.zeros((1,0)))
t0 = time.time()
regr = smooth.NonParamRegression(x1.T, y[:,0], method=npr_methods.LocalPolynomialKernel(q=poly_order))
name = 'Poly-nonparametric-'+str(poly_order)
regr.fit()
y_preds_local = regr(x1.T)[:,np.newaxis]
print 'local polynomial with degree {} is done in {} seconds'.format(poly_order,time.time()-t0)
y_preds_['local-degree-{}'.format(poly_order)] = y_preds_local
MAE['local-degree-{}'.format(poly_order)] = np.mean(np.abs(y_preds_local-y))
ARE['local-degree-{}'.format(poly_order)] = 100*np.abs(y_preds_local-y)/y
t0 = time.time()
poly = PolynomialFeatures(degree=degree)
x1_ = poly.fit_transform(x1)
regr = linear_model.LinearRegression()
regr.fit(x1_, y)
y_preds_gloabal = regr.predict(x1_)
print 'Global polynomial with degree {} is done in {} seconds'.format(poly_order,time.time()-t0)
y_preds_['global-degree-{}'.format(poly_order)] = y_preds_gloabal
MAE['global-degree-{}'.format(poly_order)] = np.mean(np.abs(y_preds_gloabal-y))
ARE['global-degree-{}'.format(poly_order)] = 100*np.abs(y_preds_gloabal-y)/y
figure =plt.figure(figsize=(7,7))
plt.plot(x1, y, '.', alpha=0.1, label='Data')
plt.plot(x1, y_preds_local, '.g', alpha=1, label='Local')
plt.plot(x1, y_preds_gloabal, '.r', alpha=1, label='Global')
plt.legend(bbox_to_anchor = (1.3,1))
MAE
quality = [1,5,10,15,20]
percentiles = pd.DataFrame(data= np.zeros((len(quality),ARE.shape[1])),index=quality,columns=ARE.columns)
for percent in quality:
for m in range(ARE.shape[1]):
a = ARE.values[:,m]
n = float(a.shape[0])
percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n
# percentiles.ix[percent,m]= 100*(a<=percent).sum()/float(a.shape[0])
percentiles
med_error = pd.DataFrame(data=ARE.median(axis=0).values[:][np.newaxis,:].copy(),columns=ARE.columns,index=['median'])
med_error
percentiles = pd.concat((med_error,percentiles))
percentiles.ix[:] = np.around(percentiles.values[:],decimals=2)
# percentiles.index.name = 'ARE'
percentiles.T