#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# ## Why is gradient descent so bad at optimizing polynomial regression?#
#
# Question from Stackexchange:
# https://stats.stackexchange.com/questions/350130/why-is-gradient-descent-so-bad-at-optimizing-polynomial-regression
#
#
#
# ### Linear regression
#
# #### Cost function
# $J(\theta) = \frac{1}{2m}\sum_{i=1}^m(h_\theta(x^{(i)}) - y^{(i)})^2 $
#
# $J(\theta) = \frac{1}{2m}(X\theta - y)^T(X\theta - y) $ (vectorized version)
#
# #### Gradient
# $\frac{\partial J(\theta)}{\partial \theta} = \frac{1}{m}X^T(X\theta - y) $
#
# ##### Hessian
# $\frac{\partial^2 J(\theta)}{\partial \theta^2} = \frac{1}{m}X^T X $
#
# ## Polynomial regression
# The design matrix is of the form:
#
# $ \mathbf{X = [1 , x , x^2 , x^3 , ... , x}^n]$
#
# ### Libraries
# In[1]:
import numpy as np
import pandas as pd
import scipy.optimize as opt
from sklearn import linear_model
import statsmodels.api as sm
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn-white')
# ### Helper Functions
# In[2]:
def costfunction(theta,X,y):
m = np.size(y)
theta = theta.reshape(-1,1)
#Cost function in vectorized form
h = X @ theta
J = float((1./(2*m)) * (h - y).T @ (h - y));
return J;
def gradient_descent(theta,X,y,alpha = 0.0005,num_iters=1000):
m = np.size(y)
J_history = np.empty(num_iters)
count_history = np.empty(num_iters)
theta_1_hist, theta_2_hist = [], []
for i in range(num_iters):
#Grad function in vectorized form
h = X @ theta
theta = theta - alpha * (1/m)* (X.T @ (h-y))
#Tracker values for plotting
J_history[i] = costfunction(theta,X,y)
count_history[i] = i
theta_1_hist.append(theta[0,0])
theta_2_hist.append(theta[1,0])
return theta, J_history,count_history, theta_1_hist, theta_2_hist
def grad(theta,X,y):
#Initializations
theta = theta[:,np.newaxis]
m = len(y)
grad = np.zeros(theta.shape)
#Computations
h = X @ theta
grad = (1./m)*(X.T @ ( h - y))
return (grad.flatten())
def polynomial_features(data, deg):
data_copy=data.copy()
for i in range(1,deg):
data_copy['X'+str(i+1)]=data_copy['X'+str(i)]*data_copy['X1']
return data_copy
def hessian(theta,X,y):
m,n = X.shape
X = X.values
return ((1./m)*(X.T @ X))
# # : Comparing results for high order polynomial regression
# ### Initializing the data
# In[3]:
#Create data from sin function with uniform noise
x = np.linspace(0.1,1,40)
noise = np.random.uniform( size = 40)
y = np.sin(x * 1.5 * np.pi )
y_noise = (y + noise).reshape(-1,1)
y_noise = y_noise - y_noise.mean() #Centering the data
degree = 7
X_d = polynomial_features(pd.DataFrame({'X0':1,'X1': x}),degree)
# ### Closed form solution
# In[4]:
def closed_form_solution(X,y):
return np.linalg.inv(X.T @ X) @ X.T @ y
coefs = closed_form_solution(X_d.values,y_noise)
coefs
# ### Numpy only fit
# In[5]:
stepsize = .1
theta_result_1,J_history_1, count_history_1, theta_1_hist, theta_2_hist = gradient_descent(np.zeros((len(X_d.T),1)).reshape(-1,1), X_d,y_noise,alpha = stepsize,num_iters=5000)
display(theta_result_1)
# ### Sciy optimize fit using first order derivative only
# #### Comment: BFGS does very well but requires adjustment of options
# In particular tolerance must be made smaller as the cost function is very flat near the global minimum
# In[6]:
import scipy.optimize as opt
theta_init = np.ones((len(X_d.T),1)).reshape(-1,1)
model_t = opt.minimize(fun = costfunction, x0 = theta_init , args = (X_d, y_noise),
method = 'BFGS', jac = grad, options={'maxiter':1000, 'gtol': 1e-10, 'disp' : True})
model_t.x
# ### Sciy optimize fit using hessian matrix
# #### As expected, 2nd order information allows to converge much faster
# In[7]:
import scipy.optimize as opt
theta_init = np.ones((len(X_d.T),1)).reshape(-1,1)
model_t = opt.minimize(fun = costfunction, x0 = theta_init , args = (X_d, y_noise),
method = 'dogleg', jac = grad, hess= hessian, options={'maxiter':1000, 'disp':True})
model_t.x
# ### Sklearn fit
# In[8]:
from sklearn import linear_model
model_d = linear_model.LinearRegression(fit_intercept=False)
model_d.fit(X_d,y_noise)
model_d.coef_
# ### Statsmodel fit
# In[9]:
import statsmodels.api as sm
model_sm = sm.OLS(y_noise, X_d)
res = model_sm.fit()
print(res.summary())
# # : Repeating the experiment with 2 polynomial variables and visualizing the results
# Here we will focus on a 2-D design matrix with $x$ and $x^2$ values. The y values have been centered so we will ignore the constant term and y-intercept
# ### Initializing the data
# In[10]:
#Create data from sin function with uniform noise
x = np.linspace(0.1,1,40) #Adjusting the starting point to reduce numerical instability
noise = np.random.uniform( size = 40)
y = np.sin(x * 1.5 * np.pi )
y_noise = (y + noise).reshape(-1,1)
y_noise = y_noise - y_noise.mean() #Centering the data
#2nd order polynomial only
degree = 2
X_d = polynomial_features(pd.DataFrame({'X1': x}),degree)
#Starting point for gradient descent - see later diagrams
initial_theta = np.array([0,-2]).reshape(-1,1)
# In[11]:
X_d = X_d[['X1','X2']]
X_d.head()
# ### Closed form solution
# In[12]:
def closed_form_solution(X,y):
return np.linalg.inv(X.T @ X) @ X.T @ y
coefs = closed_form_solution(X_d.values,y_noise)
coefs
# ### Numpy only fit
# In[14]:
stepsize = .3
theta_result,J_history, count_history_1, theta_1, theta_2 = gradient_descent(initial_theta,X_d,y_noise,alpha = stepsize,num_iters=10000)
display(theta_result)
# ### Plotting the gradient descent convergence and resulting fits
# In[15]:
fig = plt.figure(figsize = (18,8))
#Looping through different stepsizes
for s in [.001,.01,.1,1]:
theta_calc,J_history_1, count_history_1, theta_1, theta_2 = gradient_descent(initial_theta, X_d,y_noise,alpha = s,num_iters=5000)
#Plot gradient descent convergence
ax = fig.add_subplot(1, 2, 1)
ax.plot(count_history_1, J_history_1, label = 'Grad. desc. stepsize: {}'.format(s))
#Plot resulting fits on data
ax = fig.add_subplot(1, 2, 2)
ax.plot(x,X_d@theta_calc, label = 'Grad. desc. stepsize: {}'.format(s))
#Adding plot features
ax = fig.add_subplot(1, 2, 1)
ax.axhline(costfunction(coefs, X_d, y_noise), linestyle=':', label = 'Closed form minimum')
ax.set_xlabel('Count')
ax.set_ylabel('Cost function')
ax.set_title('Plot of convergence: Polynomial regression x, x^2 ={}'.format(degree))
ax.legend(loc = 1)
ax = fig.add_subplot(1, 2, 2)
ax.scatter(x,y_noise, facecolors = 'none', edgecolor = 'darkblue', label = 'f(x) + noise')
ax.plot(x,X_d@coefs, linestyle=':', label = 'Closed form fit')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Noisy data and gradient descent fits'.format(degree))
ax.legend()
plt.show()
# ### Plotting the cost function in 3D
# In[16]:
#Creating the dataset (as previously)
X = X_d.values
#Setup of meshgrid of theta values
T0, T1 = np.meshgrid(np.linspace(0,6,100),np.linspace(0,-8,100))
#Computing the cost function for each theta combination
zs = np.array( [costfunction(np.array([t0,t1]).reshape(-1,1), X, y_noise.reshape(-1,1))
for t0, t1 in zip(np.ravel(T0), np.ravel(T1)) ] )
#Reshaping the cost values
Z = zs.reshape(T0.shape)
#Computing the gradient descent
theta_result,J_history, count_history_1, theta_1, theta_2 = gradient_descent(initial_theta,X,y_noise,alpha = 0.3,num_iters=5000)
#Angles needed for quiver plot
anglesx = np.array(theta_1)[1:] - np.array(theta_1)[:-1]
anglesy = np.array(theta_2)[1:] - np.array(theta_2)[:-1]
get_ipython().run_line_magic('matplotlib', 'inline')
fig = plt.figure(figsize = (16,8))
#Surface plot
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(T0, T1, Z, rstride = 5, cstride = 5, cmap = 'jet', alpha=0.5)
ax.plot(theta_1,theta_2,J_history, marker = '*',markersize = 4, color = 'r', alpha = .2, label = 'Gradient descent')
ax.plot(coefs[0],coefs[1], marker = '*', color = 'black', markersize = 10)
ax.set_xlabel('theta 1')
ax.set_ylabel('theta 2')
ax.set_zlabel('Cost function')
ax.set_title('Gradient descent: Root at {}'.format(theta_calc.flatten().round(2)))
ax.view_init(45, -45)
ax.legend()
#Contour plot
ax = fig.add_subplot(1, 2, 2)
ax.contour(T0, T1, Z, 70, cmap = 'jet')
ax.quiver(theta_1[:-1], theta_2[:-1], anglesx, anglesy, scale_units = 'xy', angles = 'xy', scale = 1, color = 'r', alpha = .9)
ax.plot(coefs[0],coefs[1], marker = '*', color = 'black', markersize = 10)
ax.set_xlabel('theta 1')
ax.set_ylabel('theta 2')
ax.set_title('Gradient descent: Root at {}'.format(theta_calc.flatten().round(2)))
ax.legend()
plt.legend()
plt.show()
# ### Sciy optimize fit
# In[17]:
import scipy.optimize as opt
theta_init = np.ones((len(X_d.T),1)).reshape(-1,1)
model_t = opt.minimize(fun = costfunction, x0 = theta_init , args = (X_d, y_noise),
method = 'dogleg', jac = grad, hess= hessian, options={'maxiter':1000})
model_t.x
# ### Sklearn fit
# In[18]:
from sklearn import linear_model
model_d = linear_model.LinearRegression(fit_intercept=False)
model_d.fit(X_d,y_noise)
model_d.coef_
# ### Statsmodel fit
# In[19]:
import statsmodels.api as sm
model_sm = sm.OLS(y_noise, X_d)
res = model_sm.fit()
print(res.summary())
# In[ ]:
# In[ ]: