#!/usr/bin/env python # coding: utf-8 # ### Example of Gradient Descent optimization for simple linear regression (example adopted from "The Hundred Page Machine Learning Book", by Andriy Burkov). # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib matplotlib.rcParams['mathtext.fontset'] = 'stix' matplotlib.rcParams['font.family'] = 'STIXGeneral' matplotlib.rcParams.update({'font.size': 18}) # Recall linear regression model: $f(x) = wx + b$. We find the values of parameters $w$ and $b$ by minimizing the mean squared error: # # \begin{equation*} # l = 1/N \sum_{i=1}^N (y_i - (wx +b))^2 # \end{equation*} # # For Gradient Descent, we first need to obtain the partial derivatives of this loss function relative each parameter: # # \begin{equation*} # \frac{\partial l}{\partial w} = 1/N \sum_{i=1}^N -2x_i(y_i - (wx +b) # \end{equation*} # # \begin{equation*} # \frac{\partial l}{\partial b} = 1/N \sum_{i=1}^N -2(y_i - (wx +b) # \end{equation*} # # Gradient Descent proceeds in epochs. In each epoch, the entire training set is used to update each parameter. We initialize parameters $w$ and $b$ to 0. In each iteration (epoch), we compute the partial derivatives using the current values of $w$ and $b$. Then, we simultaneously update the parameters $w$ and $b$ using these partial derivatives: # # \begin{equation*} # w = w - \alpha \frac{\partial l}{\partial w} # \end{equation*} # # \begin{equation*} # b = b - \alpha \frac{\partial l}{\partial b} # \end{equation*} # # The $\alpha$ is the learning rate and controls the size of the update. # #### Example: Let's look at a specific example of simple linear regression using a real data set about ad spending. Here, in $y = wx + b$, $y$ = Sales (unit product sales) and $x$ = Spending (in millions of dollars). We want to find the optimal values of $w$ and $b$ using Gradient Descent. # In[2]: salesDF = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/data.csv") salesDF.head(10) # In[3]: salesDF.shape # In[4]: def plot_original_data(salesDF): x = salesDF['Spending'] y = salesDF['Sales'] plt.scatter(x, y, color='#1f77b4', marker='o') plt.xlabel("Spendings, M$") plt.ylabel("Sales, Units") plt.title("Sales as a function of radio ad spendings.") plt.show() # fig1 = plt.gcf() # fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) # fig1.savefig('Illustrations/gradient_descent-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) # fig1.savefig('Illustrations/gradient_descent-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) # In[5]: plot_original_data(salesDF) # #### Let's first create a function that updates the values of $w$ and $b$ using the partial derivatives during one epoch (one pass through the training data). # In[6]: def update_w_and_b(spendings, sales, w, b, alpha): dr_dw = 0.0 dr_db = 0.0 N = len(spendings) for i in range(N): dr_dw += -2 * spendings[i] * (sales[i] - (w * spendings[i] + b)) dr_db += -2 * (sales[i] - (w * spendings[i] + b)) # update w and b w = w - (dr_dw/float(N)) * alpha b = b - (dr_db/float(N)) * alpha return w, b # #### The loss function below simply retuns the mean square error on the training data based on the current values of $w$ and $b$. This function will be called in each epoch after parameter are updated so that we can monitor the progress of the gradient descent algorithm. # In[7]: def loss(spendings, sales, w, b): N = len(spendings) total_error = 0.0 for i in range(N): total_error += (sales[i] - (w*spendings[i] + b))**2 return total_error / N # #### Finally, let's create a function for training the model using gradient descent. As noted above, the function iterates over many "epochs" in each of which the full training data is used to update the parameters. The output shows the incremental progress towards the final values of $w$ and $b$. # In[8]: def train(x, y, w, b, alpha, epochs): for e in range(epochs): w, b = update_w_and_b(x, y, w, b, alpha) # log the progress if e % 500 == 0: print("epoch: ", str(e), "loss: "+str(loss(x, y, w, b))) print("w, b: ", w, b) print("\n") return w, b # #### Now let's run gradient descent on our problem using 15,000 epochs (15,000 passes through the training data) and a learning rate of 0.001. Our initial values for $w$ and $b$ can be set randomly or set to zeros. # In[9]: # x, y = np.loadtxt("Data/data.csv", delimiter= ",", skiprows=1, unpack = True) x = salesDF['Spending'] y = salesDF['Sales'] w, b = train(x, y, 0.0, 0.0, 0.001, 15000) # #### Let's now try to use our learned model to predict the Sales value for a given value of Spending: # In[10]: print("Final Values: w = ", w, "b = ", b) def predict(x, w, b): return w*x + b x_new = 23.0 y_new = predict(x_new, w, b) print("Spending =", x_new, "==> Predicted Sales =", y_new) # #### Let's compare this to what we get from scikit-learn LinearRegression module: # In[11]: from sklearn.linear_model import LinearRegression linreg = LinearRegression() # LinearRegression expects a 2-d array (with features such as Spending in columns). # We can reshape the 1-d array x into a 2-d matrix, in this case with one column x = x.reshape(-1,1) linreg.fit(x,y) y_new = linreg.predict(x_new) print("Spending =", x_new, "==> Predicted Sales =", y_new) # #### Let's modify the train function to keep track of a few more things and also to creat incremental plots of the function $f(x) = wx+b$. # In[12]: def train(x, y, w, b, alpha, epochs): image_counter = 2; w_array = np.array([]) b_array = np.array([]) for e in range(epochs): w, b = update_w_and_b(x, y, w, b, alpha) w_array = np.append(w_array, [w]) b_array = np.append(b_array, [b]) # log the progress if e % 1000 == 0: print("epoch: ", str(e), "loss: "+str(loss(x, y, w, b))) print("w, b: ", w, b) plt.figure(image_counter) axes = plt.gca() axes.set_xlim([0,50]) axes.set_ylim([0,30]) plt.scatter(x, y) X_plot = np.linspace(0,50,50) plt.plot(X_plot, X_plot*w + b) plt.show() # fig1 = plt.gcf() # fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) # fig1.savefig('Illustrations/gradient_descent-' + str(image_counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) # fig1.savefig('Illustrations/gradient_descent-' + str(image_counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) image_counter += 1 return w_array, b_array # In[13]: x = salesDF['Spending'] y = salesDF['Sales'] get_ipython().run_line_magic('time', 'w_array, b_array = train(x, y, 0.0, 0.0, 0.001, 15000)') # In[14]: epochs = 15000 plt.plot(range(epochs), w_array) plt.xlabel("Epoch") plt.ylabel("w") plt.title("Changes in w across epochs") plt.show() plt.plot(range(epochs), b_array) plt.xlabel("Epoch") plt.ylabel("b") plt.title("Changes in b across epochs") plt.show() # #### Gradient descent can be computationally expensive because in each epoch all training instances are used to update parameters. An alternative is to use only small portions of the training data in batches in each epoch for updating parameters. This is called Stochastic Gradient Descent (SGD) and has many variants depending on how the training instances are selected in each epoch. In the basic SGD approach, only one instance is used in each epoch (but, we may still need to make several passes through the data overall). # In[15]: def update_sgd_w_and_b(spendings, sales, w, b, alpha, i): dr_dw = 0.0 dr_db = 0.0 N = len(spendings) dr_dw += -2 * spendings[i] * (sales[i] - (w * spendings[i] + b)) dr_db += -2 * (sales[i] - (w * spendings[i] + b)) # update w and b w = w - dr_dw * alpha b = b - dr_db * alpha return w, b # In[16]: def train_sgd(x, y, w, b, alpha, epochs): for e in range(epochs): i = e % len(x) w, b = update_sgd_w_and_b(x, y, w, b, alpha, i) # log the progress if e % 5000 == 0: print("epoch: ", str(e), "loss: "+str(loss(x, y, w, b))) print("w, b: ", w, b) print("\n") return w, b # In[17]: x = salesDF['Spending'] y = salesDF['Sales'] # In SGD, we may need to use a much smaller learnng rate since it is applied many more times than in GD # We also may need more epochs to compensate for not using all training instances in each epoch get_ipython().run_line_magic('time', 'w, b = train_sgd(x, y, 0.0, 0.0, 0.00005, 150000)') print("Final Values: w = ", w, "b = ", b) # In[ ]: