# imports for the tutorial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
The main problem in optimization is how to search for the values of decision variables that minimize the cost/objective function.
Most search schemes are based on the assumption of unimodal surface. The optimum determined in such cases is called local optimum design.
The global optimum is the best of all local optimum designs.
# generate some random data
N = 20
x = np.linspace(1, 20, N)
y = 5 * x + np.random.randn(N)
# lls - analytical solution
# we want to find w that minimizes (wx-y)^2
# by the above derivation
w = np.sum(y * x) / np.sum(np.square(x))
print("best w:", w)
best w: 4.973943975346493
def plot_lls_sol(x, y, w, title=""):
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(1,1,1)
ax.scatter(x, y, label="y")
ax.plot(x, w * x, label="wx", color='r')
ax.legend()
ax.grid()
ax.set_xlabel("x")
ax.set_title(title)
# let's plot
plot_lls_sol(x, y, w, "Analytical Solution for min(wx-y)^2")
# lls - gradient descent solution
N = 20 # num samples
num_iterations = 20
alpha_k = 0.005
# we want to find w that minimizes (wx-y)^2
# initialize w
w = 0
for i in range(num_iterations):
print("iter:", i, " w = ", w)
gradient = np.sum(2 * w * np.square(x) - 2 * x * y) / N
w = w - alpha_k * gradient
print("best w:", w)
iter: 0 w = 0 iter: 1 w = 7.1376096046222175 iter: 2 w = 4.032749426611554 iter: 3 w = 5.383363604046192 iter: 4 w = 4.795846436862124 iter: 5 w = 5.051416404587194 iter: 6 w = 4.940243468626789 iter: 7 w = 4.988603695769565 iter: 8 w = 4.967566996962457 iter: 9 w = 4.9767179609435495 iter: 10 w = 4.972737291611774 iter: 11 w = 4.974468882771096 iter: 12 w = 4.973715640616791 iter: 13 w = 4.974043300953914 iter: 14 w = 4.973900768707265 iter: 15 w = 4.973962770234557 iter: 16 w = 4.973935799570186 iter: 17 w = 4.973947531809187 iter: 18 w = 4.973942428285222 iter: 19 w = 4.973944648318146 best w: 4.973943682603824
plot_lls_sol(x, y, w, "Gradient Descent Solution for min(wx-y)^2")
def batch_generator(x, y, batch_size, shuffle=True):
"""
This function generates batches for a given dataset x.
"""
N = len(x)
num_batches = N // batch_size
batch_x = []
batch_y = []
if shuffle:
# shuffle
rand_gen = np.random.RandomState(0)
shuffled_indices = rand_gen.permutation(np.arange(len(x)))
x = x[shuffled_indices]
y = y[shuffled_indices]
for i in range(N):
batch_x.append(x[i])
batch_y.append(y[i])
if len(batch_x) == batch_size:
yield np.array(batch_x), np.array(batch_y)
batch_x = []
batch_y = []
if batch_x:
yield np.array(batch_x), np.array(batch_y)
# mini-batch gradient descent
batch_size = 5
num_batches = N // batch_size
print("total batches:", num_batches)
num_iterations = 20
alpha_k = 0.001
batch_gen = batch_generator(x, y, batch_size, shuffle=True)
# we want to find w that minimizes (wx-y)^2
# initialize w
w = 0
for i in range(num_iterations):
for batch_i, batch in enumerate(batch_gen):
batch_x, batch_y = batch
if batch_i % 5 == 0:
print("iter:", i, "batch:", batch_i, " w = ", w)
gradient = np.sum(2 * w * np.square(batch_x) - 2 * batch_x * batch_y) / len(batch_x)
w = w - alpha_k * gradient
batch_gen = batch_generator(x, y, batch_size, shuffle=True)
print("best w:", w)
total batches: 4 iter: 0 batch: 0 w = 0 iter: 1 batch: 0 w = 3.714921791540526 iter: 2 batch: 0 w = 4.660336847464463 iter: 3 batch: 0 w = 4.900936698046245 iter: 4 batch: 0 w = 4.962167252538953 iter: 5 batch: 0 w = 4.9777498923220564 iter: 6 batch: 0 w = 4.981715537654223 iter: 7 batch: 0 w = 4.982724759655079 iter: 8 batch: 0 w = 4.982981597814244 iter: 9 batch: 0 w = 4.983046960876035 iter: 10 batch: 0 w = 4.983063595202728 iter: 11 batch: 0 w = 4.983067828493167 iter: 12 batch: 0 w = 4.983068905828502 iter: 13 batch: 0 w = 4.983069180000908 iter: 14 batch: 0 w = 4.983069249775384 iter: 15 batch: 0 w = 4.983069267532377 iter: 16 batch: 0 w = 4.9830692720513765 iter: 17 batch: 0 w = 4.983069273201423 iter: 18 batch: 0 w = 4.983069273494099 iter: 19 batch: 0 w = 4.983069273568582 best w: 4.983069273587538
# let's plot
plot_lls_sol(x, y, w, "Mini-Batch Gradient Descent Solution for min(wx-y)^2")
Method | Accuracy | Update Speed | Memory Usage | Online Learning |
---|---|---|---|---|
Batch Gradient Descent | Good | Slow | High | No |
Stochastic Gradient Descent | Good (with softening) | Fast | Low | Yes |
Mini-Batch Gradient Descent | Good | Medium | Medium | Yes (depends on the MB size) |
* Illustration: <img src="./assets/tut_06_deriv.jpg" style="height:200px">
Find the gradient of $f(x) = \sqrt{x^TQx}$ ($Q$ is positive definite)
$$X^TX \in \mathbb{R}^{L \times L} $$
# let's load the cancer dataset
dataset = pd.read_csv('./datasets/cancer_dataset.csv')
# print the number of rows in the data set
number_of_rows = len(dataset)
# reminder, the data looks like this
dataset.sample(10)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
187 | 874373 | B | 11.71 | 17.19 | 74.68 | 420.3 | 0.09774 | 0.06141 | 0.03809 | 0.03239 | ... | 21.39 | 84.42 | 521.5 | 0.1323 | 0.1040 | 0.1521 | 0.10990 | 0.2572 | 0.07097 | NaN |
323 | 895100 | M | 20.34 | 21.51 | 135.90 | 1264.0 | 0.11700 | 0.18750 | 0.25650 | 0.15040 | ... | 31.86 | 171.10 | 1938.0 | 0.1592 | 0.4492 | 0.5344 | 0.26850 | 0.5558 | 0.10240 | NaN |
355 | 9010258 | B | 12.56 | 19.07 | 81.92 | 485.8 | 0.08760 | 0.10380 | 0.10300 | 0.04391 | ... | 22.43 | 89.02 | 547.4 | 0.1096 | 0.2002 | 0.2388 | 0.09265 | 0.2121 | 0.07188 | NaN |
82 | 8611555 | M | 25.22 | 24.91 | 171.50 | 1878.0 | 0.10630 | 0.26650 | 0.33390 | 0.18450 | ... | 33.62 | 211.70 | 2562.0 | 0.1573 | 0.6076 | 0.6476 | 0.28670 | 0.2355 | 0.10510 | NaN |
329 | 895633 | M | 16.26 | 21.88 | 107.50 | 826.8 | 0.11650 | 0.12830 | 0.17990 | 0.07981 | ... | 25.21 | 113.70 | 975.2 | 0.1426 | 0.2116 | 0.3344 | 0.10470 | 0.2736 | 0.07953 | NaN |
80 | 861103 | B | 11.45 | 20.97 | 73.81 | 401.5 | 0.11020 | 0.09362 | 0.04591 | 0.02233 | ... | 32.16 | 84.53 | 525.1 | 0.1557 | 0.1676 | 0.1755 | 0.06127 | 0.2762 | 0.08851 | NaN |
530 | 91858 | B | 11.75 | 17.56 | 75.89 | 422.9 | 0.10730 | 0.09713 | 0.05282 | 0.04440 | ... | 27.98 | 88.52 | 552.3 | 0.1349 | 0.1854 | 0.1366 | 0.10100 | 0.2478 | 0.07757 | NaN |
486 | 913102 | B | 14.64 | 16.85 | 94.21 | 666.0 | 0.08641 | 0.06698 | 0.05192 | 0.02791 | ... | 25.44 | 106.00 | 831.0 | 0.1142 | 0.2070 | 0.2437 | 0.07828 | 0.2455 | 0.06596 | NaN |
9 | 84501001 | M | 12.46 | 24.04 | 83.97 | 475.9 | 0.11860 | 0.23960 | 0.22730 | 0.08543 | ... | 40.68 | 97.65 | 711.4 | 0.1853 | 1.0580 | 1.1050 | 0.22100 | 0.4366 | 0.20750 | NaN |
48 | 857155 | B | 12.05 | 14.63 | 78.04 | 449.3 | 0.10310 | 0.09092 | 0.06592 | 0.02749 | ... | 20.70 | 89.88 | 582.6 | 0.1494 | 0.2156 | 0.3050 | 0.06548 | 0.2747 | 0.08301 | NaN |
10 rows × 33 columns
def plot_3d(x, y, z):
%matplotlib notebook
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z)
ax.set_xlabel('Radius Mean')
ax.set_ylabel('Area Mean')
ax.set_zlabel('Perimeter Mean')
ax.set_title("Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean")
# let's plot X = [radius, area], y = perimeter
xs = dataset[['radius_mean']].values
ys = dataset[['area_mean']].values
zs = dataset[['perimeter_mean']].values
plot_3d(xs, ys, zs)
def plot_3d_lls(x, y, z, lls_sol, title=""):
# plot
%matplotlib notebook
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, label='Y')
ax.scatter(x, y, lls_sol, label='Xw')
ax.legend()
ax.set_xlabel('Radius Mean')
ax.set_ylabel('Area Mean')
ax.set_zlabel('Perimeter Mean')
ax.set_title(title)
# multivariate lls - analytical solution
X = dataset[['radius_mean', 'area_mean']].values
Y = dataset[['perimeter_mean']].values
xs = dataset[['radius_mean']].values
ys = dataset[['area_mean']].values
zs = dataset[['perimeter_mean']].values
w = np.linalg.inv(X.T @ X) @ X.T @ Y
lls_sol = X @ w
print("w:")
print(w)
w: [[6.19721311] [0.00676913]]
# plot
plot_3d_lls(xs, ys, zs, lls_sol, "Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean - LLS Analyitical")
If $L = 1000$, we would need to invert a $1000 \times 1000$ matrix, which would take about $10^9$ operations!
# multivariate lls - gradient descent solution
X = dataset[['radius_mean', 'area_mean']].values
Y = dataset[['perimeter_mean']].values
# Scaling
X = (X - X.mean(axis=0, keepdims=True)) / X.std(axis=0, keepdims=True)
Y = (Y - Y.mean(axis=0, keepdims=True)) / Y.std(axis=0, keepdims=True)
num_iterations = 20
alpha_k = 0.0001
L = X.shape[1]
# initialize w
w = np.zeros((L, 1))
for i in range(num_iterations):
print("iter:", i, " w = ")
print(w)
gradient = 2 * X.T @ X @ w - 2 * X.T @ Y
w = w - alpha_k * gradient
lls_sol = X @ w
print("w:")
print(w)
# plot
plot_3d_lls(X[:,0], X[:, 1], Y, lls_sol, "Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean - LLS GD")
def batch_generator(x, y, batch_size, shuffle=True):
"""
This function generates batches for a given dataset x.
"""
N, L = x.shape
num_batches = N // batch_size
batch_x = []
batch_y = []
if shuffle:
# shuffle
rand_gen = np.random.RandomState(0)
shuffled_indices = rand_gen.permutation(np.arange(N))
x = x[shuffled_indices, :]
y = y[shuffled_indices, :]
for i in range(N):
batch_x.append(x[i, :])
batch_y.append(y[i, :])
if len(batch_x) == batch_size:
yield np.array(batch_x).reshape(batch_size, L), np.array(batch_y).reshape(batch_size, 1)
batch_x = []
batch_y = []
if batch_x:
yield np.array(batch_x).reshape(-1, L), np.array(batch_y).reshape(-1, 1)
# multivaraite mini-batch gradient descent
X = dataset[['radius_mean', 'area_mean']].values
Y = dataset[['perimeter_mean']].values
# Scaling
X = (X - X.mean(axis=0, keepdims=True)) / X.std(axis=0, keepdims=True)
Y = (Y - Y.mean(axis=0, keepdims=True)) / Y.std(axis=0, keepdims=True)
N = X.shape[0]
batch_size = 10
num_batches = N // batch_size
print("total batches:", num_batches)
total batches: 56
num_iterations = 10
alpha_k = 0.001
batch_gen = batch_generator(X, Y, batch_size, shuffle=True)
# initialize w
w = np.zeros((L, 1))
for i in range(num_iterations):
for batch_i, batch in enumerate(batch_gen):
batch_x, batch_y = batch
if batch_i % 50 == 0:
print("iter:", i, "batch:", batch_i, " w = ")
print(w)
gradient = 2 * batch_x.T @ batch_x @ w - 2 * batch_x.T @ batch_y
w = w - alpha_k * gradient
batch_gen = batch_generator(X, Y, batch_size, shuffle=True)
lls_sol = X @ w
# plot
plot_3d_lls(X[:,0], X[:, 1], Y, lls_sol, "Breast Cancer - Radius Mean vs. Area Mean vs. Perimeter Mean - LLS Mini-Batch GD")
print("w:")
print(w)
w: [[0.55894282] [0.42792729]]
Maximize $H(P) = -\sum_{i=1}^d p_i \log p_i$ subject to $\sum_{i=1}^d p_i = 1$