#!/usr/bin/env python # coding: utf-8 # # 10 - Perceptron Training # # by [Fabio A. González](http://dis.unal.edu.co/~fgonza/), Universidad Nacional de Colombia # # version 1.0, June 2018 # # ## Part of the class [Applied Deep Learning](https://github.com/albahnsen/AppliedDeepLearningClass) # # # This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://creativecommons.org/licenses/by-sa/3.0/deed.en_US). # # # In[41]: import numpy as np import pylab as pl from sklearn.datasets.samples_generator import make_blobs get_ipython().run_line_magic('matplotlib', 'inline') def plot_data(X, y): y_unique = np.unique(y) colors = pl.cm.rainbow(np.linspace(0.0, 1.0, y_unique.size)) for this_y, color in zip(y_unique, colors): this_X = X[y == this_y] pl.scatter(this_X[:, 0], this_X[:, 1], c=color, alpha=0.5, edgecolor='k', label="Class %s" % this_y) pl.legend(loc="best") pl.title("Data") def plot_decision_region(X, pred_fun): min_x = np.min(X[:, 0]) max_x = np.max(X[:, 0]) min_y = np.min(X[:, 1]) max_y = np.max(X[:, 1]) min_x = min_x - (max_x - min_x) * 0.05 max_x = max_x + (max_x - min_x) * 0.05 min_y = min_y - (max_y - min_y) * 0.05 max_y = max_y + (max_y - min_y) * 0.05 x_vals = np.linspace(min_x, max_x, 30) y_vals = np.linspace(min_y, max_y, 30) XX, YY = np.meshgrid(x_vals, y_vals) grid_r, grid_c = XX.shape ZZ = np.zeros((grid_r, grid_c)) for i in range(grid_r): for j in range(grid_c): ZZ[i, j] = pred_fun(XX[i, j], YY[i, j]) pl.contourf(XX, YY, ZZ, 30, cmap = pl.cm.coolwarm, vmin= -1, vmax=2) pl.colorbar() pl.xlabel("x") pl.ylabel("y") # ## Two Class Classification Problem # In[61]: X, Y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=0) pl.figure(figsize=(8, 6)) plot_data(X, Y) # ## How to solve it? # # * We need to design a prediction function $f:\mathbb{R}^{2}\rightarrow\mathbb{R}$ such that: # # $$\textrm{Prediction}(x)=\begin{cases} # C_{1} & \mbox{si }f(x)\ge \theta\\ # C_{2} & \mbox{si }f(x)<\theta # \end{cases}$$ # # * Here we will model $f$ as a logistic model with parameters $w$ and $w_0$: # # $$f_w(x) = P(C_1|x)= \sigma(wx+w_0)$$ # # where # $$\sigma(x) = \frac{1}{1+e^{-x}}$$ # # # ## Perceptron # # This model corresponds to a perceptron or logistic regression model # # $$f_w(x,y) = P(C_1|x)= \sigma(wx+w_0)$$ # # where # $$\sigma(x) = \frac{1}{1+e^{-x}}$$ # # In[43]: def sigmoid(x): return 1.0/(1.0 + np.exp(-x)) def predict(w, x): a = np.dot(w[1:], x) + w[0] z = sigmoid(a) return z # ## Learning as optimization # # * General optimization problem: # $$\min_{f\in H}L(f,D)$$ # * Hypothesis space: # $$H=\{f_w(x,y)=\sigma(wx+w_0),\forall w\in\mathbb{R}^n and w_0\in\mathbb{R}\}$$ # * Cross entropy loss function: # $$L(f,D)=\sum_{(x_{i},r_{i})\in D} -r_i\log(f(x_i)) -(1-r_i)\log(1 - f(x_i))$$ # * Measures the likelihood of the the probabilistic model represented by $f$ given the data $D$. # # ## Calculating the cross entropy loss # In[44]: def xentropy_loss(w, x, y): return - y * np.log(predict(w, x)) - (1 - y) * np.log(1 - predict(w, x)) def batch_loss(loss_fun, w, X, Y, ): n = X.shape[0] tot_loss = 0 for i in range(n): tot_loss += loss_fun(w, X[i], Y[i]) return tot_loss # In[45]: def plot_loss(loss): w1_vals = np.linspace(-2, 2, 30) w2_vals = np.linspace(-2, 2, 30) W1, W2 = np.meshgrid(w1_vals, w2_vals) grid_r, grid_c = W1.shape ZZ = np.zeros((grid_r, grid_c)) for i in range(grid_r): for j in range(grid_c): ZZ[i, j] = loss(W1[i, j], W2[i, j]) pl.contourf(W1, W2, ZZ,30, cmap = pl.cm.jet) pl.colorbar() pl.xlabel("w1") pl.ylabel("w2") def bloss_xe(w1, w2): w = np.array([1, w1, w2]) return batch_loss(xentropy_loss, w, X, Y) # In[46]: plot_loss(bloss_xe) # ## Other loss fuctions # # * There are several different loss functions. # # * $L_1$ loss: # # $$ L_1(f, D) =\sum_{(x_{i},r_{i})\in D} |r_i - f(x_i)| $$ # # * $L_2$ loss: # # $$ L_2(f, D) =\sum_{(x_{i},r_{i})\in D} (r_i - f(x_i))^2 $$ # In[47]: def l1_loss(w, x, y): return np.abs(y - predict(w, x)) def loss1(w1, w2): w = np.array([1, w1, w2]) return batch_loss(l1_loss, w, X, Y) def l2_loss(w, x, y): return (y - predict(w, x)) ** 2 / 2 def loss2(w1, w2): w = np.array([1, w1, w2]) return batch_loss(l2_loss, w, X, Y) # In[48]: pl.figure(figsize = (15,5)) pl.subplot(1,3,1); plot_loss(bloss_xe); pl.title("XE loss") pl.subplot(1,3,2); pl.title("L1 loss"); plot_loss(loss1) pl.subplot(1,3,3); pl.title("L2 loss"); plot_loss(loss2) # ## How to solve the learning problem? # # * There are different optimization strategies: # * Linear optimization # * Convex optimization # * Non-linear optimization # * Combinatorial optimization # * No unique optimization strategy that works for all the problems: "no free lunch theorem" # ## How to solve the learning problem? (cont.) # # * Trade-offs: # * Global optimum guarantee # * Simplicity of the method # * Easy parameter tuning # * Scalability # * Potential parallelization # * In machine learning preferences change over time. # * Nowadays scalable, easy parallelizable strategies are preferred even at the expense of guaranteed optimality. # ## Gradient descent #