Notebook

Credits -- This IPython Notebook is based on this awesome tutorial and IPython Notebook by Stanford University.

In [1]:

# Import the required modules
%pylab inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

Populating the interactive namespace from numpy and matplotlib

In [2]:

# Generate the training set
#TODO
np.random.seed(0)
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])

Out[2]:

(-1, 1)

In [3]:

# Initialize the weights and biases
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1, K))

# Number of examples
num_examples = X.shape[0]

# alpha is the learning rate 
alpha = 1e-0
# lambda is the regularization cofficient
# NOTE Since lambda is reserved work in python
# We will use lambdaa
lambdaa = 1e-3

Calculate the score¶

$h = W * X + b$

where

$W$ is the weight of dimension -- (D, K)
$X$ is the input array of dimension -- (num_examples, D)

Calculate the log probabilities probabilities.¶

$p_k = \frac{e^{f_k}}{ \sum_j e^{f_j} } \hspace{1in} L_i =-\log\left(p_{y_i}\right)$

Analytical Gradient with Backpropagation¶

$\frac{\partial L_i }{ \partial f_k } = p_k - \mathbb{1}(y_i = k)$

$\frac{\partial L_i }{ \partial f_k } = p_k (y_i != k)$

In [4]:

for i in range(300):
    # Calculate the scores
    scores = np.dot(X, W) + b
    # Calculate the normalized probabilites
    exp_scores = np.exp(scores)
    probs = exp_scores/np.sum(exp_scores, axis = 1, keepdims=True)
    # Correct Normalized Probabilites
    log_probs =-np.log(probs)
    # Calculate the data loss
    data_loss = np.sum(log_probs[range(num_examples), y])/num_examples
    # Calculate the regularization loss
    reg_loss = .5*lambdaa*np.sum(W*W)
    # Get the total loss
    loss = data_loss + reg_loss
    # Print the Total loss at the ith iteration
    if i % 30 == 0:
        print "Loss at iteration {} is {}".format(i, loss)
    
    # Calculate the gradients for backpropagation
    dscores = probs
    dscores[range(num_examples), y] -= 1
    dscores /= num_examples
    dW = np.dot(X.T, dscores)
    db = np.sum(dscores, axis=0, keepdims=True)
    
    # Regularize the weight gradient
    dW += lambdaa * W
    # No need to the regularize the bias gradients
    
    # Update the weights and bias
    W -= alpha*dW
    b -= alpha*db
    

Loss at iteration 0 is 1.09691944027
Loss at iteration 30 is 0.822352268534
Loss at iteration 60 is 0.794682543764
Loss at iteration 90 is 0.78872623605
Loss at iteration 120 is 0.787049195401
Loss at iteration 150 is 0.786513941749
Loss at iteration 180 is 0.786331349242
Loss at iteration 210 is 0.786266671779
Loss at iteration 240 is 0.786243250465
Loss at iteration 270 is 0.786234655744

In [5]:

# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)

In [6]:

# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_linear.png') br

Out[6]:

(-1.8712034092398278, 1.8687965907601756)