Very helpful tutorials can be found at this site for Pytorch tutorials.

In [1]:
import torch
import numpy as np
In [2]:
torch.__version__
Out[2]:
'1.4.0'
In [3]:
import numpy as np
import torch

print(torch.__version__)
import time

import matplotlib.pyplot as plt
1.4.0

torch.tensor

The torch package contains classes and functions that are very similar to numpy. It also provides functions to convert back and forth.

In [4]:
data = [[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3]]
data
Out[4]:
[[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3]]
In [5]:
a = np.array(data)
a
Out[5]:
array([[0.1, 0.2, 0.3],
       [1.1, 1.2, 1.3],
       [2.1, 2.2, 2.3]])
In [6]:
type(a)
Out[6]:
numpy.ndarray
In [7]:
 a.dtype
Out[7]:
dtype('float64')
In [8]:
data
Out[8]:
[[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3]]
In [9]:
b = torch.tensor(data)
b
Out[9]:
tensor([[0.1000, 0.2000, 0.3000],
        [1.1000, 1.2000, 1.3000],
        [2.1000, 2.2000, 2.3000]])
In [10]:
type(b), b.dtype
Out[10]:
(torch.Tensor, torch.float32)
In [11]:
a
Out[11]:
array([[0.1, 0.2, 0.3],
       [1.1, 1.2, 1.3],
       [2.1, 2.2, 2.3]])
In [12]:
c = torch.from_numpy(a.copy())
c
Out[12]:
tensor([[0.1000, 0.2000, 0.3000],
        [1.1000, 1.2000, 1.3000],
        [2.1000, 2.2000, 2.3000]], dtype=torch.float64)
In [13]:
a[0,0]=42
a
Out[13]:
array([[42. ,  0.2,  0.3],
       [ 1.1,  1.2,  1.3],
       [ 2.1,  2.2,  2.3]])
In [14]:
c
Out[14]:
tensor([[0.1000, 0.2000, 0.3000],
        [1.1000, 1.2000, 1.3000],
        [2.1000, 2.2000, 2.3000]], dtype=torch.float64)
In [15]:
type(c), c.dtype
Out[15]:
(torch.Tensor, torch.float64)
In [16]:
d = torch.as_tensor(a)
d
Out[16]:
tensor([[42.0000,  0.2000,  0.3000],
        [ 1.1000,  1.2000,  1.3000],
        [ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)
In [17]:
type(d), d.dtype
Out[17]:
(torch.Tensor, torch.float64)
In [18]:
e = d.numpy()
e
Out[18]:
array([[42. ,  0.2,  0.3],
       [ 1.1,  1.2,  1.3],
       [ 2.1,  2.2,  2.3]])
In [19]:
type(e), e.dtype
Out[19]:
(numpy.ndarray, dtype('float64'))

torch.tensor copies the data. torch.from_numpy and torch.as_tensor do not copy the data.

In [20]:
a
Out[20]:
array([[42. ,  0.2,  0.3],
       [ 1.1,  1.2,  1.3],
       [ 2.1,  2.2,  2.3]])
In [21]:
b = torch.from_numpy(a.copy())
b
Out[21]:
tensor([[42.0000,  0.2000,  0.3000],
        [ 1.1000,  1.2000,  1.3000],
        [ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)
In [22]:
a[0,0] = 42.42
a
Out[22]:
array([[42.42,  0.2 ,  0.3 ],
       [ 1.1 ,  1.2 ,  1.3 ],
       [ 2.1 ,  2.2 ,  2.3 ]])
In [23]:
b
Out[23]:
tensor([[42.0000,  0.2000,  0.3000],
        [ 1.1000,  1.2000,  1.3000],
        [ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)
In [24]:
b = torch.tensor(a)
b
Out[24]:
tensor([[42.4200,  0.2000,  0.3000],
        [ 1.1000,  1.2000,  1.3000],
        [ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)
In [25]:
a[0,0] = 12345.0
a
Out[25]:
array([[1.2345e+04, 2.0000e-01, 3.0000e-01],
       [1.1000e+00, 1.2000e+00, 1.3000e+00],
       [2.1000e+00, 2.2000e+00, 2.3000e+00]])
In [26]:
b
Out[26]:
tensor([[42.4200,  0.2000,  0.3000],
        [ 1.1000,  1.2000,  1.3000],
        [ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)

We can even use @ for matrix multiplcation as we do in numpy.

In [27]:
a = np.random.uniform(-0.1, 0.1, size=(10, 5))
b = np.random.uniform(-0.1, 0.1, size=(5, 20))
c = a @ b
c.shape
Out[27]:
(10, 20)
In [28]:
# torch.random.uniform?
In [29]:
(torch.rand(size=(10,5)) - 0.5) / 0.5
Out[29]:
tensor([[ 0.2311,  0.7379, -0.0668,  0.7922, -0.8873],
        [ 0.1337, -0.5429,  0.7502, -0.9066,  0.3532],
        [-0.6071, -0.4208, -0.4587,  0.2937,  0.9302],
        [ 0.7569, -0.3356, -0.0300,  0.8086, -0.9949],
        [-0.8975,  0.7496,  0.7958, -0.2087,  0.2589],
        [-0.4293,  0.0736,  0.5758,  0.7266,  0.3107],
        [-0.1806, -0.8293, -0.4742, -0.4344,  0.7010],
        [ 0.8711,  0.7319, -0.9603, -0.6414,  0.9676],
        [ 0.6352, -0.3494, -0.4513, -0.5610,  0.7582],
        [-0.8493,  0.0818, -0.9738, -0.3958, -0.4724]])
In [30]:
at = (torch.rand(size=(10, 5)) - 0.5) * 0.2
bt = (torch.rand(size=(5, 20)) - 0.5) * 0.2
ct = at @ bt
ct.shape
Out[30]:
torch.Size([10, 20])
In [31]:
a = np.random.uniform(-0.1, 0.1, size=(10, 5))
b = np.random.uniform(-0.1, 0.1, size=(5, 20))
a = torch.from_numpy(a)
b = torch.from_numpy(b)
c = a @ b
c.shape
Out[31]:
torch.Size([10, 20])

Automatic Gradients using autograd

What is the derivative of $\sin(x)$?

In [32]:
plt.figure(figsize=(10, 5))
x = np.linspace(-2*np.pi, 2*np.pi, 100)
y = np.sin(x)
dy = np.cos(x)
plt.plot(x, y)
plt.plot(x, dy)
plt.legend(('$\sin(x)$', '$\\frac{d \sin(x)}{dx} = \cos(x)$',));
In [33]:
xt = torch.from_numpy(x)
xt
Out[33]:
tensor([-6.2832, -6.1563, -6.0293, -5.9024, -5.7755, -5.6485, -5.5216, -5.3947,
        -5.2677, -5.1408, -5.0139, -4.8869, -4.7600, -4.6331, -4.5061, -4.3792,
        -4.2523, -4.1253, -3.9984, -3.8715, -3.7445, -3.6176, -3.4907, -3.3637,
        -3.2368, -3.1099, -2.9829, -2.8560, -2.7291, -2.6021, -2.4752, -2.3483,
        -2.2213, -2.0944, -1.9675, -1.8405, -1.7136, -1.5867, -1.4597, -1.3328,
        -1.2059, -1.0789, -0.9520, -0.8251, -0.6981, -0.5712, -0.4443, -0.3173,
        -0.1904, -0.0635,  0.0635,  0.1904,  0.3173,  0.4443,  0.5712,  0.6981,
         0.8251,  0.9520,  1.0789,  1.2059,  1.3328,  1.4597,  1.5867,  1.7136,
         1.8405,  1.9675,  2.0944,  2.2213,  2.3483,  2.4752,  2.6021,  2.7291,
         2.8560,  2.9829,  3.1099,  3.2368,  3.3637,  3.4907,  3.6176,  3.7445,
         3.8715,  3.9984,  4.1253,  4.2523,  4.3792,  4.5061,  4.6331,  4.7600,
         4.8869,  5.0139,  5.1408,  5.2677,  5.3947,  5.5216,  5.6485,  5.7755,
         5.9024,  6.0293,  6.1563,  6.2832], dtype=torch.float64)
In [34]:
xt.requires_grad
Out[34]:
False
In [35]:
xt.requires_grad_(True)
Out[35]:
tensor([-6.2832, -6.1563, -6.0293, -5.9024, -5.7755, -5.6485, -5.5216, -5.3947,
        -5.2677, -5.1408, -5.0139, -4.8869, -4.7600, -4.6331, -4.5061, -4.3792,
        -4.2523, -4.1253, -3.9984, -3.8715, -3.7445, -3.6176, -3.4907, -3.3637,
        -3.2368, -3.1099, -2.9829, -2.8560, -2.7291, -2.6021, -2.4752, -2.3483,
        -2.2213, -2.0944, -1.9675, -1.8405, -1.7136, -1.5867, -1.4597, -1.3328,
        -1.2059, -1.0789, -0.9520, -0.8251, -0.6981, -0.5712, -0.4443, -0.3173,
        -0.1904, -0.0635,  0.0635,  0.1904,  0.3173,  0.4443,  0.5712,  0.6981,
         0.8251,  0.9520,  1.0789,  1.2059,  1.3328,  1.4597,  1.5867,  1.7136,
         1.8405,  1.9675,  2.0944,  2.2213,  2.3483,  2.4752,  2.6021,  2.7291,
         2.8560,  2.9829,  3.1099,  3.2368,  3.3637,  3.4907,  3.6176,  3.7445,
         3.8715,  3.9984,  4.1253,  4.2523,  4.3792,  4.5061,  4.6331,  4.7600,
         4.8869,  5.0139,  5.1408,  5.2677,  5.3947,  5.5216,  5.6485,  5.7755,
         5.9024,  6.0293,  6.1563,  6.2832], dtype=torch.float64,
       requires_grad=True)
In [36]:
xt.requires_grad
Out[36]:
True

Now we can define our $\sin$ function.

In [37]:
# y = torch.sin(x)
# y
In [38]:
yt = torch.sin(xt)
yt
Out[38]:
tensor([ 2.4493e-16,  1.2659e-01,  2.5115e-01,  3.7166e-01,  4.8620e-01,
         5.9291e-01,  6.9008e-01,  7.7615e-01,  8.4973e-01,  9.0963e-01,
         9.5490e-01,  9.8481e-01,  9.9887e-01,  9.9685e-01,  9.7880e-01,
         9.4500e-01,  8.9599e-01,  8.3257e-01,  7.5575e-01,  6.6677e-01,
         5.6706e-01,  4.5823e-01,  3.4202e-01,  2.2031e-01,  9.5056e-02,
        -3.1728e-02, -1.5800e-01, -2.8173e-01, -4.0093e-01, -5.1368e-01,
        -6.1816e-01, -7.1269e-01, -7.9576e-01, -8.6603e-01, -9.2235e-01,
        -9.6384e-01, -9.8982e-01, -9.9987e-01, -9.9384e-01, -9.7181e-01,
        -9.3415e-01, -8.8145e-01, -8.1458e-01, -7.3459e-01, -6.4279e-01,
        -5.4064e-01, -4.2979e-01, -3.1203e-01, -1.8925e-01, -6.3424e-02,
         6.3424e-02,  1.8925e-01,  3.1203e-01,  4.2979e-01,  5.4064e-01,
         6.4279e-01,  7.3459e-01,  8.1458e-01,  8.8145e-01,  9.3415e-01,
         9.7181e-01,  9.9384e-01,  9.9987e-01,  9.8982e-01,  9.6384e-01,
         9.2235e-01,  8.6603e-01,  7.9576e-01,  7.1269e-01,  6.1816e-01,
         5.1368e-01,  4.0093e-01,  2.8173e-01,  1.5800e-01,  3.1728e-02,
        -9.5056e-02, -2.2031e-01, -3.4202e-01, -4.5823e-01, -5.6706e-01,
        -6.6677e-01, -7.5575e-01, -8.3257e-01, -8.9599e-01, -9.4500e-01,
        -9.7880e-01, -9.9685e-01, -9.9887e-01, -9.8481e-01, -9.5490e-01,
        -9.0963e-01, -8.4973e-01, -7.7615e-01, -6.9008e-01, -5.9291e-01,
        -4.8620e-01, -3.7166e-01, -2.5115e-01, -1.2659e-01, -2.4493e-16],
       dtype=torch.float64, grad_fn=<SinBackward>)
In [39]:
xt.shape
Out[39]:
torch.Size([100])
In [40]:
yt.backward(torch.ones(100))
In [41]:
xt.grad
Out[41]:
tensor([ 1.0000,  0.9920,  0.9679,  0.9284,  0.8738,  0.8053,  0.7237,  0.6306,
         0.5272,  0.4154,  0.2969,  0.1736,  0.0476, -0.0792, -0.2048, -0.3271,
        -0.4441, -0.5539, -0.6549, -0.7453, -0.8237, -0.8888, -0.9397, -0.9754,
        -0.9955, -0.9995, -0.9874, -0.9595, -0.9161, -0.8580, -0.7861, -0.7015,
        -0.6056, -0.5000, -0.3863, -0.2665, -0.1423, -0.0159,  0.1108,  0.2358,
         0.3569,  0.4723,  0.5801,  0.6785,  0.7660,  0.8413,  0.9029,  0.9501,
         0.9819,  0.9980,  0.9980,  0.9819,  0.9501,  0.9029,  0.8413,  0.7660,
         0.6785,  0.5801,  0.4723,  0.3569,  0.2358,  0.1108, -0.0159, -0.1423,
        -0.2665, -0.3863, -0.5000, -0.6056, -0.7015, -0.7861, -0.8580, -0.9161,
        -0.9595, -0.9874, -0.9995, -0.9955, -0.9754, -0.9397, -0.8888, -0.8237,
        -0.7453, -0.6549, -0.5539, -0.4441, -0.3271, -0.2048, -0.0792,  0.0476,
         0.1736,  0.2969,  0.4154,  0.5272,  0.6306,  0.7237,  0.8053,  0.8738,
         0.9284,  0.9679,  0.9920,  1.0000], dtype=torch.float64)
In [42]:
plt.figure(figsize=(10, 5))
plt.plot(xt.detach(), yt.detach())
plt.plot(xt.detach(), xt.grad)
plt.legend(('$\sin(x)$', '$\\frac{d \sin(x)}{dx} = \cos(x)$'));
In [43]:
# yt.backward(torch.ones(100))
In [44]:
xt = torch.from_numpy(x)
xt.requires_grad_(True)

yt = torch.sin(xt)
yt.backward(torch.ones(100))

xt.grad
Out[44]:
tensor([ 1.0000,  0.9920,  0.9679,  0.9284,  0.8738,  0.8053,  0.7237,  0.6306,
         0.5272,  0.4154,  0.2969,  0.1736,  0.0476, -0.0792, -0.2048, -0.3271,
        -0.4441, -0.5539, -0.6549, -0.7453, -0.8237, -0.8888, -0.9397, -0.9754,
        -0.9955, -0.9995, -0.9874, -0.9595, -0.9161, -0.8580, -0.7861, -0.7015,
        -0.6056, -0.5000, -0.3863, -0.2665, -0.1423, -0.0159,  0.1108,  0.2358,
         0.3569,  0.4723,  0.5801,  0.6785,  0.7660,  0.8413,  0.9029,  0.9501,
         0.9819,  0.9980,  0.9980,  0.9819,  0.9501,  0.9029,  0.8413,  0.7660,
         0.6785,  0.5801,  0.4723,  0.3569,  0.2358,  0.1108, -0.0159, -0.1423,
        -0.2665, -0.3863, -0.5000, -0.6056, -0.7015, -0.7861, -0.8580, -0.9161,
        -0.9595, -0.9874, -0.9995, -0.9955, -0.9754, -0.9397, -0.8888, -0.8237,
        -0.7453, -0.6549, -0.5539, -0.4441, -0.3271, -0.2048, -0.0792,  0.0476,
         0.1736,  0.2969,  0.4154,  0.5272,  0.6306,  0.7237,  0.8053,  0.8738,
         0.9284,  0.9679,  0.9920,  1.0000], dtype=torch.float64)
In [45]:
yt = torch.sin(xt)
yt.backward(torch.ones(100))

xt.grad
Out[45]:
tensor([ 2.0000,  1.9839,  1.9359,  1.8567,  1.7477,  1.6105,  1.4475,  1.2611,
         1.0545,  0.8308,  0.5938,  0.3473,  0.0952, -0.1585, -0.4096, -0.6541,
        -0.8881, -1.1078, -1.3097, -1.4905, -1.6474, -1.7777, -1.8794, -1.9509,
        -1.9909, -1.9990, -1.9749, -1.9190, -1.8322, -1.7160, -1.5721, -1.4029,
        -1.2112, -1.0000, -0.7727, -0.5329, -0.2846, -0.0317,  0.2217,  0.4715,
         0.7138,  0.9445,  1.1601,  1.3570,  1.5321,  1.6825,  1.8059,  1.9001,
         1.9639,  1.9960,  1.9960,  1.9639,  1.9001,  1.8059,  1.6825,  1.5321,
         1.3570,  1.1601,  0.9445,  0.7138,  0.4715,  0.2217, -0.0317, -0.2846,
        -0.5329, -0.7727, -1.0000, -1.2112, -1.4029, -1.5721, -1.7160, -1.8322,
        -1.9190, -1.9749, -1.9990, -1.9909, -1.9509, -1.8794, -1.7777, -1.6474,
        -1.4905, -1.3097, -1.1078, -0.8881, -0.6541, -0.4096, -0.1585,  0.0952,
         0.3473,  0.5938,  0.8308,  1.0545,  1.2611,  1.4475,  1.6105,  1.7477,
         1.8567,  1.9359,  1.9839,  2.0000], dtype=torch.float64)
In [46]:
plt.figure(figsize=(10, 5))
plt.plot(xt.detach(), yt.detach())
plt.plot(xt.detach(), xt.grad)
plt.legend(('$\sin(x)$', '$\\frac{d \sin(x)}{dx} = \cos(x)$'));

Above shows the magnitude of our derivative is twice what is should be. This is because backward adds the gradient values to the previous values. We must explicitly zero out the gradient first.

In [47]:
xt.grad.zero_()
for i in range(10):
    yt = torch.sin(xt)
    yt.backward(torch.ones(100))
    print(xt.grad[0])
tensor(1., dtype=torch.float64)
tensor(2., dtype=torch.float64)
tensor(3., dtype=torch.float64)
tensor(4., dtype=torch.float64)
tensor(5., dtype=torch.float64)
tensor(6., dtype=torch.float64)
tensor(7., dtype=torch.float64)
tensor(8., dtype=torch.float64)
tensor(9., dtype=torch.float64)
tensor(10., dtype=torch.float64)
In [48]:
for i in range(10):
    xt.grad.zero_()
    yt = torch.sin(xt)
    yt.backward(torch.ones(100))
    print(xt.grad[0])
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)

Train linear model of one variable using SGD in Pytorch

First in numpy.

In [49]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2

n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

W = np.zeros((2, 1))

for epoch in range(100):
    
    X1 = np.insert(X, 0, 1, 1)
    Y = X1 @ W
    
    mse = ((T - Y) ** 2).mean()  # not used
    
    gradient = - X1.T @ (T - Y)
    
    # print(gradient.shape, W.shape)
    W -= learning_rate * gradient

plt.plot(X, T, 'o-', label='T')
plt.plot(X, Y, 'o-', label='Y')
plt.legend();
In [50]:
mse
Out[50]:
81.19527161526112

Now in torch.

In [51]:
# torch.insert?
In [52]:
X.T
Out[52]:
array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
In [53]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X1 = torch.from_numpy(np.insert(X, 0, 1, 1))
X = torch.from_numpy(X)

T = torch.from_numpy(T)

W = torch.zeros((2, 1))

for epoch in range(100):
    
    Y = X1 @ W
    
    mse = ((T - Y) ** 2).mean()  # not used
    
    gradient = - X1.T @ (T - Y)
    W -= learning_rate * gradient

plt.plot(X, T, 'o-', label='T')
plt.plot(X, Y, 'o-', label='Y')
plt.legend();
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-53-70e16d53136a> in <module>
     15 for epoch in range(100):
     16 
---> 17     Y = X1 @ W
     18 
     19     mse = ((T - Y) ** 2).mean()  # not used

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'mat2' in call to _th_mm
In [54]:
X1.dtype
Out[54]:
torch.int64
In [55]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X1 = torch.from_numpy(np.insert(X, 0, 1, 1)).float()
X = torch.from_numpy(X)

T = torch.from_numpy(T)

W = torch.zeros((2, 1))

for epoch in range(1000):
    
    Y = X1 @ W
    
    mse = ((T - Y) ** 2).mean()  # not used
        
    gradient = - X1.T @ (T - Y)
    W -= learning_rate * gradient

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();
In [56]:
mse
Out[56]:
tensor(52.9749)

So why are we using torch? Looks just like our numpy code?

Let's take advantage of autograd!!

In [57]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X1 = torch.from_numpy(np.insert(X, 0, 1, 1)).float()
X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

W = torch.zeros((2, 1), requires_grad=True)

for epoch in range(1000):

    Y = X1 @ W
    
    mse = ((T - Y)**2).mean()
    
    mse.backward()                                         #  NEW
    
    with torch.no_grad():                                  # NEW
        W -= learning_rate * W.grad
        W.grad.zero_()

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();
In [58]:
mse
Out[58]:
tensor(68.8780, grad_fn=<MeanBackward0>)

Using Predefined Optimizers

In [59]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X1 = torch.from_numpy(np.insert(X, 0, 1, 1)).float()
X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

W = torch.zeros((2, 1), requires_grad=True)

optimizer = torch.optim.SGD([W], lr=learning_rate)             # NEW

for epoch in range(100):

    Y = X1 @ W
    
    mse = ((T - Y)**2).mean()
    mse.backward()
    
    optimizer.step()                                           # NEW
    optimizer.zero_grad()                                      # NEW
    
plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();
In [60]:
# torch.optim.

Using Predefined Loss Functions

In [61]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X1 = torch.from_numpy(np.insert(X, 0, 1, 1)).float()
X = torch.from_numpy(X)
T = torch.from_numpy(T)

W = torch.zeros((2, 1), requires_grad=True)

optimizer = torch.optim.SGD([W], lr=learning_rate)

mse_func = torch.nn.MSELoss()                                       # NEW

for epoch in range(100):

    Y = X1 @ W
    
    mse = mse_func(T, Y)                                            # NEW
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()
    
plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();

Using torch.nn module

The torch module includes classes that simplify the constuction of our models. It will not look simpler for our linear model, but will for multilayered models.

In [62]:
# torch.nn.Linear?
In [63]:
n_inputs = 1
n_outputs = 1

model = torch.nn.Sequential(torch.nn.Linear(n_inputs, n_outputs))
model
Out[63]:
Sequential(
  (0): Linear(in_features=1, out_features=1, bias=True)
)
In [64]:
list(model)
Out[64]:
[Linear(in_features=1, out_features=1, bias=True)]
In [65]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape     
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

model = torch.nn.Sequential(torch.nn.Linear(n_inputs, n_outputs))   # NEW

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

for epoch in range(100):

    Y = model(X)                                                 # NEW
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()
    
plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();

Now, add a hidden layer, or two.

In [66]:
list(model.parameters())
Out[66]:
[Parameter containing:
 tensor([[6.9597]], requires_grad=True),
 Parameter containing:
 tensor([0.6875], requires_grad=True)]
In [69]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape
n_outputs = T.shape[1]
n_hiddens = [30, 10]                                                      # NEW

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

model = torch.nn.Sequential(                                              # NEW
    torch.nn.Linear(n_inputs, n_hiddens[0]),                              # NEW
    torch.nn.Tanh(),                                                      # NEW
    torch.nn.Linear(n_hiddens[0], n_hiddens[1]),                          # NEW
    torch.nn.Tanh(),                                                      # NEW
    torch.nn.Linear(n_hiddens[1], n_outputs))                             # NEW

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

for epoch in range(1000):

    Y = model(X)
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()
    
plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();

Maybe Adam will do better.

In [68]:
# torch.optim.Adam?
# torch.nn.Linear?
In [70]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape
n_outputs = T.shape[1]
n_hiddens = [10, 10]

learning_rate = 0.5 / (n_samples * n_outputs)                              # Larger learning rate

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

model = torch.nn.Sequential(
    torch.nn.Linear(n_inputs, n_hiddens[0]),
    torch.nn.Tanh(),
    torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
    torch.nn.Tanh(),
    torch.nn.Linear(n_hiddens[1], n_outputs))

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)         # NEW
mse_func = torch.nn.MSELoss()

for epoch in range(1000):

    Y = model(X)
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()
    
plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();

Now for some Speed with a GPU

It is trivial to move data and operations down to a GPU with pytorch.

First let's time 1000 matrix multiplications on our CPU.

In [71]:
import time

n = 1000
a = np.random.uniform(-0.1, 0.1, size=(n, n)).astype(np.float32)
b = np.random.uniform(-0.1, 0.1, size=(n, n)).astype(np.float32)

start_time = time.time()

for i in range(1000):
    c = a @ b
    
elapsed_time = time.time() - start_time

print(f'Took {elapsed_time} seconds')
c.shape
Took 3.761833429336548 seconds
Out[71]:
(1000, 1000)

Repeat this on the GPU, after we first check that this machine has a GPU.

In [72]:
torch.cuda.is_available()
Out[72]:
True
In [73]:
!nvidia-smi
Tue Sep 29 15:33:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 166...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   48C    P8     8W /  N/A |    312MiB /  5944MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1243      G   /usr/lib/xorg/Xorg                            45MiB |
|    0      1847      G   /usr/lib/xorg/Xorg                            96MiB |
|    0      1983      G   /usr/bin/gnome-shell                          88MiB |
|    0      2469      G   ...quest-channel-token=6014427668002915158     3MiB |
|    0      2737      G   ...AAAAAAAAAAAACAAAAAAAAAA= --shared-files    63MiB |
+-----------------------------------------------------------------------------+
In [79]:
n = 1000
a = np.random.uniform(-0.1, 0.1, size=(n, n)).astype(np.float32)
b = np.random.uniform(-0.1, 0.1, size=(n, n)).astype(np.float32)

at = torch.from_numpy(a)
bt = torch.from_numpy(b)
# at = (torch.rand(size=(n, n)) - 0.5) * 0.2
# bt = (torch.rand(size=(n, n)) - 0.5) * 0.2

# ct = torch.zeros((n, n))

start_time = time.time()
at = at.to('cuda')  ## Don't forget these assignments.  at.to('cuda') does not change at
bt = bt.to('cuda')
print(f'Took {time.time() - start_time} to move data to GPU')

start_time = time.time()

for i in range(1000):
    ct = at @ bt

ct = ct.to('cpu')
elapsed_time = time.time() - start_time

print(f'Took {elapsed_time} seconds to muliply matrices')
ct.shape
Took 0.0015783309936523438 to move data to GPU
Took 0.5512185096740723 seconds to muliply matrices
Out[79]:
torch.Size([1000, 1000])

If you have a machine with a GPU, you might find a function like the following useful. This works for my System76 laptop. However, I am not using it in this notebook.

In [80]:
import subprocess

def use_gpu(use=True):
    if use:
        subprocess.run(['system76-power', 'graphics', 'power', 'on'])
        subprocess.run(['sudo', 'modprobe', 'nvidia'])
    else:
        subprocess.run(['sudo', 'rmmod', 'nvidia'])
        subprocess.run(['system76-power', 'graphics', 'off'])
        
# use_gpu()  #  if running on my system76 laptop

torch.cuda.is_available()
Out[80]:
True

Now let's compare speed of our torch.nn model on more data, without and with the GPU. Our data and neural network are all using torch.Tensors, but with computations performed on the CPU or the GPU, depending on the value of use_gpu.

In [81]:
torch.cuda.is_available()
Out[81]:
True
In [87]:
use_gpu = False

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape 
n_outputs = T.shape[1]

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

model = torch.nn.Sequential(
    torch.nn.Linear(n_inputs, n_hiddens[0]),
    torch.nn.Tanh(),
    torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
    torch.nn.Tanh(),
    torch.nn.Linear(n_hiddens[1], n_outputs))

if use_gpu:
    start_time = time.time()
    X = X.to('cuda')
    T = T.to('cuda')
    model.to('cuda')
    print(f'Moving data and model to GPU took {time.time() - start_time:.2f} seconds.')

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

    Y = model(X)
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()

elapsed_time = time.time() - start_time
print(f"Training took {elapsed_time:.2f} seconds with {'GPU' if use_gpu else 'CPU'}")

plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();
Training took 7.68 seconds with CPU
In [88]:
use_gpu = True

if use_gpu:
    print('Running on the GPU')
else:
    print('Running on the CPU')

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape 
n_outputs = T.shape[1]

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

model = torch.nn.Sequential(
    torch.nn.Linear(n_inputs, n_hiddens[0]),
    torch.nn.Tanh(),
    torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
    torch.nn.Tanh(),
    torch.nn.Linear(n_hiddens[1], n_outputs))

if use_gpu:
    start_time = time.time()
    X = X.to('cuda')
    T = T.to('cuda')
    model.to('cuda')
    print(f'Moving data and model to GPU took {time.time() - start_time:.2f} seconds.')

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

    Y = model(X) 
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()

elapsed_time = time.time() - start_time
print(f"Training took {elapsed_time:.2f} seconds with {'GPU' if use_gpu else 'CPU'}")
    
plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();
Running on the GPU
Moving data and model to GPU took 0.01 seconds.
Training took 1.33 seconds with GPU

The torch.nn.Module.forward function

We just saw how to implement the neural network using torch.nn.Sequential as a combination of Linear and Tanh layers. The forward calculation for such a neural network is implicitly defined this way.

Alternatively, we can define a new class that extends torch.nn.Module and define the forward function explicitly.

In [89]:
use_gpu = False
if use_gpu:
    print('Running on the GPU')
else:
    print('Running on the CPU')

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape 
n_outputs = T.shape[1]


X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

class NNet(torch.nn.Module):
    
    def __init__(self, n_inputs, n_hiddens_list, n_outputs):
        super().__init__()                          # call parent class (torch.nn.Module) constructor
            
        self.hidden_layers = torch.nn.ModuleList()  # necessary for model.to('cuda')
        for nh in n_hiddens_list:
            self.hidden_layers.append( torch.nn.Sequential(
                torch.nn.Linear(n_inputs, nh),
                torch.nn.Tanh()))
            
            n_inputs = nh
        self.output_layer = torch.nn.Linear(n_inputs, n_outputs)
            
    def forward(self, X):
        Y = X
        for hidden_layer in self.hidden_layers:
            Y = hidden_layer(Y)
        Y = self.output_layer(Y)
        return Y

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

model = NNet(n_inputs, n_hiddens, n_outputs)

if use_gpu:
    start_time = time.time()
    X = X.to('cuda')
    T = T.to('cuda')
    model.to('cuda')
    print(f'Moving data and model to GPU took {time.time() - start_time:.2f} seconds.')

print('Starting the training')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

    Y = model(X) 
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()

elapsed_time = time.time() - start_time
print(f"Training took {elapsed_time:.2f} seconds with {'GPU' if use_gpu else 'CPU'}")
    
plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();
Running on the CPU
Starting the training
Training took 7.60 seconds with CPU
In [90]:
use_gpu = True

if use_gpu:
    print('Running on the GPU')
else:
    print('Running on the CPU')

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape 
n_outputs = T.shape[1]


X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

class NNet(torch.nn.Module):
    
    def __init__(self, n_inputs, n_hiddens_list, n_outputs):
        super().__init__()                          # call parent class (torch.nn.Module) constructor
            
        self.hidden_layers = torch.nn.ModuleList()  # necessary for model.to('cuda')
        for nh in n_hiddens_list:
            self.hidden_layers.append( torch.nn.Sequential(
                torch.nn.Linear(n_inputs, nh),
                torch.nn.Tanh()))
            
            n_inputs = nh
        self.output_layer = torch.nn.Linear(n_inputs, n_outputs)
            
    def forward(self, X):
        Y = X
        for hidden_layer in self.hidden_layers:
            Y = hidden_layer(Y)
        Y = self.output_layer(Y)
        return Y

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

model = NNet(n_inputs, n_hiddens, n_outputs)

if use_gpu:
    start_time = time.time()
    X = X.to('cuda')
    T = T.to('cuda')
    model.to('cuda')
    print(f'Moving data and model to GPU took {time.time() - start_time:.2f} seconds.')

print('Starting the training')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

    Y = model(X) 
    
    mse = mse_func(T, Y)
    mse.backward()
    
    optimizer.step() 
    optimizer.zero_grad()

elapsed_time = time.time() - start_time
print(f"Training took {elapsed_time:.2f} seconds with {'GPU' if use_gpu else 'CPU'}")
    
plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();
Running on the GPU
Moving data and model to GPU took 0.02 seconds.
Starting the training
Training took 1.33 seconds with GPU
In [ ]: