Very helpful tutorials can be found at this site for Pytorch tutorials.

In [2]:
import numpy as np
import torch

print(torch.__version__)
import time

import matplotlib.pyplot as plt

1.3.1


# torch.tensor¶

The torch package contains classes and functions that are very similar to numpy. It also provides functions to convert back and forth.

In [98]:
data = [[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3]]
data

Out[98]:
[[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3]]
In [99]:
a = np.array(data)
a

Out[99]:
array([[0.1, 0.2, 0.3],
[1.1, 1.2, 1.3],
[2.1, 2.2, 2.3]])
In [100]:
type(a), a.dtype

Out[100]:
(numpy.ndarray, dtype('float64'))
In [101]:
b = torch.tensor(data)
b

Out[101]:
tensor([[0.1000, 0.2000, 0.3000],
[1.1000, 1.2000, 1.3000],
[2.1000, 2.2000, 2.3000]])
In [102]:
type(b), b.dtype

Out[102]:
(torch.Tensor, torch.float32)
In [103]:
c = torch.from_numpy(a)
c

Out[103]:
tensor([[0.1000, 0.2000, 0.3000],
[1.1000, 1.2000, 1.3000],
[2.1000, 2.2000, 2.3000]], dtype=torch.float64)
In [104]:
type(c), c.dtype

Out[104]:
(torch.Tensor, torch.float64)
In [105]:
d = torch.as_tensor(a)
d

Out[105]:
tensor([[0.1000, 0.2000, 0.3000],
[1.1000, 1.2000, 1.3000],
[2.1000, 2.2000, 2.3000]], dtype=torch.float64)
In [106]:
type(d), d.dtype

Out[106]:
(torch.Tensor, torch.float64)
In [107]:
e = d.numpy()
e

Out[107]:
array([[0.1, 0.2, 0.3],
[1.1, 1.2, 1.3],
[2.1, 2.2, 2.3]])
In [108]:
type(e), e.dtype

Out[108]:
(numpy.ndarray, dtype('float64'))

torch.tensor copies the data. torch.from_numpy and torch.as_tensor do not copy the data.

In [109]:
a

Out[109]:
array([[0.1, 0.2, 0.3],
[1.1, 1.2, 1.3],
[2.1, 2.2, 2.3]])
In [110]:
b = torch.from_numpy(a)
b

Out[110]:
tensor([[0.1000, 0.2000, 0.3000],
[1.1000, 1.2000, 1.3000],
[2.1000, 2.2000, 2.3000]], dtype=torch.float64)
In [111]:
a[0,0] = 42.42
a

Out[111]:
array([[42.42,  0.2 ,  0.3 ],
[ 1.1 ,  1.2 ,  1.3 ],
[ 2.1 ,  2.2 ,  2.3 ]])
In [112]:
b

Out[112]:
tensor([[42.4200,  0.2000,  0.3000],
[ 1.1000,  1.2000,  1.3000],
[ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)
In [113]:
b = torch.tensor(a)
b

Out[113]:
tensor([[42.4200,  0.2000,  0.3000],
[ 1.1000,  1.2000,  1.3000],
[ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)
In [114]:
a[0,0] = 12345.0
a

Out[114]:
array([[1.2345e+04, 2.0000e-01, 3.0000e-01],
[1.1000e+00, 1.2000e+00, 1.3000e+00],
[2.1000e+00, 2.2000e+00, 2.3000e+00]])
In [115]:
b

Out[115]:
tensor([[42.4200,  0.2000,  0.3000],
[ 1.1000,  1.2000,  1.3000],
[ 2.1000,  2.2000,  2.3000]], dtype=torch.float64)

We can even use @ for matrix multiplcation as we do in numpy.

In [116]:
a = np.random.uniform(-0.1, 0.1, size=(10, 5))
b = np.random.uniform(-0.1, 0.1, size=(5, 20))
c = a @ b
c.shape

Out[116]:
(10, 20)
In [117]:
# torch.rand?

In [118]:
at = (torch.rand(size=(10, 5)) - 0.5) * 0.1
bt = (torch.rand(size=(5, 20)) - 0.5) * 0.1
ct = at @ bt
ct.shape

Out[118]:
torch.Size([10, 20])

What is the derivative of $\sin(x)$?

In [119]:
plt.figure(figsize=(10, 5))
x = np.linspace(-2*np.pi, 2*np.pi, 100)
y = np.sin(x)
dy = np.cos(x)
plt.plot(x, y)
plt.plot(x, dy)
plt.legend(('$\sin(x)$', '$\\frac{d \sin(x)}{dx} = \cos(x)$',));

In [120]:
xt = torch.from_numpy(x)
xt

Out[120]:
tensor([-6.2832, -6.1563, -6.0293, -5.9024, -5.7755, -5.6485, -5.5216, -5.3947,
-5.2677, -5.1408, -5.0139, -4.8869, -4.7600, -4.6331, -4.5061, -4.3792,
-4.2523, -4.1253, -3.9984, -3.8715, -3.7445, -3.6176, -3.4907, -3.3637,
-3.2368, -3.1099, -2.9829, -2.8560, -2.7291, -2.6021, -2.4752, -2.3483,
-2.2213, -2.0944, -1.9675, -1.8405, -1.7136, -1.5867, -1.4597, -1.3328,
-1.2059, -1.0789, -0.9520, -0.8251, -0.6981, -0.5712, -0.4443, -0.3173,
-0.1904, -0.0635,  0.0635,  0.1904,  0.3173,  0.4443,  0.5712,  0.6981,
0.8251,  0.9520,  1.0789,  1.2059,  1.3328,  1.4597,  1.5867,  1.7136,
1.8405,  1.9675,  2.0944,  2.2213,  2.3483,  2.4752,  2.6021,  2.7291,
2.8560,  2.9829,  3.1099,  3.2368,  3.3637,  3.4907,  3.6176,  3.7445,
3.8715,  3.9984,  4.1253,  4.2523,  4.3792,  4.5061,  4.6331,  4.7600,
4.8869,  5.0139,  5.1408,  5.2677,  5.3947,  5.5216,  5.6485,  5.7755,
5.9024,  6.0293,  6.1563,  6.2832], dtype=torch.float64)
In [121]:
xt.requires_grad

Out[121]:
False
In [122]:
xt.requires_grad_(True)

Out[122]:
tensor([-6.2832, -6.1563, -6.0293, -5.9024, -5.7755, -5.6485, -5.5216, -5.3947,
-5.2677, -5.1408, -5.0139, -4.8869, -4.7600, -4.6331, -4.5061, -4.3792,
-4.2523, -4.1253, -3.9984, -3.8715, -3.7445, -3.6176, -3.4907, -3.3637,
-3.2368, -3.1099, -2.9829, -2.8560, -2.7291, -2.6021, -2.4752, -2.3483,
-2.2213, -2.0944, -1.9675, -1.8405, -1.7136, -1.5867, -1.4597, -1.3328,
-1.2059, -1.0789, -0.9520, -0.8251, -0.6981, -0.5712, -0.4443, -0.3173,
-0.1904, -0.0635,  0.0635,  0.1904,  0.3173,  0.4443,  0.5712,  0.6981,
0.8251,  0.9520,  1.0789,  1.2059,  1.3328,  1.4597,  1.5867,  1.7136,
1.8405,  1.9675,  2.0944,  2.2213,  2.3483,  2.4752,  2.6021,  2.7291,
2.8560,  2.9829,  3.1099,  3.2368,  3.3637,  3.4907,  3.6176,  3.7445,
3.8715,  3.9984,  4.1253,  4.2523,  4.3792,  4.5061,  4.6331,  4.7600,
4.8869,  5.0139,  5.1408,  5.2677,  5.3947,  5.5216,  5.6485,  5.7755,
5.9024,  6.0293,  6.1563,  6.2832], dtype=torch.float64,
requires_grad=True)
In [123]:
xt.requires_grad

Out[123]:
True

Now we can define our $\sin$ function.

In [124]:
# y = torch.sin(x)
# y

In [125]:
yt = torch.sin(xt)
yt

Out[125]:
tensor([ 2.4493e-16,  1.2659e-01,  2.5115e-01,  3.7166e-01,  4.8620e-01,
5.9291e-01,  6.9008e-01,  7.7615e-01,  8.4973e-01,  9.0963e-01,
9.5490e-01,  9.8481e-01,  9.9887e-01,  9.9685e-01,  9.7880e-01,
9.4500e-01,  8.9599e-01,  8.3257e-01,  7.5575e-01,  6.6677e-01,
5.6706e-01,  4.5823e-01,  3.4202e-01,  2.2031e-01,  9.5056e-02,
-3.1728e-02, -1.5800e-01, -2.8173e-01, -4.0093e-01, -5.1368e-01,
-6.1816e-01, -7.1269e-01, -7.9576e-01, -8.6603e-01, -9.2235e-01,
-9.6384e-01, -9.8982e-01, -9.9987e-01, -9.9384e-01, -9.7181e-01,
-9.3415e-01, -8.8145e-01, -8.1458e-01, -7.3459e-01, -6.4279e-01,
-5.4064e-01, -4.2979e-01, -3.1203e-01, -1.8925e-01, -6.3424e-02,
6.3424e-02,  1.8925e-01,  3.1203e-01,  4.2979e-01,  5.4064e-01,
6.4279e-01,  7.3459e-01,  8.1458e-01,  8.8145e-01,  9.3415e-01,
9.7181e-01,  9.9384e-01,  9.9987e-01,  9.8982e-01,  9.6384e-01,
9.2235e-01,  8.6603e-01,  7.9576e-01,  7.1269e-01,  6.1816e-01,
5.1368e-01,  4.0093e-01,  2.8173e-01,  1.5800e-01,  3.1728e-02,
-9.5056e-02, -2.2031e-01, -3.4202e-01, -4.5823e-01, -5.6706e-01,
-6.6677e-01, -7.5575e-01, -8.3257e-01, -8.9599e-01, -9.4500e-01,
-9.7880e-01, -9.9685e-01, -9.9887e-01, -9.8481e-01, -9.5490e-01,
-9.0963e-01, -8.4973e-01, -7.7615e-01, -6.9008e-01, -5.9291e-01,
-4.8620e-01, -3.7166e-01, -2.5115e-01, -1.2659e-01, -2.4493e-16],
dtype=torch.float64, grad_fn=<SinBackward>)
In [126]:
xt.shape

Out[126]:
torch.Size([100])
In [127]:
yt.backward(torch.ones(100))

In [128]:
xt.grad

Out[128]:
tensor([ 1.0000,  0.9920,  0.9679,  0.9284,  0.8738,  0.8053,  0.7237,  0.6306,
0.5272,  0.4154,  0.2969,  0.1736,  0.0476, -0.0792, -0.2048, -0.3271,
-0.4441, -0.5539, -0.6549, -0.7453, -0.8237, -0.8888, -0.9397, -0.9754,
-0.9955, -0.9995, -0.9874, -0.9595, -0.9161, -0.8580, -0.7861, -0.7015,
-0.6056, -0.5000, -0.3863, -0.2665, -0.1423, -0.0159,  0.1108,  0.2358,
0.3569,  0.4723,  0.5801,  0.6785,  0.7660,  0.8413,  0.9029,  0.9501,
0.9819,  0.9980,  0.9980,  0.9819,  0.9501,  0.9029,  0.8413,  0.7660,
0.6785,  0.5801,  0.4723,  0.3569,  0.2358,  0.1108, -0.0159, -0.1423,
-0.2665, -0.3863, -0.5000, -0.6056, -0.7015, -0.7861, -0.8580, -0.9161,
-0.9595, -0.9874, -0.9995, -0.9955, -0.9754, -0.9397, -0.8888, -0.8237,
-0.7453, -0.6549, -0.5539, -0.4441, -0.3271, -0.2048, -0.0792,  0.0476,
0.1736,  0.2969,  0.4154,  0.5272,  0.6306,  0.7237,  0.8053,  0.8738,
0.9284,  0.9679,  0.9920,  1.0000], dtype=torch.float64)
In [129]:
plt.figure(figsize=(10, 5))
plt.plot(xt.detach(), yt.detach())
plt.legend(('$\sin(x)$', '$\\frac{d \sin(x)}{dx} = \cos(x)$'));

In [130]:
# yt.backward(torch.ones(100))

In [131]:
yt = torch.sin(xt)
yt.backward(torch.ones(100))

Out[131]:
tensor([ 2.0000,  1.9839,  1.9359,  1.8567,  1.7477,  1.6105,  1.4475,  1.2611,
1.0545,  0.8308,  0.5938,  0.3473,  0.0952, -0.1585, -0.4096, -0.6541,
-0.8881, -1.1078, -1.3097, -1.4905, -1.6474, -1.7777, -1.8794, -1.9509,
-1.9909, -1.9990, -1.9749, -1.9190, -1.8322, -1.7160, -1.5721, -1.4029,
-1.2112, -1.0000, -0.7727, -0.5329, -0.2846, -0.0317,  0.2217,  0.4715,
0.7138,  0.9445,  1.1601,  1.3570,  1.5321,  1.6825,  1.8059,  1.9001,
1.9639,  1.9960,  1.9960,  1.9639,  1.9001,  1.8059,  1.6825,  1.5321,
1.3570,  1.1601,  0.9445,  0.7138,  0.4715,  0.2217, -0.0317, -0.2846,
-0.5329, -0.7727, -1.0000, -1.2112, -1.4029, -1.5721, -1.7160, -1.8322,
-1.9190, -1.9749, -1.9990, -1.9909, -1.9509, -1.8794, -1.7777, -1.6474,
-1.4905, -1.3097, -1.1078, -0.8881, -0.6541, -0.4096, -0.1585,  0.0952,
0.3473,  0.5938,  0.8308,  1.0545,  1.2611,  1.4475,  1.6105,  1.7477,
1.8567,  1.9359,  1.9839,  2.0000], dtype=torch.float64)
In [132]:
plt.figure(figsize=(10, 5))
plt.plot(xt.detach(), yt.detach())
plt.legend(('$\sin(x)$', '$\\frac{d \sin(x)}{dx} = \cos(x)$'));


Above shows the magnitude of our derivative is twice what is should be. This is because backward adds the gradient values to the previous values. We must explicitly zero out the gradient first.

In [133]:
xt.grad.zero_()
for i in range(10):
yt = torch.sin(xt)
yt.backward(torch.ones(100))

tensor(1., dtype=torch.float64)
tensor(2., dtype=torch.float64)
tensor(3., dtype=torch.float64)
tensor(4., dtype=torch.float64)
tensor(5., dtype=torch.float64)
tensor(6., dtype=torch.float64)
tensor(7., dtype=torch.float64)
tensor(8., dtype=torch.float64)
tensor(9., dtype=torch.float64)
tensor(10., dtype=torch.float64)

In [134]:
for i in range(10):
yt = torch.sin(xt)
yt.backward(torch.ones(100))

tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)
tensor(1., dtype=torch.float64)


# Train linear model of one variable using SGD in Pytorch¶

## First in numpy.¶

In [135]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2

n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

W = np.zeros((2, 1))

for epoch in range(100):

Y = X @ W[1:, :] + W[0:1, :]

mse = ((T - Y) ** 2).mean()  # not used

gradient = - X.T @ (T - Y)

plt.plot(X, T, 'o-', label='T')
plt.plot(X, Y, 'o-', label='Y')
plt.legend();


## Now in torch.¶

In [136]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X)
T = torch.from_numpy(T)

W = torch.zeros((2, 1))

for epoch in range(100):

Y = X @ W[1:, :] + W[0:1, :]

mse = ((T - Y) ** 2).mean()  # not used

gradient = - X.T @ (T - Y)

plt.plot(X, T, 'o-', label='T')
plt.plot(X, Y, 'o-', label='Y')
plt.legend();

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-136-5038be964fda> in <module>
13 for epoch in range(100):
14
---> 15     Y = X @ W[1:, :] + W[0:1, :]
16
17     mse = ((T - Y) ** 2).mean()  # not used

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'mat2' in call to _th_mm
In [137]:
X.dtype

Out[137]:
torch.int64
In [138]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()  ## ADDED .float()
T = torch.from_numpy(T)

W = torch.zeros((2, 1))

for epoch in range(100):

Y = X @ W[1:, :] + W[0:1, :]

mse = ((T - Y) ** 2).mean()  # not used

gradient = - X.T @ (T - Y)

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();


So why are we using torch? Looks just like our numpy code?

## Let's take advantage of autograd!!¶

In [139]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

for epoch in range(100):

Y = X @ W[1:, :] + W[0:1, :]

mse = ((T - Y)**2).mean()

mse.backward()  ##  NEW

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();


## Using Predefined Optimizers¶

In [140]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

optimizer = torch.optim.SGD([W], lr=learning_rate)   ## NEW

for epoch in range(100):

Y = X @ W[1:, :] + W[0:1, :]

mse = ((T - Y)**2).mean()
mse.backward()

optimizer.step()        ## NEW

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();

In [141]:
# torch.optim.


## Using Predefined Loss Functions¶

In [142]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples = X.shape[0]
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

optimizer = torch.optim.SGD([W], lr=learning_rate)

mse_func = torch.nn.MSELoss()  ## NEW

for epoch in range(100):

Y = X @ W[1:, :] + W[0:1, :]

mse = mse_func(T, Y)    ## NEW
mse.backward()

optimizer.step()

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();


## Using torch.nn module¶

The torch module includes classes that simplify the constuction of our models. It will not look simpler for our linear model, but will for multilayered models.

In [143]:
n_inputs = 1
n_outputs = 1

model = torch.nn.Sequential(torch.nn.Linear(n_inputs, n_outputs))
model

Out[143]:
Sequential(
(0): Linear(in_features=1, out_features=1, bias=True)
)
In [144]:
list(model.parameters())

Out[144]:
[Parameter containing:
Parameter containing:
tensor([-0.2438], requires_grad=True)]
In [145]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape  ## NEW, added n_inputs
n_outputs = T.shape[1]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

model = torch.nn.Sequential(torch.nn.Linear(n_inputs, n_outputs))

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

for epoch in range(100):

Y = model(X)            ## NEW

mse = mse_func(T, Y)
mse.backward()

optimizer.step()

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();


Now, add a hidden layer, or two.

In [146]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape  ## NEW, added n_inputs
n_outputs = T.shape[1]
n_hiddens = [10, 10]

learning_rate = 0.01 / (n_samples * n_outputs)

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

model = torch.nn.Sequential(
torch.nn.Linear(n_inputs, n_hiddens[0]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[1], n_outputs))

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()

for epoch in range(1000):

Y = model(X)            ## NEW

mse = mse_func(T, Y)
mse.backward()

optimizer.step()

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();


In [147]:
# torch.optim.Adam?
# torch.nn.Linear?

In [148]:
X = np.arange(10).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape
n_outputs = T.shape[1]
n_hiddens = [10, 10]

learning_rate = 0.5 / (n_samples * n_outputs)  ## Larger learning rate

X = torch.from_numpy(X).float()
T = torch.from_numpy(T)

model = torch.nn.Sequential(
torch.nn.Linear(n_inputs, n_hiddens[0]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[1], n_outputs))

mse_func = torch.nn.MSELoss()

for epoch in range(1000):

Y = model(X)

mse = mse_func(T, Y)
mse.backward()

optimizer.step()

plt.plot(X.detach(), T, 'o-', label='T')
plt.plot(X.detach(), Y.detach(), 'o-', label='Y')
plt.legend();


# Now for some Speed with a GPU¶

It is trivial to move data and operations down to a GPU with pytorch.

In [149]:
import time

n = 1000
a = np.random.uniform(-0.1, 0.1, size=(n, n)).astype(np.float32)
b = np.random.uniform(-0.1, 0.1, size=(n, n)).astype(np.float32)

start_time = time.time()
for i in range(1000):
c = a @ b
elapsed_time = time.time() - start_time

print(f'Took {elapsed_time} seconds')
c.shape

Took 4.328633069992065 seconds

Out[149]:
(1000, 1000)
In [150]:
torch.cuda.is_available()

Out[150]:
True
In [151]:
n = 1000
at = (torch.rand(size=(n, n)) - 0.5) * 0.2
bt = (torch.rand(size=(n, n)) - 0.5) * 0.2

# ct = torch.zeros((n, n))

start_time = time.time()
at = at.to('cuda')  ## Don't forget these assignments.  at.to('cuda') does not change at
bt = bt.to('cuda')

start_time = time.time()

for i in range(10000):
ct = at @ bt

ct = ct.to('cpu')
elapsed_time = time.time() - start_time

print(f'Took {elapsed_time} seconds')
ct.shape

Took 2.464926242828369 seconds

Out[151]:
torch.Size([1000, 1000])
In [152]:
import subprocess

def use_gpu(use=True):
if use:
subprocess.run(['system76-power', 'graphics', 'power', 'on'])
subprocess.run(['sudo', 'modprobe', 'nvidia'])
else:
subprocess.run(['sudo', 'rmmod', 'nvidia'])
subprocess.run(['system76-power', 'graphics', 'off'])

# use_gpu()  #  if running on my system76 laptop

torch.cuda.is_available()

Out[152]:
True

Now let's compare speed of our torch.nn model on more data, without and with the GPU.

In [153]:
use_gpu = False

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape
n_outputs = T.shape[1]

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

model = torch.nn.Sequential(
torch.nn.Linear(n_inputs, n_hiddens[0]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[1], n_outputs))

if use_gpu:
print('Moving data and model to GPU')
X = X.to('cuda')
T = T.to('cuda')
model.to('cuda')

mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

Y = model(X)

mse = mse_func(T, Y)
mse.backward()

optimizer.step()

elapsed_time = time.time() - start_time
print(f'Training took {elapsed_time:.2f} seconds.')
if use_gpu:
print('   with GPU')

plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();

Training took 9.45 seconds.

In [154]:
use_gpu = True

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape
n_outputs = T.shape[1]

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

model = torch.nn.Sequential(
torch.nn.Linear(n_inputs, n_hiddens[0]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[0], n_hiddens[1]),
torch.nn.Tanh(),
torch.nn.Linear(n_hiddens[1], n_outputs))

if use_gpu:
print('Moving data and model to GPU')
X = X.to('cuda')
T = T.to('cuda')
model.to('cuda')

mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

Y = model(X)

mse = mse_func(T, Y)
mse.backward()

optimizer.step()

elapsed_time = time.time() - start_time
print(f'Training took {elapsed_time:.2f} seconds.')
if use_gpu:
print('   with GPU')

plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();

Moving data and model to GPU
Training took 2.52 seconds.
with GPU


# The torch.nn.Module.forward function¶

We just saw how to implement the neural network using torch.nn.Sequential as a combination of Linear and Tanh layers. The forward calculation for such a neural network is implicitly defined this way.

Alternatively, we can define a new class that extends torch.nn.Module and define the forward function explicitly.

In [3]:
use_gpu = True

n_samples = 10000
X = np.linspace(0, 10, n_samples).reshape((-1, 1))
T = X ** 2
n_samples, n_inputs = X.shape
n_outputs = T.shape[1]

X = torch.from_numpy(X).float()
T = torch.from_numpy(T).float()

class NNet(torch.nn.Module):

def __init__(self, n_inputs, n_hiddens_list, n_outputs):
super().__init__()  # call parent class (torch.nn.Module) constructor

self.hidden_layers = torch.nn.ModuleList()  # necessary for model.to('cuda')
for nh in n_hiddens_list:
self.hidden_layers.append( torch.nn.Sequential(
torch.nn.Linear(n_inputs, nh),
torch.nn.Tanh()))

n_inputs = nh
self.output_layer = torch.nn.Linear(n_inputs, n_outputs)

def forward(self, X):
Y = X
for hidden_layer in self.hidden_layers:
Y = hidden_layer(Y)
Y = self.output_layer(Y)
return Y

n_hiddens = [100, 100]

learning_rate = 0.1 #  / (n_samples * n_outputs)  ## Larger learning rate

model = NNet(n_inputs, n_hiddens, n_outputs)

if use_gpu:
print('Moving data and model to GPU')
X = X.to('cuda')
T = T.to('cuda')
model.to('cuda')   # or   model.cuda()

mse_func = torch.nn.MSELoss()

start_time = time.time()

for epoch in range(1000):

Y = model(X)

mse = mse_func(T, Y)
mse.backward()

optimizer.step()

elapsed_time = time.time() - start_time
print(f'Training took {elapsed_time:.2f} seconds.')
if use_gpu:
print('   with GPU')

plt.plot(X.cpu().detach(), T.cpu(), 'o-', label='T')
plt.plot(X.cpu().detach(), Y.cpu().detach(), 'o-', label='Y')
plt.legend();

Moving data and model to GPU
Training took 2.71 seconds.
with GPU

In [ ]: