import torch
import torchvision
from torch.autograd import Variable
import numpy as np
print(torch.__version__)
!pip install ipdb
import ipdb
PyTorch allows you to dynamically define computational graphs. This is done by operating on Variable
s, which wrap PyTorch's Tensor
objects.
Here is an example, where we work with the function
$$f(x) = x^2 + 2x + 6$$def f(x):
return x ** 2 + 2 * x + 6
np_x = np.array([4.0])
x = torch.from_numpy(np_x).requires_grad_(True)
y = f(x)
print(y)
y.backward()
x.grad
np_x = np.array([5.0])
x = torch.from_numpy(np_x).requires_grad_(True)
y = f(x)
y.backward()
x.grad
Unlike Tensorflow, we can define the graph on the fly. That is why it is more convenient to define a function in Python: we call the function as part of constructing the graph.
Let's now create a simple linear function for classifiying MNIST digits. Material is lifted from: https://github.com/fastai/fastai_old/blob/master/dev_nb/001a_nn_basics.ipynb
import matplotlib.pyplot as plt
import math
%matplotlib inline
from torchvision import datasets, transforms
mnist_train = datasets.MNIST('data', train=True, download=True,
transform=transforms.ToTensor())
mnist_test = datasets.MNIST('../data', train=False, download=True, transform=
transforms.ToTensor())
print(mnist_train)
i = 732 # try different indices
example = mnist_train[i]
print("Label: ", example[1])
plt.imshow(example[0].reshape((28,28)), cmap = plt.cm.gray)
plt.grid(None)
Pytorch's DataLoader is responsible for managing batches. You can create a DataLoader from any Dataset. DataLoader makes it easier to iterate over batches (it can shuffle and give you the next batch)
from torch.utils.data import DataLoader
train_dl = DataLoader(mnist_train, batch_size=100)
dataiter = iter(train_dl)
images, labels = dataiter.next()
viz = torchvision.utils.make_grid(images, nrow=10, padding = 2).numpy()
fig, ax = plt.subplots(figsize= (8,8))
ax.imshow(np.transpose(viz, (1,2,0)))
ax.grid(None)
Thanks to PyTorch's ability to calculate gradients automatically, we can use any standard Python function (or callable object) as a model! So let's just write a plain matrix multiplication and broadcasted addition to create a simple linear model. We also need an activation function, so we'll write log_softmax and use it. Remember: although PyTorch provides lots of pre-written loss functions, activation functions, and so forth, you can easily write your own using plain python. PyTorch will even create fast GPU or vectorized CPU code for your function automatically.
def log_softmax(x):
return x - x.exp().sum(-1).log().unsqueeze(-1)
def model(xb, weights, bias):
return log_softmax(xb @ weights + bias)
def nll(input, target):
return -input[range(target.shape[0]), target].mean()
def accuracy(out, yb):
preds = torch.argmax(out, dim=1)
return (preds==yb).float().mean()
loss_func = nll
In the above, the '@' is syntactic sugar for the matrix multiply operation. We will call our function on one batch of data (in this case, 64 images). This is one forward pass. Note that our predictions won't be any better than random at this stage, since we start with random weights.
lr = 0.1
epochs = 1
bs = 128
print_every = 100
in_shape = 784
out_shape = 10
train_dl = DataLoader(mnist_train, batch_size=bs)
test_dl = DataLoader(mnist_test, batch_size = 100)
# Initialize weights
weights = torch.randn(in_shape, out_shape) / math.sqrt(in_shape)
weights.requires_grad_()
bias = torch.zeros(out_shape, requires_grad=True)
for epoch in range(epochs):
for i, (xb, yb) in enumerate(train_dl):
xb = xb.view(xb.size(0), -1)
# Evaluate training accuracy
if i % print_every == 0:
print("Batch: ", i)
print("Train acc on curr batch: ", accuracy(model(xb, weights, bias), yb).item())
# Forward pass
pred = model(xb, weights, bias)
loss = loss_func(pred, yb)
# Backward pass
loss.backward()
with torch.no_grad():
weights -= weights.grad * lr
bias -= bias.grad * lr
weights.grad.zero_()
bias.grad.zero_()
# Evaluate training accuracy
if i % print_every == 0:
print("Train acc on curr batch (post-update): ", accuracy(model(xb, weights, bias), yb).item())
The above training loop is a bit clunky and error-prone, so we'll now introduce more built-in PyTorch functionality. We first introduce a helper function for evaluating neural networks.
def get_test_stat(model, dl, device):
model.eval()
cum_loss, cum_acc = 0.0, 0.0
for i, (xb, yb) in enumerate(dl):
xb = xb.to(device)
yb = yb.to(device)
xb = xb.view(xb.size(0), -1)
y_pred = model(xb)
loss = loss_fn(y_pred, yb)
acc = accuracy(y_pred, yb)
cum_loss += loss.item() * len(yb)
cum_acc += acc.item() * len(yb)
cum_loss /= 10000
cum_acc /= 10000
model.train()
return cum_loss, cum_acc
learning_rate = 1e-2
epochs = 2
dim_x = 784
dim_h = 100
dim_out = 10
model = torch.nn.Sequential(
torch.nn.Linear(dim_x, dim_h),
torch.nn.ReLU(),
torch.nn.Linear(dim_h, dim_out),
)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# create datasets and data loader
mnist_train = datasets.MNIST('data', train=True, download=True,
transform=transforms.ToTensor())
mnist_test = datasets.MNIST('../data', train=False, download=True, transform=
transforms.ToTensor())
train_dl = DataLoader(mnist_train, batch_size=bs)
test_dl = DataLoader(mnist_test, batch_size = 100)
# Using GPUs in PyTorch is pretty straightforward
if torch.cuda.is_available():
print("Using cuda")
use_cuda = True
device = torch.device("cuda")
else:
device = "cpu"
model.to(device)
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(epochs):
print(epoch)
for i, (xb, yb) in enumerate(train_dl):
xb = xb.to(device)
yb = yb.to(device)
xb = xb.view(xb.size(0), -1)
# Forward pass
y_pred = model(xb)
loss = loss_fn(y_pred, yb)
# Backward pass
model.zero_grad() # Zero out the previous gradient computation
loss.backward() # Compute the gradient
optimizer.step() # Use the gradient information to make a step
test_loss, test_acc = get_test_stat(model, test_dl, device)
print("Test loss: {} Test acc: {}".format(test_loss, test_acc))
# We can access the weights of our neural network
model[0].weight
# We can look at the weights going into the tenth hidden unit
weights_ten = model[0].weight.data.cpu().numpy()[10, :]
print(weights_ten.shape)
fig, ax = plt.subplots()
ax.imshow(weights_ten.reshape((28,28)), cmap=plt.cm.coolwarm)
ax.grid(None)
To showcase the power of PyTorch dynamic graphs, we will implement a very strange model: a fully-connected ReLU network that on each forward pass randomly chooses a number between 1 and 4 and has that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.
By Justin Johnson https://github.com/jcjohnson/pytorch-examples/blob/master/nn/dynamic_net.py
import random
class DynamicNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
"""
In the constructor we construct three nn.Linear instances that we will use
in the forward pass.
"""
super(DynamicNet, self).__init__()
self.input_linear = torch.nn.Linear(D_in, H)
self.middle_linear = torch.nn.Linear(H, H)
self.output_linear = torch.nn.Linear(H, D_out)
def forward(self, x, verbose = False):
"""
For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
and reuse the middle_linear Module that many times to compute hidden layer
representations.
Since each forward pass builds a dynamic computation graph, we can use normal
Python control-flow operators like loops or conditional statements when
defining the forward pass of the model.
Here we also see that it is perfectly safe to reuse the same Module many
times when defining a computational graph. This is a big improvement from Lua
Torch, where each Module could be used only once.
"""
h_relu = self.input_linear(x).clamp(min=0)
n_layers = random.randint(0, 3)
if verbose:
print("The number of layers for this run is", n_layers)
# print(h_relu)
for _ in range(n_layers):
h_relu = self.middle_linear(h_relu).clamp(min=0)
if verbose:
pass
# print(h_relu)
y_pred = self.output_linear(h_relu)
return y_pred
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 10, 1
# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = torch.randn(N, D_in)
y = torch.randn(N, D_out).requires_grad_(False)
# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)
# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(50):
# Forward pass: Compute predicted y by passing x to the model
y_pred = model(x)
# Compute and print loss
loss = criterion(y_pred, y)
print(t, loss.data.item())
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
loss.backward()
optimizer.step()
We will finish with an example on CIFAR10, highlighting the importance of applying transformations to your inputs. Example is lifted from:
https://github.com/uoguelph-mlrg/Cutout/blob/master/train.py
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 120)
self.fc3 = nn.Linear(120, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
We will experiment with data normalization and data augmentation.
def get_data(data_normalize=False, data_augment=False):
train_transform = transforms.Compose([])
test_transform = transforms.Compose([])
if data_augment:
train_transform.transforms.append(transforms.RandomCrop(32, padding=4))
train_transform.transforms.append(transforms.RandomHorizontalFlip())
train_transform.transforms.append(transforms.ToTensor())
test_transform.transforms.append(transforms.ToTensor())
if data_normalize:
normalize = transforms.Normalize(mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
train_transform.transforms.append(normalize)
test_transform.transforms.append(normalize)
train_dataset = datasets.CIFAR10(root='data/',
train=True,
transform=train_transform,
download=True)
test_dataset = datasets.CIFAR10(root='data/',
train=False,
transform=test_transform,
download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=128,
shuffle=True,
num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=128,
shuffle=False,
num_workers=2)
return train_loader, test_loader
def test(net, loader):
net.eval() # Change model to 'eval' mode (BN uses moving mean/var).
correct = 0.
total = 0.
for images, labels in loader:
with torch.no_grad():
pred = net(images)
pred = torch.max(pred.data, 1)[1]
total += labels.size(0)
correct += (pred == labels).sum().item()
val_acc = correct / total
net.train()
return val_acc
def train_model(train_loader, test_loader, epochs=5):
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
criterion = nn.CrossEntropyLoss()
train_accs = []
test_accs = []
net.train()
for epoch in range(epochs):
print(epoch)
xentropy_loss_avg = 0.
correct = 0.
total = 0.
for i, (images, labels) in enumerate(train_loader):
net.zero_grad()
pred = net(images)
xentropy_loss = criterion(pred, labels)
xentropy_loss.backward()
optimizer.step()
xentropy_loss_avg += xentropy_loss.item()
# Calculate running average of accuracy
pred = torch.max(pred.data, 1)[1]
total += labels.size(0)
correct += (pred == labels.data).sum().item()
accuracy = correct / total
test_acc = test(net, test_loader)
print("Test acc: ", test_acc)
train_accs.append(accuracy)
test_accs.append(test_acc)
return train_accs, test_accs
train_loader, test_loader = get_data(data_augment=False, data_normalize=False)
train_accs, test_accs = train_model(train_loader, test_loader, epochs=3)
train_loader, test_loader = get_data(data_augment=False, data_normalize=True)
normalize_train_accs, normalize_test_accs = train_model(train_loader, test_loader, epochs=3)
fig, ax = plt.subplots()
epochs = 3
ax.plot(range(epochs), train_accs, c="blue", label="no input normalization")
ax.plot(range(epochs), normalize_train_accs, c="red", label="input normalization")
ax.legend()
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")
ax.set_title("Train Accuracy")
fig.show()
fig, ax = plt.subplots()
epochs = 3
ax.plot(range(epochs), test_accs, c="blue", label="no input normalization")
ax.plot(range(epochs), normalize_test_accs, c="red", label="input normalization")
ax.legend()
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")
ax.set_title("Test Accuracy")
fig.show()