Notebook

In [ ]:

import torch
import torchvision
from torch.autograd import Variable
import numpy as np
print(torch.__version__)

In [ ]:

!pip install ipdb
import ipdb

PyTorch: The Basics¶

PyTorch allows you to dynamically define computational graphs. This is done by operating on Variables, which wrap PyTorch's Tensor objects.

Here is an example, where we work with the function

$$f(x) = x^2 + 2x + 6$$

In [ ]:

def f(x):
    return x ** 2 + 2 * x + 6

In [ ]:

np_x = np.array([4.0])
x = torch.from_numpy(np_x).requires_grad_(True)
y = f(x)

In [ ]:

print(y)

In [ ]:

y.backward()

In [ ]:

x.grad

In [ ]:

np_x = np.array([5.0])
x = torch.from_numpy(np_x).requires_grad_(True)
y = f(x)

In [ ]:

y.backward()

In [ ]:

x.grad

Unlike Tensorflow, we can define the graph on the fly. That is why it is more convenient to define a function in Python: we call the function as part of constructing the graph.

Let's now create a simple linear function for classifiying MNIST digits. Material is lifted from: https://github.com/fastai/fastai_old/blob/master/dev_nb/001a_nn_basics.ipynb

In [ ]:

import matplotlib.pyplot as plt
import math
%matplotlib inline

In [ ]:

from torchvision import datasets, transforms

mnist_train = datasets.MNIST('data', train=True, download=True,
                       transform=transforms.ToTensor())

mnist_test = datasets.MNIST('../data', train=False, download=True, transform=
                            transforms.ToTensor())

In [ ]:

print(mnist_train)

In [ ]:

i = 732  # try different indices
example = mnist_train[i]
print("Label: ", example[1])
plt.imshow(example[0].reshape((28,28)), cmap = plt.cm.gray)
plt.grid(None)

Pytorch's DataLoader is responsible for managing batches. You can create a DataLoader from any Dataset. DataLoader makes it easier to iterate over batches (it can shuffle and give you the next batch)

In [ ]:

from torch.utils.data import DataLoader
train_dl = DataLoader(mnist_train, batch_size=100)
dataiter = iter(train_dl)
images, labels = dataiter.next()
viz = torchvision.utils.make_grid(images, nrow=10, padding = 2).numpy()
fig, ax = plt.subplots(figsize= (8,8))
ax.imshow(np.transpose(viz, (1,2,0)))
ax.grid(None)

Thanks to PyTorch's ability to calculate gradients automatically, we can use any standard Python function (or callable object) as a model! So let's just write a plain matrix multiplication and broadcasted addition to create a simple linear model. We also need an activation function, so we'll write log_softmax and use it. Remember: although PyTorch provides lots of pre-written loss functions, activation functions, and so forth, you can easily write your own using plain python. PyTorch will even create fast GPU or vectorized CPU code for your function automatically.

In [ ]:

def log_softmax(x): 
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb, weights, bias):      
    return log_softmax(xb @ weights + bias)

def nll(input, target): 
    return -input[range(target.shape[0]), target].mean()

def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds==yb).float().mean()

loss_func = nll

In the above, the '@' is syntactic sugar for the matrix multiply operation. We will call our function on one batch of data (in this case, 64 images). This is one forward pass. Note that our predictions won't be any better than random at this stage, since we start with random weights.

In [ ]:

lr = 0.1
epochs = 1
bs = 128

print_every = 100
in_shape = 784
out_shape = 10

train_dl = DataLoader(mnist_train, batch_size=bs)
test_dl = DataLoader(mnist_test, batch_size = 100)

# Initialize weights
weights = torch.randn(in_shape, out_shape) / math.sqrt(in_shape)
weights.requires_grad_()
bias = torch.zeros(out_shape, requires_grad=True)


for epoch in range(epochs):
    for i, (xb, yb) in enumerate(train_dl):
    
        xb = xb.view(xb.size(0), -1)
        
        # Evaluate training accuracy
        if i % print_every == 0: 
            print("Batch: ", i)
            print("Train acc on curr batch: ", accuracy(model(xb, weights, bias), yb).item())
            
        # Forward pass    
        pred = model(xb, weights, bias)
        loss = loss_func(pred, yb)

        # Backward pass
        loss.backward()
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()
            
        # Evaluate training accuracy
        if i % print_every == 0: 
            print("Train acc on curr batch (post-update): ", accuracy(model(xb, weights, bias), yb).item())
     
       

The above training loop is a bit clunky and error-prone, so we'll now introduce more built-in PyTorch functionality. We first introduce a helper function for evaluating neural networks.

In [ ]:

def get_test_stat(model, dl, device):
    model.eval()
    cum_loss, cum_acc = 0.0, 0.0
    for i, (xb, yb) in enumerate(dl):
        xb = xb.to(device)
        yb = yb.to(device)
        
        xb = xb.view(xb.size(0), -1)
        y_pred = model(xb)
        loss = loss_fn(y_pred, yb)
        acc = accuracy(y_pred, yb)
        cum_loss += loss.item() * len(yb)
        cum_acc += acc.item() * len(yb)
    cum_loss /= 10000
    cum_acc /= 10000
    model.train()
    return cum_loss, cum_acc

In [ ]:

learning_rate = 1e-2
epochs = 2

dim_x = 784
dim_h = 100
dim_out = 10

model = torch.nn.Sequential(
    torch.nn.Linear(dim_x, dim_h),
    torch.nn.ReLU(),
    torch.nn.Linear(dim_h, dim_out),
)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


# create datasets and data loader
mnist_train = datasets.MNIST('data', train=True, download=True,
                       transform=transforms.ToTensor())

mnist_test = datasets.MNIST('../data', train=False, download=True, transform=
                            transforms.ToTensor())
train_dl = DataLoader(mnist_train, batch_size=bs)
test_dl = DataLoader(mnist_test, batch_size = 100)

# Using GPUs in PyTorch is pretty straightforward
if torch.cuda.is_available():
    print("Using cuda")
    use_cuda = True
    device = torch.device("cuda")
else:
    device = "cpu"

model.to(device)
loss_fn = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(epochs):
    print(epoch)
    for i, (xb, yb) in enumerate(train_dl):
        xb = xb.to(device)
        yb = yb.to(device)
        xb = xb.view(xb.size(0), -1)
        
        # Forward pass
        y_pred = model(xb)
        loss = loss_fn(y_pred, yb)

        # Backward pass
        model.zero_grad()  # Zero out the previous gradient computation
        loss.backward()    # Compute the gradient
        optimizer.step()   # Use the gradient information to make a step
        
    test_loss, test_acc = get_test_stat(model, test_dl, device)
    print("Test loss: {}  Test acc: {}".format(test_loss, test_acc))

In [ ]:

# We can access the weights of our neural network
model[0].weight

In [ ]:

# We can look at the weights going into the tenth hidden unit
weights_ten = model[0].weight.data.cpu().numpy()[10, :]
print(weights_ten.shape)

In [ ]:

fig, ax = plt.subplots()
ax.imshow(weights_ten.reshape((28,28)), cmap=plt.cm.coolwarm)
ax.grid(None)

To showcase the power of PyTorch dynamic graphs, we will implement a very strange model: a fully-connected ReLU network that on each forward pass randomly chooses a number between 1 and 4 and has that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.

By Justin Johnson https://github.com/jcjohnson/pytorch-examples/blob/master/nn/dynamic_net.py

In [ ]:

import random


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x, verbose = False):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.
        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.
        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        n_layers = random.randint(0, 3)
        if verbose:
            print("The number of layers for this run is", n_layers)
            # print(h_relu)
        for _ in range(n_layers):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
            if verbose:
                pass
                # print(h_relu)
        y_pred = self.output_linear(h_relu)
        return y_pred




# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 10, 1

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = torch.randn(N, D_in)
y = torch.randn(N, D_out).requires_grad_(False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(50):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

We will finish with an example on CIFAR10, highlighting the importance of applying transformations to your inputs. Example is lifted from:

https://github.com/uoguelph-mlrg/Cutout/blob/master/train.py

In [ ]:

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 120)
        self.fc3 = nn.Linear(120, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

We will experiment with data normalization and data augmentation.

In [ ]:

def get_data(data_normalize=False, data_augment=False):
 
    train_transform = transforms.Compose([])
    test_transform = transforms.Compose([])

    if data_augment:
        train_transform.transforms.append(transforms.RandomCrop(32, padding=4))
        train_transform.transforms.append(transforms.RandomHorizontalFlip())
    
    train_transform.transforms.append(transforms.ToTensor())
    test_transform.transforms.append(transforms.ToTensor())
        
    if data_normalize:
        normalize = transforms.Normalize(mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
                                         std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
        train_transform.transforms.append(normalize)
        test_transform.transforms.append(normalize)

  
    train_dataset = datasets.CIFAR10(root='data/',
                                     train=True,
                                     transform=train_transform,
                                     download=True)

    test_dataset = datasets.CIFAR10(root='data/',
                                    train=False,
                                    transform=test_transform,
                                    download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=128,
                                               shuffle=True,
                                               num_workers=2)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=128,
                                              shuffle=False,
                                              num_workers=2)
    return train_loader, test_loader

def test(net, loader):
    net.eval()    # Change model to 'eval' mode (BN uses moving mean/var).
    correct = 0.
    total = 0.
    for images, labels in loader:
        with torch.no_grad():
            pred = net(images)

        pred = torch.max(pred.data, 1)[1]
        total += labels.size(0)
        correct += (pred == labels).sum().item()

    val_acc = correct / total
    net.train()
    return val_acc

In [ ]:

def train_model(train_loader, test_loader, epochs=5):
    
    net = Net()
    optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9) 
    criterion = nn.CrossEntropyLoss()
    train_accs = []
    test_accs = []
    
    net.train()

    for epoch in range(epochs):
        print(epoch)

        xentropy_loss_avg = 0.
        correct = 0.
        total = 0.

        for i, (images, labels) in enumerate(train_loader):
           
            net.zero_grad()
            pred = net(images)
            xentropy_loss = criterion(pred, labels)
            xentropy_loss.backward()
            optimizer.step()

            xentropy_loss_avg += xentropy_loss.item()

            # Calculate running average of accuracy
            pred = torch.max(pred.data, 1)[1]
            total += labels.size(0)
            correct += (pred == labels.data).sum().item()
            accuracy = correct / total
        
        test_acc = test(net, test_loader)
        print("Test acc: ", test_acc)
        train_accs.append(accuracy)
        test_accs.append(test_acc)
    return train_accs, test_accs

In [ ]:

train_loader, test_loader = get_data(data_augment=False, data_normalize=False)
train_accs, test_accs = train_model(train_loader, test_loader, epochs=3)

In [ ]:

train_loader, test_loader = get_data(data_augment=False, data_normalize=True)
normalize_train_accs, normalize_test_accs = train_model(train_loader, test_loader, epochs=3)

In [ ]:

fig, ax = plt.subplots()
epochs = 3
ax.plot(range(epochs), train_accs, c="blue", label="no input normalization")
ax.plot(range(epochs), normalize_train_accs, c="red", label="input normalization")
ax.legend()
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")
ax.set_title("Train Accuracy")
fig.show()

In [ ]:

fig, ax = plt.subplots()
epochs = 3
ax.plot(range(epochs), test_accs, c="blue", label="no input normalization")
ax.plot(range(epochs), normalize_test_accs, c="red", label="input normalization")
ax.legend()
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")
ax.set_title("Test Accuracy")
fig.show()

In [ ]: