Notebook

Finding uncertainty in a toy classification example¶

Calculate uncertainty with dropout-based methods on one-dimensional source and target.

Setup notebook¶

In [1]:

from typing import Callable, List, Optional, Tuple, Union

from glob import glob
import os
import random
import sys
import warnings

gpu_id = 1
os.environ["CUDA_VISIBLE_DEVICES"] = f'{gpu_id}'

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

Support in-notebook plotting

In [2]:

%matplotlib inline

Report versions

In [3]:

print('numpy version: {}'.format(np.__version__))
from matplotlib import __version__ as mplver
print('matplotlib version: {}'.format(mplver))
print(f'pytorch version: {torch.__version__}')

numpy version: 1.17.2
matplotlib version: 3.1.3
pytorch version: 1.5.0

In [4]:

pv = sys.version_info
print('python version: {}.{}.{}'.format(pv.major, pv.minor, pv.micro))

python version: 3.7.7

Reload packages where content for package development

In [5]:

%load_ext autoreload
%autoreload 2

Check GPU(s)

In [6]:

!nvidia-smi | head -n 4

Sat Jun 20 15:38:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.40       Driver Version: 430.40       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+

Set seeds for better reproducibility

In [7]:

seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Setup training and validation data¶

Make up some toy data for our classification experiments, e.g., a mixture of 2 Gaussians.

In [8]:

class ToyClassificationData(Dataset):
    def __init__(self, n:int, x_locs:Tuple[float,float]=(-1.,1.), 
                 noise_std:float=0.1, dist:Optional=torch.distributions.Normal):
        self.n = n
        self.x_locs = x_locs
        self.noise_std = noise_std
        self.x0_dist = dist(x_locs[0], noise_std)
        self.x1_dist = dist(x_locs[1], noise_std)
        x0 = self.x0_dist.sample((n//2,1))
        y0 = torch.zeros((n//2,1))
        x1 = self.x1_dist.sample((n//2,1))
        y1 = torch.ones((n//2,1))
        self.x = torch.cat((x0, x1))
        self.y = torch.cat((y0, y1))

    def __len__(self):
        return self.x.size(0)

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

In [9]:

dim = 1
N = 2**10
train_dist = torch.distributions.Normal
valid_dist = torch.distributions.Normal
xt_locs, xv_locs = (-1.,1.), (-1., 1.)
train_std, valid_std = 0.5, 0.5
print(f'N = {N}')

train_ds_params = dict(x_locs=xt_locs, noise_std=train_std, dist=train_dist)
valid_ds_params = dict(x_locs=xv_locs, noise_std=valid_std, dist=valid_dist)
train_ds = ToyClassificationData(N, **train_ds_params)
valid_ds = ToyClassificationData(N//2, **valid_ds_params)

N = 1024

In [10]:

plot_data = N <= 2**12 and dim == 1
if plot_data:
    ix0 = train_ds.y[:,0] == 0
    ix1 = train_ds.y[:,0] == 1
    f, ax = plt.subplots(1,1)
    ax.hist(train_ds.x[ix0,0], bins=20, label='0', alpha=0.7)
    ax.hist(train_ds.x[ix1,0], bins=20, label='1', alpha=0.7)
    ax.legend()
    ax.set_title('Training data');

Setup model¶

In [11]:

activation = nn.ReLU

class UncertainLinear(nn.Module):
    def __init__(self, dim:int, mid:int, n_layers:int=1, dropout_rate:float=0.2, 
                 mid_bias:bool=True, out_bias:bool=False):
        super().__init__()
        self.p = dropout_rate
        layers = [nn.Sequential(nn.Linear(dim, mid, bias=mid_bias), activation())]
        for _ in range(n_layers-1):
            layers.append(nn.Sequential(nn.Linear(mid,mid,bias=mid_bias), activation()))
        self.m = nn.ModuleList(layers)
        self.out = nn.Linear(mid, 1, bias=out_bias)
        self.var = nn.Linear(mid, 1, bias=out_bias)
    
    def forward(self, x:torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        for m in self.m:
            x = F.dropout(m(x), p=self.p, training=True)
        return self.out(x), self.var(x)

In [12]:

class BCELoss(nn.Module):
    """ use just BCE loss with UncertainLinear network """
    def forward(self, out:torch.Tensor, y:torch.Tensor) -> torch.Tensor:
        logit, _ = out
        loss = F.binary_cross_entropy_with_logits(logit, y)
        return loss

class ExtendedBCELoss(nn.Module):
    """ use modified BCE loss for variance calculation with UncertainLinear network """
    def forward(self, out:torch.Tensor, y:torch.Tensor, n_samp:int=10) -> torch.Tensor:
        logit, sigma = out
        dist = torch.distributions.Normal(logit, torch.exp(sigma))
        mc_logs = dist.rsample((n_samp,))
        loss = 0.
        for mc_log in mc_logs:
            loss += F.binary_cross_entropy_with_logits(mc_log, y)
        loss /= n_samp
        return loss

In [13]:

def calc_uncertainty(logits:torch.Tensor, sigmas:torch.Tensor, eps:float=1e-6) -> Tuple[torch.Tensor, torch.Tensor]:
    """ calculate epistemic, entropy, and aleatory uncertainty quantities """
    probits = torch.sigmoid(logits)
    epistemic = probits.var(dim=0, unbiased=True)
    probit = probits.mean(dim=0)
    entropy = -1 * (probit * (probit + eps).log2() + ((1 - probit) * (1 - probit + eps).log2()))
    aleatory = torch.exp(sigmas).mean(dim=0)
    return epistemic, entropy, aleatory

def predict(model, x:torch.Tensor, n_samp:int=25):
    """ return predictions of the model and the predicted model uncertainties """
    out = [model.forward(x) for _ in range(n_samp)]
    logits = torch.stack([o[0] for o in out]).detach().cpu()
    sigmas = torch.stack([o[1] for o in out]).detach().cpu()
    ep, en, al = calc_uncertainty(logits, sigmas)
    return (torch.mean(logits, dim=0), torch.mean(sigmas, dim=0), ep, en, al)

Helper functions¶

In [14]:

def get_metrics(model, x:torch.Tensor, y:torch.Tensor, n_samp:int, eps:float):
    """ get uncertainties and other metrics during training for analysis """
    state = model.training
    model.eval()
    with torch.no_grad():
        out, s, ep, en, al = predict(model, x, n_samp)
        loss = criterion((out, s), y.detach().cpu())
        sb = ep / (al + eps)
        eu, nu, au = ep.numpy().mean(), en.numpy().mean(), al.numpy().mean()
        su = sb.numpy().mean()
    model.train(state)
    return loss, (out, ep, en, al, sb), eu, nu, au, su

Setup training¶

Set training and model hyperparameters

In [15]:

# system setup
use_jit = False
load_model = False

# logging setup
log_rate = 100  # print losses and uncertainty measures every log_rate epoch
unc_rate = 5  # get uncertainties in training every unc_rate iteration
version = 'toy_classification_v1'  # naming scheme of model to load

# model, optimizer, loss, and training parameters
batch_size = 64
n_jobs = 0
n_epochs = 200
n_samp = 5  # number of samples to take in validation
model_kwargs = dict(dim=dim, mid=1024, n_layers=1, dropout_rate=0.25, mid_bias=True, out_bias=False)
use_adam = True
optim_kwargs = dict(lr=0.001, betas=(0.8,0.99), weight_decay=1e-6) if use_adam else \
               dict(lr=0.001, momentum=0.,      weight_decay=1e-6)
use_scheduler = False
scheduler_kwargs = dict(step_size=1000, gamma=0.5)
extended = True  # use modified loss for variance estimation
eps = 1e-6  # add this value to denominator in scibilic uncertainty calc

In [16]:

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          num_workers=n_jobs, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size,
                          num_workers=n_jobs, pin_memory=True)

In [17]:

assert torch.cuda.is_available()
device = torch.device(f'cuda')
torch.backends.cudnn.benchmark = True

In [18]:

model = UncertainLinear(**model_kwargs)
if use_jit: model = torch.jit.script(model)

In [19]:

def num_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [20]:

print(f'Number of trainable parameters in model: {num_params(model)}')

Number of trainable parameters in model: 4096

In [21]:

if load_model:
    if use_jit:
        model = torch.jit.load(f'trained_jit_{version}.pth')
    else:
        state_dict = torch.load(f'trained_{version}.pth')
        model.load_state_dict(state_dict);

In [22]:

model.cuda(device=device)
optim_cls = torch.optim.AdamW if use_adam else torch.optim.SGD
optimizer = optim_cls(model.parameters(), **optim_kwargs)
if use_scheduler: 
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **scheduler_kwargs)
criterion = ExtendedBCELoss() if extended else BCELoss()

Train model¶

In [23]:

train_losses, valid_losses = [], []
t_ep_unc, t_en_unc, t_al_unc, t_sb_unc = [], [], [], []
v_ep_unc, v_en_unc, v_al_unc, v_sb_unc = [], [], [], []
n_batches = len(train_loader)

In [24]:

for t in range(1, n_epochs+1):
    # training
    t_losses, t_ep, t_en, t_al, t_sb = [], [], [], [], []
    model.train()
    for i, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        t_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if i % unc_rate == 0:
            _, _, ep, en, al, sb = get_metrics(model, x, y, n_samp, eps)
            t_ep.append(ep); t_en.append(en)
            t_al.append(al); t_sb.append(sb)
    train_losses.append(t_losses)
    t_ep_unc.append(t_ep); t_en_unc.append(t_en)
    t_al_unc.append(t_al); t_sb_unc.append(t_sb)

    # validation
    v_losses, v_ep, v_en, v_al, v_sb = [], [], [], [], []
    model.eval()
    with torch.no_grad():
        for i, (x, y) in enumerate(valid_loader):
            x = x.to(device)
            loss, out, ep, en, al, sb = get_metrics(model, x, y, n_samp, eps)
            v_losses.append(loss.item())
            v_ep.append(ep); v_en.append(en)
            v_al.append(al); v_sb.append(sb)
        valid_losses.append(v_losses)
        v_ep_unc.append(v_ep); v_en_unc.append(v_en)
        v_al_unc.append(v_al); v_sb_unc.append(v_sb)

    if not np.all(np.isfinite(t_losses)): 
        raise RuntimeError('NaN or Inf in training loss, cannot recover. Exiting.')
    if t % log_rate == 0:
        log = (f'Epoch: {t} - TL: {np.mean(t_losses):.2e}, VL: {np.mean(v_losses):.2e}, '             
               f'tEU: {np.mean(t_ep):.2e}, vEU: {np.mean(v_ep):.2e}, '
               f'tAU: {np.mean(t_al):.2e}, vAU: {np.mean(v_al):.2e}')
        print(log)
    if use_scheduler: scheduler.step()

Epoch: 100 - TL: 6.64e-02, VL: 4.52e-02, tEU: 1.02e-04, vEU: 1.25e-04, tAU: 3.35e-03, vAU: 2.84e-03
Epoch: 200 - TL: 6.55e-02, VL: 4.36e-02, tEU: 1.22e-04, vEU: 9.21e-05, tAU: 2.05e-03, vAU: 2.06e-03

TL/VL = mean training/validation loss
tEU/vEU = mean epistemic uncert. over training/validation set
tAU/vAU = mean aleatory uncert. over training/validation set

Analyze training¶

Create tidy datasets from training and validation losses for plotting purposes.

In [25]:

def tidy_losses(train:List[List[float]], valid:List[List[float]]) -> pd.DataFrame:
    out = {'epoch': [], 'type': [], 'value': [], 'phase': []}
    for i, (tl,vl) in enumerate(zip(train,valid),1):
        for tli in tl:
            out['epoch'].append(i)
            out['type'].append('loss')
            out['value'].append(tli)
            out['phase'].append('train')
        for vli in vl:
            out['epoch'].append(i)
            out['type'].append('loss')
            out['value'].append(vli)
            out['phase'].append('valid')
    return pd.DataFrame(out)

def tidy_uncertainty(ep:List[List[float]], al:List[List[float]], sb:List[List[float]]) -> pd.DataFrame:
    out = {'epoch': [], 'type': [], 'value': [], 'phase': []}
    for i, (epi, ali, sbi) in enumerate(zip(ep, al, sb)):
        phase = 'train' if i == 0 else 'valid'
        for j, (epij,alij,sbij) in enumerate(zip(epi,ali,sbi),1):
            for epijk in epij:
                out['epoch'].append(j)
                out['type'].append('epistemic')
                out['value'].append(epijk)
                out['phase'].append(phase)
            for alijk in alij:
                out['epoch'].append(j)
                out['type'].append('aleatory')
                out['value'].append(alijk)
                out['phase'].append(phase)
    return pd.DataFrame(out)

In [26]:

losses = tidy_losses(train_losses, valid_losses)
uncert = tidy_uncertainty((t_ep_unc, v_ep_unc), 
                          (t_al_unc, v_al_unc), 
                          (t_sb_unc, v_sb_unc))

In [27]:

version = version  # naming scheme of model to save

In [28]:

load_tidy = False
if load_tidy:
    losses = pd.read_csv(f'losses_{version}.csv')
    uncert = pd.read_csv(f'uncert_{version}.csv')

In [29]:

f, ax1 = plt.subplots(1,1,figsize=(12, 8),sharey=True)
sns.lineplot(x='epoch',y='value',hue='phase',data=losses,ci='sd',ax=ax1,lw=3);
ax1.set_title('Losses');
ax1.set_ylim((0.,0.1))
f.savefig(f'losses_{version}.pdf')

The output of the validation is averaged over 5 samples before the loss is computed which is why it is lower than the training loss (which is not averaged).

In [30]:

use_log = False
save_plots = False

In [31]:

f, ax1 = plt.subplots(1,1,figsize=(12,8),sharey=True) 
if use_log: ax1.set(yscale='log')
sns.lineplot(x='epoch',y='value',hue='type',style='phase',ci='sd',data=uncert,ax=ax1,lw=3);
ax1.set_title('Uncertainty');
ax1.set_ylim((-0.01, 0.2))
if save_plots: f.savefig(f'uncert_{version}.pdf')

It is not immediately clear to me why aleatory uncertainty is jumping all over the place. My guess would be that the loss function is fairly sensitive because of the trade-off between the normal loss and variance term.

In [32]:

save_tidy = False
if save_tidy:
    losses.to_csv(f'losses_{version}.csv')
    uncert.to_csv(f'uncert_{version}.csv')

In [33]:

save_model = False
if save_model:
    if use_jit:
        torch.jit.save(model, f'trained_jit_{version}.pth')
    else:
        torch.save(model.state_dict(), f'trained_{version}.pth')

Examine and evaluate results¶

In [34]:

xt, yt = train_ds.x, train_ds.y
xv, yv = valid_ds.x, valid_ds.y
xtn, ytn = xt.numpy(), yt.numpy()
xvn, yvn = xv.numpy(), yv.numpy()

In [35]:

def list_to_np(lst:List[torch.Tensor]) -> List[np.ndarray]:
    return list(map(lambda x: x.detach().cpu().numpy(), lst)) 

In [36]:

model.eval()
with torch.no_grad():
    logittn, sigmatn, etn, ntn, atn = list_to_np(predict(model, xt.to(device), 50))
    logitvn, sigmavn, evn, ntn, avn = list_to_np(predict(model, xv.to(device), 50))

In [37]:

fig,(ax1,ax2,ax3,ax4) = plt.subplots(1,4,figsize=(15,5),sharex=True,sharey=True)
with warnings.catch_warnings():
    warnings.simplefilter("ignore") 
    ax1.hist(train_ds.x[train_ds.y[:,0] == 0,0], bins=20, label='0', alpha=0.7, density=True)
    ax1.hist(train_ds.x[train_ds.y[:,0] == 1,0], bins=20, label='1', alpha=0.7, density=True)
    ax2.hist(valid_ds.x[valid_ds.y[:,0] == 0,0], bins=20, label='0', alpha=0.7, density=True)
    ax2.hist(valid_ds.x[valid_ds.y[:,0] == 1,0], bins=20, label='1', alpha=0.7, density=True)
    ax3.hist(train_ds.x[logittn[:,0] <  0.,0], bins=20, label='0', alpha=0.7, density=True)
    ax3.hist(train_ds.x[logittn[:,0] >= 0.,0], bins=20, label='1', alpha=0.7, density=True)
    ax4.hist(valid_ds.x[logitvn[:,0] <  0.,0], bins=20, label='0', alpha=0.7, density=True)
    ax4.hist(valid_ds.x[logitvn[:,0] >= 0.,0], bins=20, label='1', alpha=0.7, density=True)
    ax4.legend()
ax1.set_title('Training data');
ax2.set_title('Validation data');
ax3.set_title('Predicted training');
ax4.set_title('Predicted validation');

Examine the function fit by the neural network as well as the estimates of uncertainty¶

In [38]:

if plot_data:
    fig, (ax1,ax2,ax3) = plt.subplots(1,3,figsize=(16,6))
    x_start = 2.
    x_range = (-x_start, x_start)
    xs = torch.linspace(*x_range, 100).unsqueeze(1)
    with torch.no_grad():
        ys, _, ep, en, al = list_to_np(predict(model, xs.to(device), 1000))
    x, y, ep, en, al = xs[:,0], ys[:,0], ep[:,0], en[:,0], al[:,0]
    sc = ep / (al + eps)  # scibilic uncertainty
    ixs0 = train_ds.y[:,0] == 0
    ixs1 = train_ds.y[:,0] == 1
    pal = sns.color_palette()
    ax1_2 = ax1.twinx()
    ax1_2.hist(train_ds.x[ixs0,0], bins=20, label='0', alpha=0.7, density=True)
    ax1_2.hist(train_ds.x[ixs1,0], bins=20, label='1', alpha=0.7, density=True)
    ax1.plot(x, ep, label='Ep.', lw=3, color=pal[0])
    ax1.set_title('Epistemic')
    ax1.set_ylim((0.,None))
    ax2_2 = ax2.twinx()
    ax2_2.hist(train_ds.x[ixs0,0], bins=20, label='0', alpha=0.7, density=True)
    ax2_2.hist(train_ds.x[ixs1,0], bins=20, label='1', alpha=0.7, density=True)
    ax2.plot(x, al, label='Al.', lw=3, color=pal[1])
    ax2.set_title('Aleatory')
    ax2.set_ylim((0.,None))
    ax3_2 = ax3.twinx()
    ax3_2.hist(train_ds.x[ixs0,0], bins=20, label='0', alpha=0.7, density=True)
    ax3_2.hist(train_ds.x[ixs1,0], bins=20, label='1', alpha=0.7, density=True)
    ax3.plot(x, en, label='En.', lw=3, color=pal[2])
    ax3.set_title('Entropy')
    ax3.set_ylim((0.,None))
    fig.tight_layout()

We can see the epistemic and aleatory uncertainty (and entropy) plotted over the histograms of the true data. We see that epistemic uncertainty is highly correlated with aleatory uncertainty and entropy (albeit, aleatory is wider). In fact, this behavior—under these circumstances-is expected. Because we are fitting a hyperplane to the data, the value of the output probability will not go back to 0.5 far to the right or the left of the in-distribution data. Aleatory uncertainty is high in the overlapped region because high aleatory uncertainty in that region lowers the loss.

This example is too trivial to show useful differences between epistemic and aleatory uncertainty for classification tasks, but perhaps it will lead to a better understanding of what is going on, as well as instructive for implementation.

In [ ]: