Calculate uncertainty with dropout-based methods on one-dimensional source and target.
from typing import Callable, List, Optional, Tuple, Union
from glob import glob
import os
import random
import sys
import warnings
gpu_id = 1
os.environ["CUDA_VISIBLE_DEVICES"] = f'{gpu_id}'
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
Support in-notebook plotting
%matplotlib inline
Report versions
print('numpy version: {}'.format(np.__version__))
from matplotlib import __version__ as mplver
print('matplotlib version: {}'.format(mplver))
print(f'pytorch version: {torch.__version__}')
numpy version: 1.17.2 matplotlib version: 3.1.3 pytorch version: 1.5.0
pv = sys.version_info
print('python version: {}.{}.{}'.format(pv.major, pv.minor, pv.micro))
python version: 3.7.7
Reload packages where content for package development
%load_ext autoreload
%autoreload 2
Check GPU(s)
!nvidia-smi | head -n 4
Sat Jun 20 15:38:32 2020 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 430.40 Driver Version: 430.40 CUDA Version: 10.1 | |-------------------------------+----------------------+----------------------+
Set seeds for better reproducibility
seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
Make up some toy data for our classification experiments, e.g., a mixture of 2 Gaussians.
class ToyClassificationData(Dataset):
def __init__(self, n:int, x_locs:Tuple[float,float]=(-1.,1.),
noise_std:float=0.1, dist:Optional=torch.distributions.Normal):
self.n = n
self.x_locs = x_locs
self.noise_std = noise_std
self.x0_dist = dist(x_locs[0], noise_std)
self.x1_dist = dist(x_locs[1], noise_std)
x0 = self.x0_dist.sample((n//2,1))
y0 = torch.zeros((n//2,1))
x1 = self.x1_dist.sample((n//2,1))
y1 = torch.ones((n//2,1))
self.x = torch.cat((x0, x1))
self.y = torch.cat((y0, y1))
def __len__(self):
return self.x.size(0)
def __getitem__(self, idx):
return (self.x[idx], self.y[idx])
dim = 1
N = 2**10
train_dist = torch.distributions.Normal
valid_dist = torch.distributions.Normal
xt_locs, xv_locs = (-1.,1.), (-1., 1.)
train_std, valid_std = 0.5, 0.5
print(f'N = {N}')
train_ds_params = dict(x_locs=xt_locs, noise_std=train_std, dist=train_dist)
valid_ds_params = dict(x_locs=xv_locs, noise_std=valid_std, dist=valid_dist)
train_ds = ToyClassificationData(N, **train_ds_params)
valid_ds = ToyClassificationData(N//2, **valid_ds_params)
N = 1024
plot_data = N <= 2**12 and dim == 1
if plot_data:
ix0 = train_ds.y[:,0] == 0
ix1 = train_ds.y[:,0] == 1
f, ax = plt.subplots(1,1)
ax.hist(train_ds.x[ix0,0], bins=20, label='0', alpha=0.7)
ax.hist(train_ds.x[ix1,0], bins=20, label='1', alpha=0.7)
ax.legend()
ax.set_title('Training data');
activation = nn.ReLU
class UncertainLinear(nn.Module):
def __init__(self, dim:int, mid:int, n_layers:int=1, dropout_rate:float=0.2,
mid_bias:bool=True, out_bias:bool=False):
super().__init__()
self.p = dropout_rate
layers = [nn.Sequential(nn.Linear(dim, mid, bias=mid_bias), activation())]
for _ in range(n_layers-1):
layers.append(nn.Sequential(nn.Linear(mid,mid,bias=mid_bias), activation()))
self.m = nn.ModuleList(layers)
self.out = nn.Linear(mid, 1, bias=out_bias)
self.var = nn.Linear(mid, 1, bias=out_bias)
def forward(self, x:torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
for m in self.m:
x = F.dropout(m(x), p=self.p, training=True)
return self.out(x), self.var(x)
class BCELoss(nn.Module):
""" use just BCE loss with UncertainLinear network """
def forward(self, out:torch.Tensor, y:torch.Tensor) -> torch.Tensor:
logit, _ = out
loss = F.binary_cross_entropy_with_logits(logit, y)
return loss
class ExtendedBCELoss(nn.Module):
""" use modified BCE loss for variance calculation with UncertainLinear network """
def forward(self, out:torch.Tensor, y:torch.Tensor, n_samp:int=10) -> torch.Tensor:
logit, sigma = out
dist = torch.distributions.Normal(logit, torch.exp(sigma))
mc_logs = dist.rsample((n_samp,))
loss = 0.
for mc_log in mc_logs:
loss += F.binary_cross_entropy_with_logits(mc_log, y)
loss /= n_samp
return loss
def calc_uncertainty(logits:torch.Tensor, sigmas:torch.Tensor, eps:float=1e-6) -> Tuple[torch.Tensor, torch.Tensor]:
""" calculate epistemic, entropy, and aleatory uncertainty quantities """
probits = torch.sigmoid(logits)
epistemic = probits.var(dim=0, unbiased=True)
probit = probits.mean(dim=0)
entropy = -1 * (probit * (probit + eps).log2() + ((1 - probit) * (1 - probit + eps).log2()))
aleatory = torch.exp(sigmas).mean(dim=0)
return epistemic, entropy, aleatory
def predict(model, x:torch.Tensor, n_samp:int=25):
""" return predictions of the model and the predicted model uncertainties """
out = [model.forward(x) for _ in range(n_samp)]
logits = torch.stack([o[0] for o in out]).detach().cpu()
sigmas = torch.stack([o[1] for o in out]).detach().cpu()
ep, en, al = calc_uncertainty(logits, sigmas)
return (torch.mean(logits, dim=0), torch.mean(sigmas, dim=0), ep, en, al)
def get_metrics(model, x:torch.Tensor, y:torch.Tensor, n_samp:int, eps:float):
""" get uncertainties and other metrics during training for analysis """
state = model.training
model.eval()
with torch.no_grad():
out, s, ep, en, al = predict(model, x, n_samp)
loss = criterion((out, s), y.detach().cpu())
sb = ep / (al + eps)
eu, nu, au = ep.numpy().mean(), en.numpy().mean(), al.numpy().mean()
su = sb.numpy().mean()
model.train(state)
return loss, (out, ep, en, al, sb), eu, nu, au, su
Set training and model hyperparameters
# system setup
use_jit = False
load_model = False
# logging setup
log_rate = 100 # print losses and uncertainty measures every log_rate epoch
unc_rate = 5 # get uncertainties in training every unc_rate iteration
version = 'toy_classification_v1' # naming scheme of model to load
# model, optimizer, loss, and training parameters
batch_size = 64
n_jobs = 0
n_epochs = 200
n_samp = 5 # number of samples to take in validation
model_kwargs = dict(dim=dim, mid=1024, n_layers=1, dropout_rate=0.25, mid_bias=True, out_bias=False)
use_adam = True
optim_kwargs = dict(lr=0.001, betas=(0.8,0.99), weight_decay=1e-6) if use_adam else \
dict(lr=0.001, momentum=0., weight_decay=1e-6)
use_scheduler = False
scheduler_kwargs = dict(step_size=1000, gamma=0.5)
extended = True # use modified loss for variance estimation
eps = 1e-6 # add this value to denominator in scibilic uncertainty calc
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
num_workers=n_jobs, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size,
num_workers=n_jobs, pin_memory=True)
assert torch.cuda.is_available()
device = torch.device(f'cuda')
torch.backends.cudnn.benchmark = True
model = UncertainLinear(**model_kwargs)
if use_jit: model = torch.jit.script(model)
def num_params(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of trainable parameters in model: {num_params(model)}')
Number of trainable parameters in model: 4096
if load_model:
if use_jit:
model = torch.jit.load(f'trained_jit_{version}.pth')
else:
state_dict = torch.load(f'trained_{version}.pth')
model.load_state_dict(state_dict);
model.cuda(device=device)
optim_cls = torch.optim.AdamW if use_adam else torch.optim.SGD
optimizer = optim_cls(model.parameters(), **optim_kwargs)
if use_scheduler:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **scheduler_kwargs)
criterion = ExtendedBCELoss() if extended else BCELoss()
train_losses, valid_losses = [], []
t_ep_unc, t_en_unc, t_al_unc, t_sb_unc = [], [], [], []
v_ep_unc, v_en_unc, v_al_unc, v_sb_unc = [], [], [], []
n_batches = len(train_loader)
for t in range(1, n_epochs+1):
# training
t_losses, t_ep, t_en, t_al, t_sb = [], [], [], [], []
model.train()
for i, (x, y) in enumerate(train_loader):
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
out = model(x)
loss = criterion(out, y)
t_losses.append(loss.item())
loss.backward()
optimizer.step()
if i % unc_rate == 0:
_, _, ep, en, al, sb = get_metrics(model, x, y, n_samp, eps)
t_ep.append(ep); t_en.append(en)
t_al.append(al); t_sb.append(sb)
train_losses.append(t_losses)
t_ep_unc.append(t_ep); t_en_unc.append(t_en)
t_al_unc.append(t_al); t_sb_unc.append(t_sb)
# validation
v_losses, v_ep, v_en, v_al, v_sb = [], [], [], [], []
model.eval()
with torch.no_grad():
for i, (x, y) in enumerate(valid_loader):
x = x.to(device)
loss, out, ep, en, al, sb = get_metrics(model, x, y, n_samp, eps)
v_losses.append(loss.item())
v_ep.append(ep); v_en.append(en)
v_al.append(al); v_sb.append(sb)
valid_losses.append(v_losses)
v_ep_unc.append(v_ep); v_en_unc.append(v_en)
v_al_unc.append(v_al); v_sb_unc.append(v_sb)
if not np.all(np.isfinite(t_losses)):
raise RuntimeError('NaN or Inf in training loss, cannot recover. Exiting.')
if t % log_rate == 0:
log = (f'Epoch: {t} - TL: {np.mean(t_losses):.2e}, VL: {np.mean(v_losses):.2e}, '
f'tEU: {np.mean(t_ep):.2e}, vEU: {np.mean(v_ep):.2e}, '
f'tAU: {np.mean(t_al):.2e}, vAU: {np.mean(v_al):.2e}')
print(log)
if use_scheduler: scheduler.step()
Epoch: 100 - TL: 6.64e-02, VL: 4.52e-02, tEU: 1.02e-04, vEU: 1.25e-04, tAU: 3.35e-03, vAU: 2.84e-03 Epoch: 200 - TL: 6.55e-02, VL: 4.36e-02, tEU: 1.22e-04, vEU: 9.21e-05, tAU: 2.05e-03, vAU: 2.06e-03
Create tidy datasets from training and validation losses for plotting purposes.
def tidy_losses(train:List[List[float]], valid:List[List[float]]) -> pd.DataFrame:
out = {'epoch': [], 'type': [], 'value': [], 'phase': []}
for i, (tl,vl) in enumerate(zip(train,valid),1):
for tli in tl:
out['epoch'].append(i)
out['type'].append('loss')
out['value'].append(tli)
out['phase'].append('train')
for vli in vl:
out['epoch'].append(i)
out['type'].append('loss')
out['value'].append(vli)
out['phase'].append('valid')
return pd.DataFrame(out)
def tidy_uncertainty(ep:List[List[float]], al:List[List[float]], sb:List[List[float]]) -> pd.DataFrame:
out = {'epoch': [], 'type': [], 'value': [], 'phase': []}
for i, (epi, ali, sbi) in enumerate(zip(ep, al, sb)):
phase = 'train' if i == 0 else 'valid'
for j, (epij,alij,sbij) in enumerate(zip(epi,ali,sbi),1):
for epijk in epij:
out['epoch'].append(j)
out['type'].append('epistemic')
out['value'].append(epijk)
out['phase'].append(phase)
for alijk in alij:
out['epoch'].append(j)
out['type'].append('aleatory')
out['value'].append(alijk)
out['phase'].append(phase)
return pd.DataFrame(out)
losses = tidy_losses(train_losses, valid_losses)
uncert = tidy_uncertainty((t_ep_unc, v_ep_unc),
(t_al_unc, v_al_unc),
(t_sb_unc, v_sb_unc))
version = version # naming scheme of model to save
load_tidy = False
if load_tidy:
losses = pd.read_csv(f'losses_{version}.csv')
uncert = pd.read_csv(f'uncert_{version}.csv')
f, ax1 = plt.subplots(1,1,figsize=(12, 8),sharey=True)
sns.lineplot(x='epoch',y='value',hue='phase',data=losses,ci='sd',ax=ax1,lw=3);
ax1.set_title('Losses');
ax1.set_ylim((0.,0.1))
f.savefig(f'losses_{version}.pdf')
The output of the validation is averaged over 5 samples before the loss is computed which is why it is lower than the training loss (which is not averaged).
use_log = False
save_plots = False
f, ax1 = plt.subplots(1,1,figsize=(12,8),sharey=True)
if use_log: ax1.set(yscale='log')
sns.lineplot(x='epoch',y='value',hue='type',style='phase',ci='sd',data=uncert,ax=ax1,lw=3);
ax1.set_title('Uncertainty');
ax1.set_ylim((-0.01, 0.2))
if save_plots: f.savefig(f'uncert_{version}.pdf')
It is not immediately clear to me why aleatory uncertainty is jumping all over the place. My guess would be that the loss function is fairly sensitive because of the trade-off between the normal loss and variance term.
save_tidy = False
if save_tidy:
losses.to_csv(f'losses_{version}.csv')
uncert.to_csv(f'uncert_{version}.csv')
save_model = False
if save_model:
if use_jit:
torch.jit.save(model, f'trained_jit_{version}.pth')
else:
torch.save(model.state_dict(), f'trained_{version}.pth')
xt, yt = train_ds.x, train_ds.y
xv, yv = valid_ds.x, valid_ds.y
xtn, ytn = xt.numpy(), yt.numpy()
xvn, yvn = xv.numpy(), yv.numpy()
def list_to_np(lst:List[torch.Tensor]) -> List[np.ndarray]:
return list(map(lambda x: x.detach().cpu().numpy(), lst))
model.eval()
with torch.no_grad():
logittn, sigmatn, etn, ntn, atn = list_to_np(predict(model, xt.to(device), 50))
logitvn, sigmavn, evn, ntn, avn = list_to_np(predict(model, xv.to(device), 50))
fig,(ax1,ax2,ax3,ax4) = plt.subplots(1,4,figsize=(15,5),sharex=True,sharey=True)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ax1.hist(train_ds.x[train_ds.y[:,0] == 0,0], bins=20, label='0', alpha=0.7, density=True)
ax1.hist(train_ds.x[train_ds.y[:,0] == 1,0], bins=20, label='1', alpha=0.7, density=True)
ax2.hist(valid_ds.x[valid_ds.y[:,0] == 0,0], bins=20, label='0', alpha=0.7, density=True)
ax2.hist(valid_ds.x[valid_ds.y[:,0] == 1,0], bins=20, label='1', alpha=0.7, density=True)
ax3.hist(train_ds.x[logittn[:,0] < 0.,0], bins=20, label='0', alpha=0.7, density=True)
ax3.hist(train_ds.x[logittn[:,0] >= 0.,0], bins=20, label='1', alpha=0.7, density=True)
ax4.hist(valid_ds.x[logitvn[:,0] < 0.,0], bins=20, label='0', alpha=0.7, density=True)
ax4.hist(valid_ds.x[logitvn[:,0] >= 0.,0], bins=20, label='1', alpha=0.7, density=True)
ax4.legend()
ax1.set_title('Training data');
ax2.set_title('Validation data');
ax3.set_title('Predicted training');
ax4.set_title('Predicted validation');
if plot_data:
fig, (ax1,ax2,ax3) = plt.subplots(1,3,figsize=(16,6))
x_start = 2.
x_range = (-x_start, x_start)
xs = torch.linspace(*x_range, 100).unsqueeze(1)
with torch.no_grad():
ys, _, ep, en, al = list_to_np(predict(model, xs.to(device), 1000))
x, y, ep, en, al = xs[:,0], ys[:,0], ep[:,0], en[:,0], al[:,0]
sc = ep / (al + eps) # scibilic uncertainty
ixs0 = train_ds.y[:,0] == 0
ixs1 = train_ds.y[:,0] == 1
pal = sns.color_palette()
ax1_2 = ax1.twinx()
ax1_2.hist(train_ds.x[ixs0,0], bins=20, label='0', alpha=0.7, density=True)
ax1_2.hist(train_ds.x[ixs1,0], bins=20, label='1', alpha=0.7, density=True)
ax1.plot(x, ep, label='Ep.', lw=3, color=pal[0])
ax1.set_title('Epistemic')
ax1.set_ylim((0.,None))
ax2_2 = ax2.twinx()
ax2_2.hist(train_ds.x[ixs0,0], bins=20, label='0', alpha=0.7, density=True)
ax2_2.hist(train_ds.x[ixs1,0], bins=20, label='1', alpha=0.7, density=True)
ax2.plot(x, al, label='Al.', lw=3, color=pal[1])
ax2.set_title('Aleatory')
ax2.set_ylim((0.,None))
ax3_2 = ax3.twinx()
ax3_2.hist(train_ds.x[ixs0,0], bins=20, label='0', alpha=0.7, density=True)
ax3_2.hist(train_ds.x[ixs1,0], bins=20, label='1', alpha=0.7, density=True)
ax3.plot(x, en, label='En.', lw=3, color=pal[2])
ax3.set_title('Entropy')
ax3.set_ylim((0.,None))
fig.tight_layout()
We can see the epistemic and aleatory uncertainty (and entropy) plotted over the histograms of the true data. We see that epistemic uncertainty is highly correlated with aleatory uncertainty and entropy (albeit, aleatory is wider). In fact, this behavior—under these circumstances-is expected. Because we are fitting a hyperplane to the data, the value of the output probability will not go back to 0.5 far to the right or the left of the in-distribution data. Aleatory uncertainty is high in the overlapped region because high aleatory uncertainty in that region lowers the loss.
This example is too trivial to show useful differences between epistemic and aleatory uncertainty for classification tasks, but perhaps it will lead to a better understanding of what is going on, as well as instructive for implementation.