%pylab inline
Populating the interactive namespace from numpy and matplotlib
import torch
from torch import nn
from torchmore import flex, layers
Goodfellow, Ian J., et al. "Multi-digit number recognition from street view imagery using deep convolutional neural networks." arXiv preprint arXiv:1312.6082 (2013).
Jaderberg, Max, et al. "Deep structured output learning for unconstrained text recognition." arXiv preprint arXiv:1412.5903 (2014).
Historically, LSTM came first, but we're going to start off with convolutional networks analogous to object recognition networks.
Structure:
def make_model():
return nn.Sequential(
# BDHW
*convolutional_layers(),
# BDHW, now reduce along the vertical
layers.Fun(lambda x: x.sum(2)),
# BDW
layers.Conv1d(num_classes, 1)
)
"ABC"
is replaced by regular expression /_+A+_+B+_+C+_+/
Identical to traditional HMM training in speech recognition:
cctc2
library, we can make the alignment explicitdef train_batch(input, target):
optimizer.zero_grad()
output = model(input)
aligned = cctc2.align(output, target)
loss = mse_loss(aligned, output)
loss.backward()
optimizer.step()
CTCLoss
in PyTorch obscures what's going onctc_loss = nn.CTCLoss()
def train_batch(input, target):
optimizer.zero_grad()
output = model(input)
loss = ctc_loss(output, target)
loss.backward()
optimizer.step()
def make_model():
return nn.Sequential(
*convolutional_layers(),
layers.Fun(lambda x: x.sum(2)),
layers.Conv1d(num_classes, 1)
)
def train_batch(input, target):
optimizer.zero_grad()
output = model(input)
loss = ctc_loss(output, target)
loss.backward()
optimizer.step()
def conv2d(d, r=3, stride=1, repeat=1):
"""Generate a conv layer with batchnorm and optional maxpool."""
result = []
for i in range(repeat):
result += [
flex.Conv2d(d, r, padding=(r//2, r//2), stride=stride),
flex.BatchNorm2d(),
nn.ReLU()
]
return result
def conv2mp(d, r=3, mp=2, repeat=1):
"""Generate a conv layer with batchnorm and optional maxpool."""
result = conv2d(d, r, repeat=repeat)
if mp is not None:
result += [nn.MaxPool2d(mp)]
return result
def project_and_conv1d(d, noutput, r=5):
return [
layers.Fun("lambda x: x.max(2)[0]"),
flex.Conv1d(d, r, padding=r//2),
flex.BatchNorm1d(),
nn.ReLU(),
flex.Conv1d(noutput, 1),
layers.Reorder("BDL", "BLD")
]
class Additive(nn.Module):
def __init__(self, *args, post=None):
super().__init__()
self.sub = nn.ModuleList(args)
self.post = None
def forward(self, x):
y = self.sub[0](x)
for f in self.sub[1:]:
y = y + f(x)
if self.post is not None:
y = self.post(y)
return y
def make_vgg_model(noutput=53):
return nn.Sequential(
layers.Input("BDHW", sizes=[None, 1, None, None]),
*conv2mp(100, 3, 2, repeat=2),
*conv2mp(200, 3, 2, repeat=2),
*conv2mp(300, 3, 2, repeat=2),
*conv2d(400, 3, repeat=2),
*project_and_conv1d(800, noutput)
)
make_vgg_model()(torch.rand(1, 1, 60, 400)).shape
torch.Size([1, 50, 53])
def ResnetBlock(d, r=3):
return Additive(
nn.Identity(),
nn.Sequential(
nn.Conv2d(d, d, r, padding=r//2), nn.BatchNorm2d(d), nn.ReLU(),
nn.Conv2d(d, d, r, padding=r//2), nn.BatchNorm2d(d)
)
)
def resnet_blocks(n, d, r=3):
return [ResnetBlock(d, r) for _ in range(n)]
def make_resnet_model(noutput=53):
return nn.Sequential(
layers.Input("BDHW", sizes=[None, 1, None, None]),
*conv2mp(64, 3, (2, 1)),
*resnet_blocks(5, 64), *conv2mp(128, 3, 2),
*resnet_blocks(5, 128), *conv2mp(256, 3, 2),
*resnet_blocks(5, 256), *conv2mp(512, 3, 2),
*resnet_blocks(5, 512),
*project_and_conv1d(800, noutput)
)
make_resnet_model()(torch.rand(1, 1, 60, 400)).shape
torch.Size([1, 50, 53])
Problem:
Solutions:
FractionalMaxPool2d
¶def conv2fmp(d, r=3, fmp=(0.7, 0.85), repeat=1):
result = conv2d(d, r, repeat=repeat)
if fmp is not None:
result += [nn.FractionalMaxPool2d(3, output_ratio=fmp)]
return result
def make_fmp_model(noutput=53):
return nn.Sequential(
layers.Input("BDHW", sizes=[None, 1, None, None]),
*[l for d in [50, 100, 150, 200, 250, 300] for l in conv2fmp(d, 3, (0.7, 0.9))],
*project_and_conv1d(800, noutput)
)
make_fmp_model()(torch.rand(1, 1, 60, 400)).shape
torch.Size([1, 210, 53])
interpolate
¶interpolate
scales an image, has backward()
MaxPool2d...interpolate
is a simple multiscale analysisimport torch.nn.functional as F
def make_interpolating_model(noutput=53):
return nn.Sequential(
layers.Input("BDHW", sizes=[None, 1, None, None]),
*conv2mp(50, 3), *conv2mp(100, 3), *conv2mp(150, 3), *conv2mp(200, 3),
layers.Fun_(lambda x: F.interpolate(x, scale_factor=16)),
*project_and_conv1d(800, noutput)
)
make_interpolating_model()(torch.rand(1, 1, 60, 400)).shape
torch.Size([1, 400, 53])
interpolate
¶ConvTranspose1d
¶ConvTranspose2d
fills in higher resolutions with "templates"def make_ct_model(noutput=53, ct=1):
return nn.Sequential(
layers.Input("BDHW", sizes=[None, 1, None, None]),
*conv2mp(50, 3),
*conv2mp(100, 3),
*conv2mp(150, 3),
*conv2mp(200, 3),
layers.Fun("lambda x: x.sum(2)"), # BDHW -> BDW
*[flex.ConvTranspose1d(800, 1, stride=2)]*ct,
flex.Conv1d(noutput, 7, padding=3)
)
print(make_ct_model()(torch.rand(1, 1, 60, 400)).shape)
print(make_ct_model(ct=0)(torch.rand(1, 1, 60, 400)).shape)
torch.Size([1, 53, 49]) torch.Size([1, 53, 25])
But: