Notebook

In [40]:

from  __future__ import print_function
import torch

In [41]:

# make a simple 5x4 matrix

x = torch.Tensor(5,4)
print(x)

1.00000e-19 *
  0.0000  1.0842  0.0000  1.0842
  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  0.0000
[torch.FloatTensor of size 5x4]

In [42]:

# Randdomly initialized matrix
x = torch.rand(5,4)
print(x)

 0.8279  0.2494  0.9360  0.4095
 0.4523  0.0623  0.6327  0.8333
 0.7154  0.2328  0.7027  0.5982
 0.8434  0.7973  0.8574  0.6481
 0.4558  0.9925  0.9888  0.5318
[torch.FloatTensor of size 5x4]

In [43]:

print(x.size())
print("torch.Size is  however a tuple ")

torch.Size([5, 4])
torch.Size is  however a tuple

In [44]:

y = torch.rand(5, 4)
print(x + y)

 1.3527  0.2853  1.8305  1.0669
 1.3825  0.8608  0.7175  1.4620
 0.9138  1.2007  1.4080  1.1309
 1.6843  1.1873  1.5777  1.0847
 0.4859  1.1659  1.1593  1.1505
[torch.FloatTensor of size 5x4]

In [45]:

print(torch.add(x, y))

 1.3527  0.2853  1.8305  1.0669
 1.3825  0.8608  0.7175  1.4620
 0.9138  1.2007  1.4080  1.1309
 1.6843  1.1873  1.5777  1.0847
 0.4859  1.1659  1.1593  1.1505
[torch.FloatTensor of size 5x4]

In [46]:

result = torch.Tensor(5, 4)
torch.add(x, y, out=result)
print(result)

 1.3527  0.2853  1.8305  1.0669
 1.3825  0.8608  0.7175  1.4620
 0.9138  1.2007  1.4080  1.1309
 1.6843  1.1873  1.5777  1.0847
 0.4859  1.1659  1.1593  1.1505
[torch.FloatTensor of size 5x4]

In [47]:

print(x[:, 1])

 0.2494
 0.0623
 0.2328
 0.7973
 0.9925
[torch.FloatTensor of size 5]

In [48]:

# Converting torch Tensor into a numpy Array
a = torch.ones(10)
a

Out[48]:

 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
[torch.FloatTensor of size 10]

In [49]:

b = a.numpy()
print(b)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

In [50]:

# Watch the numpy array
a.add_(1)
print(a)
print(b)

 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
[torch.FloatTensor of size 10]

[ 2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]

In [51]:

# Doing the opposite; convertihng numpy Array into a torch Tensor
import numpy as np
a = np.ones(10)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

[ 2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]

 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
[torch.DoubleTensor of size 10]

In [52]:

if torch.cuda.is_available():
    x = x.cuda()
    y = y.cuda()
    x + y

A bit on Autograd¶

In [53]:

from torch.autograd import Variable

In [54]:

x = Variable(torch.ones(3,3), requires_grad=True)
print(x)

Variable containing:
 1  1  1
 1  1  1
 1  1  1
[torch.FloatTensor of size 3x3]

In [55]:

y = x + 2
print(y)

Variable containing:
 3  3  3
 3  3  3
 3  3  3
[torch.FloatTensor of size 3x3]

In [56]:

print(y.grad_fn)
#why does this fail?

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-56-faedf5ea10b6> in <module>()
----> 1 print(y.grad_fn)

~/anaconda/lib/python3.6/site-packages/torch/autograd/variable.py in __getattr__(self, name)
     61         if name in self._fallthrough_methods:
     62             return getattr(self.data, name)
---> 63         raise AttributeError(name)
     64 
     65     def __getitem__(self, key):

AttributeError: grad_fn

In [57]:

z = y*y*3
out = z.mean()
print(z, out)

Variable containing:
 27  27  27
 27  27  27
 27  27  27
[torch.FloatTensor of size 3x3]
 Variable containing:
 27
[torch.FloatTensor of size 1]

Gradients¶

Backprop with out.backward is same as out.backward(torch.Tensor([1,0]))

In [58]:

out.backward()

In [59]:

print(x.grad)

Variable containing:
 2  2  2
 2  2  2
 2  2  2
[torch.FloatTensor of size 3x3]

In [60]:

x = torch.randn(5)
x = Variable(x, requires_grad=True)
y = x*2
while y.data.norm()<1000:
    y = y*2
print(y)

Variable containing:
-1358.7740
  731.0142
  699.5322
 -524.8378
 -568.4971
[torch.FloatTensor of size 5]

In [61]:

gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
y.backward(gradients)
print(x.grad)

Variable containing:
  51.2000
 512.0000
   0.0512
[torch.FloatTensor of size 3]

In [62]:

help(Variable)

Help on class Variable in module torch.autograd.variable:

class Variable(torch._C._VariableBase)
 |  Wraps a tensor and records the operations applied to it.
 |  
 |  Variable is a thin wrapper around a Tensor object, that also holds
 |  the gradient w.r.t. to it, and a reference to a function that created it.
 |  This reference allows retracing the whole chain of operations that
 |  created the data. If the Variable has been created by the user, its creator
 |  will be ``None`` and we call such objects *leaf* Variables.
 |  
 |  Since autograd only supports scalar valued function differentiation, grad
 |  size always matches the data size. Also, grad is normally only allocated
 |  for leaf variables, and will be always zero otherwise.
 |  
 |  Attributes:
 |      data: Wrapped tensor of any type.
 |      grad: Variable holding the gradient of type and location matching
 |          the ``.data``.  This attribute is lazily allocated and can't
 |          be reassigned.
 |      requires_grad: Boolean indicating whether the Variable has been
 |          created by a subgraph containing any Variable, that requires it.
 |          See :ref:`excluding-subgraphs` for more details.
 |          Can be changed only on leaf Variables.
 |      volatile: Boolean indicating that the Variable should be used in
 |          inference mode, i.e. don't save the history. See
 |          :ref:`excluding-subgraphs` for more details.
 |          Can be changed only on leaf Variables.
 |      creator: Function of which the variable was an output. For leaf
 |          (user created) variables it's ``None``. Read-only attribute.
 |  
 |  Parameters:
 |      data (any tensor class): Tensor to wrap.
 |      requires_grad (bool): Value of the requires_grad flag. **Keyword only.**
 |      volatile (bool): Value of the volatile flag. **Keyword only.**
 |  
 |  Method resolution order:
 |      Variable
 |      torch._C._VariableBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __add__(self, other)
 |  
 |  __deepcopy__(self, memo)
 |  
 |  __div__(self, other)
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __ge__(self, other)
 |      Return self>=value.
 |  
 |  __getattr__(self, name)
 |  
 |  __getitem__(self, key)
 |  
 |  __gt__(self, other)
 |      Return self>value.
 |  
 |  __hash__(self)
 |      Return hash(self).
 |  
 |  __iadd__(self, other)
 |  
 |  __idiv__(self, other)
 |  
 |  __imul__(self, other)
 |  
 |  __ipow__(self, other)
 |  
 |  __isub__(self, other)
 |  
 |  __iter__(self)
 |  
 |  __le__(self, other)
 |      Return self<=value.
 |  
 |  __len__(self)
 |  
 |  __lt__(self, other)
 |      Return self<value.
 |  
 |  __matmul__(self, other)
 |  
 |  __mod__(self, other)
 |  
 |  __mul__(self, other)
 |  
 |  __ne__(self, other)
 |      Return self!=value.
 |  
 |  __neg__(self)
 |  
 |  __pow__(self, other)
 |  
 |  __radd__ = __add__(self, other)
 |  
 |  __rdiv__(self, other)
 |  
 |  __reduce_ex__(self, proto)
 |      helper for pickle
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __rmul__ = __mul__(self, other)
 |  
 |  __rpow__(self, other)
 |  
 |  __rsub__(self, other)
 |  
 |  __rtruediv__ = __rdiv__(self, other)
 |  
 |  __setitem__(self, key, value)
 |  
 |  __setstate__(self, state)
 |  
 |  __sub__(self, other)
 |  
 |  __truediv__ = __div__(self, other)
 |  
 |  abs(self)
 |  
 |  acos(self)
 |  
 |  add(self, other)
 |  
 |  add_(self, other)
 |  
 |  addbmm(self, *args)
 |  
 |  addbmm_(self, *args)
 |  
 |  addcdiv(self, *args)
 |  
 |  addcmul(self, *args)
 |  
 |  addmm(self, *args)
 |  
 |  addmm_(self, *args)
 |  
 |  addmv(self, *args)
 |  
 |  addmv_(self, *args)
 |  
 |  addr(self, *args)
 |  
 |  addr_(self, *args)
 |  
 |  asin(self)
 |  
 |  atan(self)
 |  
 |  backward(self, gradient=None, retain_variables=False)
 |      Computes the gradient of current variable w.r.t. graph leaves.
 |      
 |      The graph is differentiated using the chain rule. If the variable is
 |      non-scalar (i.e. its data has more than one element) and requires
 |      gradient, the function additionaly requires specifying ``gradient``.
 |      It should be a tensor of matching type and location, that contains
 |      the gradient of the differentiated function w.r.t. ``self``.
 |      
 |      This function accumulates gradients in the leaves - you might need to zero
 |      them before calling it.
 |      
 |      Arguments:
 |          gradient (Tensor): Gradient of the differentiated function
 |              w.r.t. the data. Required only if the data has more than one
 |              element. Type and location should match these of ``self.data``.
 |          retain_variables (bool): If ``True``, buffers necessary for computing
 |              gradients won't be freed after use. It is only necessary to
 |              specify ``True`` if you want to differentiate some subgraph multiple
 |              times (in some cases it will be much more efficient to use
 |              `autograd.backward`).
 |  
 |  baddbmm(self, *args)
 |  
 |  baddbmm_(self, *args)
 |  
 |  bernoulli(self)
 |  
 |  bmm(self, batch)
 |  
 |  byte(self)
 |  
 |  ceil(self)
 |  
 |  char(self)
 |  
 |  chunk(self, num_chunks, dim=0)
 |  
 |  clamp(self, min=None, max=None)
 |  
 |  clone(self)
 |  
 |  contiguous(self)
 |  
 |  cos(self)
 |  
 |  cosh(self)
 |  
 |  cpu(self)
 |  
 |  cross(self, other, dim=-1)
 |  
 |  cuda(self, device_id=None, async=False)
 |  
 |  cumsum(self, dim)
 |  
 |  detach(self)
 |      Returns a new Variable, detached from the current graph.
 |      
 |      Result will never require gradient. If the input is volatile, the output
 |      will be volatile too.
 |      
 |      .. note::
 |      
 |        Returned Variable uses the same data tensor, as the original one, and
 |        in-place modifications on either of them will be seen, and may trigger
 |        errors in correctness checks.
 |  
 |  detach_(self)
 |      Detaches the Variable from the graph that created it, making it a leaf.
 |  
 |  diag(self, diagonal_idx=0)
 |  
 |  dist(self, tensor, p=2)
 |  
 |  div(self, other)
 |  
 |  div_(self, other)
 |  
 |  dot(self, other)
 |  
 |  double(self)
 |  
 |  eq(self, other)
 |  
 |  exp(self)
 |  
 |  exp_(self)
 |  
 |  expand(self, *sizes)
 |  
 |  expand_as(self, tensor)
 |  
 |  float(self)
 |  
 |  floor(self)
 |  
 |  fmod(self, value)
 |  
 |  frac(self)
 |  
 |  gather(self, dim, index)
 |  
 |  ge(self, other)
 |  
 |  ger(self, vector)
 |  
 |  gt(self, other)
 |  
 |  half(self)
 |  
 |  index_add(self, dim, index, tensor)
 |  
 |  index_add_(self, dim, index, tensor)
 |  
 |  index_copy(self, dim, index, tensor)
 |  
 |  index_copy_(self, dim, index, tensor)
 |  
 |  index_fill(self, dim, index, value)
 |  
 |  index_fill_(self, dim, index, value)
 |  
 |  index_select(self, dim, index)
 |  
 |  int(self)
 |  
 |  is_same_size(self, other_var)
 |  
 |  kthvalue(self, dim)
 |  
 |  le(self, other)
 |  
 |  lerp(self, tensor, weight)
 |  
 |  log(self)
 |  
 |  log1p(self)
 |  
 |  long(self)
 |  
 |  lt(self, other)
 |  
 |  masked_copy(self, mask, variable)
 |  
 |  masked_copy_(self, mask, variable)
 |  
 |  masked_fill(self, mask, value)
 |  
 |  masked_fill_(self, mask, value)
 |  
 |  masked_select(self, mask)
 |  
 |  max(self, dim=None)
 |  
 |  mean(self, dim=None)
 |  
 |  median(self, dim)
 |  
 |  min(self, dim=None)
 |  
 |  mm(self, matrix)
 |  
 |  mode(self, dim)
 |  
 |  mul(self, other)
 |  
 |  mul_(self, other)
 |  
 |  multinomial(self, num_samples=1, with_replacement=False)
 |  
 |  mv(self, vector)
 |  
 |  narrow(self, dim, start_index, length)
 |  
 |  ne(self, other)
 |  
 |  neg(self)
 |  
 |  neg_(self)
 |  
 |  norm(self, p=2, dim=None)
 |  
 |  permute(self, *permutation)
 |  
 |  pow(self, other)
 |  
 |  prod(self, dim=None)
 |  
 |  reciprocal(self)
 |  
 |  register_hook(self, hook)
 |      Registers a backward hook.
 |      
 |      The hook will be called every time a gradient with respect to the
 |      variable is computed. The hook should have the following signature::
 |      
 |          hook(grad) -> Variable or None
 |      
 |      The hook should not modify its argument, but it can optionally return
 |      a new gradient which will be used in place of :attr:`grad`.
 |      
 |      This function returns a handle with a method ``handle.remove()``
 |      that removes the hook from the module.
 |      
 |      Example:
 |          >>> v = Variable(torch.Tensor([0, 0, 0]), requires_grad=True)
 |          >>> h = v.register_hook(lambda grad: grad * 2)  # double the gradient
 |          >>> v.backward(torch.Tensor([1, 1, 1]))
 |          >>> v.grad.data
 |           2
 |           2
 |           2
 |          [torch.FloatTensor of size 3]
 |          >>> h.remove()  # removes the hook
 |  
 |  reinforce(self, reward)
 |      Registers a reward obtained as a result of a stochastic process.
 |      
 |      Differentiating stochastic nodes requires providing them with reward
 |      value. If your graph contains any stochastic operations, you should
 |      call this function on their outputs. Otherwise an error will be raised.
 |      
 |      Parameters:
 |          reward(Tensor): Tensor with per-element rewards. It has to match
 |              the device location and shape of Variable's data.
 |  
 |  remainder(self, value)
 |  
 |  renorm(self, p, dim, maxnorm)
 |  
 |  repeat(self, *repeats)
 |  
 |  resize(self, *sizes)
 |  
 |  resize_as(self, variable)
 |  
 |  round(self)
 |  
 |  rsqrt(self)
 |  
 |  scatter(self, dim, index, source)
 |  
 |  scatter_(self, dim, index, source)
 |  
 |  select(self, dim, _index)
 |  
 |  short(self)
 |  
 |  sigmoid(self)
 |  
 |  sigmoid_(self)
 |  
 |  sign(self)
 |  
 |  sin(self)
 |  
 |  sinh(self)
 |  
 |  sort(self, dim=None, descending=False)
 |  
 |  split(self, split_size, dim=0)
 |  
 |  sqrt(self)
 |  
 |  squeeze(self, dim=None)
 |  
 |  std(self, dim=None, unbiased=True)
 |  
 |  sub(self, other)
 |  
 |  sub_(self, other)
 |  
 |  sum(self, dim=None)
 |  
 |  t(self)
 |  
 |  tan(self)
 |  
 |  tanh(self)
 |  
 |  tanh_(self)
 |  
 |  topk(self, k, dim=None, largest=True, sorted=True)
 |  
 |  trace(self)
 |  
 |  transpose(self, dim1, dim2)
 |  
 |  tril(self, diagonal_idx=0)
 |  
 |  triu(self, diagonal_idx=0)
 |  
 |  trunc(self)
 |  
 |  type(self, t)
 |  
 |  type_as(self, t)
 |  
 |  unsqueeze(self, dim)
 |  
 |  var(self, dim=None, unbiased=True)
 |  
 |  view(self, *sizes)
 |  
 |  view_as(self, tensor)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from torch._C._VariableBase:
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from torch._C._VariableBase:
 |  
 |  creator
 |  
 |  data
 |  
 |  grad
 |  
 |  output_nr
 |  
 |  requires_grad
 |  
 |  volatile

A bit on NN¶

In [102]:

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as fxnl

class MyNN(nn.Module):
    def __init__(self):
        super(MyNN, self).__init__()
        self.conv1 = nn.Conv2d(1,6,5)
        self.conv2 = nn.Conv2d(6,16,5)
        # An affine operation y = Wx+b
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,10)
        
    def forward(self, x):
        # max pooling over a 2x2 window
        x = fxnl.max_pool2d(fxnl.relu(self.conv1(x)), (2,2))
        x= fxnl.max_pool2d(fxnl.relu(self.conv2(x)), 2)
        x= x.view(-1, self.num_flat_features(x))
        x = fxnl.relu(self.fc1(x))
        x = fxnl.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        # all dimensions except the batch dimension
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = MyNN()
print(net)          

MyNN (
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear (400 -> 120)
  (fc2): Linear (120 -> 84)
  (fc3): Linear (84 -> 10)
)

Notes¶

All you need to do is define the forward function and backward function (this is where the gradients - you know the curvatures , or the steepest path to local minima are computed) is automatically defined for us by using autograd. We can use any of the Tensor ops in the forward function

In [111]:

params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 5, 5])

In [112]:

# Input to the forward is autograd.Variable as is the output

input = Variable(torch.randn(1, 1, 32, 32))
output = net(input)
print(output)

Variable containing:
-0.0525  0.1221 -0.0390  0.0300  0.0151  0.0359 -0.0839 -0.1024 -0.1495 -0.0095
[torch.FloatTensor of size 1x10]

In [113]:

net.zero_grad()
output.backward(torch.randn(1, 10), retain_variables=True)
print("First backward of ", output)

First backward of  Variable containing:
-0.0525  0.1221 -0.0390  0.0300  0.0151  0.0359 -0.0839 -0.1024 -0.1495 -0.0095
[torch.FloatTensor of size 1x10]

Summarizing¶

torch.Tensor: We create a multi-dimensional array with torch.Tensor
autograd.Variable: W2 wrapped a Tensor and recorded the history of ops applied to it.
nn.Module - We played with the Neural Network Module
nn.Parameter - This parameter is automatically registered as a parameter with helpers for moving then to gpu, loading etc
autograd.Function = This implemented forward and backward definitions of an autograd op.

So, all in all we defined a neural network AND we processed inputs and called it backward.