from __future__ import print_function
import torch
# make a simple 5x4 matrix
x = torch.Tensor(5,4)
print(x)
1.00000e-19 * 0.0000 1.0842 0.0000 1.0842 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 [torch.FloatTensor of size 5x4]
# Randdomly initialized matrix
x = torch.rand(5,4)
print(x)
0.8279 0.2494 0.9360 0.4095 0.4523 0.0623 0.6327 0.8333 0.7154 0.2328 0.7027 0.5982 0.8434 0.7973 0.8574 0.6481 0.4558 0.9925 0.9888 0.5318 [torch.FloatTensor of size 5x4]
print(x.size())
print("torch.Size is however a tuple ")
torch.Size([5, 4]) torch.Size is however a tuple
y = torch.rand(5, 4)
print(x + y)
1.3527 0.2853 1.8305 1.0669 1.3825 0.8608 0.7175 1.4620 0.9138 1.2007 1.4080 1.1309 1.6843 1.1873 1.5777 1.0847 0.4859 1.1659 1.1593 1.1505 [torch.FloatTensor of size 5x4]
print(torch.add(x, y))
1.3527 0.2853 1.8305 1.0669 1.3825 0.8608 0.7175 1.4620 0.9138 1.2007 1.4080 1.1309 1.6843 1.1873 1.5777 1.0847 0.4859 1.1659 1.1593 1.1505 [torch.FloatTensor of size 5x4]
result = torch.Tensor(5, 4)
torch.add(x, y, out=result)
print(result)
1.3527 0.2853 1.8305 1.0669 1.3825 0.8608 0.7175 1.4620 0.9138 1.2007 1.4080 1.1309 1.6843 1.1873 1.5777 1.0847 0.4859 1.1659 1.1593 1.1505 [torch.FloatTensor of size 5x4]
print(x[:, 1])
0.2494 0.0623 0.2328 0.7973 0.9925 [torch.FloatTensor of size 5]
# Converting torch Tensor into a numpy Array
a = torch.ones(10)
a
1 1 1 1 1 1 1 1 1 1 [torch.FloatTensor of size 10]
b = a.numpy()
print(b)
[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
# Watch the numpy array
a.add_(1)
print(a)
print(b)
2 2 2 2 2 2 2 2 2 2 [torch.FloatTensor of size 10] [ 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
# Doing the opposite; convertihng numpy Array into a torch Tensor
import numpy as np
a = np.ones(10)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)
[ 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.] 2 2 2 2 2 2 2 2 2 2 [torch.DoubleTensor of size 10]
if torch.cuda.is_available():
x = x.cuda()
y = y.cuda()
x + y
from torch.autograd import Variable
x = Variable(torch.ones(3,3), requires_grad=True)
print(x)
Variable containing: 1 1 1 1 1 1 1 1 1 [torch.FloatTensor of size 3x3]
y = x + 2
print(y)
Variable containing: 3 3 3 3 3 3 3 3 3 [torch.FloatTensor of size 3x3]
print(y.grad_fn)
#why does this fail?
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-56-faedf5ea10b6> in <module>() ----> 1 print(y.grad_fn) ~/anaconda/lib/python3.6/site-packages/torch/autograd/variable.py in __getattr__(self, name) 61 if name in self._fallthrough_methods: 62 return getattr(self.data, name) ---> 63 raise AttributeError(name) 64 65 def __getitem__(self, key): AttributeError: grad_fn
z = y*y*3
out = z.mean()
print(z, out)
Variable containing: 27 27 27 27 27 27 27 27 27 [torch.FloatTensor of size 3x3] Variable containing: 27 [torch.FloatTensor of size 1]
Backprop with out.backward is same as out.backward(torch.Tensor([1,0]))
out.backward()
print(x.grad)
Variable containing: 2 2 2 2 2 2 2 2 2 [torch.FloatTensor of size 3x3]
x = torch.randn(5)
x = Variable(x, requires_grad=True)
y = x*2
while y.data.norm()<1000:
y = y*2
print(y)
Variable containing: -1358.7740 731.0142 699.5322 -524.8378 -568.4971 [torch.FloatTensor of size 5]
gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
y.backward(gradients)
print(x.grad)
Variable containing: 51.2000 512.0000 0.0512 [torch.FloatTensor of size 3]
help(Variable)
Help on class Variable in module torch.autograd.variable: class Variable(torch._C._VariableBase) | Wraps a tensor and records the operations applied to it. | | Variable is a thin wrapper around a Tensor object, that also holds | the gradient w.r.t. to it, and a reference to a function that created it. | This reference allows retracing the whole chain of operations that | created the data. If the Variable has been created by the user, its creator | will be ``None`` and we call such objects *leaf* Variables. | | Since autograd only supports scalar valued function differentiation, grad | size always matches the data size. Also, grad is normally only allocated | for leaf variables, and will be always zero otherwise. | | Attributes: | data: Wrapped tensor of any type. | grad: Variable holding the gradient of type and location matching | the ``.data``. This attribute is lazily allocated and can't | be reassigned. | requires_grad: Boolean indicating whether the Variable has been | created by a subgraph containing any Variable, that requires it. | See :ref:`excluding-subgraphs` for more details. | Can be changed only on leaf Variables. | volatile: Boolean indicating that the Variable should be used in | inference mode, i.e. don't save the history. See | :ref:`excluding-subgraphs` for more details. | Can be changed only on leaf Variables. | creator: Function of which the variable was an output. For leaf | (user created) variables it's ``None``. Read-only attribute. | | Parameters: | data (any tensor class): Tensor to wrap. | requires_grad (bool): Value of the requires_grad flag. **Keyword only.** | volatile (bool): Value of the volatile flag. **Keyword only.** | | Method resolution order: | Variable | torch._C._VariableBase | builtins.object | | Methods defined here: | | __add__(self, other) | | __deepcopy__(self, memo) | | __div__(self, other) | | __eq__(self, other) | Return self==value. | | __ge__(self, other) | Return self>=value. | | __getattr__(self, name) | | __getitem__(self, key) | | __gt__(self, other) | Return self>value. | | __hash__(self) | Return hash(self). | | __iadd__(self, other) | | __idiv__(self, other) | | __imul__(self, other) | | __ipow__(self, other) | | __isub__(self, other) | | __iter__(self) | | __le__(self, other) | Return self<=value. | | __len__(self) | | __lt__(self, other) | Return self<value. | | __matmul__(self, other) | | __mod__(self, other) | | __mul__(self, other) | | __ne__(self, other) | Return self!=value. | | __neg__(self) | | __pow__(self, other) | | __radd__ = __add__(self, other) | | __rdiv__(self, other) | | __reduce_ex__(self, proto) | helper for pickle | | __repr__(self) | Return repr(self). | | __rmul__ = __mul__(self, other) | | __rpow__(self, other) | | __rsub__(self, other) | | __rtruediv__ = __rdiv__(self, other) | | __setitem__(self, key, value) | | __setstate__(self, state) | | __sub__(self, other) | | __truediv__ = __div__(self, other) | | abs(self) | | acos(self) | | add(self, other) | | add_(self, other) | | addbmm(self, *args) | | addbmm_(self, *args) | | addcdiv(self, *args) | | addcmul(self, *args) | | addmm(self, *args) | | addmm_(self, *args) | | addmv(self, *args) | | addmv_(self, *args) | | addr(self, *args) | | addr_(self, *args) | | asin(self) | | atan(self) | | backward(self, gradient=None, retain_variables=False) | Computes the gradient of current variable w.r.t. graph leaves. | | The graph is differentiated using the chain rule. If the variable is | non-scalar (i.e. its data has more than one element) and requires | gradient, the function additionaly requires specifying ``gradient``. | It should be a tensor of matching type and location, that contains | the gradient of the differentiated function w.r.t. ``self``. | | This function accumulates gradients in the leaves - you might need to zero | them before calling it. | | Arguments: | gradient (Tensor): Gradient of the differentiated function | w.r.t. the data. Required only if the data has more than one | element. Type and location should match these of ``self.data``. | retain_variables (bool): If ``True``, buffers necessary for computing | gradients won't be freed after use. It is only necessary to | specify ``True`` if you want to differentiate some subgraph multiple | times (in some cases it will be much more efficient to use | `autograd.backward`). | | baddbmm(self, *args) | | baddbmm_(self, *args) | | bernoulli(self) | | bmm(self, batch) | | byte(self) | | ceil(self) | | char(self) | | chunk(self, num_chunks, dim=0) | | clamp(self, min=None, max=None) | | clone(self) | | contiguous(self) | | cos(self) | | cosh(self) | | cpu(self) | | cross(self, other, dim=-1) | | cuda(self, device_id=None, async=False) | | cumsum(self, dim) | | detach(self) | Returns a new Variable, detached from the current graph. | | Result will never require gradient. If the input is volatile, the output | will be volatile too. | | .. note:: | | Returned Variable uses the same data tensor, as the original one, and | in-place modifications on either of them will be seen, and may trigger | errors in correctness checks. | | detach_(self) | Detaches the Variable from the graph that created it, making it a leaf. | | diag(self, diagonal_idx=0) | | dist(self, tensor, p=2) | | div(self, other) | | div_(self, other) | | dot(self, other) | | double(self) | | eq(self, other) | | exp(self) | | exp_(self) | | expand(self, *sizes) | | expand_as(self, tensor) | | float(self) | | floor(self) | | fmod(self, value) | | frac(self) | | gather(self, dim, index) | | ge(self, other) | | ger(self, vector) | | gt(self, other) | | half(self) | | index_add(self, dim, index, tensor) | | index_add_(self, dim, index, tensor) | | index_copy(self, dim, index, tensor) | | index_copy_(self, dim, index, tensor) | | index_fill(self, dim, index, value) | | index_fill_(self, dim, index, value) | | index_select(self, dim, index) | | int(self) | | is_same_size(self, other_var) | | kthvalue(self, dim) | | le(self, other) | | lerp(self, tensor, weight) | | log(self) | | log1p(self) | | long(self) | | lt(self, other) | | masked_copy(self, mask, variable) | | masked_copy_(self, mask, variable) | | masked_fill(self, mask, value) | | masked_fill_(self, mask, value) | | masked_select(self, mask) | | max(self, dim=None) | | mean(self, dim=None) | | median(self, dim) | | min(self, dim=None) | | mm(self, matrix) | | mode(self, dim) | | mul(self, other) | | mul_(self, other) | | multinomial(self, num_samples=1, with_replacement=False) | | mv(self, vector) | | narrow(self, dim, start_index, length) | | ne(self, other) | | neg(self) | | neg_(self) | | norm(self, p=2, dim=None) | | permute(self, *permutation) | | pow(self, other) | | prod(self, dim=None) | | reciprocal(self) | | register_hook(self, hook) | Registers a backward hook. | | The hook will be called every time a gradient with respect to the | variable is computed. The hook should have the following signature:: | | hook(grad) -> Variable or None | | The hook should not modify its argument, but it can optionally return | a new gradient which will be used in place of :attr:`grad`. | | This function returns a handle with a method ``handle.remove()`` | that removes the hook from the module. | | Example: | >>> v = Variable(torch.Tensor([0, 0, 0]), requires_grad=True) | >>> h = v.register_hook(lambda grad: grad * 2) # double the gradient | >>> v.backward(torch.Tensor([1, 1, 1])) | >>> v.grad.data | 2 | 2 | 2 | [torch.FloatTensor of size 3] | >>> h.remove() # removes the hook | | reinforce(self, reward) | Registers a reward obtained as a result of a stochastic process. | | Differentiating stochastic nodes requires providing them with reward | value. If your graph contains any stochastic operations, you should | call this function on their outputs. Otherwise an error will be raised. | | Parameters: | reward(Tensor): Tensor with per-element rewards. It has to match | the device location and shape of Variable's data. | | remainder(self, value) | | renorm(self, p, dim, maxnorm) | | repeat(self, *repeats) | | resize(self, *sizes) | | resize_as(self, variable) | | round(self) | | rsqrt(self) | | scatter(self, dim, index, source) | | scatter_(self, dim, index, source) | | select(self, dim, _index) | | short(self) | | sigmoid(self) | | sigmoid_(self) | | sign(self) | | sin(self) | | sinh(self) | | sort(self, dim=None, descending=False) | | split(self, split_size, dim=0) | | sqrt(self) | | squeeze(self, dim=None) | | std(self, dim=None, unbiased=True) | | sub(self, other) | | sub_(self, other) | | sum(self, dim=None) | | t(self) | | tan(self) | | tanh(self) | | tanh_(self) | | topk(self, k, dim=None, largest=True, sorted=True) | | trace(self) | | transpose(self, dim1, dim2) | | tril(self, diagonal_idx=0) | | triu(self, diagonal_idx=0) | | trunc(self) | | type(self, t) | | type_as(self, t) | | unsqueeze(self, dim) | | var(self, dim=None, unbiased=True) | | view(self, *sizes) | | view_as(self, tensor) | | ---------------------------------------------------------------------- | Data descriptors defined here: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | ---------------------------------------------------------------------- | Methods inherited from torch._C._VariableBase: | | __init__(self, /, *args, **kwargs) | Initialize self. See help(type(self)) for accurate signature. | | __new__(*args, **kwargs) from builtins.type | Create and return a new object. See help(type) for accurate signature. | | ---------------------------------------------------------------------- | Data descriptors inherited from torch._C._VariableBase: | | creator | | data | | grad | | output_nr | | requires_grad | | volatile
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as fxnl
class MyNN(nn.Module):
def __init__(self):
super(MyNN, self).__init__()
self.conv1 = nn.Conv2d(1,6,5)
self.conv2 = nn.Conv2d(6,16,5)
# An affine operation y = Wx+b
self.fc1 = nn.Linear(16*5*5, 120)
self.fc2 = nn.Linear(120,84)
self.fc3 = nn.Linear(84,10)
def forward(self, x):
# max pooling over a 2x2 window
x = fxnl.max_pool2d(fxnl.relu(self.conv1(x)), (2,2))
x= fxnl.max_pool2d(fxnl.relu(self.conv2(x)), 2)
x= x.view(-1, self.num_flat_features(x))
x = fxnl.relu(self.fc1(x))
x = fxnl.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
# all dimensions except the batch dimension
size = x.size()[1:]
num_features = 1
for s in size:
num_features *= s
return num_features
net = MyNN()
print(net)
MyNN ( (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1)) (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1)) (fc1): Linear (400 -> 120) (fc2): Linear (120 -> 84) (fc3): Linear (84 -> 10) )
All you need to do is define the forward function and backward function (this is where the gradients - you know the curvatures , or the steepest path to local minima are computed) is automatically defined for us by using autograd. We can use any of the Tensor ops in the forward function
params = list(net.parameters())
print(len(params))
print(params[0].size())
10 torch.Size([6, 1, 5, 5])
# Input to the forward is autograd.Variable as is the output
input = Variable(torch.randn(1, 1, 32, 32))
output = net(input)
print(output)
Variable containing: -0.0525 0.1221 -0.0390 0.0300 0.0151 0.0359 -0.0839 -0.1024 -0.1495 -0.0095 [torch.FloatTensor of size 1x10]
net.zero_grad()
output.backward(torch.randn(1, 10), retain_variables=True)
print("First backward of ", output)
First backward of Variable containing: -0.0525 0.1221 -0.0390 0.0300 0.0151 0.0359 -0.0839 -0.1024 -0.1495 -0.0095 [torch.FloatTensor of size 1x10]
So, all in all we defined a neural network AND we processed inputs and called it backward.