#!/usr/bin/env python
# coding: utf-8

# # Optimising functions
# 
# Now for something a bit different.
# PyTorch is a tensor processing library and whilst it has a focus on neural networks, it can also be used for more standard funciton optimisation.
# In this example we will use torchbearer to minimise a simple function.
# 
# **Note**: The easiest way to use this tutorial is as a colab notebook, which allows you to dive in with no setup. We recommend you enable a free GPU with
# 
# > **Runtime**   →   **Change runtime type**   →   **Hardware Accelerator: GPU**
# 
# ## Install Torchbearer
# 
# First we install torchbearer if needed. 

# In[1]:


try:
    import torchbearer
except:
    get_ipython().system('pip install -q torchbearer')
    import torchbearer
    
print(torchbearer.__version__)


# ## A Simple Function
# 
# For this task, we are going to try to optimise the following:
# 
# \begin{equation}
# min_{\textbf{x}} (\textbf{x}[0] - 5)^2 + \textbf{x}[1]^2 + (\textbf{x}[2] - 1)^2\; ,
# \end{equation}
# which has a minimum at $\textbf{x}=[5, 0, 1]$.
# 
# ### The Model
# 
# First we will need to create something that looks very similar to a neural network model - but with the purpose of minimising our function.
# We store the current estimates for the minimum as parameters in the model (so PyTorch optimisers can find and optimise them) and we return the function value in the forward method.

# In[2]:


import torch
import torch.nn as nn

ESTIMATE = torchbearer.state_key('est')


class Net(nn.Module):
    def __init__(self, x):
        super().__init__()
        self.pars = nn.Parameter(x)

    def f(self):
        """
        function to be minimised:
        f(x) = (x[0]-5)^2 + x[1]^2 + (x[2]-1)^2
        Solution:
        x = [5,0,1]
        """
        out = torch.zeros_like(self.pars)
        out[0] = self.pars[0]-5
        out[1] = self.pars[1]
        out[2] = self.pars[2]-1
        return torch.sum(out**2)

    def forward(self, _, state):
        state[ESTIMATE] = self.pars.detach().unsqueeze(1)
        return self.f()


# ### The Loss
# 
# For function minimisation we have an analogue to neural network losses - we minimise the value of the function under the current estimates of the minimum.
# Note that as we are using a base loss, torchbearer passes this the network output and the "label" (which is of no use here).
# 

# In[3]:


def loss(y_pred, y_true):
    return y_pred


# ### Optimising
# 
# We need two more things before we can start optimising with torchbearer.
# We need our initial guess - which we've set to [2.0, 1.0, 10.0] and we need to tell torchbearer how "long" an epoch is - I.e. how many optimisation steps we want for each epoch.
# For our simple function, we can complete the optimisation in a single epoch, but for more complex optimisations we might want to take multiple epochs and include tensorboard logging and perhaps learning rate annealing to find a final solution.

# In[4]:


p = torch.tensor([2.0, 1.0, 10.0])
training_steps = 5000


# The learning rate chosen for this example is very low and we could get convergence much faster with a larger rate, however this allows us to view convergence in real time.
# We define the model and optimiser in the standard way.

# In[5]:


model = Net(p)
optim = torch.optim.SGD(model.parameters(), lr=0.001)


# Finally we start the optimising on the GPU and print the final minimum estimate.

# In[6]:


device = 'cuda' if torch.cuda.is_available() else 'cpu'
trial = torchbearer.Trial(model, optim, loss, [torchbearer.metrics.running_mean(ESTIMATE, dim=1), 'loss'])
trial.for_train_steps(training_steps).to(device)
trial.run()
print(list(model.parameters())[0].data)


# ## Visualising Himmelblau's Function
# 
# We'll now have a go at a more complex example, which we also visualise, with multiple optima; [Himmelblau's function](https://en.wikipedia.org/wiki/Himmelblau%27s_function). This is defined as:
# 
# \begin{equation}
# f(x, y) = (x^2 + y - 11)^2 + (x + y^2 - 7)^2\; ,
# \end{equation}
# and has minima at
# \begin{equation}
# f(3, 2) = f(-2.805118, 3.131312) = f(-3.779310, -3.283186) = f(3.584428, -1.848126) = 0\; .
# \end{equation}
# 
# Let's first define the function

# In[7]:


himm = lambda x, y: (x ** 2 + y - 11) ** 2 + (x + y ** 2 - 7) ** 2


# and plot its surface

# In[8]:


import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LogNorm

xmin, xmax, xstep = -5, 5, .2
ymin, ymax, ystep = -5, 5, .2
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep))
z = himm(x, y)

fig = plt.figure(figsize=(8, 5))
ax = plt.axes(projection='3d', elev=50, azim=-50)
ax.plot_surface(x, y, z, norm=LogNorm(), rstride=1, cstride=1, 
                edgecolor='none', alpha=.8, cmap=plt.cm.jet)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

plt.show()


# That looks about right, now we'd like to view the optimisation path of a few different optimisers.
# 
# ### The Model
# 
# Since we already have the function definition, we can define our model simply: 

# In[9]:


import torch
import torch.nn as nn

ESTIMATE = torchbearer.state_key('est')

class Net(nn.Module):
    def __init__(self, x):
        super().__init__()
        self.pars = nn.Parameter(x)

    def forward(self, _, state):
        state[ESTIMATE] = self.pars.detach()
        return himm(self.pars[0], self.pars[1])


# ### Visualising
# 
# We now wish to create a visualisation. There are a few ways to do this, we'll use a couple of callback functions, combined in a [`CallbackList`](https://torchbearer.readthedocs.io/en/latest/code/callbacks.html#torchbearer.callbacks.callbacks.CallbackList).

# In[10]:


import torchbearer
from torchbearer import callbacks

def make_callback(ax, color, label):
    xs = []
    ys = []
    
    @callbacks.on_step_training
    def log_estimate(state):
        est = state[torchbearer.METRICS][ESTIMATE].cpu().numpy()
        xs.append(est[0])
        ys.append(est[1])
    
    @callbacks.on_end
    def plot_history(state):
        ax.plot(xs, ys, color=color, label=label, linewidth=2)
    
    return callbacks.CallbackList([log_estimate, plot_history])


# This callback will allow us to create an axis (2d this time) and subsequently plot the optimisation path on it for a series of optimisers.
# 
# ### Optimising
# 
# Let's see how it looks for a few different optimisers from a range of starting points

# In[11]:


import numpy as np
import torchbearer
device = 'cuda' if torch.cuda.is_available() else 'cpu'

xmin, xmax, xstep = -5, 5, .2
ymin, ymax, ystep = -5, 5, .2
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep))
z = himm(x, y)

fig, ax = plt.subplots(figsize=(8, 8))
ax.contourf(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.gray)

def make_plots(p):
    def run(optimiser, color, label):
        model = Net(p)
        optim = optimiser(model.parameters())
        training_steps = 50
        trial = torchbearer.Trial(model, optim, loss, [ESTIMATE], callbacks=[make_callback(ax, color, label)], verbose=-1)
        trial.for_train_steps(training_steps).to(device)
        trial.run()

    cmap = plt.cm.viridis

    run(lambda param: torch.optim.SGD(param, lr=0.01), cmap(0), 'SGD')
    run(lambda param: torch.optim.Adam(param, lr=0.1), cmap(0.25), 'Adam')
    run(lambda param: torch.optim.Adam(param, lr=0.1, amsgrad=True), cmap(0.5), 'AMSGrad')
    run(lambda param: torch.optim.RMSprop(param, lr=0.1), cmap(0.75), 'RMSProp')
    run(lambda param: torch.optim.Adamax(param, lr=0.1), cmap(1.0), 'Adamax')

make_plots(torch.tensor([0.0, 0.0]))
plt.legend()

for x in np.arange(-1.0, 1.0, 0.5):
    for y in np.arange(-2.0, 1.0, 0.5):
        make_plots(torch.tensor([x, y]))

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))
        
fig.savefig('optimisers.png', bbox_inches='tight')


# In[ ]: