#!/usr/bin/env python
# coding: utf-8

# # Implementing a Declarative Node using the `ddn.basic.node` Module
# 
# In this notebook we demonstrate how to implement a declarative node using the `ddn.basic.node` module. This will allow us to explore the behavior of the node and solve simple bi-level optimization problems. For more sophisticated problems and integrating into large deep learning models use modules in the package `ddn.pytorch` instead.
# 
# We consider the problem of minimizing the KL-divergence between the input $x$ and output $y$ subject to the output forming a valid probablility vector (i.e., the elements of $y$ be positive and sum to one). We will assume strictly positive $x$. The problem can be written formally as
# 
# $$
# \begin{array}{rll}
# y =& \text{argmin}_u & - \sum_{i=1}^{n} x_i \log u_i \\
# & \text{subject to} & \sum_{i=1}^{n} u_i = 1
# \end{array}
# $$
# where the positivity constraint on $y$ is automatically satisfied by the domain of the log function.
# 
# A nice feature of this problem is that we can solve it in closed-form as
# $$
# y = \frac{1}{\sum_{i=1}^{n} x_i} x.
# $$
# 
# However, we will only use this for verification and pretend for now that we do not have a closed-form solution. Instead we will make use of the `scipy.optimize` module to solve the problem via an iterative method. Deriving our deep declarative node from the `LinEqConstDeclarativeNode` class, we will need to implement two functions: the `objective` function and the `solve` function (the `constraint` and `gradient` functions are implemented for us).
# 

# In[1]:


import numpy as np
import scipy.optimize as opt

import sys
sys.path.append("../")
from ddn.basic.node import *

import warnings
warnings.filterwarnings('ignore')

# create the example node
class MinKLNode(LinEqConstDeclarativeNode):
    def __init__(self, n):
        # Here we establish the linear equality constraint, Au = b. Since we want the sum of the
        # u_i to equal one we set A to be the all-ones row vector and b to be the scalar 1.
        super().__init__(n, n, np.ones((1,n)), np.ones((1,1)))

    def objective(self, x, u):
        return -1.0 * np.dot(x, np.log(u))
        
    def solve(self, x):
        # Solve the constrained optimization problem using scipy's built-in minimize function. Here we
        # initialize the solver at the uniform distribution.
        u0 = np.ones((self.dim_y,)) / self.dim_y
        result = opt.minimize(lambda u: self.objective(x, u), u0,
                              constraints={'type': 'eq', 'fun': lambda u: (np.dot(self.A, u) - self.b)[0]})
        
        # The solve function must always return two arguments, the solution and context (i.e., cached values needed
        # for computing the gradient). In the case of linearly constrained problems we do not need the dual solution
        # in computing the gradient so we return None for context.
        return result.x, None


# In[2]:


# test the node
node = MinKLNode(5)
x = np.random.random(5)
print("Input:           {}".format(x))
print("Expected output: {}".format(x / np.sum(x)))

y, _ = node.solve(x)
print("Actual output:   {}".format(y))


# We now plot the function and gradient sweeping the first component of the input $x_1$ from 0.1 to 10.0 while holding the other elements of $x$ constant.

# In[3]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

x_data = np.linspace(0.1, 10.0, 100)
y_data = []
Dy_data = []
for x[0] in x_data:
    y, _ = node.solve(x)
    y_data.append(y)
    Dy_data.append(node.gradient(x, y)[:,0])
    
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(x_data, y_data)
plt.ylabel(r"$y$")

plt.subplot(2, 1, 2)
plt.plot(x_data, Dy_data)
plt.xlabel(r"$x_1$"); plt.ylabel(r"$Dy_{:,1}$")
plt.show()


# ### Bi-level optimization
# 
# Now let's see whether we can use the node within a bi-level optimization problem. We will attempt to learn an input $x$ that results in an output $y$ with smallest norm-squared. Moreover, we will regularize the norm of $x$ to be close to 10. Given our understanding of KL-divergence this should learn a vector $x$ that is a constant multiple of the ones vector (i.e., all elements of $x$ should be the same). Let's see what happens.

# In[4]:


from autograd import grad

# define the upper-level objective
def JofXandY(x, y):
    """Computes our upper-level objective given both x and y."""
    return np.dot(y, y) + np.power(np.sqrt(np.dot(x, x)) - 10.0, 2.0)

def JofX(x):
    """Computes our upper-level objective given x and with a y that minimizes the lower-level objective."""
    y, ctx = node.solve(x)
    return JofXandY(x, y)

def dJofX(x):
    """Computes the gradient of the upper-level objective with respect to x."""
    Jx = grad(JofXandY, 0)
    Jy = grad(JofXandY, 1)
    y, ctx = node.solve(x)
    return Jx(x, y) + np.dot(Jy(x, y), node.gradient(x, y, ctx))

# solve using L-BFGS
x0 = np.random.random(node.dim_x)
history = [JofX(x0)]
result = opt.minimize(JofX, x0, args=(), method='L-BFGS-B', jac=dJofX,
                      options={'maxiter': 100, 'disp': False},
                      bounds=[(1.0e-6, None) for xk in x0],
                      callback=lambda xk : history.append(JofX(xk)))

x = result.x
y, _ = node.solve(x)
print("Found x = {} with norm {:0.2f}".format(x, np.sqrt(np.dot(x, x))))
print("Results in y = {}".format(y))

fig = plt.figure()
plt.semilogy(history)
plt.ylabel("upper-level objective (log-scale)"); plt.xlabel("iteration")
plt.show()


# ## Multiple Equality Constraints
# 
# We can also solve problems with non-linear constraints. If there is just one constraint use ``EqConstDeclarativeNode`` as the base class for implementing the node. Otherwise use ``MultiEqConstDeclarativeNode`` when there is more than one (non-linear) equality constraint. Consider the following problem with $x, y \in \mathbb{R}^3$.
# 
# $$
# \begin{array}{rll}
#     y =& \text{argmin}_u & \sum_{i=1}^{3} x_i u_i^{2} \\
#     & \text{subject to} & \sum_{i=1}^{2} u_i^2 = 1 \\
#     & & \sum_{i=1}^{3} u_i = 0
# \end{array}
# $$
# 
# We need to implement three functions: the `objective`, `constraint` and `solve` functions.

# In[5]:


# create the example node
# by Suikei Wong (2020)

class MinMulNode(MultiEqConstDeclarativeNode):
    def __init__(self):
        super().__init__(3, 3)

    def objective(self, x, u):
        return np.dot(x, u ** 2)
    
    def constraint(self, x, u):
        """Return 2-vector, one element for each constraint."""
        return np.array([u[0] ** 2 + u[1] ** 2 - 1, u[0] + u[1] + u[2]])
        
    def solve(self, x):
        # Solve the constrained optimization problem using scipy's built-in minimize function.
        con1 = {'type': 'eq', 'fun': lambda u: u[0] ** 2 + u[1] ** 2 - 1}
        con2 = {'type': 'eq', 'fun': lambda u: u[0] + u[1] + u[2]}
        cons = ([con1, con2])
        # Initialize the solver at (sin30, cos30, -sin30-cos30) which is a feasible point
        u0 = np.array([1/2, np.sqrt(3)/2, -(np.sqrt(3)-1)/2])
        result = opt.minimize(lambda u: self.objective(x, u), u0, constraints=cons)
        
        # The solve function must always return two arguments, the solution and context (i.e., cached values needed
        # for computing the gradient). In the case of linearly constrained problems we do not need the dual solution
        # in computing the gradient so we return None for context.
        return result.x, None


# In[6]:


# test the node
mul_node = MinMulNode()
x = np.random.random(mul_node.dim_x)
print("Input:           {}".format(x))

y, _ = mul_node.solve(x)
print("Actual output:   {}".format(y))
print("Objective:       {}".format(mul_node.objective(x, y)))
print("Constraints:     {}".format(mul_node.constraint(x, y)))


# In[7]:


# plot the function and gradients
x_data = np.linspace(0.1, 10.0, 100)
y_data = []
Dy_data = []
for x[0] in x_data:
    y, _ = mul_node.solve(x)
    y_data.append(y)
    Dy_data.append(mul_node.gradient(x, y)[:,0])
    
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(x_data, y_data)
plt.ylabel(r"$y$")

plt.subplot(2, 1, 2)
plt.plot(x_data, Dy_data)
plt.xlabel(r"$x_1$"); plt.ylabel(r"$Dy_{:,1}$")
plt.show()


# ### Equality and Inequality Constraints
# 
# We now consider a problem with multiple equality and inequality constraints,
# 
# $$
# \begin{array}{rll}
#     y =& \text{argmin}_u & \sum_{i=1}^{3} x_i u_i^{2} \\
#     & \text{subject to} & \sum_{i=1}^{2} u_i^2 = 1 \\
#     & & \sum_{i=1}^{3} u_i = 0 \\
#     & & u_1 - u_2 \leq 0
# \end{array}
# $$
# 
# We will construct the problem by deriving from the `GeneralConstDeclarativeNode` class from `ddn.basic.node`.

# In[8]:


# An example of a simple general declarative node
# by Suikei Wong (2020)

class SimpleGeneralNode(GeneralConstDeclarativeNode):
    def __init__(self):
        super().__init__(3, 3)

    def objective(self, x, u):
        return np.dot(x, u ** 2)
    
    def eq_constraints(self, x, u):
        return np.array([u[0] ** 2 + u[1] ** 2 - 1, u[0] + u[1] + u[2]])

    def ineq_constraints(self, x, u):
        return np.array([u[0] - u[1]])

    def solve(self, x):
        # Solve the constrained optimization problem using scipy's built-in minimize function. Here we
        # initialize the solver at the uniform distribution.
        con1 = {'type': 'eq', 'fun': lambda u: u[0] ** 2 + u[1] ** 2 - 1}
        con2 = {'type': 'eq', 'fun': lambda u: u[0] + u[1] + u[2]}
        con3 = {'type': 'ineq', 'fun': lambda u: u[1] - u[0]}
        
        cons = ([con1, con2, con3])
        # initialize u0 = [sin30, cos30, -sin30-cos30] which is a feasible point
        u0 = np.array([1/2, np.sqrt(3)/2, -(np.sqrt(3)-1)/2])
        result = opt.minimize(lambda u: self.objective(x, u), u0, method='SLSQP', constraints=cons)
        
        # The solve function must always return two arguments, the solution and context (i.e., cached values needed
        # for computing the gradient). In the case of linearly constrained problems we do not need the dual solution
        # in computing the gradient so we return None for context.
        return result.x, None


# In[9]:


# test the node
gen_node = SimpleGeneralNode()
x = np.random.random(gen_node.dim_x)
print("Input:           {}".format(x))

y, _ = mul_node.solve(x)
print("Actual output:   {}".format(y))
print("Objective:       {}".format(gen_node.objective(x, y)))
print("Eq. Constraints: {}".format(gen_node.eq_constraints(x, y)))
print("Ineq. Consts:    {}".format(gen_node.ineq_constraints(x, y)))


# In[10]:


x_data = np.linspace(0.1, 10.0, 100)
y_data = []
Dy_data = []
for x[0] in x_data:
    y, _ = gen_node.solve(x)
    y_data.append(y)
    Dy_data.append(gen_node.gradient(x, y)[:,0])
    
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(x_data, y_data)
plt.ylabel(r"$y$")

plt.subplot(2, 1, 2)
plt.plot(x_data, Dy_data)
plt.xlabel(r"$x_1$"); plt.ylabel(r"$Dy_{:,1}$")
plt.show()


# In[ ]: