"""
At the moment of writing (06.06.2019) the only build that supports tensorflow probability is the tensorflow 
nightly build so we will use that to install tensorflow 2.0 and tensorflow probability.
"""
# Install tensorflow 2.0 and tensorflow probability from the nightly build
!pip install --upgrade tf-nightly-2.0-preview tfp-nightly

# Imports

import tensorflow as tf
import tensorflow_probability as tfp

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import numpy as np

# plt axis colors setup
plt.rc_context({'axes.edgecolor':'orange', 'xtick.color':'red', 'ytick.color':'red', 'text.color':'orange'})

"""
In the same way, no matter how many base 2 digits you’re willing to use, the decimal value 0.1 cannot be represented 
exactly as a base 2 fraction. In base 2, 1/10 is the infinitely repeating fraction

0.0001100110011001100110011001100110011001100110011...

One illusion may beget another. For example, since 0.1 is not exactly 1/10, summing three values of 0.1 may not 
yield exactly 0.3, either:

Also, since the 0.1 cannot get any closer to the exact value of 1/10 and 0.3 cannot get any closer to the exact 
value of 3/10, then pre-rounding with round() function cannot help:

Though the numbers cannot be made closer to their intended exact values, the round() function can be useful for 
post-rounding so that results with inexact values become comparable to one another:

Binary floating-point arithmetic holds many surprises like this.
"""

x = 0.1
y = 0.2

print("x + y =  {}".format(x + y))
print("Rounded x + y: {}".format(round(x + y, 1)))
print("Check if .1 + .1 +.1 == .3: {}".format(1 + .1 + .1 == .3))
print("What if we pre round before adding: {}".format(round(.1, 1) + round(.1, 1) + round(.1, 1) == round(.3, 1)))
print("What if we post round after adding: {}".format(round(.1 + .1 + .1, 10) == round(.3, 10)))

def softmax(x, solve=False):
    """Softmax implementation"""
    if solve:
        z = x-max(x)
    else:
        z = x
    numerator = tf.math.exp(z)
    denominator = tf.math.reduce_sum(numerator)
    
    return tf.divide(numerator, denominator)


# Underflow example
"""
If c is very negative, exp(c) will underflow, meaning the denominator will become 0
"""

underflow = tf.constant([-12345, -67890, -99999999], dtype=tf.float32)
print("Softmax Underflow {}".format(softmax(underflow, solve=False)))

# Overflow example
"""
When c is very large and positive, exp(c) will overflow and the expression ends up being undefined
"""

overflow = tf.constant([12345, 67890, 99999999], dtype=tf.float32)
print("Softmax Overflow {}".format(softmax(overflow, solve=False)))

# Solution
"""
Both of these can be solved by evaluating softmax(z) where z = x - max_i x_i.
This works because subtracting max results in the largest argument to exp being 0, getting rid of overflow
and atleast one term in the denominator has a value of 1, which gets rid of underflow

Compare the overflow and underflow examples
"""

underflow = tf.constant([-12345, -67890, -99999999], dtype=tf.float32)
print("Underflow Solved: {}".format(softmax(underflow, solve=True)))

overflow = tf.constant([12345, 67890, 99999999], dtype=tf.float32)
print("Overflow Solved: {}".format(softmax(overflow, solve=True)))


# compare the solution with the tensorflow softmax implementation
underflow_softmax_tf = tf.nn.softmax(underflow, axis=None)
overflow_softmax_tf = tf.nn.softmax(overflow, axis=None)

print("Tensorflow Softmax Underflow: {} \nTensorflow Softmax Overflow: {}".format(underflow_softmax_tf, overflow_softmax_tf))

A = tf.constant([[4.1, 2.8], [9.7, 6.6]], dtype=tf.float32)
b = tf.constant([[4.1], [9.7]], dtype=tf.float32)
print("Matrix A: \n{}\n".format(A))

# solve for x, from Ax=b, x = A^(-1) b
x = tf.linalg.matmul(tf.linalg.inv(A), b)
print("Value of x: \n{}\n".format(x))

# Now lets see what happens if we add 0.01 to the first component of b
b2 = tf.constant([[4.11], [9.7]], dtype=tf.float32)

# We can also use tf.linalg.solve to solve a systems of linear equations
x_solve = tf.linalg.solve(A, b2)
print("Solving for new x: \n{}".format(x_solve))
print("We can see how the solution changes dramatically for a small increase in value\n")

# let's now calculate the condition number for matrix A using ||A|| * ||A^-1||
condition_A = tf.math.multiply(tf.norm(A), tf.norm(tf.linalg.inv(A)))
print("Condition Number of A: {}".format(condition_A))


"""
Let's see how these look like by plotting these. Note that I am using numpy for these plots but we will start 
using tensorflow in the upcoming sections. Also, taking the derivatives and most optimizations are 
easily done with packages like numpy, scipy or sympy since these are dedicated scientific libraries
whereas tensorflow is a machine learning framework but I am working on bringing a scientific extension to the 
tensorflow, please see (https://github.com/mukeshmithrakumar/scientific). 
This is work in progress and contributions are welcome.

"""

polynomial = np.poly1d([2,-4,-28,62,122,-256,-196,140,392,240,72])             # Create a one-dimensional polynomial class
polynomial_root = polynomial.deriv().r                                         # Return a derivative of this polynomial with the roots
r_crit = polynomial_root[polynomial_root.imag==0].real                         # Return the real part of the complex argument
test = polynomial.deriv(2)(r_crit) 

# local minima 
x_min = r_crit[test>0]
y_min = polynomial(x_min)
plt.plot(x_min, y_min, 'o')
plt.text(1.5, 200, 'local minima')

# local maxima 
x_max = r_crit[test<0]
y_max = polynomial(x_max)
plt.plot(x_max, y_max, 'o', color='r')
plt.text(0.5, 800, 'local maxima')

#global maxima
plt.text(-1.5, 1600, 'global maxima')

# global minima 
xc = np.arange(-2.5, 2.6, 0.02)
yc = polynomial(xc)
plt.plot( xc, yc)
plt.xlim([-2.5,2.5])
plt.text(-1, 400, 'global minima')


# Saddle Point
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
plot_args = {'rstride': 1, 'cstride': 1, 'cmap':"Blues_r",
             'linewidth': 0.4, 'antialiased': True,
             'vmin': -1, 'vmax': 1}

x, y = np.mgrid[-1:1:31j, -1:1:31j]
z = x**2 - y**2
ax.plot_surface(x, y, z, **plot_args)
ax.view_init(azim=-60, elev=30)
ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)
ax.set_zlim(-1, 1)
plt.xticks([-1, -0.5, 0, 0.5, 1], [r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"])
plt.yticks([-1, -0.5, 0, 0.5, 1],[r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"])
ax.set_zticks([-1, -0.5, 0, 0.5, 1])
ax.set_zticklabels([r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"])
ax.scatter([0], [0], [.3], s=100, color = 'r', marker="o")
plt.title("Saddle Point")
plt.show()

"""
Lets say we have a function z = f(x, y) = 2x - y: The partial derivatives of this equation is:
dz_dx = 2, dz_dy = -1. Let's see how we can calculate this in tensorflow using GradientTape.

GradientTape records operations for automatic differentiation and we use it with persistent=True 
to allow multiple calls to the gradient(). Note that since we are using constants, we need to 
ensure that these are being traced when we do the GradientTape, hence we use a watch method.
But when we start working with variables, we don't need to have a watch.
"""

x = tf.constant(1.0)
y = tf.constant(2.0)

with tf.GradientTape(persistent=True) as g:
    g.watch(y)
    g.watch(x)
    out = tf.subtract(2*x, y)
dz_dx = g.gradient(out, x)
dz_dy = g.gradient(out, y)
del g  # Drop the reference to the tape

print("Partial Derivative of dz_dx: {} \nPartial Derivative of dz_dy: {} ".format(dz_dx, dz_dy))

"""
f(x,y) = x^2 + y^2 + xsiny + ysinx
f'(x, y) = (2 x + y cos(x) + sin(y)) dx + (2 y + x cos(y) + sin(x)) dy

If you know chain rule, this can be easily differentiated but if you don't, you don't have to worry, tensorflow
will take care of any kind of function for you using Gradient Tape

"""
def f(x, y):
    return tf.pow(x, 2) + tf.pow(y, 2) + tf.multiply(x, tf.math.sin(y)) + tf.multiply(y, tf.math.sin(x))

def grad(x, y):
    with tf.GradientTape() as t:
        t.watch(x)
        out = f(x, y)
    return t.gradient(out, x)


a = 0.1 # learning rate
x0 = tf.constant(1.0)
y0 = tf.constant(1.0) 
print("Starting x value: {}".format(x0))

update = []
for i in range(41):
    x0 -= a * grad(x0, y0)
    update.append(x0)
    plt.plot(i, update[-1], color='r', marker='.')
    if i%5 == 0:
        print("Iteration: {} x: {}".format(i, x0))
plt.grid()

"""
Using Tensorflow GradientTape, we can easily find the Jacobian of any vector using "jacobian(target, sources)".
In this example for f(x) = x^2 we will look at the Jacobian when x = [1.0, 2.0]
"""

with tf.GradientTape() as g:
    x  = tf.constant([1.0, 2.0])
    g.watch(x)
    y = x * x
jacobian = g.jacobian(y, x)

print("Jacobian \n{}".format(jacobian))
del g  # Drop the reference to the tape

"""
We will use tf.GradientTape() to calculate the second derivative of the function 
f(x) = x^3
f'(x) = 3x^2
f''(x) = 6x
"""

x = tf.Variable(1.0)

with tf.GradientTape() as t:
    with tf.GradientTape() as t2:
        y = x * x * x
    dy_dx = t2.gradient(y, x)
d2y_dx2 = t.gradient(dy_dx, x)

print("First Derivative of f(x): {} \nSecond Derivative of f(x): {}".format(dy_dx, d2y_dx2))
del t, t2  # Drop the reference to the tape

# Plot where second derivative is zero

x0 = tf.range(-15., 15., 1.)
x = tf.Variable(x0)

def f(x): 
    return tf.pow(x, 2)

def second_derivative(x):
    """
    Note here since x is a variable we are not calling the watch method. 
    Since we are taking the second derivative, we need to call the GradientTape twice and use the 
    gradient() method to obtain the gradients.
    """
    with tf.GradientTape() as t:
        with tf.GradientTape() as t2:
            out = f(x)
        dy_dx = t2.gradient(out, x)
    d2y_dx2 = t.gradient(dy_dx, x)
    return d2y_dx2

# Original Function f(x) = x^2
plt.plot(x0, f(x0), color='dodgerblue')

# Second Derivative of f(x)
d2 = second_derivative(x)
plt.plot(x0, d2, color='#FF9A13')
plt.grid()

# Plot where second derivative is negative

x0 = tf.range(-10., 10., .5)
x = tf.Variable(x0)

def f(x): 
    return -(tf.pow(x, 4))

def second_derivative(x):
    with tf.GradientTape() as t:
        with tf.GradientTape() as t2:
            out = f(x)
        dy_dx = t2.gradient(out, x)
    d2y_dx2 = t.gradient(dy_dx, x)
    return d2y_dx2

# Original Function f(x): -x^4
plt.plot(x0, f(x0), color='dodgerblue')

# Second Derivative of f(x)
d3 = second_derivative(x)
plt.plot(x0, d3, color='#FF9A13')
plt.grid()

# Plot where second derivative is positive

x0 = tf.range(-10., 10., .5)
x = tf.Variable(x0)

def f(x): 
    return (tf.pow(x, 4))

def second_derivative(x):
    with tf.GradientTape() as t:
        with tf.GradientTape() as t2:
            out = f(x)
        dy_dx = t2.gradient(out, x)
    d2y_dx2 = t.gradient(dy_dx, x)
    return d2y_dx2

# Original Function f(x): x^4
plt.plot(x0, f(x0), color='dodgerblue')

# Second Derivative of f(x)
d3 = second_derivative(x)
plt.plot(x0, d3, color='#FF9A13')
plt.grid()

"""
NOTE: As of 06.14.2019 tf.hessians doesn't work with tf eager mode and the tf.GradientTape doesn't
have a method hessians. There is a bug report open in GitHub 
(https://github.com/tensorflow/tensorflow/issues/29781) so as soon as that is resolved the following
code will work and I will fix this cell.

>>> f = 100*(y - x**2)**2 + (1 - x)**2
>>> x = tf.Variable([1., 1.])

>>> hessian_matrix = tf.hessians(f, x)
>>> print(hessian_matrix)

>>> with tf.GradientTape() as t:
... out = f
>>> hessian = t.hessians(out, x)

But for now, we will look into another way we can get the Hessian using the
equivalency of the Hessian being the gradient of the Jacobian for f(x,y)= x^3 - 2xy + y^6 and for 
points x = 1, y = 2, but note that this will return the sum of each column, the final 
Hessian Matrix will be:

[[ 6., -2.  ],
 [-2., -480.]]

I specifically took this example from Khan academy so if you want to see how the derivation is calculated, 
take a look at: 
(https://www.khanacademy.org/math/multivariable-calculus/applications-of-multivariable-derivatives/quadratic-approximations/a/the-hessian)

"""

def f(x): return tf.pow(x[0], 3) - 2*x[0]*x[1] - tf.pow(x[1], 6)


x = tf.Variable([1., 2.], dtype=tf.float32)

with tf.GradientTape() as h:
    with tf.GradientTape() as g:
        out = f(x)
    jacobian = g.jacobian(out, x)                      # Calls the Jacobian method
hessian = h.gradient(jacobian, x)                      # We take the gradient of the Jacobian to get the Hessian Matrix
    
print("Jacobian: \n{} \nHessian: \n{}".format(jacobian, hessian))

# Eigenvalue and Eigenvectors for the Hessian matrix

hessian = tf.constant([[ 6., -2.], [-2., -480.]])
eigenvalues, eigenvectors = tf.linalg.eigh(hessian)

print("Eigenvalue for the Hessian: \n{} \n\nEigenvectors for the Hessian: \n{}".format(eigenvalues, eigenvectors))

"""
Let's see how Newton's method can be used to solve a quadratic function by applying it once.
We start with a quadratic function x^2 + 5y^2 and when you differentiate it, you end up with 2x + 10y
which would result in a Hessian of [[2 0], [0, 10]] because the derivative of 2x + 10y is 2 and 10.
"""

def quad(x):
    return ((x[1:])**2.0 + 5*(x[:-1])**2.0)

def quad_grad(x,y):
    return (tf.Variable([2.0*x, 10.0*y]))


# Create x and y values
x = tf.linspace(-20.0, 20.0, 100)
y = tf.linspace(-20.0, 20.0, 100)

# Broadcasts parameters for evaluation on an N-D grid
X, Y = tf.meshgrid(x, y)

# Reshape X and Y to pack along first dim (row wise) and apply the quadratic function and reshape it to get the original dimension
Z = tf.reshape(quad(tf.stack([tf.reshape(X, [10000,]), tf.reshape(Y, [10000,])], axis=0)), [100, 100])

# Take the inverse of the Hessian: (1/2, 1/10)
H_inv = - tf.constant([[0.5, 0], [0, 0.1]])

plt.figure(figsize=(12,4))
plt.subplot(121)

plt.contour(X,Y,Z);
plt.title("Steepest Descent");
step = -0.25
X0 = 10.0
Y0 = 1.0

# Here we calculate the Gradient of our function and take the dot product between the gradient and the Hessian inverse
N_grad = tf.tensordot(H_inv, quad_grad(X0,Y0), axes=1)

sgrad = step*quad_grad(X0,Y0)
plt.quiver(X0, Y0, sgrad[0], sgrad[1], color='red',angles='xy',scale_units='xy',scale=1);
X1 = X0 + sgrad[0]
Y1 = Y0 + sgrad[1]

sgrad = step*quad_grad(X1,Y1)
plt.quiver(X1.numpy(), Y1.numpy(), sgrad[0].numpy(), sgrad[1].numpy(), color='green', angles='xy', scale_units='xy', scale=1);
X2 = X1 + sgrad[0]
Y2 = Y1 + sgrad[1]

sgrad = step*quad_grad(X2,Y2)
plt.quiver(X2.numpy(), Y2.numpy(), sgrad[0].numpy(), sgrad[1].numpy(), color='purple',angles='xy',scale_units='xy',scale=1);


plt.subplot(122)
plt.contour(X,Y,Z);
plt.title("Newton's Method")
plt.quiver(X0, Y0, N_grad[0], N_grad[1], color='purple',angles='xy',scale_units='xy',scale=1);


"""
Let's see how this looks by plotting a function f(x) = −(2xy+2x−x^2−2y^2) with constraints 
x^3−y = 0 and y−(x−1)^4−2≥ 0 with bounds 0.5, 1.5, 1.5, 2.5. The solution will be where the
two constraints meet inside the bound.
"""

def f(x):
    return -(2*x[0]*x[1] + 2*x[0] - x[0]**2 - 2*x[1]**2)

x = np.linspace(0, 3, 100)
y = np.linspace(0, 3, 100)
X, Y = np.meshgrid(x, y)
Z = f(np.vstack([X.ravel(), Y.ravel()])).reshape((100,100))
plt.contour(X, Y, Z, np.arange(-1.99,10, 1), cmap='jet');
plt.plot(x, x**3, 'k:', linewidth=2, color='r')                   # constraint 1: x^3−y = 0
plt.plot(x, (x-1)**4+2, 'k:', linewidth=2, color='b')             # constraint 2: y−(x−1)^4−2≥ 0
plt.fill([0.5,0.5,1.5,1.5], [2.5,1.5,1.5,2.5], alpha=0.3)         # bounds 0.5, 1.5, 1.5, 2.5
plt.axis([0,3,0,3])

"""
There are two ways to solve the above system. Direct method and iterative method.
We start with the iterative method. The optimal solution is where the gradient becomes zero
therefore it is at x = (A^T b)*(A^T A)^{-1} and this is what we will be calculating
"""

# Generate random x and y data
x_vals = np.linspace(0., 10., 100)
y_vals = x_vals + np.random.normal(loc=0, scale=1, size=100)

x_vals_column = np.transpose(np.matrix(x_vals))
ones_column = tf.ones([100, 1], dtype=tf.float32)

# tensorflow needs its data types in float so we cast the dtypes to float
A_tensor = tf.dtypes.cast(tf.concat([x_vals_column, ones_column], axis=1), tf.float32)
Y_tensor = tf.dtypes.cast(tf.reshape(tf.transpose(y_vals), [100, 1]), tf.float32)

# Iterative method
tA_A = tf.matmul(A_tensor, A_tensor, transpose_a=True)         # We calculate A^T A
tA_A_inv = tf.linalg.inv(tA_A)                                 # And take the inverse of it (A^TA)^{-1}
product = tf.matmul(tA_A_inv, A_tensor, transpose_b=True)      # Then multiply it with A to yield (A^TA)^{-1} A
A_eval = tf.matmul(product, Y_tensor)                          # Finally we find (A^TA)^{-1}*A*b


m_slope = A_eval[0][0]
b_intercept = A_eval[1][0]
print('slope (m): ' + str(m_slope))
print('intercept (b): ' + str(b_intercept))

# Now for each x_val we find the best fit line
best_fit = []
for i in x_vals:
    best_fit.append(m_slope * i + b_intercept)

plt.plot(x_vals, y_vals, 'o', label='Data')
plt.plot(x_vals, best_fit, 'r-', label='Least square fit', linewidth=3)
plt.legend(loc='upper left')
plt.show()

"""
Next, we try the direct method for another dataset using tensorflows least squares class for function
f(x) = 1 + 2x + 3x^2
"""

# define true model parameters
x = np.linspace(-1, 1, 100)
a, b, c = 1, 2, 3
y_exact = a + b * x + c * x**2

# simulate noisy data points
m = 100
X = 1 - 2 * np.random.rand(m)
Y = tf.reshape(tf.convert_to_tensor(a + b * X + c * X**2 + np.random.randn(m)), [100, 1])
A = tf.stack([X**0, X**1, X**2], axis=1)

# Solving directly using tensorflow's least sqaures. 
sol = tf.linalg.lstsq(A, Y)

y_fit = sol[0] + sol[1] * x + sol[2] * x**2
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(X, Y, 'go', alpha=0.5, label='Simulated data')
ax.plot(x, y_exact, 'k', lw=2, label='True value $y = 1 + 2x + 3x^2$')
ax.plot(x, y_fit, 'b', lw=2, label='Least square fit')
ax.set_xlabel(r"$x$", fontsize=18)
ax.set_ylabel(r"$y$", fontsize=18)
ax.legend(loc=2);

"""
Let' take an example: Maximize f(x,y,z)=xy+yz subject to the constraints x+2y=6 and x−3z=0.
We start by setting up the equation:

F(x,y,z,λ,α) = xy + yz − λ(x+2y−6) + α(x-3z)

Now set the partial derivatives to zero and solve the following set of equations:

y − λ − α = 0
x + z − 2λ = 0
y + 3α = 0
x + 2y −6 = 0
x − 3z = 0

which is a linear equation in  x,y,z,λ,μ, Now this can be put into the matrix equation:

[0, 1, 0, -1, -1] [x] = [0]
[1, 0, 1, -2,  0] [y] = [0]
[0, 1, 0,  0,  3] [z] = [0]
[1, 2, 0,  0,  0] [λ] = [6]
[1, 0,-3,  0,  0] [μ] = [0]

"""

matrix = tf.constant([
    [0, 1, 0, -1, -1],
    [1, 0, 1, -2, 0],
    [0, 1, 0, 0, 3],
    [1, 2, 0, 0, 0],
    [1, 0,-3, 0, 0]], dtype=tf.float32)

rhs = tf.constant([[0],[0],[0],[6],[0]], dtype=tf.float32)
solve = tf.linalg.solve(matrix, rhs)

print("Solving the constrained optimization using Lagrange multipliers yield x: \n{}".format(solve))

# Moore-penrose pseudoinverse solution to the above problem
mpp_solution = tfp.math.pinv(matrix)

print(mpp_solution)