""" At the moment of writing (06.06.2019) the only build that supports tensorflow probability is the tensorflow nightly build so we will use that to install tensorflow 2.0 and tensorflow probability. """ # Install tensorflow 2.0 and tensorflow probability from the nightly build !pip install --upgrade tf-nightly-2.0-preview tfp-nightly # Imports import tensorflow as tf import tensorflow_probability as tfp from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np # plt axis colors setup plt.rc_context({'axes.edgecolor':'orange', 'xtick.color':'red', 'ytick.color':'red', 'text.color':'orange'}) """ In the same way, no matter how many base 2 digits you’re willing to use, the decimal value 0.1 cannot be represented exactly as a base 2 fraction. In base 2, 1/10 is the infinitely repeating fraction 0.0001100110011001100110011001100110011001100110011... One illusion may beget another. For example, since 0.1 is not exactly 1/10, summing three values of 0.1 may not yield exactly 0.3, either: Also, since the 0.1 cannot get any closer to the exact value of 1/10 and 0.3 cannot get any closer to the exact value of 3/10, then pre-rounding with round() function cannot help: Though the numbers cannot be made closer to their intended exact values, the round() function can be useful for post-rounding so that results with inexact values become comparable to one another: Binary floating-point arithmetic holds many surprises like this. """ x = 0.1 y = 0.2 print("x + y = {}".format(x + y)) print("Rounded x + y: {}".format(round(x + y, 1))) print("Check if .1 + .1 +.1 == .3: {}".format(1 + .1 + .1 == .3)) print("What if we pre round before adding: {}".format(round(.1, 1) + round(.1, 1) + round(.1, 1) == round(.3, 1))) print("What if we post round after adding: {}".format(round(.1 + .1 + .1, 10) == round(.3, 10))) def softmax(x, solve=False): """Softmax implementation""" if solve: z = x-max(x) else: z = x numerator = tf.math.exp(z) denominator = tf.math.reduce_sum(numerator) return tf.divide(numerator, denominator) # Underflow example """ If c is very negative, exp(c) will underflow, meaning the denominator will become 0 """ underflow = tf.constant([-12345, -67890, -99999999], dtype=tf.float32) print("Softmax Underflow {}".format(softmax(underflow, solve=False))) # Overflow example """ When c is very large and positive, exp(c) will overflow and the expression ends up being undefined """ overflow = tf.constant([12345, 67890, 99999999], dtype=tf.float32) print("Softmax Overflow {}".format(softmax(overflow, solve=False))) # Solution """ Both of these can be solved by evaluating softmax(z) where z = x - max_i x_i. This works because subtracting max results in the largest argument to exp being 0, getting rid of overflow and atleast one term in the denominator has a value of 1, which gets rid of underflow Compare the overflow and underflow examples """ underflow = tf.constant([-12345, -67890, -99999999], dtype=tf.float32) print("Underflow Solved: {}".format(softmax(underflow, solve=True))) overflow = tf.constant([12345, 67890, 99999999], dtype=tf.float32) print("Overflow Solved: {}".format(softmax(overflow, solve=True))) # compare the solution with the tensorflow softmax implementation underflow_softmax_tf = tf.nn.softmax(underflow, axis=None) overflow_softmax_tf = tf.nn.softmax(overflow, axis=None) print("Tensorflow Softmax Underflow: {} \nTensorflow Softmax Overflow: {}".format(underflow_softmax_tf, overflow_softmax_tf)) A = tf.constant([[4.1, 2.8], [9.7, 6.6]], dtype=tf.float32) b = tf.constant([[4.1], [9.7]], dtype=tf.float32) print("Matrix A: \n{}\n".format(A)) # solve for x, from Ax=b, x = A^(-1) b x = tf.linalg.matmul(tf.linalg.inv(A), b) print("Value of x: \n{}\n".format(x)) # Now lets see what happens if we add 0.01 to the first component of b b2 = tf.constant([[4.11], [9.7]], dtype=tf.float32) # We can also use tf.linalg.solve to solve a systems of linear equations x_solve = tf.linalg.solve(A, b2) print("Solving for new x: \n{}".format(x_solve)) print("We can see how the solution changes dramatically for a small increase in value\n") # let's now calculate the condition number for matrix A using ||A|| * ||A^-1|| condition_A = tf.math.multiply(tf.norm(A), tf.norm(tf.linalg.inv(A))) print("Condition Number of A: {}".format(condition_A)) """ Let's see how these look like by plotting these. Note that I am using numpy for these plots but we will start using tensorflow in the upcoming sections. Also, taking the derivatives and most optimizations are easily done with packages like numpy, scipy or sympy since these are dedicated scientific libraries whereas tensorflow is a machine learning framework but I am working on bringing a scientific extension to the tensorflow, please see (https://github.com/mukeshmithrakumar/scientific). This is work in progress and contributions are welcome. """ polynomial = np.poly1d([2,-4,-28,62,122,-256,-196,140,392,240,72]) # Create a one-dimensional polynomial class polynomial_root = polynomial.deriv().r # Return a derivative of this polynomial with the roots r_crit = polynomial_root[polynomial_root.imag==0].real # Return the real part of the complex argument test = polynomial.deriv(2)(r_crit) # local minima x_min = r_crit[test>0] y_min = polynomial(x_min) plt.plot(x_min, y_min, 'o') plt.text(1.5, 200, 'local minima') # local maxima x_max = r_crit[test<0] y_max = polynomial(x_max) plt.plot(x_max, y_max, 'o', color='r') plt.text(0.5, 800, 'local maxima') #global maxima plt.text(-1.5, 1600, 'global maxima') # global minima xc = np.arange(-2.5, 2.6, 0.02) yc = polynomial(xc) plt.plot( xc, yc) plt.xlim([-2.5,2.5]) plt.text(-1, 400, 'global minima') # Saddle Point fig = plt.figure() ax = fig.add_subplot(111, projection='3d') plot_args = {'rstride': 1, 'cstride': 1, 'cmap':"Blues_r", 'linewidth': 0.4, 'antialiased': True, 'vmin': -1, 'vmax': 1} x, y = np.mgrid[-1:1:31j, -1:1:31j] z = x**2 - y**2 ax.plot_surface(x, y, z, **plot_args) ax.view_init(azim=-60, elev=30) ax.set_xlim(-1, 1) ax.set_ylim(-1, 1) ax.set_zlim(-1, 1) plt.xticks([-1, -0.5, 0, 0.5, 1], [r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"]) plt.yticks([-1, -0.5, 0, 0.5, 1],[r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"]) ax.set_zticks([-1, -0.5, 0, 0.5, 1]) ax.set_zticklabels([r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"]) ax.scatter([0], [0], [.3], s=100, color = 'r', marker="o") plt.title("Saddle Point") plt.show() """ Lets say we have a function z = f(x, y) = 2x - y: The partial derivatives of this equation is: dz_dx = 2, dz_dy = -1. Let's see how we can calculate this in tensorflow using GradientTape. GradientTape records operations for automatic differentiation and we use it with persistent=True to allow multiple calls to the gradient(). Note that since we are using constants, we need to ensure that these are being traced when we do the GradientTape, hence we use a watch method. But when we start working with variables, we don't need to have a watch. """ x = tf.constant(1.0) y = tf.constant(2.0) with tf.GradientTape(persistent=True) as g: g.watch(y) g.watch(x) out = tf.subtract(2*x, y) dz_dx = g.gradient(out, x) dz_dy = g.gradient(out, y) del g # Drop the reference to the tape print("Partial Derivative of dz_dx: {} \nPartial Derivative of dz_dy: {} ".format(dz_dx, dz_dy)) """ f(x,y) = x^2 + y^2 + xsiny + ysinx f'(x, y) = (2 x + y cos(x) + sin(y)) dx + (2 y + x cos(y) + sin(x)) dy If you know chain rule, this can be easily differentiated but if you don't, you don't have to worry, tensorflow will take care of any kind of function for you using Gradient Tape """ def f(x, y): return tf.pow(x, 2) + tf.pow(y, 2) + tf.multiply(x, tf.math.sin(y)) + tf.multiply(y, tf.math.sin(x)) def grad(x, y): with tf.GradientTape() as t: t.watch(x) out = f(x, y) return t.gradient(out, x) a = 0.1 # learning rate x0 = tf.constant(1.0) y0 = tf.constant(1.0) print("Starting x value: {}".format(x0)) update = [] for i in range(41): x0 -= a * grad(x0, y0) update.append(x0) plt.plot(i, update[-1], color='r', marker='.') if i%5 == 0: print("Iteration: {} x: {}".format(i, x0)) plt.grid() """ Using Tensorflow GradientTape, we can easily find the Jacobian of any vector using "jacobian(target, sources)". In this example for f(x) = x^2 we will look at the Jacobian when x = [1.0, 2.0] """ with tf.GradientTape() as g: x = tf.constant([1.0, 2.0]) g.watch(x) y = x * x jacobian = g.jacobian(y, x) print("Jacobian \n{}".format(jacobian)) del g # Drop the reference to the tape """ We will use tf.GradientTape() to calculate the second derivative of the function f(x) = x^3 f'(x) = 3x^2 f''(x) = 6x """ x = tf.Variable(1.0) with tf.GradientTape() as t: with tf.GradientTape() as t2: y = x * x * x dy_dx = t2.gradient(y, x) d2y_dx2 = t.gradient(dy_dx, x) print("First Derivative of f(x): {} \nSecond Derivative of f(x): {}".format(dy_dx, d2y_dx2)) del t, t2 # Drop the reference to the tape # Plot where second derivative is zero x0 = tf.range(-15., 15., 1.) x = tf.Variable(x0) def f(x): return tf.pow(x, 2) def second_derivative(x): """ Note here since x is a variable we are not calling the watch method. Since we are taking the second derivative, we need to call the GradientTape twice and use the gradient() method to obtain the gradients. """ with tf.GradientTape() as t: with tf.GradientTape() as t2: out = f(x) dy_dx = t2.gradient(out, x) d2y_dx2 = t.gradient(dy_dx, x) return d2y_dx2 # Original Function f(x) = x^2 plt.plot(x0, f(x0), color='dodgerblue') # Second Derivative of f(x) d2 = second_derivative(x) plt.plot(x0, d2, color='#FF9A13') plt.grid() # Plot where second derivative is negative x0 = tf.range(-10., 10., .5) x = tf.Variable(x0) def f(x): return -(tf.pow(x, 4)) def second_derivative(x): with tf.GradientTape() as t: with tf.GradientTape() as t2: out = f(x) dy_dx = t2.gradient(out, x) d2y_dx2 = t.gradient(dy_dx, x) return d2y_dx2 # Original Function f(x): -x^4 plt.plot(x0, f(x0), color='dodgerblue') # Second Derivative of f(x) d3 = second_derivative(x) plt.plot(x0, d3, color='#FF9A13') plt.grid() # Plot where second derivative is positive x0 = tf.range(-10., 10., .5) x = tf.Variable(x0) def f(x): return (tf.pow(x, 4)) def second_derivative(x): with tf.GradientTape() as t: with tf.GradientTape() as t2: out = f(x) dy_dx = t2.gradient(out, x) d2y_dx2 = t.gradient(dy_dx, x) return d2y_dx2 # Original Function f(x): x^4 plt.plot(x0, f(x0), color='dodgerblue') # Second Derivative of f(x) d3 = second_derivative(x) plt.plot(x0, d3, color='#FF9A13') plt.grid() """ NOTE: As of 06.14.2019 tf.hessians doesn't work with tf eager mode and the tf.GradientTape doesn't have a method hessians. There is a bug report open in GitHub (https://github.com/tensorflow/tensorflow/issues/29781) so as soon as that is resolved the following code will work and I will fix this cell. >>> f = 100*(y - x**2)**2 + (1 - x)**2 >>> x = tf.Variable([1., 1.]) >>> hessian_matrix = tf.hessians(f, x) >>> print(hessian_matrix) >>> with tf.GradientTape() as t: ... out = f >>> hessian = t.hessians(out, x) But for now, we will look into another way we can get the Hessian using the equivalency of the Hessian being the gradient of the Jacobian for f(x,y)= x^3 - 2xy + y^6 and for points x = 1, y = 2, but note that this will return the sum of each column, the final Hessian Matrix will be: [[ 6., -2. ], [-2., -480.]] I specifically took this example from Khan academy so if you want to see how the derivation is calculated, take a look at: (https://www.khanacademy.org/math/multivariable-calculus/applications-of-multivariable-derivatives/quadratic-approximations/a/the-hessian) """ def f(x): return tf.pow(x[0], 3) - 2*x[0]*x[1] - tf.pow(x[1], 6) x = tf.Variable([1., 2.], dtype=tf.float32) with tf.GradientTape() as h: with tf.GradientTape() as g: out = f(x) jacobian = g.jacobian(out, x) # Calls the Jacobian method hessian = h.gradient(jacobian, x) # We take the gradient of the Jacobian to get the Hessian Matrix print("Jacobian: \n{} \nHessian: \n{}".format(jacobian, hessian)) # Eigenvalue and Eigenvectors for the Hessian matrix hessian = tf.constant([[ 6., -2.], [-2., -480.]]) eigenvalues, eigenvectors = tf.linalg.eigh(hessian) print("Eigenvalue for the Hessian: \n{} \n\nEigenvectors for the Hessian: \n{}".format(eigenvalues, eigenvectors)) """ Let's see how Newton's method can be used to solve a quadratic function by applying it once. We start with a quadratic function x^2 + 5y^2 and when you differentiate it, you end up with 2x + 10y which would result in a Hessian of [[2 0], [0, 10]] because the derivative of 2x + 10y is 2 and 10. """ def quad(x): return ((x[1:])**2.0 + 5*(x[:-1])**2.0) def quad_grad(x,y): return (tf.Variable([2.0*x, 10.0*y])) # Create x and y values x = tf.linspace(-20.0, 20.0, 100) y = tf.linspace(-20.0, 20.0, 100) # Broadcasts parameters for evaluation on an N-D grid X, Y = tf.meshgrid(x, y) # Reshape X and Y to pack along first dim (row wise) and apply the quadratic function and reshape it to get the original dimension Z = tf.reshape(quad(tf.stack([tf.reshape(X, [10000,]), tf.reshape(Y, [10000,])], axis=0)), [100, 100]) # Take the inverse of the Hessian: (1/2, 1/10) H_inv = - tf.constant([[0.5, 0], [0, 0.1]]) plt.figure(figsize=(12,4)) plt.subplot(121) plt.contour(X,Y,Z); plt.title("Steepest Descent"); step = -0.25 X0 = 10.0 Y0 = 1.0 # Here we calculate the Gradient of our function and take the dot product between the gradient and the Hessian inverse N_grad = tf.tensordot(H_inv, quad_grad(X0,Y0), axes=1) sgrad = step*quad_grad(X0,Y0) plt.quiver(X0, Y0, sgrad[0], sgrad[1], color='red',angles='xy',scale_units='xy',scale=1); X1 = X0 + sgrad[0] Y1 = Y0 + sgrad[1] sgrad = step*quad_grad(X1,Y1) plt.quiver(X1.numpy(), Y1.numpy(), sgrad[0].numpy(), sgrad[1].numpy(), color='green', angles='xy', scale_units='xy', scale=1); X2 = X1 + sgrad[0] Y2 = Y1 + sgrad[1] sgrad = step*quad_grad(X2,Y2) plt.quiver(X2.numpy(), Y2.numpy(), sgrad[0].numpy(), sgrad[1].numpy(), color='purple',angles='xy',scale_units='xy',scale=1); plt.subplot(122) plt.contour(X,Y,Z); plt.title("Newton's Method") plt.quiver(X0, Y0, N_grad[0], N_grad[1], color='purple',angles='xy',scale_units='xy',scale=1); """ Let's see how this looks by plotting a function f(x) = −(2xy+2x−x^2−2y^2) with constraints x^3−y = 0 and y−(x−1)^4−2≥ 0 with bounds 0.5, 1.5, 1.5, 2.5. The solution will be where the two constraints meet inside the bound. """ def f(x): return -(2*x[0]*x[1] + 2*x[0] - x[0]**2 - 2*x[1]**2) x = np.linspace(0, 3, 100) y = np.linspace(0, 3, 100) X, Y = np.meshgrid(x, y) Z = f(np.vstack([X.ravel(), Y.ravel()])).reshape((100,100)) plt.contour(X, Y, Z, np.arange(-1.99,10, 1), cmap='jet'); plt.plot(x, x**3, 'k:', linewidth=2, color='r') # constraint 1: x^3−y = 0 plt.plot(x, (x-1)**4+2, 'k:', linewidth=2, color='b') # constraint 2: y−(x−1)^4−2≥ 0 plt.fill([0.5,0.5,1.5,1.5], [2.5,1.5,1.5,2.5], alpha=0.3) # bounds 0.5, 1.5, 1.5, 2.5 plt.axis([0,3,0,3]) """ There are two ways to solve the above system. Direct method and iterative method. We start with the iterative method. The optimal solution is where the gradient becomes zero therefore it is at x = (A^T b)*(A^T A)^{-1} and this is what we will be calculating """ # Generate random x and y data x_vals = np.linspace(0., 10., 100) y_vals = x_vals + np.random.normal(loc=0, scale=1, size=100) x_vals_column = np.transpose(np.matrix(x_vals)) ones_column = tf.ones([100, 1], dtype=tf.float32) # tensorflow needs its data types in float so we cast the dtypes to float A_tensor = tf.dtypes.cast(tf.concat([x_vals_column, ones_column], axis=1), tf.float32) Y_tensor = tf.dtypes.cast(tf.reshape(tf.transpose(y_vals), [100, 1]), tf.float32) # Iterative method tA_A = tf.matmul(A_tensor, A_tensor, transpose_a=True) # We calculate A^T A tA_A_inv = tf.linalg.inv(tA_A) # And take the inverse of it (A^TA)^{-1} product = tf.matmul(tA_A_inv, A_tensor, transpose_b=True) # Then multiply it with A to yield (A^TA)^{-1} A A_eval = tf.matmul(product, Y_tensor) # Finally we find (A^TA)^{-1}*A*b m_slope = A_eval[0][0] b_intercept = A_eval[1][0] print('slope (m): ' + str(m_slope)) print('intercept (b): ' + str(b_intercept)) # Now for each x_val we find the best fit line best_fit = [] for i in x_vals: best_fit.append(m_slope * i + b_intercept) plt.plot(x_vals, y_vals, 'o', label='Data') plt.plot(x_vals, best_fit, 'r-', label='Least square fit', linewidth=3) plt.legend(loc='upper left') plt.show() """ Next, we try the direct method for another dataset using tensorflows least squares class for function f(x) = 1 + 2x + 3x^2 """ # define true model parameters x = np.linspace(-1, 1, 100) a, b, c = 1, 2, 3 y_exact = a + b * x + c * x**2 # simulate noisy data points m = 100 X = 1 - 2 * np.random.rand(m) Y = tf.reshape(tf.convert_to_tensor(a + b * X + c * X**2 + np.random.randn(m)), [100, 1]) A = tf.stack([X**0, X**1, X**2], axis=1) # Solving directly using tensorflow's least sqaures. sol = tf.linalg.lstsq(A, Y) y_fit = sol[0] + sol[1] * x + sol[2] * x**2 fig, ax = plt.subplots(figsize=(12, 4)) ax.plot(X, Y, 'go', alpha=0.5, label='Simulated data') ax.plot(x, y_exact, 'k', lw=2, label='True value $y = 1 + 2x + 3x^2$') ax.plot(x, y_fit, 'b', lw=2, label='Least square fit') ax.set_xlabel(r"$x$", fontsize=18) ax.set_ylabel(r"$y$", fontsize=18) ax.legend(loc=2); """ Let' take an example: Maximize f(x,y,z)=xy+yz subject to the constraints x+2y=6 and x−3z=0. We start by setting up the equation: F(x,y,z,λ,α) = xy + yz − λ(x+2y−6) + α(x-3z) Now set the partial derivatives to zero and solve the following set of equations: y − λ − α = 0 x + z − 2λ = 0 y + 3α = 0 x + 2y −6 = 0 x − 3z = 0 which is a linear equation in x,y,z,λ,μ, Now this can be put into the matrix equation: [0, 1, 0, -1, -1] [x] = [0] [1, 0, 1, -2, 0] [y] = [0] [0, 1, 0, 0, 3] [z] = [0] [1, 2, 0, 0, 0] [λ] = [6] [1, 0,-3, 0, 0] [μ] = [0] """ matrix = tf.constant([ [0, 1, 0, -1, -1], [1, 0, 1, -2, 0], [0, 1, 0, 0, 3], [1, 2, 0, 0, 0], [1, 0,-3, 0, 0]], dtype=tf.float32) rhs = tf.constant([[0],[0],[0],[6],[0]], dtype=tf.float32) solve = tf.linalg.solve(matrix, rhs) print("Solving the constrained optimization using Lagrange multipliers yield x: \n{}".format(solve)) # Moore-penrose pseudoinverse solution to the above problem mpp_solution = tfp.math.pinv(matrix) print(mpp_solution)