#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# Author: Joseph Salmon <joseph.salmon@telecom-paristech.fr>
#         Mathurin Massias <mathurin.massias@gmail.com>

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import check_random_state
# dirty local imports:
import sys
sys.path.append("./../../../")
from share_code.utils import my_saving_display, make_huber, make_abs, make_median, make_bisquare

sns.set_palette("colorblind")
get_ipython().run_line_magic('matplotlib', 'notebook')


# In[ ]:


dirname = "../prebuiltimages/"
imageformat = ".pdf"  # should be .pdf or .png    
# some colors I'll use
brown = (0.64, 0.16, 0.16)
purple = (148. / 255, 0, 211. / 255)
plt.close("all")
saving = False  # True


# In[ ]:


color_blind_list = sns.color_palette("GnBu_d", 8)
c1 = color_blind_list[0]
c2 = color_blind_list[1]
c3 = color_blind_list[2]


# In[ ]:


def gradient_descent(func, func_prime, steps, theta_init, n_iter=10):
    if not isinstance(steps, np.ndarray):
        steps = np.repeat(steps, n_iter)

    theta_init = np.atleast_1d(theta_init).astype(np.float64)
    all_thetas = np.zeros([n_iter, theta_init.shape[0]])

    all_objs = np.zeros([n_iter])

    theta = theta_init.copy()

    for it in range(n_iter):
        theta -= steps[it] * func_prime(theta)
        all_thetas[it] = theta
        all_objs[it] = func(theta)
    return all_thetas, all_objs


def weiszfeld(X, theta_init=3., n_iter=10):
    """
    Compute the median of points in X with Weiszfeld algorithm.

    X : array, shape (n_samples, n_features)
        Array of points of which the median must be computed
    """
    if X.ndim == 1:  # hopefully it's not a single data point being passed
        X = X[:, None]

    theta_init = np.atleast_1d(theta_init).astype(np.float64)
    if theta_init.shape[0] != X.shape[1]:
        raise ValueError("theta_init has shape %s, expected %s" %
                         (theta_init.shape[0], X.shape[1]))
    theta = theta_init.copy()

    all_thetas = np.zeros([n_iter, theta_init.shape[0]])
    all_thetas[0] = theta_init

    all_objs = np.zeros([n_iter])
    all_objs[0] = np.linalg.norm(X - theta[None, :], axis=1).sum()

    for it in range(1, n_iter):
        w = 1. / (np.linalg.norm(X - theta[None, :], axis=1) + 1e-16)
        theta = np.sum(X * w[:, None], axis=0) / np.sum(w)
        all_thetas[it] = theta
        all_objs[it] = np.linalg.norm(X - theta[None, :], axis=1).sum()

    return all_thetas, all_objs


def mm_visu(func, func_prime, theta, x, step):
    nabla = func_prime(theta)
    z = 1. / (2. * step) * (theta - step * nabla - x) ** 2 + \
        func(theta) - 1. / 2. * step * nabla ** 2
    return z


def min_gd(func, func_prime, theta, x, step):
    nabla = func_prime(theta)
    z = func(theta) - 1. / 2. * step * nabla ** 2
    return z


# In[ ]:


def my_plot(func, func_prime, x, theta_init, n_iter, step, saving):

    iterates_list,_ = gradient_descent(
        func, func_prime, step, theta_init=theta_init, n_iter=n_iter)
    ylims = [-2, 5]
    xlims = [-5, 5]
    mm = False,
    name = r'$|\cdot|$,'
    figname = "abs_step{:.1f}".format(step).replace(".", "pt")

    fig1, ax = plt.subplots(figsize=(7, 7))
    ax.set_xlim(*xlims)
    ax.set_ylim(*ylims)
    ax.plot(x, func(x), c=c3, lw=1.7, label=name)
    ax.set_title(r"Gradient descent for $|\cdot|$ function; stepsize: $\alpha={:.1f}$".format(step))
    my_saving_display(fig1, dirname, figname, imageformat, saving)

    plt.legend(loc='lower left')

    myarray = np.asarray(iterates_list[:0 + 1])
    ax.plot(myarray, func(myarray), 'o')

    my_saving_display(fig1, dirname, figname + "_with_init", imageformat, saving)

    plt.legend(loc='lower left')

    for index_to_plot in range(n_iter - 1):
        fig1, ax = plt.subplots(figsize=(7, 7))
        ax.set_title(r"Gradient descent for $|\cdot|$ function; stepsize: $\alpha={:.1f}$".format(step))
        ax.set_xlim(*xlims)
        ax.set_ylim(*ylims)
        if mm:
            z = mm_visu(func, func_prime,
                        iterates_list[index_to_plot], x, step=step)
            ax.plot(x, z, c='purple', lw=1.1,
                    label=r"$g(\theta^{:1}, \cdot)$".format(index_to_plot))
            minima = min_gd(func, func_prime,
                            iterates_list[index_to_plot], x, step=step)
            ax.plot(iterates_list[index_to_plot + 1], minima, 'o', c='purple')

        ax.plot(x, func(x), c=c3, lw=1.7, label=name)
        myarray = np.asarray(iterates_list[:index_to_plot + 1])
        ax.plot(myarray, func(myarray), 'o')

        plt.legend(loc='lower left')
        my_saving_display(fig1, dirname, figname + str(index_to_plot), imageformat, saving)


# # Geometric Median case

# In[ ]:


saving = True  # True
x = np.linspace(-5, 5, 501)

theta_init = 3
n_iter = 5
step = 1.5
func, func_prime = make_abs()


my_plot(func, func_prime, x, theta_init, n_iter, step, saving)


# In[ ]:


rng = check_random_state(24)
n_samples, n_features = 21, 2

X = rng.normal(0, 1, size=(n_samples, n_features))
n_iter = 50
theta_init = np.mean(X, axis=0)


# In[ ]:


if X.shape[1] == 2:
    plt.scatter(X[:, 0], X[:, 1])


# In[ ]:


from scipy.optimize import minimize


def cost_median(theta, X):
    return np.linalg.norm(X - theta[None, :], axis=1).sum()

# numerical minimization of the cost function, starting with the mean of the dataset:
opt_result = minimize(lambda theta: cost_median(theta, X), x0=theta_init, method='BFGS')
true_median = opt_result["x"]
cost_min = cost_median(true_median, X)
print("scipy's solution: ", true_median)
print("scipy's ojective: ", cost_min)


# In[ ]:


weiszfeld_iterates, weiszfeld_objs = weiszfeld(X, theta_init=theta_init, n_iter=n_iter)
print("our solution: ", weiszfeld_iterates[-1])
print("last objective: ", weiszfeld_objs[-1])


# In[ ]:


n_iter = 100
sqrt_decay_step = 0.5  / np.sqrt(np.arange(n_iter) + 1)

func, func_prime = make_median(X)

_, list_gd_val10pt0_decay = gradient_descent(
    func, func_prime, 10./ X.shape[0] * sqrt_decay_step, 
    theta_init=theta_init, n_iter=n_iter)

_, list_gd_val0pt01 = gradient_descent(
    func, func_prime, 0.5 / X.shape[0], 
    theta_init=theta_init, n_iter=n_iter)

list_gd_val10pt0_decay[-10:]


# In[ ]:


fig1=plt.figure(figsize=(7,7))

plt.semilogy(list_gd_val10pt0_decay - cost_min,
             label=r"GD $\alpha=\frac{1}{\sqrt{t}}$")
plt.semilogy(list_gd_val0pt01 - cost_min,
             label=r"GD $\alpha=cst$")
plt.semilogy(weiszfeld_objs - cost_min, label="Weiszfeld")
plt.xlabel("Number of epochs")
plt.ylabel(r"$f(\theta) -f(\theta^{\star})$")
plt.legend()


figname = "WeiszfeldVsGD"
my_saving_display(fig1, dirname, figname, imageformat, saving)


# # Biweight case

# In[ ]:


def cost_bisquare(theta, X, threshold):
    z = 1 - (1 -  (np.linalg.norm(X - theta[None, :],axis=1) / threshold)**2)**3
    z = np.minimum(1, z)
    return np.sum(z)


# In[ ]:


rng = check_random_state(24)
n_samples, n_features = 10, 1
X = rng.normal(0, 1, size=(n_samples, n_features))
n_iter = 100
threshold = 0.505


# In[ ]:


# theta_init = np.mean(X, axis=0)
theta_init = -0.2*np.ones(X.shape[1])

print("Initial solution: ", theta_init)
print("Initial objective:",  cost_bisquare(theta_init, X, threshold))


# In[ ]:


# numerical minimization of the cost function, starting with the mean of the dataset:
opt_result = minimize(lambda theta: cost_bisquare(theta, X,threshold), x0=theta_init, method='BFGS')
scipy_sol = opt_result["x"]
cost_min = cost_bisquare(scipy_sol, X, threshold)
print("scipy's solution: ", scipy_sol)
print("scipy's ojective: ", cost_min)


# In[ ]:


func, func_prime, func_weight = make_bisquare(X, threshold)
# In 1D the lipschitz constant is 6 as the $\rho''(x)

list_gd_iterate, list_gd_val = gradient_descent(
    func, func_prime, 1 / (6 * X.shape[0]),
    theta_init=theta_init, n_iter=n_iter)

print("GD's solution: ", list_gd_iterate[-1])
print("GD's objective: ", list_gd_val[-1])


# In[ ]:


def mm_bisquare(X, func_weight, theta_init, n_iter=10):
    """
    Compute the M-estimator (biweights) of points in X with MM algorithm.

    X : array, shape (n_samples, n_features)
        Array of points of which the median must be computed
    """
    if X.ndim == 1:  # hopefully it's not a single data point being passed
        X = X[:, None]

    theta_init = np.atleast_1d(theta_init).astype(np.float64)
    if theta_init.shape[0] != X.shape[1]:
        raise ValueError("theta_init has shape %s, expected %s" %
                         (theta_init.shape[0], X.shape[1]))
    theta = theta_init.copy()

    all_thetas = np.zeros([n_iter, theta_init.shape[0]])
    all_thetas[0] = theta_init

    all_objs = np.zeros([n_iter])
    all_objs[0] = cost_bisquare(theta, X, threshold)

    for it in range(1, n_iter):
        w = func_weight(theta)
        if np.linalg.norm(w)>1e-12:
            theta = np.sum(X * w[:, None], axis=0) / np.sum(w)
        else:
            theta = np.zeros(theta_init.shape)
        all_thetas[it] = theta
        all_objs[it] = cost_bisquare(theta, X, threshold)

    return all_thetas, all_objs


# In[ ]:


mm_iterates, mm_objs = mm_bisquare(X, func_weight, theta_init=theta_init, n_iter=n_iter)
print("our solution: ", mm_iterates[-1])
print("last objective: ", mm_objs[-1])


# In[ ]:


fig1=plt.figure(figsize=(7,7))

plt.semilogy(list_gd_val - cost_min,
             label=r"GD $\alpha=cst$")
plt.semilogy(mm_objs - cost_min, label="MM")
plt.xlabel("Number of epochs")
plt.ylabel(r"$f(\theta) -f(\theta^{\star})$")
plt.legend()

figname = "MMalgVsGD"
my_saving_display(fig1, dirname, figname, imageformat, saving)


# In[ ]:


cost_x = np.zeros(x.shape)
if n_features == 1:
    for i, valx in enumerate(x):
        theta1 = np.array([valx])
        cost_x[i] = cost_bisquare(theta1, X, threshold)
    fig1 = plt.figure()
    plt.title("Objective function")
    plt.plot(x, cost_x, lw=1.1)
    plt.scatter(theta_init, cost_bisquare(
        theta_init, X, threshold), s=21, label='init')
    plt.scatter(mm_iterates[-1], cost_bisquare(
        mm_iterates[-1], X, threshold), s=41, label='MM', marker='d')
    plt.scatter(scipy_sol, cost_bisquare(
        scipy_sol, X, threshold), s=21, label='BFGS', marker='x')
    plt.scatter(list_gd_iterate[-1], cost_bisquare(
        list_gd_iterate[-1], X, threshold), s=41, label='GD', marker='+')

    plt.legend()
    figname = "Bisquare1DLandscape"
    my_saving_display(fig1, dirname, figname, imageformat, saving)


# In[ ]:


plt.figure()
plt.title("Concavity of $f$ for the bi-square case")
y = 1 - (1 - np.abs(x / threshold))**3
y[np.abs(x) > threshold] = 1
plt.plot(x, y)


# In[ ]:


# TODO movie for evolution of the objective function w.r.t. to threshold parameter
# (to show interest of homotopy methods)


# In[ ]: