#!/usr/bin/env python
# coding: utf-8

# # Gini Mean Difference Portfolio Optimization
# 
# In this notebook we show how we can solve a hard problem using some reformulations.
# 
# ## 1. Gini Optimization
# 
# ### 1.1 Original formulation
# 
# The Gini mean difference (GMD) is a measure of dispersion and it was introduced in the context of portfolio optimization by __[Yitzhaki (1982)](https://www.researchgate.net/publication/4900733_Stochastic_Dominance_Mean_Variance_and_Gini%27s_Mean_Difference)__. However, this model is not used by practitioners due to the original formulation having a number of variables that increases proportional to $T(T-1)/2$, where $T$ is the number of observations. The original model is presented as follows:
# 
# $$
# \begin{equation}
# \begin{aligned}
# & \underset{x,\, d}{\text{min}} & & \frac{1}{T(T-1)} \sum^{T}_{i=1} \sum^{T}_{j > i} d_{i,j} \\
# & \text{s.t.} & & \mu x \geq \bar{\mu} \\
# & & &  d_{i,j} \geq (r_{i} -r_{j})x \; ; \; \forall \; i,j =1, \ldots, T \; ; \; i < j \\
# & & &  d_{i,j} \geq -(r_{i} -r_{j})x \\
# & & &  \sum_{i=1}^{N} x_i = 1 \\
# & & &  x_{i} \geq 0 \; ; \; \forall \; i =1, \ldots, N \\
# & & &  d_{i,j} \geq 0 \; ; \; \forall \; i,j =1, \ldots, T \\
# \end{aligned}
# \end{equation}
# $$
# 
# Where $r_{i}$ is the vector of returns in period $i$ and $d$ is an auxiliary variable.
# 
# ### 1.2 Murray's reformulation
# 
# To increase the efficiency of the problem above, __[Murray (2022)](https://github.com/cvxpy/cvxpy/issues/1585)__ proposed the following reformulation:
# 
# $$
# \begin{equation}
# \begin{aligned}
# & \underset{x,\, d}{\text{min}} & & \frac{1}{T(T-1)} \sum^{T}_{i=1} \sum^{T}_{j > i} d_{i,j} \\
# & \text{s.t.} & & \mu x \geq \bar{\mu} \\
# & & & y = r x \\
# & & & d \geq M y \\
# & & & d \geq -M y \\
# & & &  \sum_{i=1}^{N} x_i = 1 \\
# & & &  x_{i} \geq 0 \; ; \; \forall \; i =1, \ldots, N \\
# & & &  d_{i,j} \geq 0 \; ; \; \forall \; i,j =1, \ldots, T \\
# \end{aligned}
# \end{equation}
# $$
# 
# where 
# $$
# M = \begin{bmatrix}
# \left. \begin{matrix}
# -1 & 1 & 0 & 0 & \ldots & 0 & 0\\
# -1 & 0 & 1 & 0 & \ldots & 0 & 0\\
# \vdots & \vdots & \vdots & \vdots & \ddots & \vdots & \vdots \\
# -1 & 0 & 0 & 0 & \ldots & 0 & 1 \\
# \end{matrix} \right \} & T-1\\
# \left. \begin{matrix}
# 0 & -1 & 1 & 0 & \ldots & 0 & 0\\
# 0 &  -1 & 0 & 1 & \ldots & 0 & 0\\
# \vdots & \vdots & \vdots & \vdots & \ddots & \vdots &  \vdots \\
# 0 & -1 & 0 & 0 & \ldots & 0 & 1 \\
# \end{matrix} \right \} & T-2\\          
# \vdots \\
# \underbrace{ \left. \begin{matrix}
# 0 & 0 & 0 & 0 & \ldots & -1 & 1 \\
# \end{matrix} \right \} }_{T} & 1 \\
# \end{bmatrix}
# $$
# 
# This reformulation is more efficient for medium scale problems (T<800).
# 
# ### 1.3 Cajas's reformulation:
# 
# __[Cajas (2021)](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3988927)__ proposed an alternative reformulation based on the ordered weighted averaging (OWA) operator optimization model for monotonic weights proposed by __[Chassein and Goerigk (2015)](https://kluedo.ub.uni-kl.de/frontdoor/deliver/index/docId/3899/file/paper.pdf)__. This formulation works better for large scale problems (T>=800). This formulation is presented as follows:
# 
# $$
# \begin{equation}
# \begin{aligned}
# & \min_{\alpha, \, \beta, \, x, \, y} & & \sum^{T}_{i=1} \alpha_{i} + \beta_{i}  \\
# & \text{s.t.} & & \mu x \geq \bar{\mu} \\
# & & & r x = y \\
# & & & \alpha_{i} + \beta_{j} \geq w_{i} y_{j}  \;\;\;\; \forall \; i,j =1, \ldots, T \\
# & & &  \sum_{i=1}^{N} x_i = 1 \\
# & & &  x_i \geq 0 \; ; \; \forall \; i =1, \ldots, N \\
# \end{aligned}
# \end{equation}
# $$
# 
# where $w_{i} =  2 \left ( \frac{2i - 1 - T}{T(T-1)} \right )$.

# In[1]:


import numpy as np
import pandas as pd
import cvxpy as cp
import mosek
import scipy.stats as st
from timeit import default_timer as timer
from datetime import timedelta
import warnings

warnings.filterwarnings("ignore")

def gini(mu, returns, D, assets, lift=0):
    (T, N) = returns.shape

    d = cp.Variable((int(T * (T - 1) / 2),1))
    w = cp.Variable((N,1))
    constraints = []

    if lift in ['Murray', 'Yitzhaki']:  # use Murray's reformulation
        if lift == 'Murray':    
            ret_w = cp.Variable((T,1))
            constraints.append(ret_w == returns @ w)
            mat = np.zeros((d.shape[0], T))
            """ 
            We need to create a vector that has the following entries:
                ret_w[i] - ret_w[j]
            for j in range(T), for i in range(j+1, T).
            We do this by building a numpy array of mostly 0's and 1's.
            (It would be better to use SciPy sparse matrix objects.)
            """
            ell = 0
            for j in range(T):
                for i in range(j+1, T):
                    # write to mat so that (mat @ ret_w)[ell] == var_i - var_j
                    mat[ell, i] = 1
                    mat[ell, j] = -1
                    ell += 1
            all_pairs_ret_diff = mat @ ret_w
        elif lift == 'Yitzhaki':  # use the original formulation
            all_pairs_ret_diff = D @ w
    
        constraints += [d >= all_pairs_ret_diff,
                        d >= -all_pairs_ret_diff,
                        w >= 0,
                        cp.sum(w) == 1,
                       ]

        risk = cp.sum(d) / ((T - 1) * T)

    elif lift == 'Cajas':
        a = cp.Variable((T,1))
        b = cp.Variable((T,1))
        y = cp.Variable((T,1))

        owa_w = []
        for i in range(1,T+1):
            owa_w.append(2*i - 1 - T)
        owa_w = np.array(owa_w) / (T * (T-1))

        constraints = [returns @ w == y,
                       w >= 0,
                       cp.sum(w) == 1]
   
        for i in range(T):
            constraints += [a[i] + b >= cp.multiply(owa_w[i], y)]

        risk = cp.sum(a + b)

    objective = cp.Minimize(risk * 1000)

    prob = cp.Problem(objective, constraints)

    try:
        prob.solve(solver=cp.MOSEK,
                   mosek_params={'MSK_IPAR_NUM_THREADS': 2},
                   verbose=False,)
        w = pd.DataFrame(w.value)
        w.index = assets
        w = w / w.sum()
    except:
        w = None
    return w


# In[2]:


####################################
# Calculating the Gini Portfolio
# using three formulations
####################################

rs = np.random.RandomState(123)

sizes = [100]
data = {}
weights = {}
lifts = ['Yitzhaki', 'Murray', 'Cajas']
k = 0
T = [200, 500, 700, 1000]

for t in T:
    for n in sizes:
        
        cov = rs.rand(n,n) * 1.5 - 0.5
        cov = cov @ cov.T/1000 + np.diag(rs.rand(n) * 0.7 + 0.3)/1000
        mean = np.zeros(n) + 1/1000

        Y = st.multivariate_normal.rvs(mean=mean, cov=cov, size=t, random_state=rs)
        Y = pd.DataFrame(Y)
        assets = ['Asset '+ str(i) for i in range(1,n+1)]
        mu = Y.mean().to_numpy()
        returns = Y.to_numpy()

        D = np.array([]).reshape(0,len(assets))
        for j in range(0, returns.shape[0]-1):
            D = np.concatenate((D, returns[j+1:] - returns[j,:]), axis=0)

        for lift in lifts:
            name = str(lift) + '-' + str(t) + '-' + str(n)
            data[name] = []
            weights[name] = []
            if t >= 700 and lift == 'Yitzhaki':
                continue
            else: 
                start = timer()
                w = gini(mu, returns, D, assets, lift=lift)
                end = timer()
                data[name].append(timedelta(seconds=end-start).total_seconds())
                weights[name].append(w)
                
            k += 1
            print(name)


# In[3]:


keys = list(data.keys())
for i in keys:
    if len(data[i]) == 0:
        del data[i]

pd.options.display.float_format = '{:.2f}'.format

a = pd.DataFrame(data).T
a.columns = ['Time in Seconds']
display(a)


# As we can see, as the number of observations $T$ increases the formulation proposed by Cajas (2021) becomes more efficient than Yitzhaki's and Murray's formulations.

# In[4]:


b = pd.DataFrame([])
for i in weights.keys():
    if len(weights[i]) == 0:
        continue
    weights[i][0].columns = [i]
    b = pd.concat([b , mu @ weights[i][0]],axis=0)
    
pd.options.display.float_format = '{:.4%}'.format

display(b)


# If we calculate the expected returns for each formulation, we can see that the three models give us the same results, which means that these formulations give us the same solution.

# For more portfolio optimization models and applications, you can see the CVXPY based library __[Riskfolio-Lib](https://github.com/dcajasn/Riskfolio-Lib)__.
# 
# Finally, I hope you liked this example.