Imports

In [1]:
%load_ext watermark
%watermark

%load_ext autoreload
%autoreload 2

from IPython.display import display, Image

# import standard libs
from pathlib import Path, PurePath
import sys
import time
import os
import itertools


def get_relative_project_dir(project_repo_name=None, partial=True):
    """helper fn to get local project directory by using the repo name"""
    current_working_directory = Path.cwd()
    cwd_parts = current_working_directory.parts
    if partial:
        while project_repo_name not in cwd_parts[-1]:
            current_working_directory = current_working_directory.parent
            cwd_parts = current_working_directory.parts
    else:
        while cwd_parts[-1] != project_repo_name:
            current_working_directory = current_working_directory.parent
            cwd_parts = current_working_directory.parts
    return current_working_directory

# get project dir
pdir = get_relative_project_dir('mixture_model_trading_public', partial=False)
data_dir = pdir / 'data'
script_dir = pdir / 'scripts' 
sys.path.append(script_dir.as_posix())

# import python scientific stack
import pandas as pd
pd.set_option('display.width', 1000)
#import pandas_datareader as pdr
import numpy as np
import sklearn.mixture as mix
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as stats
from arch.unitroot import KPSS
from numba import jit
import math

# import visual tools
from mpl_toolkits import mplot3d
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import plotnine as pn
import mizani.breaks as mzb
import mizani.formatters as mzf
import seaborn as sns

# import util libs
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from utils import cprint

from dotenv import load_dotenv
load_dotenv()  # to use hidden api key

print()
%watermark -p pandas,pandas_datareader,numpy,sklearn,statsmodels,scipy,matplotlib,seaborn,plotnine
2019-08-13T18:08:14-06:00

CPython 3.7.3
IPython 7.6.1

compiler   : GCC 7.3.0
system     : Linux
release    : 4.19.11-041911-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 12
interpreter: 64bit

pandas 0.24.2
pandas_datareader 0.7.0
numpy 1.16.2
sklearn 0.20.3
statsmodels 0.9.0
scipy 1.2.1
matplotlib 3.0.2
seaborn 0.9.0
plotnine 0.5.1
In [2]:
%load_ext nb_black

sns_params = {
    "axes.grid": True,
    "ytick.left": True,
    "xtick.bottom": True,
    "xtick.major.size": 2,
    "ytick.major.size": 2,
    "font.size": 11,
    "font.weight": "medium",
    "figure.figsize": (10, 7),
    "font.family": "DejaVu Sans Mono",
}

sns.set(context="talk", style="ticks", rc=sns_params)
savefig_kwds = dict(dpi=90, bbox_inches="tight", frameon=True, format="png")
flatui = ["#3498db", "#9b59b6", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", "#f4cae4"]
sns.set_palette(sns.color_palette(flatui, 7))

Get Data

In [3]:
infp = Path(data_dir / "tiingo_etf_returns_ending_2017-12-31.parq")
R = pd.read_parquet(infp).assign(year=lambda df: df.index.year)
cprint(R)
-------------------------------------------------------------------------------
dataframe information
-------------------------------------------------------------------------------
                 SPY       QQQ       TLT       GLD       EFA       EEM  year
date                                                                        
2017-12-22 -0.000262 -0.001143  0.001363  0.005223  0.002571  0.008422  2017
2017-12-26 -0.001197 -0.005416  0.002961  0.006839 -0.000285 -0.000861  2017
2017-12-27  0.000486  0.000128  0.012941  0.003770  0.001853  0.002580  2017
2017-12-28  0.002055  0.001213 -0.000868  0.005060  0.000569  0.006846  2017
2017-12-29 -0.003778 -0.006208  0.001578  0.006491  0.000711  0.004680  2017
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3301 entries, 2004-11-19 to 2017-12-29
Data columns (total 7 columns):
SPY     3301 non-null float64
QQQ     3301 non-null float64
TLT     3301 non-null float64
GLD     3301 non-null float64
EFA     3301 non-null float64
EEM     3301 non-null float64
year    3301 non-null int64
dtypes: float64(6), int64(1)
memory usage: 206.3 KB
None
-------------------------------------------------------------------------------

Analysis Functions

In [4]:
def to_returns(s):
    return np.log(s / s.shift(1)).dropna()


def to_price_index_exp(df, start=100):
    return start * (np.exp(df.cumsum()))


def to_price_index_sum(df, start=100):
    return start + df.cumsum()


def cdescribe(x, n_cols=None):
    if not n_cols:
        d = x.describe()
        d.loc["skew"] = x.skew()
        d.loc["kurtosis"] = x.kurtosis()
        return d
    else:
        x_ = x.sample(n_cols, axis=1)
        d = x_.describe()
        d.loc["skew"] = x_.skew()
        d.loc["kurtosis"] = x_.kurtosis()
        return d


def compare_stats(x, y, n_cols=None):
    pd.options.display.float_format = "{:,.4f}".format
    data = pd.concat([cdescribe(x), cdescribe(y, n_cols=n_cols)], axis=1)
    return data

Plot Functions

In [5]:
def plot_autocorr(s, lags=50, figsize=(12, 8), title=None):
    fig = plt.figure(figsize=figsize)
    layout = 2, 2
    acf_ax = plt.subplot2grid(layout, (0, 0))
    abs_acf_ax = plt.subplot2grid(layout, (0, 1))
    pacf_ax = plt.subplot2grid(layout, (1, 0))
    squared_ax = plt.subplot2grid(layout, (1, 1))

    sm.graphics.tsa.plot_acf(
        s,
        fft=True,
        zero=False,
        lags=lags,
        ax=acf_ax,
        title="Autocorrelation of Returns",
    )

    sm.graphics.tsa.plot_acf(
        s.abs(),
        fft=True,
        zero=False,
        lags=lags,
        ax=abs_acf_ax,
        title="Autocorrelation of Absolute Returns",
    )

    sm.graphics.tsa.plot_pacf(
        s, zero=False, lags=lags, ax=pacf_ax, title="Partial Autocorrelation of Returns"
    )

    sm.graphics.tsa.plot_acf(
        s ** 2,
        fft=True,
        zero=False,
        lags=lags,
        ax=squared_ax,
        title="Autocorrelation of Squared Returns",
    )

    if title:
        fig.suptitle(title, fontweight="demi", fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.88)
    return


def plot_mean_dist(returns, sim_returns, ax):

    mean_return = returns.mean()

    ax.set_title(f"{returns.name} mean return: {mean_return:.4f}")
    sim_means = sim_returns.mean().squeeze()

    g = sns.distplot(sim_means, kde=False, ax=ax)
    g.axvline(mean_return, color="r")
    return


def plot_std_dist(returns, sim_returns, ax):

    std = returns.std()

    ax.set_title(f"{returns.name} return std: {std:.4f}")
    sim_stds = sim_returns.std().squeeze()

    g = sns.distplot(sim_stds, kde=False, ax=ax)
    g.axvline(std, color="r")
    return


def plot_min_dist(returns, sim_returns, ax):

    min_ = returns.min()

    ax.set_title(f"{returns.name} return min: {min_:.4f}")
    sim_mins = sim_returns.min().squeeze()

    g = sns.distplot(sim_mins, kde=False, ax=ax)
    g.axvline(min_, color="r")
    return


def plot_max_dist(returns, sim_returns, ax):

    max_ = returns.max()

    ax.set_title(f"{returns.name} return max: {max_:.4f}")
    sim_maxs = sim_returns.max().squeeze()

    g = sns.distplot(sim_maxs, kde=False, ax=ax)
    g.axvline(max_, color="r")
    return


def plot_stat_dist(returns, sim_returns, figsize=(12, 8)):
    fig, axes = plt.subplots(2, 2, figsize=figsize)

    returns.name = "real_etf"
    plot_mean_dist(returns, sim_returns, axes[0, 0])
    plot_std_dist(returns, sim_returns, axes[0, 1])
    plot_min_dist(returns, sim_returns, axes[1, 0])
    plot_max_dist(returns, sim_returns, axes[1, 1])

    fig.suptitle(
        f"{returns.name} simulated stat distributions", fontweight="demi", fontsize=16
    )
    fig.tight_layout()
    fig.subplots_adjust(top=0.88)
    return


def plot_autocorr_dist(returns, sim_returns):

    autocorr = returns.autocorr()

    plt.title(f"{returns.name} return autocorr: {autocorr:.4f}")
    sim_autocorrs = sim_returns.apply(pd.Series.autocorr).squeeze()

    g = sns.distplot(sim_autocorrs, kde=False)
    g.axvline(autocorr, color="r")
    return

Scale and Transform Data

In [31]:
from sklearn.preprocessing import PowerTransformer, MaxAbsScaler, RobustScaler

R_pct = R.iloc[:, :-1].mul(100)
pt = RobustScaler(
    quantile_range=(2.5, 97.5)
)  # PowerTransformer(method="yeo-johnson", standardize=True)
data = pt.fit_transform(R_pct)
data.shape
Out[31]:
(3301, 6)

Begin Analysis

Find AIC, BIC Minimum

In [32]:
## careful this can take >6 mins
n_components = np.arange(250, 2750, 250)
models = [
    mix.GaussianMixture(n, covariance_type="full", random_state=0) for n in n_components
]
aics = pd.Series(
    [model.fit(data).aic(data) for model in models], index=n_components, name="aic"
)
bics = pd.Series(
    [model.fit(data).bic(data) for model in models], index=n_components, name="bic"
)
In [33]:
fig, ax = plt.subplots()
_ = aics.plot(ax=ax)
ax.axvline(aics.idxmin(), color="red", ls="--", lw=7)

_ = bics.plot(ax=ax)
ax.axvline(bics.idxmin(), color="purple", ls="--", lw=7)
Out[33]:
<matplotlib.lines.Line2D at 0x7fe1ae906e80>

Fit Mixture Model

In [34]:
gmm = mix.GaussianMixture(aics.idxmin(), n_init=3, covariance_type="full")
gmm.fit(data)
print(gmm.converged_)
True

Evaluate Single Path Sample

In [35]:
def sample_path(gmm_model, transformer, n_samples, seed=0):
    np.random.seed(seed)
    data_new = gmm_model.sample(n_samples)[0]
    paths = transformer.inverse_transform(data_new)
    path_df = pd.DataFrame(paths)
    return path_df
In [36]:
N_SAMPLES = len(data)
path_df = sample_path(gmm, pt, N_SAMPLES)

Real data plot

In [37]:
_ = R_pct.cumsum().plot(figsize=(12, 10))

Synthetic data plot

In [38]:
_ = path_df.cumsum().plot(figsize=(12, 10))

Compare histogram density

In [39]:
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(path_df.mean(axis=1), ax=ax, label="sim")
ax.legend()
Out[39]:
<matplotlib.legend.Legend at 0x7fe1ae89e3c8>

Compare descriptive statistics

In [40]:
compare_stats(R_pct, path_df).T
Out[40]:
count mean std min 25% 50% 75% max skew kurtosis
SPY 3,301.0000 0.0326 1.1776 -10.1640 -0.3754 0.0660 0.5336 13.0685 -0.1341 15.2330
QQQ 3,301.0000 0.0452 1.2572 -9.3824 -0.4830 0.0911 0.6567 11.4799 -0.1257 7.8209
TLT 3,301.0000 0.0249 0.8761 -5.1767 -0.4901 0.0553 0.5365 5.0371 -0.0190 1.8800
GLD 3,301.0000 0.0310 1.1991 -9.1905 -0.5424 0.0563 0.6686 10.6974 -0.3076 5.9959
EFA 3,301.0000 0.0209 1.4326 -11.8369 -0.5321 0.0593 0.6787 14.7451 -0.1557 12.5111
EEM 3,301.0000 0.0315 1.9389 -17.6334 -0.8442 0.0965 0.9404 20.5141 0.1936 14.7749
0 3,301.0000 0.0632 1.1487 -7.0912 -0.3447 0.0880 0.5611 13.0614 0.0988 11.4614
1 3,301.0000 0.0696 1.2312 -6.2080 -0.4673 0.0961 0.6987 11.4774 0.1418 5.0027
2 3,301.0000 -0.0293 0.8759 -5.1814 -0.5180 0.0276 0.5067 3.6889 -0.3858 1.9443
3 3,301.0000 0.0225 1.1860 -7.7289 -0.5354 0.0557 0.6449 7.1052 -0.5995 4.9229
4 3,301.0000 0.0644 1.3921 -7.7590 -0.5188 0.0915 0.7217 14.7437 0.3185 9.1792
5 3,301.0000 0.1021 1.9080 -10.1001 -0.7857 0.1329 0.9893 20.5172 0.7456 10.9967

Compare Correlations

Real

In [41]:
sns.clustermap(R_pct.corr())
Out[41]:
<seaborn.matrix.ClusterGrid at 0x7fe1ae7304a8>

Synthetic

In [42]:
sns.clustermap(path_df.corr())
Out[42]:
<seaborn.matrix.ClusterGrid at 0x7fe1ae5977f0>

Plot example stat distributions

In [43]:
plot_stat_dist(R_pct.mean(axis=1), path_df)

Plot autocorrelations

Real

In [44]:
plot_autocorr(R_pct.mean(axis=1), title="Real Mean Returns Autocorrelations")

Synthetic

In [45]:
plot_autocorr(path_df.mean(axis=1), title="Simulated Mean Return Autocorrelations")

Evaluate Many Paths

In [46]:
path_dfs = list()
for i in tqdm(range(int(5000 / 6))):
    path_df = sample_path(gmm, pt, N_SAMPLES, seed=i)
    path_dfs.append(path_df)

paths = pd.concat(path_dfs, axis=1)
paths.columns = np.arange(paths.shape[1])
cprint(paths)
100%|██████████| 833/833 [02:32<00:00,  5.54it/s]
-------------------------------------------------------------------------------
dataframe information
-------------------------------------------------------------------------------
        0       1       2      3       4       5       6       7       8       9     ...    4988    4989    4990    4991    4992    4993   4994   4995    4996    4997
3296  0.5962 -0.4723 -0.8383 1.0700  0.8320  2.7895 -0.9240 -1.0146 -0.2378 -1.6050  ...  0.5360 -1.9595 -1.5479 -1.6083 -0.7385 -0.9492 0.7755 1.0338 -0.0969 -0.5895
3297  0.5958 -0.4853 -0.8314 1.0739  0.8311  2.7969  0.5907 -0.4814 -0.8281  1.0734  ... -0.1942 -0.3932 -1.1604 -0.2333 -0.6888 -0.8453 0.8455 1.1289 -0.2279 -0.6704
3298  0.5957 -0.4846 -0.8351 1.0697  0.8198  2.7986 -0.9117 -0.7246  0.9425  0.6990  ... -0.2949 -0.0912 -1.0195 -0.2119 -0.6356 -0.8945 0.8239 1.2172 -0.1857 -0.8031
3299 -0.7101 -1.0065  0.6090 1.2230 -0.3330 -0.9851 -0.6784 -0.7974  0.8515  1.1884  ... -0.2356 -1.6057 -0.9591 -3.1173 -0.6599 -0.8734 0.7320 1.2673 -0.3875 -0.9484
3300 -0.6280 -0.9970  0.6703 1.2900 -0.2749 -1.0431 -0.8262 -0.7244  0.7384  1.0918  ... -0.2291 -1.6137 -0.9532 -3.1094 -0.9077 -0.7789 0.9012 0.7492 -0.1063 -0.0608

[5 rows x 4998 columns]
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3301 entries, 0 to 3300
Columns: 4998 entries, 0 to 4997
dtypes: float64(4998)
memory usage: 125.9 MB
None
-------------------------------------------------------------------------------

Synthetic data plot

In [47]:
paths.sample(10, axis=1, random_state=0).cumsum().plot(figsize=(12, 10))
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe1ae197780>

Plot Histogram Comparisons

Random selection of 6 Paths

In [48]:
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(paths.sample(6, axis=1, random_state=0).mean(axis=1), ax=ax, label="synth")
ax.legend()
Out[48]:
<matplotlib.legend.Legend at 0x7fe1aeb90b70>