#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# ## Imports
# In[1]:
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', '')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
from IPython.display import display, Image
# import standard libs
from pathlib import Path, PurePath
import sys
import time
import os
import itertools
def get_relative_project_dir(project_repo_name=None, partial=True):
"""helper fn to get local project directory by using the repo name"""
current_working_directory = Path.cwd()
cwd_parts = current_working_directory.parts
if partial:
while project_repo_name not in cwd_parts[-1]:
current_working_directory = current_working_directory.parent
cwd_parts = current_working_directory.parts
else:
while cwd_parts[-1] != project_repo_name:
current_working_directory = current_working_directory.parent
cwd_parts = current_working_directory.parts
return current_working_directory
# get project dir
pdir = get_relative_project_dir('mixture_model_trading_public', partial=False)
data_dir = pdir / 'data'
script_dir = pdir / 'scripts'
sys.path.append(script_dir.as_posix())
# import python scientific stack
import pandas as pd
pd.set_option('display.width', 1000)
#import pandas_datareader as pdr
import numpy as np
import sklearn.mixture as mix
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as stats
from arch.unitroot import KPSS
from numba import jit
import math
# import visual tools
from mpl_toolkits import mplot3d
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
get_ipython().run_line_magic('matplotlib', 'inline')
import plotnine as pn
import mizani.breaks as mzb
import mizani.formatters as mzf
import seaborn as sns
# import util libs
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from utils import cprint
from dotenv import load_dotenv
load_dotenv() # to use hidden api key
print()
get_ipython().run_line_magic('watermark', '-p pandas,pandas_datareader,numpy,sklearn,statsmodels,scipy,matplotlib,seaborn,plotnine')
# In[2]:
get_ipython().run_line_magic('load_ext', 'nb_black')
sns_params = {
"axes.grid": True,
"ytick.left": True,
"xtick.bottom": True,
"xtick.major.size": 2,
"ytick.major.size": 2,
"font.size": 11,
"font.weight": "medium",
"figure.figsize": (10, 7),
"font.family": "DejaVu Sans Mono",
}
sns.set(context="talk", style="ticks", rc=sns_params)
savefig_kwds = dict(dpi=90, bbox_inches="tight", frameon=True, format="png")
flatui = ["#3498db", "#9b59b6", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", "#f4cae4"]
sns.set_palette(sns.color_palette(flatui, 7))
# ## Get Data
# In[3]:
infp = Path(data_dir / "tiingo_etf_returns_ending_2017-12-31.parq")
R = pd.read_parquet(infp).assign(year=lambda df: df.index.year)
cprint(R)
# ## Analysis Functions
# In[4]:
def to_returns(s):
return np.log(s / s.shift(1)).dropna()
def to_price_index_exp(df, start=100):
return start * (np.exp(df.cumsum()))
def to_price_index_sum(df, start=100):
return start + df.cumsum()
def cdescribe(x, n_cols=None):
if not n_cols:
d = x.describe()
d.loc["skew"] = x.skew()
d.loc["kurtosis"] = x.kurtosis()
return d
else:
x_ = x.sample(n_cols, axis=1)
d = x_.describe()
d.loc["skew"] = x_.skew()
d.loc["kurtosis"] = x_.kurtosis()
return d
def compare_stats(x, y, n_cols=None):
pd.options.display.float_format = "{:,.4f}".format
data = pd.concat([cdescribe(x), cdescribe(y, n_cols=n_cols)], axis=1)
return data
# ## Plot Functions
# In[5]:
def plot_autocorr(s, lags=50, figsize=(12, 8), title=None):
fig = plt.figure(figsize=figsize)
layout = 2, 2
acf_ax = plt.subplot2grid(layout, (0, 0))
abs_acf_ax = plt.subplot2grid(layout, (0, 1))
pacf_ax = plt.subplot2grid(layout, (1, 0))
squared_ax = plt.subplot2grid(layout, (1, 1))
sm.graphics.tsa.plot_acf(
s,
fft=True,
zero=False,
lags=lags,
ax=acf_ax,
title="Autocorrelation of Returns",
)
sm.graphics.tsa.plot_acf(
s.abs(),
fft=True,
zero=False,
lags=lags,
ax=abs_acf_ax,
title="Autocorrelation of Absolute Returns",
)
sm.graphics.tsa.plot_pacf(
s, zero=False, lags=lags, ax=pacf_ax, title="Partial Autocorrelation of Returns"
)
sm.graphics.tsa.plot_acf(
s ** 2,
fft=True,
zero=False,
lags=lags,
ax=squared_ax,
title="Autocorrelation of Squared Returns",
)
if title:
fig.suptitle(title, fontweight="demi", fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.88)
return
def plot_mean_dist(returns, sim_returns, ax):
mean_return = returns.mean()
ax.set_title(f"{returns.name} mean return: {mean_return:.4f}")
sim_means = sim_returns.mean().squeeze()
g = sns.distplot(sim_means, kde=False, ax=ax)
g.axvline(mean_return, color="r")
return
def plot_std_dist(returns, sim_returns, ax):
std = returns.std()
ax.set_title(f"{returns.name} return std: {std:.4f}")
sim_stds = sim_returns.std().squeeze()
g = sns.distplot(sim_stds, kde=False, ax=ax)
g.axvline(std, color="r")
return
def plot_min_dist(returns, sim_returns, ax):
min_ = returns.min()
ax.set_title(f"{returns.name} return min: {min_:.4f}")
sim_mins = sim_returns.min().squeeze()
g = sns.distplot(sim_mins, kde=False, ax=ax)
g.axvline(min_, color="r")
return
def plot_max_dist(returns, sim_returns, ax):
max_ = returns.max()
ax.set_title(f"{returns.name} return max: {max_:.4f}")
sim_maxs = sim_returns.max().squeeze()
g = sns.distplot(sim_maxs, kde=False, ax=ax)
g.axvline(max_, color="r")
return
def plot_stat_dist(returns, sim_returns, figsize=(12, 8)):
fig, axes = plt.subplots(2, 2, figsize=figsize)
returns.name = "real_etf"
plot_mean_dist(returns, sim_returns, axes[0, 0])
plot_std_dist(returns, sim_returns, axes[0, 1])
plot_min_dist(returns, sim_returns, axes[1, 0])
plot_max_dist(returns, sim_returns, axes[1, 1])
fig.suptitle(
f"{returns.name} simulated stat distributions", fontweight="demi", fontsize=16
)
fig.tight_layout()
fig.subplots_adjust(top=0.88)
return
def plot_autocorr_dist(returns, sim_returns):
autocorr = returns.autocorr()
plt.title(f"{returns.name} return autocorr: {autocorr:.4f}")
sim_autocorrs = sim_returns.apply(pd.Series.autocorr).squeeze()
g = sns.distplot(sim_autocorrs, kde=False)
g.axvline(autocorr, color="r")
return
# ## Scale and Transform Data
# In[31]:
from sklearn.preprocessing import PowerTransformer, MaxAbsScaler, RobustScaler
R_pct = R.iloc[:, :-1].mul(100)
pt = RobustScaler(
quantile_range=(2.5, 97.5)
) # PowerTransformer(method="yeo-johnson", standardize=True)
data = pt.fit_transform(R_pct)
data.shape
# ## Begin Analysis
# ### Find AIC, BIC Minimum
# In[32]:
## careful this can take >6 mins
n_components = np.arange(250, 2750, 250)
models = [
mix.GaussianMixture(n, covariance_type="full", random_state=0) for n in n_components
]
aics = pd.Series(
[model.fit(data).aic(data) for model in models], index=n_components, name="aic"
)
bics = pd.Series(
[model.fit(data).bic(data) for model in models], index=n_components, name="bic"
)
# In[33]:
fig, ax = plt.subplots()
_ = aics.plot(ax=ax)
ax.axvline(aics.idxmin(), color="red", ls="--", lw=7)
_ = bics.plot(ax=ax)
ax.axvline(bics.idxmin(), color="purple", ls="--", lw=7)
# ### Fit Mixture Model
# In[34]:
gmm = mix.GaussianMixture(aics.idxmin(), n_init=3, covariance_type="full")
gmm.fit(data)
print(gmm.converged_)
# ## Evaluate Single Path Sample
# In[35]:
def sample_path(gmm_model, transformer, n_samples, seed=0):
np.random.seed(seed)
data_new = gmm_model.sample(n_samples)[0]
paths = transformer.inverse_transform(data_new)
path_df = pd.DataFrame(paths)
return path_df
# In[36]:
N_SAMPLES = len(data)
path_df = sample_path(gmm, pt, N_SAMPLES)
# ### Real data plot
# In[37]:
_ = R_pct.cumsum().plot(figsize=(12, 10))
# ### Synthetic data plot
# In[38]:
_ = path_df.cumsum().plot(figsize=(12, 10))
# ### Compare histogram density
# In[39]:
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(path_df.mean(axis=1), ax=ax, label="sim")
ax.legend()
# ### Compare descriptive statistics
# In[40]:
compare_stats(R_pct, path_df).T
# ### Compare Correlations
# #### Real
# In[41]:
sns.clustermap(R_pct.corr())
# #### Synthetic
# In[42]:
sns.clustermap(path_df.corr())
# ### Plot example stat distributions
# In[43]:
plot_stat_dist(R_pct.mean(axis=1), path_df)
# ### Plot autocorrelations
# #### Real
# In[44]:
plot_autocorr(R_pct.mean(axis=1), title="Real Mean Returns Autocorrelations")
# #### Synthetic
# In[45]:
plot_autocorr(path_df.mean(axis=1), title="Simulated Mean Return Autocorrelations")
# ## Evaluate Many Paths
# In[46]:
path_dfs = list()
for i in tqdm(range(int(5000 / 6))):
path_df = sample_path(gmm, pt, N_SAMPLES, seed=i)
path_dfs.append(path_df)
paths = pd.concat(path_dfs, axis=1)
paths.columns = np.arange(paths.shape[1])
cprint(paths)
# ### Synthetic data plot
# In[47]:
paths.sample(10, axis=1, random_state=0).cumsum().plot(figsize=(12, 10))
# ### Plot Histogram Comparisons
# #### Random selection of 6 Paths
# In[48]:
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(paths.sample(6, axis=1, random_state=0).mean(axis=1), ax=ax, label="synth")
ax.legend()
# #### All Paths
# In[49]:
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(paths.mean(axis=1), ax=ax, label="synth")
ax.legend()
# ### Compare descriptive statistics
# In[50]:
# change the number of columns to see more or use a subset of paths using paths.sample
compare_stats(R_pct, paths.iloc[:, :12], n_cols=10).T
# ### Compare Correlations
# #### Real
# In[51]:
sns.clustermap(R_pct.corr())
# #### Synthetic first 12
# In[52]:
sns.clustermap(paths.iloc[:, :12].corr())
# ### Plot example stat distributions
# In[53]:
plot_stat_dist(R_pct.mean(axis=1), paths)
# ### Plot Autocorrelations
# #### Real
# In[54]:
plot_autocorr(R_pct.mean(axis=1), title="Real Mean Returns Autocorrelations")
# #### Synthetic
# In[55]:
plot_autocorr(path_df.mean(axis=1), title="Simulated Mean Return Autocorrelations")
# ## Conclusions
# This is a solid way to generate correlated simulated time series that mimics the real data in many ways. The simulated/synthetic descriptive statistics are very similar to the original data. The histogram of the mean returns are nearly identical, and a plot of the cumulative returns are nearly indistinguishable from real return data.
#
# However there are some limitations. The simulated autocorrelations are too signficant and positive in the shorter lags, relative to the real data. Using a random sample of the simulated data can vastly alter the descriptive statistics, the correlations, and the histogram. This seems to be a result of the GMM being fit on a correlated group of data, so that the random samples may overuse a particular version of a return path, which when aggregated together could show extreme kurtosis or a shifted skew.
#
# Another limitation is the computational burden this method requires. If the dataset gets larger than 20K rows the time and memory requirements become prohibitive and may jam your laptop/desktop computer.