%load_ext watermark
%watermark
%load_ext autoreload
%autoreload 2
from IPython.display import display, Image
# import standard libs
from pathlib import Path, PurePath
import sys
import time
import os
import itertools
def get_relative_project_dir(project_repo_name=None, partial=True):
"""helper fn to get local project directory by using the repo name"""
current_working_directory = Path.cwd()
cwd_parts = current_working_directory.parts
if partial:
while project_repo_name not in cwd_parts[-1]:
current_working_directory = current_working_directory.parent
cwd_parts = current_working_directory.parts
else:
while cwd_parts[-1] != project_repo_name:
current_working_directory = current_working_directory.parent
cwd_parts = current_working_directory.parts
return current_working_directory
# get project dir
pdir = get_relative_project_dir('mixture_model_trading_public', partial=False)
data_dir = pdir / 'data'
script_dir = pdir / 'scripts'
sys.path.append(script_dir.as_posix())
# import python scientific stack
import pandas as pd
pd.set_option('display.width', 1000)
#import pandas_datareader as pdr
import numpy as np
import sklearn.mixture as mix
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as stats
from arch.unitroot import KPSS
from numba import jit
import math
# import visual tools
from mpl_toolkits import mplot3d
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import plotnine as pn
import mizani.breaks as mzb
import mizani.formatters as mzf
import seaborn as sns
# import util libs
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from utils import cprint
from dotenv import load_dotenv
load_dotenv() # to use hidden api key
print()
%watermark -p pandas,pandas_datareader,numpy,sklearn,statsmodels,scipy,matplotlib,seaborn,plotnine
2019-08-13T18:08:14-06:00 CPython 3.7.3 IPython 7.6.1 compiler : GCC 7.3.0 system : Linux release : 4.19.11-041911-generic machine : x86_64 processor : x86_64 CPU cores : 12 interpreter: 64bit pandas 0.24.2 pandas_datareader 0.7.0 numpy 1.16.2 sklearn 0.20.3 statsmodels 0.9.0 scipy 1.2.1 matplotlib 3.0.2 seaborn 0.9.0 plotnine 0.5.1
%load_ext nb_black
sns_params = {
"axes.grid": True,
"ytick.left": True,
"xtick.bottom": True,
"xtick.major.size": 2,
"ytick.major.size": 2,
"font.size": 11,
"font.weight": "medium",
"figure.figsize": (10, 7),
"font.family": "DejaVu Sans Mono",
}
sns.set(context="talk", style="ticks", rc=sns_params)
savefig_kwds = dict(dpi=90, bbox_inches="tight", frameon=True, format="png")
flatui = ["#3498db", "#9b59b6", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", "#f4cae4"]
sns.set_palette(sns.color_palette(flatui, 7))
infp = Path(data_dir / "tiingo_etf_returns_ending_2017-12-31.parq")
R = pd.read_parquet(infp).assign(year=lambda df: df.index.year)
cprint(R)
------------------------------------------------------------------------------- dataframe information ------------------------------------------------------------------------------- SPY QQQ TLT GLD EFA EEM year date 2017-12-22 -0.000262 -0.001143 0.001363 0.005223 0.002571 0.008422 2017 2017-12-26 -0.001197 -0.005416 0.002961 0.006839 -0.000285 -0.000861 2017 2017-12-27 0.000486 0.000128 0.012941 0.003770 0.001853 0.002580 2017 2017-12-28 0.002055 0.001213 -0.000868 0.005060 0.000569 0.006846 2017 2017-12-29 -0.003778 -0.006208 0.001578 0.006491 0.000711 0.004680 2017 -------------------------------------------------- <class 'pandas.core.frame.DataFrame'> DatetimeIndex: 3301 entries, 2004-11-19 to 2017-12-29 Data columns (total 7 columns): SPY 3301 non-null float64 QQQ 3301 non-null float64 TLT 3301 non-null float64 GLD 3301 non-null float64 EFA 3301 non-null float64 EEM 3301 non-null float64 year 3301 non-null int64 dtypes: float64(6), int64(1) memory usage: 206.3 KB None -------------------------------------------------------------------------------
def to_returns(s):
return np.log(s / s.shift(1)).dropna()
def to_price_index_exp(df, start=100):
return start * (np.exp(df.cumsum()))
def to_price_index_sum(df, start=100):
return start + df.cumsum()
def cdescribe(x, n_cols=None):
if not n_cols:
d = x.describe()
d.loc["skew"] = x.skew()
d.loc["kurtosis"] = x.kurtosis()
return d
else:
x_ = x.sample(n_cols, axis=1)
d = x_.describe()
d.loc["skew"] = x_.skew()
d.loc["kurtosis"] = x_.kurtosis()
return d
def compare_stats(x, y, n_cols=None):
pd.options.display.float_format = "{:,.4f}".format
data = pd.concat([cdescribe(x), cdescribe(y, n_cols=n_cols)], axis=1)
return data
def plot_autocorr(s, lags=50, figsize=(12, 8), title=None):
fig = plt.figure(figsize=figsize)
layout = 2, 2
acf_ax = plt.subplot2grid(layout, (0, 0))
abs_acf_ax = plt.subplot2grid(layout, (0, 1))
pacf_ax = plt.subplot2grid(layout, (1, 0))
squared_ax = plt.subplot2grid(layout, (1, 1))
sm.graphics.tsa.plot_acf(
s,
fft=True,
zero=False,
lags=lags,
ax=acf_ax,
title="Autocorrelation of Returns",
)
sm.graphics.tsa.plot_acf(
s.abs(),
fft=True,
zero=False,
lags=lags,
ax=abs_acf_ax,
title="Autocorrelation of Absolute Returns",
)
sm.graphics.tsa.plot_pacf(
s, zero=False, lags=lags, ax=pacf_ax, title="Partial Autocorrelation of Returns"
)
sm.graphics.tsa.plot_acf(
s ** 2,
fft=True,
zero=False,
lags=lags,
ax=squared_ax,
title="Autocorrelation of Squared Returns",
)
if title:
fig.suptitle(title, fontweight="demi", fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.88)
return
def plot_mean_dist(returns, sim_returns, ax):
mean_return = returns.mean()
ax.set_title(f"{returns.name} mean return: {mean_return:.4f}")
sim_means = sim_returns.mean().squeeze()
g = sns.distplot(sim_means, kde=False, ax=ax)
g.axvline(mean_return, color="r")
return
def plot_std_dist(returns, sim_returns, ax):
std = returns.std()
ax.set_title(f"{returns.name} return std: {std:.4f}")
sim_stds = sim_returns.std().squeeze()
g = sns.distplot(sim_stds, kde=False, ax=ax)
g.axvline(std, color="r")
return
def plot_min_dist(returns, sim_returns, ax):
min_ = returns.min()
ax.set_title(f"{returns.name} return min: {min_:.4f}")
sim_mins = sim_returns.min().squeeze()
g = sns.distplot(sim_mins, kde=False, ax=ax)
g.axvline(min_, color="r")
return
def plot_max_dist(returns, sim_returns, ax):
max_ = returns.max()
ax.set_title(f"{returns.name} return max: {max_:.4f}")
sim_maxs = sim_returns.max().squeeze()
g = sns.distplot(sim_maxs, kde=False, ax=ax)
g.axvline(max_, color="r")
return
def plot_stat_dist(returns, sim_returns, figsize=(12, 8)):
fig, axes = plt.subplots(2, 2, figsize=figsize)
returns.name = "real_etf"
plot_mean_dist(returns, sim_returns, axes[0, 0])
plot_std_dist(returns, sim_returns, axes[0, 1])
plot_min_dist(returns, sim_returns, axes[1, 0])
plot_max_dist(returns, sim_returns, axes[1, 1])
fig.suptitle(
f"{returns.name} simulated stat distributions", fontweight="demi", fontsize=16
)
fig.tight_layout()
fig.subplots_adjust(top=0.88)
return
def plot_autocorr_dist(returns, sim_returns):
autocorr = returns.autocorr()
plt.title(f"{returns.name} return autocorr: {autocorr:.4f}")
sim_autocorrs = sim_returns.apply(pd.Series.autocorr).squeeze()
g = sns.distplot(sim_autocorrs, kde=False)
g.axvline(autocorr, color="r")
return
from sklearn.preprocessing import PowerTransformer, MaxAbsScaler, RobustScaler
R_pct = R.iloc[:, :-1].mul(100)
pt = RobustScaler(
quantile_range=(2.5, 97.5)
) # PowerTransformer(method="yeo-johnson", standardize=True)
data = pt.fit_transform(R_pct)
data.shape
(3301, 6)
## careful this can take >6 mins
n_components = np.arange(250, 2750, 250)
models = [
mix.GaussianMixture(n, covariance_type="full", random_state=0) for n in n_components
]
aics = pd.Series(
[model.fit(data).aic(data) for model in models], index=n_components, name="aic"
)
bics = pd.Series(
[model.fit(data).bic(data) for model in models], index=n_components, name="bic"
)
fig, ax = plt.subplots()
_ = aics.plot(ax=ax)
ax.axvline(aics.idxmin(), color="red", ls="--", lw=7)
_ = bics.plot(ax=ax)
ax.axvline(bics.idxmin(), color="purple", ls="--", lw=7)
<matplotlib.lines.Line2D at 0x7fe1ae906e80>
gmm = mix.GaussianMixture(aics.idxmin(), n_init=3, covariance_type="full")
gmm.fit(data)
print(gmm.converged_)
True
def sample_path(gmm_model, transformer, n_samples, seed=0):
np.random.seed(seed)
data_new = gmm_model.sample(n_samples)[0]
paths = transformer.inverse_transform(data_new)
path_df = pd.DataFrame(paths)
return path_df
N_SAMPLES = len(data)
path_df = sample_path(gmm, pt, N_SAMPLES)
_ = R_pct.cumsum().plot(figsize=(12, 10))
_ = path_df.cumsum().plot(figsize=(12, 10))
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(path_df.mean(axis=1), ax=ax, label="sim")
ax.legend()
<matplotlib.legend.Legend at 0x7fe1ae89e3c8>
compare_stats(R_pct, path_df).T
count | mean | std | min | 25% | 50% | 75% | max | skew | kurtosis | |
---|---|---|---|---|---|---|---|---|---|---|
SPY | 3,301.0000 | 0.0326 | 1.1776 | -10.1640 | -0.3754 | 0.0660 | 0.5336 | 13.0685 | -0.1341 | 15.2330 |
QQQ | 3,301.0000 | 0.0452 | 1.2572 | -9.3824 | -0.4830 | 0.0911 | 0.6567 | 11.4799 | -0.1257 | 7.8209 |
TLT | 3,301.0000 | 0.0249 | 0.8761 | -5.1767 | -0.4901 | 0.0553 | 0.5365 | 5.0371 | -0.0190 | 1.8800 |
GLD | 3,301.0000 | 0.0310 | 1.1991 | -9.1905 | -0.5424 | 0.0563 | 0.6686 | 10.6974 | -0.3076 | 5.9959 |
EFA | 3,301.0000 | 0.0209 | 1.4326 | -11.8369 | -0.5321 | 0.0593 | 0.6787 | 14.7451 | -0.1557 | 12.5111 |
EEM | 3,301.0000 | 0.0315 | 1.9389 | -17.6334 | -0.8442 | 0.0965 | 0.9404 | 20.5141 | 0.1936 | 14.7749 |
0 | 3,301.0000 | 0.0632 | 1.1487 | -7.0912 | -0.3447 | 0.0880 | 0.5611 | 13.0614 | 0.0988 | 11.4614 |
1 | 3,301.0000 | 0.0696 | 1.2312 | -6.2080 | -0.4673 | 0.0961 | 0.6987 | 11.4774 | 0.1418 | 5.0027 |
2 | 3,301.0000 | -0.0293 | 0.8759 | -5.1814 | -0.5180 | 0.0276 | 0.5067 | 3.6889 | -0.3858 | 1.9443 |
3 | 3,301.0000 | 0.0225 | 1.1860 | -7.7289 | -0.5354 | 0.0557 | 0.6449 | 7.1052 | -0.5995 | 4.9229 |
4 | 3,301.0000 | 0.0644 | 1.3921 | -7.7590 | -0.5188 | 0.0915 | 0.7217 | 14.7437 | 0.3185 | 9.1792 |
5 | 3,301.0000 | 0.1021 | 1.9080 | -10.1001 | -0.7857 | 0.1329 | 0.9893 | 20.5172 | 0.7456 | 10.9967 |
sns.clustermap(R_pct.corr())
<seaborn.matrix.ClusterGrid at 0x7fe1ae7304a8>
sns.clustermap(path_df.corr())
<seaborn.matrix.ClusterGrid at 0x7fe1ae5977f0>
plot_stat_dist(R_pct.mean(axis=1), path_df)
plot_autocorr(R_pct.mean(axis=1), title="Real Mean Returns Autocorrelations")
plot_autocorr(path_df.mean(axis=1), title="Simulated Mean Return Autocorrelations")
path_dfs = list()
for i in tqdm(range(int(5000 / 6))):
path_df = sample_path(gmm, pt, N_SAMPLES, seed=i)
path_dfs.append(path_df)
paths = pd.concat(path_dfs, axis=1)
paths.columns = np.arange(paths.shape[1])
cprint(paths)
100%|██████████| 833/833 [02:32<00:00, 5.54it/s]
------------------------------------------------------------------------------- dataframe information ------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 ... 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 3296 0.5962 -0.4723 -0.8383 1.0700 0.8320 2.7895 -0.9240 -1.0146 -0.2378 -1.6050 ... 0.5360 -1.9595 -1.5479 -1.6083 -0.7385 -0.9492 0.7755 1.0338 -0.0969 -0.5895 3297 0.5958 -0.4853 -0.8314 1.0739 0.8311 2.7969 0.5907 -0.4814 -0.8281 1.0734 ... -0.1942 -0.3932 -1.1604 -0.2333 -0.6888 -0.8453 0.8455 1.1289 -0.2279 -0.6704 3298 0.5957 -0.4846 -0.8351 1.0697 0.8198 2.7986 -0.9117 -0.7246 0.9425 0.6990 ... -0.2949 -0.0912 -1.0195 -0.2119 -0.6356 -0.8945 0.8239 1.2172 -0.1857 -0.8031 3299 -0.7101 -1.0065 0.6090 1.2230 -0.3330 -0.9851 -0.6784 -0.7974 0.8515 1.1884 ... -0.2356 -1.6057 -0.9591 -3.1173 -0.6599 -0.8734 0.7320 1.2673 -0.3875 -0.9484 3300 -0.6280 -0.9970 0.6703 1.2900 -0.2749 -1.0431 -0.8262 -0.7244 0.7384 1.0918 ... -0.2291 -1.6137 -0.9532 -3.1094 -0.9077 -0.7789 0.9012 0.7492 -0.1063 -0.0608 [5 rows x 4998 columns] -------------------------------------------------- <class 'pandas.core.frame.DataFrame'> RangeIndex: 3301 entries, 0 to 3300 Columns: 4998 entries, 0 to 4997 dtypes: float64(4998) memory usage: 125.9 MB None -------------------------------------------------------------------------------
paths.sample(10, axis=1, random_state=0).cumsum().plot(figsize=(12, 10))
<matplotlib.axes._subplots.AxesSubplot at 0x7fe1ae197780>
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(paths.sample(6, axis=1, random_state=0).mean(axis=1), ax=ax, label="synth")
ax.legend()
<matplotlib.legend.Legend at 0x7fe1aeb90b70>
fig, ax = plt.subplots()
sns.distplot(R_pct.mean(axis=1), ax=ax, label="real")
sns.distplot(paths.mean(axis=1), ax=ax, label="synth")
ax.legend()
<matplotlib.legend.Legend at 0x7fe1aebef2b0>
# change the number of columns to see more or use a subset of paths using paths.sample
compare_stats(R_pct, paths.iloc[:, :12], n_cols=10).T
count | mean | std | min | 25% | 50% | 75% | max | skew | kurtosis | |
---|---|---|---|---|---|---|---|---|---|---|
SPY | 3,301.0000 | 0.0326 | 1.1776 | -10.1640 | -0.3754 | 0.0660 | 0.5336 | 13.0685 | -0.1341 | 15.2330 |
QQQ | 3,301.0000 | 0.0452 | 1.2572 | -9.3824 | -0.4830 | 0.0911 | 0.6567 | 11.4799 | -0.1257 | 7.8209 |
TLT | 3,301.0000 | 0.0249 | 0.8761 | -5.1767 | -0.4901 | 0.0553 | 0.5365 | 5.0371 | -0.0190 | 1.8800 |
GLD | 3,301.0000 | 0.0310 | 1.1991 | -9.1905 | -0.5424 | 0.0563 | 0.6686 | 10.6974 | -0.3076 | 5.9959 |
EFA | 3,301.0000 | 0.0209 | 1.4326 | -11.8369 | -0.5321 | 0.0593 | 0.6787 | 14.7451 | -0.1557 | 12.5111 |
EEM | 3,301.0000 | 0.0315 | 1.9389 | -17.6334 | -0.8442 | 0.0965 | 0.9404 | 20.5141 | 0.1936 | 14.7749 |
0 | 3,301.0000 | 0.0632 | 1.1487 | -7.0912 | -0.3447 | 0.0880 | 0.5611 | 13.0614 | 0.0988 | 11.4614 |
4 | 3,301.0000 | 0.0644 | 1.3921 | -7.7590 | -0.5188 | 0.0915 | 0.7217 | 14.7437 | 0.3185 | 9.1792 |
6 | 3,301.0000 | 0.0270 | 1.1657 | -10.1632 | -0.3976 | 0.0576 | 0.5243 | 11.3837 | -0.5042 | 11.9878 |
3 | 3,301.0000 | 0.0225 | 1.1860 | -7.7289 | -0.5354 | 0.0557 | 0.6449 | 7.1052 | -0.5995 | 4.9229 |
7 | 3,301.0000 | 0.0431 | 1.2467 | -9.3837 | -0.4739 | 0.0885 | 0.6326 | 10.4765 | -0.3149 | 6.4675 |
10 | 3,301.0000 | 0.0138 | 1.4448 | -11.8406 | -0.5621 | 0.0576 | 0.6961 | 12.9054 | -0.6325 | 11.2251 |
11 | 3,301.0000 | 0.0233 | 1.9097 | -17.6366 | -0.8243 | 0.0957 | 0.9375 | 19.0091 | -0.5722 | 13.1705 |
8 | 3,301.0000 | 0.0148 | 0.8954 | -5.1726 | -0.5209 | 0.0492 | 0.5734 | 3.4987 | -0.1846 | 1.4280 |
5 | 3,301.0000 | 0.1021 | 1.9080 | -10.1001 | -0.7857 | 0.1329 | 0.9893 | 20.5172 | 0.7456 | 10.9967 |
1 | 3,301.0000 | 0.0696 | 1.2312 | -6.2080 | -0.4673 | 0.0961 | 0.6987 | 11.4774 | 0.1418 | 5.0027 |
sns.clustermap(R_pct.corr())
<seaborn.matrix.ClusterGrid at 0x7fe1d405b7b8>
sns.clustermap(paths.iloc[:, :12].corr())
<seaborn.matrix.ClusterGrid at 0x7fe1d4669b70>
plot_stat_dist(R_pct.mean(axis=1), paths)
plot_autocorr(R_pct.mean(axis=1), title="Real Mean Returns Autocorrelations")
plot_autocorr(path_df.mean(axis=1), title="Simulated Mean Return Autocorrelations")
This is a solid way to generate correlated simulated time series that mimics the real data in many ways. The simulated/synthetic descriptive statistics are very similar to the original data. The histogram of the mean returns are nearly identical, and a plot of the cumulative returns are nearly indistinguishable from real return data.
However there are some limitations. The simulated autocorrelations are too signficant and positive in the shorter lags, relative to the real data. Using a random sample of the simulated data can vastly alter the descriptive statistics, the correlations, and the histogram. This seems to be a result of the GMM being fit on a correlated group of data, so that the random samples may overuse a particular version of a return path, which when aggregated together could show extreme kurtosis or a shifted skew.
Another limitation is the computational burden this method requires. If the dataset gets larger than 20K rows the time and memory requirements become prohibitive and may jam your laptop/desktop computer.