#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# ## Synthetic Data Generation Experiments
#
# In this notebook I experiment with synthetic data generation using block bootstrap methods available from the `arch` package. The basic idea is to compare the aspects of the real series with the bootstrapped versions.
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('load_ext', 'watermark')
from pathlib import Path
import time
from pprint import pprint, pformat
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import numba as nb
from arch.bootstrap import CircularBlockBootstrap
# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
sns_params = {
'font.size':9.5,
'font.weight':'medium',
'figure.figsize':(10,7),
}
plt.style.use('seaborn-talk')
plt.style.use('bmh')
#plt.style.use('dark_background')
sns.set_context(sns_params)
savefig_kwds=dict(dpi=300, bbox_inches='tight', frameon=True, format='png')
nanex_colors = ("#f92b20", "#fe701b", "#facd1f", "#d6fd1c", "#65fe1b",
"#1bfe42", "#1cfdb4", "#1fb9fa", "#1e71fb", "#261cfd")
nanex_cmap = mpl.colors.ListedColormap(nanex_colors,name='nanex_cmap')
plt.register_cmap('nanex_cmap', cmap=nanex_cmap)
# import util libs
from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from src.tools.pystore_tools import *
from src.tools.utils import *
from src.CONSTANTS import PROJECT_REPO_NAME
pdir = get_relative_project_dir(
project_repo_name=PROJECT_REPO_NAME, partial=False)
data_dir = Path(pdir/'data')
get_ipython().run_line_magic('watermark', '-v -m -g')
print()
get_ipython().run_line_magic('watermark', '--iversions')
# ## Convenience Functions
#
# Below I have aggregated some convenience functions to plot the various statistics such as mean, std, min, max, and autocorrelation structures between the real series and the synthetic. The plots also feature comparisons of the indexed price series among the various realizations.
# In[2]:
def plot_autocorr(s, lags=50, figsize=(10,7), title=None):
fig = plt.figure(figsize=figsize)
layout = 2, 2
acf_ax = plt.subplot2grid(layout, (0, 0))
abs_acf_ax = plt.subplot2grid(layout, (0, 1))
pacf_ax = plt.subplot2grid(layout, (1, 0))
squared_ax = plt.subplot2grid(layout, (1, 1))
sm.graphics.tsa.plot_acf(s, fft=True, zero=False, lags=lags, ax=acf_ax,
title='Autocorrelation of Returns');
sm.graphics.tsa.plot_acf(s.abs(), fft=True, zero=False,
lags=lags, ax=abs_acf_ax,
title='Autocorrelation of Absolute Returns');
sm.graphics.tsa.plot_pacf(s, zero=False, lags=lags, ax=pacf_ax,
title='Partial Autocorrelation of Returns');
sm.graphics.tsa.plot_acf(s**2, fft=True, zero=False,
lags=lags, ax=squared_ax,
title='Autocorrelation of Squared Returns');
if title: fig.suptitle(title, fontweight='demi', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.88)
return
def plot_mean_dist(returns, sim_returns, ax):
mean_return = returns.mean()
ax.set_title(f'{returns.name} mean return: {mean_return:.4f}')
sim_means = sim_returns.mean().squeeze()
g = sns.distplot(sim_means, kde=False, ax=ax)
g.axvline(mean_return, color='r')
return
def plot_std_dist(returns, sim_returns, ax):
std = returns.std()
ax.set_title(f'{returns.name} return std: {std:.4f}')
sim_stds = sim_returns.std().squeeze()
g = sns.distplot(sim_stds, kde=False, ax=ax)
g.axvline(std, color='r')
return
def plot_min_dist(returns, sim_returns, ax):
min_ = returns.min()
ax.set_title(f'{returns.name} return min: {min_:.4f}')
sim_mins = sim_returns.min().squeeze()
g = sns.distplot(sim_mins, kde=False, ax=ax)
g.axvline(min_, color='r')
return
def plot_max_dist(returns, sim_returns, ax):
max_ = returns.max()
ax.set_title(f'{returns.name} return max: {max_:.4f}')
sim_maxs = sim_returns.max().squeeze()
g = sns.distplot(sim_maxs, kde=False, ax=ax)
g.axvline(max_, color='r')
return
def plot_autocorr_dist(returns, sim_returns, ax):
autocorr = returns.autocorr()
ax.set_title(f'{returns.name} return autocorr: {autocorr:.4f}')
sim_autocorrs = sim_returns.apply(pd.Series.autocorr).squeeze()
g = sns.distplot(sim_autocorrs, kde=False, ax=ax)
g.axvline(autocorr, color='r')
return
def plot_stat_dist(returns, sim_returns, figsize=(10,7)):
fig = plt.figure(figsize=figsize, constrained_layout=True)
gs = fig.add_gridspec(3, 2)
plot_mean_dist(returns, sim_returns, fig.add_subplot(gs[0,0]))
plot_std_dist(returns, sim_returns, fig.add_subplot(gs[0,1]))
plot_min_dist(returns, sim_returns, fig.add_subplot(gs[1,0]))
plot_max_dist(returns, sim_returns, fig.add_subplot(gs[1,1]))
plot_autocorr_dist(returns, sim_returns, fig.add_subplot(gs[2,:]))
fig.suptitle(f'{returns.name} simulated stat distributions',
fontweight='demi', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.88)
return
# In[3]:
def to_returns(s): return np.log(s/s.shift(1)).dropna()
def to_price_index(df, start=100):
return (start * (np.exp(df.cumsum())))
def cdescribe(x, n_cols=None):
if not n_cols:
d = x.describe()
d.loc['skew'] = x.skew()
d.loc['kurtosis'] = x.kurtosis()
return d
else:
x_ = x.sample(n_cols, axis=1)
d = x_.describe()
d.loc['skew'] = x_.skew()
d.loc['kurtosis'] = x_.kurtosis()
return d
def CBB(s, blocksize, N_paths):
sim_returns = []
bs = CircularBlockBootstrap(blocksize, s)
for i, data in enumerate(tqdm(bs.bootstrap(N_paths))):
tmp = data[0][0].reset_index(drop=True)
sim_returns.append(tmp)
simulations = pd.concat(sim_returns, axis=1, ignore_index=True)
return simulations
def compare_stats(x, y, n_cols=None):
pd.options.display.float_format = '{:,.4f}'.format
data = (pd.concat([cdescribe(x), cdescribe(y, n_cols=n_cols)], axis=1))
return data
# In[4]:
def view_all(real, sims, n_cols=20, cmap=None):
plt.set_cmap(cmap)
display(compare_stats(real, sims, n_cols=20))
plot_stat_dist(real, sims)
plot_autocorr(real, title=f'{real.name} Real Returns')
rand_col = np.random.randint(len(sims.columns), size=1)[0]
plot_autocorr(sims[rand_col],
title=f'Simulated Return Path {rand_col}')
return
def plot_realizations(real, sims, start,
n_plot_paths=50, figsize=(10,7), cmap=None):
plt.set_cmap(cmap)
sim_prices = to_price_index(sims, start=100)
fig, ax = plt.subplots(figsize=figsize)
(sim_prices.sample(n_plot_paths, axis=1)
.plot(legend=False, alpha=0.7, lw=1., ax=ax))
(to_price_index(real.reset_index(drop=True))
.plot(legend=True, ax=ax, lw=5, ls='--', color='k'))
plt.title(f'{real.name} {n_plot_paths} simulated price paths')
# ## Get some data.
#
# Below is intraday data from the `iex` trading exchange sampled at a 30 sec resolution, resampled to 1 mean using the mean price over the period. The data covers approximately 3 months ending in December 2018.
# In[5]:
infp = Path(data_dir/'processed'/'etf_symbols_01.parq')
all_df = pd.read_parquet(infp).swaplevel().sort_index()
cprint(all_df)
# In[6]:
prices = all_df['lastSalePrice'].unstack()
cprint(prices)
# ## Extracting SPY sector ETFs
# In[7]:
sector = [x for x in prices.columns if x.startswith('XL')]
sectors = prices[sector].dropna()
cprint(sectors)
# compute the returns.
# In[8]:
sector_returns = to_returns(sectors).dropna()
# ### View example plots with `XLE`
# In[9]:
xle = sector_returns['XLE']
cprint(xle)
# ### Select the bootstrap parameters and run
# In[10]:
N_paths = 1_000
block_size = 100
xle_sims = CBB(xle, blocksize=block_size, N_paths=N_paths)
# In[11]:
view_all(xle, xle_sims, cmap=None)
# In[12]:
plot_realizations(xle, xle_sims, start=100,
n_plot_paths=250, cmap='nanex_cmap')
# ## Correlation Investigation
# ### correlation between sectors
# In[13]:
sns.heatmap(sectors.corr())
# ### correlation between real and synthetic series
# In[14]:
concat = pd.concat([xle.reset_index(drop=True), xle_sims],axis=1)
concat.corrwith(concat['XLE'])
# In[15]:
sns.heatmap(concat.corr())
# ## Generate synthetics for all sectors
# In[16]:
sim_dict = dict()
for sec in sector_returns:
tmp = sector_returns[sec]
N_paths = 2000
block_size = 100
sec_sims = CBB(tmp, blocksize=block_size, N_paths=N_paths)
sim_dict[sec] = sec_sims
view_all(tmp, sec_sims, cmap=None)
plot_realizations(tmp, sec_sims, start=100,
n_plot_paths=100, cmap='nanex_cmap')
# ## Examine correlations across synthetic realizations
#
# In this section I try to select a random sampling of synthetic realizations and blend them together. Once blended I examine the correlation between the different synthetics that were generated using each of the real sector returns.
# In[17]:
n_cols = 10
merged = dict()
for sym in sim_dict.keys():
np.random.seed(0)
tmp = sim_dict[sym]
# pick random set of columns
col_idx = np.random.randint(0, len(tmp.columns)+1, n_cols)
merged[sym] = tmp[col_idx].mean(axis=1)
df = pd.DataFrame(merged)
cprint(df)
display(df.corr())
to_price_index(df).plot()
# In[18]:
to_price_index(sector_returns.reset_index(drop=True)).plot(legend=False)
# ## Regenerate synthetics with larger blocks
# In[19]:
bsize = len(sector_returns)//10
bsize
# In[20]:
sim_dict = dict()
for sec in sector_returns:
tmp = sector_returns[sec]
N_paths = 2000
sec_sims = CBB(tmp, blocksize=bsize, N_paths=N_paths)
sim_dict[sec] = sec_sims
view_all(tmp, sec_sims, cmap=None)
plot_realizations(tmp, sec_sims, start=100,
n_plot_paths=100, cmap='nanex_cmap')
# In[21]:
n_cols = 2
merged = dict()
for sym in sim_dict.keys():
np.random.seed(0)
tmp = sim_dict[sym]
# pick random set of columns
col_idx = np.random.randint(0, len(tmp.columns)+1, n_cols)
merged[sym] = tmp[col_idx].mean(axis=1)
df = pd.DataFrame(merged)
cprint(df)
display(df.corr())
to_price_index(df).plot()
# In[22]:
sns.heatmap(df.corr())
# ## Regenerate Synthetics with Smaller Blocks
# In[23]:
bsize = len(sector_returns)//1000
bsize
# In[24]:
sim_dict = dict()
for sec in sector_returns:
tmp = sector_returns[sec]
N_paths = 2000
sec_sims = CBB(tmp, blocksize=bsize, N_paths=N_paths)
sim_dict[sec] = sec_sims
view_all(tmp, sec_sims, cmap=None)
plot_realizations(tmp, sec_sims, start=100,
n_plot_paths=100, cmap='nanex_cmap')
# ## Additional ETF synthetics
# In[25]:
symbols = ['SPY', 'IWM', 'ACWI', 'EEM', 'GLD',
'TLT', 'BND', 'SHY', 'LQD', 'HYG', 'VNQ']
etfs = to_returns(prices[symbols].dropna())
cprint(etfs)
# In[26]:
bsize = 20
sim_dict = dict()
for sec in etfs:
tmp = etfs[sec]
N_paths = 2000
sec_sims = CBB(tmp, blocksize=bsize, N_paths=N_paths)
sim_dict[sec] = sec_sims
view_all(tmp, sec_sims, cmap=None)
plot_realizations(tmp, sec_sims, start=100,
n_plot_paths=100, cmap='nanex_cmap')
# ### Rolling Volatility
# In[27]:
sim_no = 100
window = 240
for sym in etfs.columns:
fig, ax = plt.subplots(figsize=(10,5))
sim_dict[sym][sim_no].rolling(window).std().plot(ax=ax,
label=f'{sym} sim {sim_no}',alpha=0.75)
etfs[sym].reset_index(drop=True).rolling(window).std().plot(ax=ax, alpha=0.75)
ax.legend()
# In[ ]: