#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
# ## Synthetic Data Generation Experiments # # In this notebook I experiment with synthetic data generation using block bootstrap methods available from the `arch` package. The basic idea is to compare the aspects of the real series with the bootstrapped versions. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('load_ext', 'watermark') from pathlib import Path import time from pprint import pprint, pformat import pandas as pd import numpy as np import scipy.stats as stats import statsmodels.api as sm import numba as nb from arch.bootstrap import CircularBlockBootstrap # import visual tools import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns sns_params = { 'font.size':9.5, 'font.weight':'medium', 'figure.figsize':(10,7), } plt.style.use('seaborn-talk') plt.style.use('bmh') #plt.style.use('dark_background') sns.set_context(sns_params) savefig_kwds=dict(dpi=300, bbox_inches='tight', frameon=True, format='png') nanex_colors = ("#f92b20", "#fe701b", "#facd1f", "#d6fd1c", "#65fe1b", "#1bfe42", "#1cfdb4", "#1fb9fa", "#1e71fb", "#261cfd") nanex_cmap = mpl.colors.ListedColormap(nanex_colors,name='nanex_cmap') plt.register_cmap('nanex_cmap', cmap=nanex_cmap) # import util libs from tqdm import tqdm, tqdm_notebook import warnings warnings.filterwarnings("ignore") from src.tools.pystore_tools import * from src.tools.utils import * from src.CONSTANTS import PROJECT_REPO_NAME pdir = get_relative_project_dir( project_repo_name=PROJECT_REPO_NAME, partial=False) data_dir = Path(pdir/'data') get_ipython().run_line_magic('watermark', '-v -m -g') print() get_ipython().run_line_magic('watermark', '--iversions') # ## Convenience Functions # # Below I have aggregated some convenience functions to plot the various statistics such as mean, std, min, max, and autocorrelation structures between the real series and the synthetic. The plots also feature comparisons of the indexed price series among the various realizations. # In[2]: def plot_autocorr(s, lags=50, figsize=(10,7), title=None): fig = plt.figure(figsize=figsize) layout = 2, 2 acf_ax = plt.subplot2grid(layout, (0, 0)) abs_acf_ax = plt.subplot2grid(layout, (0, 1)) pacf_ax = plt.subplot2grid(layout, (1, 0)) squared_ax = plt.subplot2grid(layout, (1, 1)) sm.graphics.tsa.plot_acf(s, fft=True, zero=False, lags=lags, ax=acf_ax, title='Autocorrelation of Returns'); sm.graphics.tsa.plot_acf(s.abs(), fft=True, zero=False, lags=lags, ax=abs_acf_ax, title='Autocorrelation of Absolute Returns'); sm.graphics.tsa.plot_pacf(s, zero=False, lags=lags, ax=pacf_ax, title='Partial Autocorrelation of Returns'); sm.graphics.tsa.plot_acf(s**2, fft=True, zero=False, lags=lags, ax=squared_ax, title='Autocorrelation of Squared Returns'); if title: fig.suptitle(title, fontweight='demi', fontsize=16) fig.tight_layout() fig.subplots_adjust(top=0.88) return def plot_mean_dist(returns, sim_returns, ax): mean_return = returns.mean() ax.set_title(f'{returns.name} mean return: {mean_return:.4f}') sim_means = sim_returns.mean().squeeze() g = sns.distplot(sim_means, kde=False, ax=ax) g.axvline(mean_return, color='r') return def plot_std_dist(returns, sim_returns, ax): std = returns.std() ax.set_title(f'{returns.name} return std: {std:.4f}') sim_stds = sim_returns.std().squeeze() g = sns.distplot(sim_stds, kde=False, ax=ax) g.axvline(std, color='r') return def plot_min_dist(returns, sim_returns, ax): min_ = returns.min() ax.set_title(f'{returns.name} return min: {min_:.4f}') sim_mins = sim_returns.min().squeeze() g = sns.distplot(sim_mins, kde=False, ax=ax) g.axvline(min_, color='r') return def plot_max_dist(returns, sim_returns, ax): max_ = returns.max() ax.set_title(f'{returns.name} return max: {max_:.4f}') sim_maxs = sim_returns.max().squeeze() g = sns.distplot(sim_maxs, kde=False, ax=ax) g.axvline(max_, color='r') return def plot_autocorr_dist(returns, sim_returns, ax): autocorr = returns.autocorr() ax.set_title(f'{returns.name} return autocorr: {autocorr:.4f}') sim_autocorrs = sim_returns.apply(pd.Series.autocorr).squeeze() g = sns.distplot(sim_autocorrs, kde=False, ax=ax) g.axvline(autocorr, color='r') return def plot_stat_dist(returns, sim_returns, figsize=(10,7)): fig = plt.figure(figsize=figsize, constrained_layout=True) gs = fig.add_gridspec(3, 2) plot_mean_dist(returns, sim_returns, fig.add_subplot(gs[0,0])) plot_std_dist(returns, sim_returns, fig.add_subplot(gs[0,1])) plot_min_dist(returns, sim_returns, fig.add_subplot(gs[1,0])) plot_max_dist(returns, sim_returns, fig.add_subplot(gs[1,1])) plot_autocorr_dist(returns, sim_returns, fig.add_subplot(gs[2,:])) fig.suptitle(f'{returns.name} simulated stat distributions', fontweight='demi', fontsize=16) fig.tight_layout() fig.subplots_adjust(top=0.88) return # In[3]: def to_returns(s): return np.log(s/s.shift(1)).dropna() def to_price_index(df, start=100): return (start * (np.exp(df.cumsum()))) def cdescribe(x, n_cols=None): if not n_cols: d = x.describe() d.loc['skew'] = x.skew() d.loc['kurtosis'] = x.kurtosis() return d else: x_ = x.sample(n_cols, axis=1) d = x_.describe() d.loc['skew'] = x_.skew() d.loc['kurtosis'] = x_.kurtosis() return d def CBB(s, blocksize, N_paths): sim_returns = [] bs = CircularBlockBootstrap(blocksize, s) for i, data in enumerate(tqdm(bs.bootstrap(N_paths))): tmp = data[0][0].reset_index(drop=True) sim_returns.append(tmp) simulations = pd.concat(sim_returns, axis=1, ignore_index=True) return simulations def compare_stats(x, y, n_cols=None): pd.options.display.float_format = '{:,.4f}'.format data = (pd.concat([cdescribe(x), cdescribe(y, n_cols=n_cols)], axis=1)) return data # In[4]: def view_all(real, sims, n_cols=20, cmap=None): plt.set_cmap(cmap) display(compare_stats(real, sims, n_cols=20)) plot_stat_dist(real, sims) plot_autocorr(real, title=f'{real.name} Real Returns') rand_col = np.random.randint(len(sims.columns), size=1)[0] plot_autocorr(sims[rand_col], title=f'Simulated Return Path {rand_col}') return def plot_realizations(real, sims, start, n_plot_paths=50, figsize=(10,7), cmap=None): plt.set_cmap(cmap) sim_prices = to_price_index(sims, start=100) fig, ax = plt.subplots(figsize=figsize) (sim_prices.sample(n_plot_paths, axis=1) .plot(legend=False, alpha=0.7, lw=1., ax=ax)) (to_price_index(real.reset_index(drop=True)) .plot(legend=True, ax=ax, lw=5, ls='--', color='k')) plt.title(f'{real.name} {n_plot_paths} simulated price paths') # ## Get some data. # # Below is intraday data from the `iex` trading exchange sampled at a 30 sec resolution, resampled to 1 mean using the mean price over the period. The data covers approximately 3 months ending in December 2018. # In[5]: infp = Path(data_dir/'processed'/'etf_symbols_01.parq') all_df = pd.read_parquet(infp).swaplevel().sort_index() cprint(all_df) # In[6]: prices = all_df['lastSalePrice'].unstack() cprint(prices) # ## Extracting SPY sector ETFs # In[7]: sector = [x for x in prices.columns if x.startswith('XL')] sectors = prices[sector].dropna() cprint(sectors) # compute the returns. # In[8]: sector_returns = to_returns(sectors).dropna() # ### View example plots with `XLE` # In[9]: xle = sector_returns['XLE'] cprint(xle) # ### Select the bootstrap parameters and run # In[10]: N_paths = 1_000 block_size = 100 xle_sims = CBB(xle, blocksize=block_size, N_paths=N_paths) # In[11]: view_all(xle, xle_sims, cmap=None) # In[12]: plot_realizations(xle, xle_sims, start=100, n_plot_paths=250, cmap='nanex_cmap') # ## Correlation Investigation # ### correlation between sectors # In[13]: sns.heatmap(sectors.corr()) # ### correlation between real and synthetic series # In[14]: concat = pd.concat([xle.reset_index(drop=True), xle_sims],axis=1) concat.corrwith(concat['XLE']) # In[15]: sns.heatmap(concat.corr()) # ## Generate synthetics for all sectors # In[16]: sim_dict = dict() for sec in sector_returns: tmp = sector_returns[sec] N_paths = 2000 block_size = 100 sec_sims = CBB(tmp, blocksize=block_size, N_paths=N_paths) sim_dict[sec] = sec_sims view_all(tmp, sec_sims, cmap=None) plot_realizations(tmp, sec_sims, start=100, n_plot_paths=100, cmap='nanex_cmap') # ## Examine correlations across synthetic realizations # # In this section I try to select a random sampling of synthetic realizations and blend them together. Once blended I examine the correlation between the different synthetics that were generated using each of the real sector returns. # In[17]: n_cols = 10 merged = dict() for sym in sim_dict.keys(): np.random.seed(0) tmp = sim_dict[sym] # pick random set of columns col_idx = np.random.randint(0, len(tmp.columns)+1, n_cols) merged[sym] = tmp[col_idx].mean(axis=1) df = pd.DataFrame(merged) cprint(df) display(df.corr()) to_price_index(df).plot() # In[18]: to_price_index(sector_returns.reset_index(drop=True)).plot(legend=False) # ## Regenerate synthetics with larger blocks # In[19]: bsize = len(sector_returns)//10 bsize # In[20]: sim_dict = dict() for sec in sector_returns: tmp = sector_returns[sec] N_paths = 2000 sec_sims = CBB(tmp, blocksize=bsize, N_paths=N_paths) sim_dict[sec] = sec_sims view_all(tmp, sec_sims, cmap=None) plot_realizations(tmp, sec_sims, start=100, n_plot_paths=100, cmap='nanex_cmap') # In[21]: n_cols = 2 merged = dict() for sym in sim_dict.keys(): np.random.seed(0) tmp = sim_dict[sym] # pick random set of columns col_idx = np.random.randint(0, len(tmp.columns)+1, n_cols) merged[sym] = tmp[col_idx].mean(axis=1) df = pd.DataFrame(merged) cprint(df) display(df.corr()) to_price_index(df).plot() # In[22]: sns.heatmap(df.corr()) # ## Regenerate Synthetics with Smaller Blocks # In[23]: bsize = len(sector_returns)//1000 bsize # In[24]: sim_dict = dict() for sec in sector_returns: tmp = sector_returns[sec] N_paths = 2000 sec_sims = CBB(tmp, blocksize=bsize, N_paths=N_paths) sim_dict[sec] = sec_sims view_all(tmp, sec_sims, cmap=None) plot_realizations(tmp, sec_sims, start=100, n_plot_paths=100, cmap='nanex_cmap') # ## Additional ETF synthetics # In[25]: symbols = ['SPY', 'IWM', 'ACWI', 'EEM', 'GLD', 'TLT', 'BND', 'SHY', 'LQD', 'HYG', 'VNQ'] etfs = to_returns(prices[symbols].dropna()) cprint(etfs) # In[26]: bsize = 20 sim_dict = dict() for sec in etfs: tmp = etfs[sec] N_paths = 2000 sec_sims = CBB(tmp, blocksize=bsize, N_paths=N_paths) sim_dict[sec] = sec_sims view_all(tmp, sec_sims, cmap=None) plot_realizations(tmp, sec_sims, start=100, n_plot_paths=100, cmap='nanex_cmap') # ### Rolling Volatility # In[27]: sim_no = 100 window = 240 for sym in etfs.columns: fig, ax = plt.subplots(figsize=(10,5)) sim_dict[sym][sim_no].rolling(window).std().plot(ax=ax, label=f'{sym} sim {sim_no}',alpha=0.75) etfs[sym].reset_index(drop=True).rolling(window).std().plot(ax=ax, alpha=0.75) ax.legend() # In[ ]: