%load_ext watermark
%watermark -a 'cs224' -u -d -v -p numpy,pandas,matplotlib,sklearn,h5py,zipline
%matplotlib inline
import numpy as np, scipy, scipy.stats as stats, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import sklearn, sklearn.pipeline, sklearn.model_selection, sklearn.preprocessing, sklearn.linear_model
import re, fnmatch
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(linewidth=1000)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180
SEED = 42
np.random.seed(SEED)
sns.set()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
import os,sys
path = os.path.realpath(os.path.abspath('') + '/csi_futures_data')
sys.path.append(path)
%load_ext autoreload
%autoreload 1
%aimport csi_futures_data
from IPython.display import display, HTML
from IPython.display import display_html
def display_side_by_side(*args):
html_str=''
for df in args:
if type(df) == np.ndarray:
df = pd.DataFrame(df)
html_str+=df.to_html()
html_str = html_str.replace('table','table style="display:inline"')
# print(html_str)
display_html(html_str,raw=True)
CSS = """
.output {
flex-direction: row;
}
"""
def display_graphs_side_by_side(*args):
html_str='<table><tr>'
for g in args:
html_str += '<td>'
html_str += g._repr_svg_()
html_str += '</td>'
html_str += '</tr></table>'
display_html(html_str,raw=True)
display(HTML("<style>.container { width:70% !important; }</style>"))
# the following list is a hand curated list of futures as a combination of data from:
# - Following the Trend
# - Trading Evolved
# - zipline.finance.constants.ROOT_SYMBOL_TO_ETA
# - zipline.finance.constants.FUTURE_EXCHANGE_FEES_BY_SYMBOL
csi_futures_data_df = csi_futures_data.futures_lookup_.copy()
csi_futures_data_df
# the meta.csv is from the package that Andreas Clenow provides for his random_futures_data: https://www.dropbox.com/s/etocgt9zgeedo22/data.zip?dl=0
futures_meta_df = pd.read_csv('meta.csv', index_col=0)
futures_meta_df
# outdated: http://www.csidata.com/factsheets/factsheet-futures.html
# csidata_futures_fact_sheet_df = pd.read_excel('~/2020-10-13-csidata-futures-fact-sheet.xlsx')
# csidata_futures_fact_sheet_df.drop(csidata_futures_fact_sheet_df.columns[1], axis=1, inplace=True)
# csidata_futures_fact_sheet_df.head()
# New, up to date: http://www.csidata.com/factsheets.php?type=commodity&format=htmltable&exchangeid=
csidata_futures_fact_sheet_df = pd.read_csv('./2020-10-29-commodityfactsheet.csv', parse_dates=['StartDate', 'EndDate'])
csidata_futures_fact_sheet_df = csidata_futures_fact_sheet_df[['UACsiNumber', 'SymbolUA', 'ExchangeSymbol', 'Exchange', 'Name', 'SessionType', 'IsActive', 'TerminalPointValue', 'FullPointValue', 'Currency', 'ContractSize', 'Units', 'MinimumTick', 'TickValue', 'StartDate', 'EndDate',\
'LinkSymbol', 'ConversionFactorCode', 'HasCurrentDayVolume', 'HasCurrentDayOpenInterest', 'HasKnownExpirationDates', 'LastTotalVolume']]
csidata_futures_fact_sheet_df.head()
s1 = set(csi_futures_data_df.root_symbol)
s2 = set(futures_meta_df.root_symbol)
# verify that all symbols that are present in the meta.csv are present in csi_futures_data_df
s2 - s1
My understanding is that the values in csi_futures_data_df
are correct and the values in meta.csv
are incorrect.
ldf = pd.merge(csi_futures_data_df, futures_meta_df, how='left', on=['root_symbol'])
ldf.head()
ldf[(ldf['multiplier_x'] != ldf['multiplier_y']) & ldf['root_symbol'].isin(futures_meta_df['root_symbol'])]
ldf[(ldf['minor_fx_adj_x'] != ldf['minor_fx_adj_y']) & ldf['root_symbol'].isin(futures_meta_df['root_symbol'])]
ldf[(ldf['exchange_x'] != ldf['exchange_y']) & ldf['root_symbol'].isin(futures_meta_df['root_symbol'])]
ldf_ = pd.merge(csi_futures_data_df, csidata_futures_fact_sheet_df, how='left', left_on=['csi_symbol'], right_on=['SymbolUA'])
#ldf_ = ldf_[['csi_symbol', 'root_symbol', 'multiplier', 'minor_fx_adj', 'description', 'exchange', 'sector', 'EXCHANGE', 'EXCHANGE SYMBOL', 'NAME', 'ACTIVE', 'CONTRACT SIZE', 'UNIT OF MEASURE', 'CURRENCY', 'FULL POINT VALUE']]
#ldf_['numeric_full_point_value'] = ldf_['FULL POINT VALUE'].str.extract(r'^(\d+).*$').astype(np.int)
ldf_['calc_poin_value'] = (ldf_['multiplier'] * ldf_['minor_fx_adj'])#.astype(np.int)
ldf_.head()
# This here is the most important as this makes sure that we agree on the point value
# The expected result is that the dataframe has 0 entries
ldf_[ldf_['FullPointValue'] != ldf_['calc_poin_value']]
# In general I tried to use the symbols that represent the combined floor and electronic session
ldf_[~ldf_['SessionType'].str.contains(r'Combined').astype(np.bool)]
list(ldf_[~ldf_['SessionType'].str.contains(r'Combined').astype(np.bool)].loc[:,'csi_symbol'].values)
# The name of the exchange is irrelevant, but just for the sake of showing the differences
ldf_[ldf_['exchange'] != ldf_['Exchange']].loc[:,['csi_symbol', 'root_symbol', 'multiplier', 'minor_fx_adj', 'description', 'sector', 'exchange', 'Exchange']]
# All futures trade in USD
ldf_[ldf_['Currency'] != 'USD']
# XXX This here is also very important as this makes sure that we agree on the minor_fx_adj
# Only for "Class III Milk" the pattern is a bit different, but this USD/CWT should also mean that minor_fx_adj is 0.01, e.g. this should be correct.
ldf_['factsheet_minor_fx_adj'] = -1.0
selector = ldf_['Units'].str.startswith('cents').astype(np.bool)
ldf_.loc[selector, 'factsheet_minor_fx_adj'] = 0.01
ldf_.loc[~selector, 'factsheet_minor_fx_adj'] = 1.0
ldf_[ldf_['factsheet_minor_fx_adj'] != ldf_['minor_fx_adj']]
# make sure that all contracts are still trading
ldf_[ldf_['LastTotalVolume'] < 1.0]
# make sure that all contracts are still trading
ldf_[ldf_['EndDate'] != pd.Timestamp('2020-10-28')]
# not all contracts existed at the beginning of the time winow
ldf_[ldf_['StartDate'] > pd.Timestamp('2000-12-31')]
# Some indices are the continuation of others
csidata_futures_fact_sheet_df[csidata_futures_fact_sheet_df['SymbolUA'].isin(['LRC', 'LKD', 'LKR'])]
# Some indices are the continuation of others
csidata_futures_fact_sheet_df[csidata_futures_fact_sheet_df['SymbolUA'].isin(['RB2', 'HU2', 'HU'])]
# Some indices are the continuation of others
csidata_futures_fact_sheet_df[csidata_futures_fact_sheet_df['SymbolUA'].isin(['YM', 'DJ'])]
# Verify that all the data files are available
ldf['cash_file_available'] = False
ldf['first_future_year'] = ''
ldf['first_future_month'] = ''
ldf['last_future_year'] = ''
ldf['last_future_month'] = ''
for idx, row in ldf_.iterrows():
csi_symbol = row['csi_symbol']
csi_file_symbol = csi_symbol.ljust(3, '_')
dir_path = csi_futures_data.data_path3 + '/' + csi_symbol
file_name_list = sorted(fnmatch.filter(os.listdir(dir_path), '*.CSV'))
cash_file_name = file_name_list[0]
r = re.match(r'^{}(\d\d\d\d)(.).*$'.format(csi_file_symbol), cash_file_name)
if not r:
raise RuntimeError('cash_file_name does not match file pattern')
cash_file_available = r.group(2) == '$'
ldf_.loc[idx, 'cash_file_available'] = cash_file_available
if cash_file_available:
futures_file_name_list_start = 1
else:
futures_file_name_list_start = 0
futures_file_name_list = file_name_list[futures_file_name_list_start:]
first_future = futures_file_name_list[0]
r = re.match(r'^{}(\d\d\d\d)(.).*$'.format(csi_file_symbol), first_future)
if not r:
raise RuntimeError('first_future does not match file pattern')
first_future_year = r.group(1)
first_future_month = r.group(2)
last_future = futures_file_name_list[-1]
r = re.match(r'^{}(\d\d\d\d)(.).*$'.format(csi_file_symbol), last_future)
if not r:
raise RuntimeError('last_future does not match file pattern')
last_future_year = r.group(1)
last_future_month = r.group(2)
ldf_.loc[idx, 'first_future_year'] = first_future_year
ldf_.loc[idx, 'first_future_month'] = first_future_month
ldf_.loc[idx, 'last_future_year'] = last_future_year
ldf_.loc[idx, 'last_future_month'] = last_future_month
ldf2 = ldf_[['csi_symbol','root_symbol','multiplier','minor_fx_adj','description','sector', 'ExchangeSymbol','Exchange','Name','SessionType','FullPointValue','Currency','ContractSize','Units','StartDate','EndDate','LastTotalVolume','calc_poin_value',
'factsheet_minor_fx_adj', 'cash_file_available','first_future_year','first_future_month','last_future_year','last_future_month']]
ldf2
# See above: not all contracts started at the beginning of the time window at 2000
# ldf2[ldf2['first_future_year'] > '2000']
# ldf2[ldf2['last_future_year'] < '2020']
# ldf2[ldf2['StartDate'] > pd.Timestamp('2000-12-31')]
#ldf_.to_excel('futures_portfolio_overview.xlsx')
ldf = csi_futures_data.futures_markets_lookup.copy()
ldf['any_market'] = ldf['trend_following_markets'] | ldf['time_return_markets'] | ldf['counter_trend_markets'] | ldf['curve_trading_markets']
ldf.head()
# ldf.to_excel('futures_markets_lookup.xlsx')
# I checked manually: the following are all not traded any more. They were part of the Quantopian futures offering in 2018.
ldf[~ldf['is_in_bundle']]
# The zipline.finance.constants.ROOT_SYMBOL_TO_ETA and zipline.finance.constants.FUTURE_EXCHANGE_FEES_BY_SYMBOL variables agree with each other
ldf[ldf['ROOT_SYMBOL_TO_ETA'] != ldf['FUTURE_EXCHANGE_FEES_BY_SYMBOL']]
# See above: these are the contracts that are not traded any longer
ldf[pd.isnull(ldf['root_symbol'])]
# The following are the contracts that are mentioned in Andreas' books, but were not part of the Quantopian futures offering
ldf[~pd.isnull(ldf['root_symbol']) & ~ldf['ROOT_SYMBOL_TO_ETA']]
# Get all the symbols that Andreas used in his trend_following_markets strategy
tfm1 = csi_futures_data.get_bundle_market_symbols('trend_following_markets')
tfm1
tfm2 = csi_futures_data.trend_following_markets
tfm2
set(tfm1) - set(tfm2)
set(tfm2) - set(tfm1)