Even the most simple NN's can be useful. This notebook will demonstrate a simple approach for a predictive trading model.
-Dynamically load daily price history from Yahoo / Google Finance
-Apply some feature engineering
-Train XGBoost and NN Models to predict if the direction of the next day's price move
-Re-Train based on the most important Feature Columns
-Predict using the trained models and evaluate the financial performance
-Do Basic Feature Engineering in Pandas
-Use XGBoost to detect most important features
-Stop Training a Model when the metrics fail to improve, and re-set model to what it was after the best epoch
-Visualise Model input and outputs
-Plot loss and accuracy charts
-Plot some cool charts like Auto Correlation & Scatter Matrix
-Make Money applying NN to Finance !
import numpy as np
from datetime import datetime
import operator
import pandas as pd
from pandas.tools.plotting import autocorrelation_plot
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from matplotlib.pyplot import legend
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Reshape, Dropout, Convolution2D, MaxPooling2D, LSTM
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.linear_model import LinearRegression
from keras.utils import np_utils
import itertools
import copy
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas_datareader as pdr
from IPython import display
useGoogle = True # Use either Google or Yahoo Finance
#rates # DataFrame of Raw price data
INST='^GSPC' # Financial Instrument Code eg IBM, F, FB
useLSTM=False # Use the LSTM model instead of the Dense NN
plotall=True # Display all the Charts
epochs=1000 # Max number of epochs to run
waitEpochs=100 # Max number of epochs to wait for an improvement before stopping training
#AA # feature engineered DataFrame
#x, xi, xt # np arrays for x, xi is the train set, xt is the test set
#y, yi, yt # np arrays for y, yi is the train set, yt is the test set
#split # array index the separates the train from test sets
#inputCols # list of columns from AA that will be used to build x
#m # current model
#window=0 # LSTM window
#offset=0 # Ignore the first n rows of x
#posVect # position predictions 1="LONG" -1 = "SHORT"
#predictions # raw model predictions
#pnlVect # profit & loss vector
def GetData():
global rates
start = datetime(1990, 1, 1)
end = datetime.now()
if useGoogle:
rates= pdr.get_data_google(INST, start, end)
rates['Adj Close']=rates['Close'] #### Note Google does not have adj price for corporate actions
else:
rates= pdr.get_data_yahoo(INST, start, end)
rates.rename(columns={'Open': 'OPEN', 'High': 'HIGH', 'Low': 'LOW', 'Adj Close': 'CLOSE'}, inplace=True)
rates['Volume'].fillna(0, inplace=True)
rates = rates.reset_index()
def Start(inst):
global INST
INST=inst
GetData()
if plotall :
fig, ax = plt.subplots(1,1)
ax.plot(rates['Date'],rates['CLOSE'])
ax.set_title(inst)
plt.show()
def PlotAutoCorrel():
fig = plt.figure()
_ = autocorrelation_plot(rates['CLOSE'], label=INST)
plt.show()
print "Describing " + INST
display.display(rates.describe())
def divMax(data,column): # scale
data[column]=data[column]/max(data[column])
def mavg(data,column,periods): # moving average
c=pd.Series.rolling(data[column],periods).mean()
data[column+"mavg"+str(periods)]=c
def logColumn(data,column): # log
data[column]=np.log(data[column])
def pct(data,column): # % change
data[column+"pct"]=((data[column]-data[column].shift())/data[column])
def mom(data,column,MomPeriodOffset): # Momentum
x=data[column].as_matrix()
res=np.zeros(len(data))
for i in range(len(x)):
if (i >(MomPeriodOffset-1)):
iqr = np.subtract(*np.percentile(x[(i-3):i], [75, 25]))
if (iqr<=0.000000001):
res[i]=0.0
else:
res[i]=(x[i]-x[i-MomPeriodOffset])/ iqr #np.std(x[i-3:i]) # todo divide by interquartile to avoid large and small values
data[column+"mom"+str(MomPeriodOffset)]=res
def bolWidth(data,column,windowsize): # Bollenger Band Width
x=data[column].as_matrix()
res=np.zeros(len(data))
for i in range(len(x)):
if (i >windowsize):
std=np.std(x[i-windowsize-1:i])
mean=np.mean(x[i-windowsize-1:i])
bolup=mean+2*std
boldown=mean-2*std
bolwidth=(bolup-boldown)/mean
res[i]=bolwidth
data[column+"bolW"+str(windowsize)]=res
def TR (data): # True Range
h=data['HIGH'].as_matrix()
l=data['LOW'].as_matrix()
pc=data['CLOSE'].as_matrix()
res=np.zeros(len(data))
for i in range(len(h)):
if (i >1):
t=[(h[i]-l[i]),abs((h[i]-pc[i-1])),abs((l[i]-pc[i-1])) ]
res[i]=np.amax(t)
data['TR']=res
def ATR(data, window): # Average True Range
c=data['TR'].ewm(span = window, min_periods = window).mean()
data["ATR"+str(window)]=c
def LR(y): # Linear Regression
X = np.matrix(range(len(y))).T
m = LinearRegression()
m.fit(X, y)
p=np.array([len(y)]).reshape(-1,1)
return m.predict(p)
def TSF(data,column, window):
x=data[column].as_matrix()
res=np.zeros(len(data))
for i in range(len(data)):
if (i>window):
res[i]=LR(x[i-(window-1):i])
data["TSF"+str(window)]=res
def norm(data,column,window): # rolling normalisation function
x=data[column].as_matrix()
res=np.zeros(len(data))
x[np.isnan(x)] = 0
for i in range(len(data)):
if (i>window):
sub=x[i-window:i]
avg=np.mean(sub)
ma=np.amax(sub)
mi=np.amin(sub)
res[i]=(x[i]-avg)/(ma-mi)
#res[i]=(x[i]-avg)/np.std(sub)
data[column]=res
def objective(data): # used if we want have a regressor rather than a classifier
x=data['CLOSE'].as_matrix()
x2=data['ATR100'].as_matrix()
res=np.zeros(len(data))
for i in range(len(data)):
if( i<len(x)-1):
res[i]=(x[i+1]-x[i])/x2[i+1]
data['objective']= res
def Except(full_list, excludes): # list "except" function
s = set(excludes)
return (x for x in full_list if x not in s)
def PrepData(XColumns=""):
global AA
global split
global inputCols
AA=rates.copy()
pct(AA,'CLOSE')
norm(AA,'CLOSE',100)
norm(AA,'OPEN',100)
norm(AA,'HIGH',100)
norm(AA,'LOW',100)
mom(AA,'CLOSE',3)
mom(AA,'CLOSE',5)
mom(AA,'CLOSE',10)
mom(AA,'CLOSE',30)
mom(AA,'CLOSE',100)
TR(AA)
ATR(AA, 7)
ATR(AA, 10)
ATR(AA, 20)
ATR(AA, 100)
bolWidth(AA,'CLOSE',20)
TSF(AA,'CLOSE',10)
mavg(AA,'CLOSE',10)
mavg(AA,'CLOSE',30)
mavg(AA,'CLOSE',100)
mavg(AA,'CLOSE',200)
AA=AA.loc[200:,] # drop the first 200 rows
AA.reset_index(drop=True, inplace=True)
AA['ATR10v20']=AA['ATR10'] / AA['ATR20']
AA['ATR10v100']=AA['ATR10']/AA['ATR100']
AA['DeltaBolW20']= AA['CLOSEbolW20'].diff()
if XColumns=="": # use the default columns except Date and Closepct
inputCols=list(Except(AA.columns.tolist(),['Date', 'CLOSEpct' ] ))# dont normalize the date and pct CLOSE columns
else:
inputCols = XColumns
for c in Except(inputCols,['OPEN', 'HIGH', 'LOW','CLOSE' ] ):
norm(AA,c,200)
objective(AA) # add the objective column
AA.reset_index(drop=True, inplace=True)
split= int(len(AA)*.8)
def PlotAutoCorrelAfter():
tmp=AA['CLOSE']
tmp=tmp[201:]
plt.plot(tmp)
plt.show()
_ = autocorrelation_plot(tmp, label=INST)
plt.show()
def GetFeatureImportance():
model = XGBClassifier()
x=AA[inputCols]
y=np.sign(AA['CLOSEpct'].shift(-1)) # set Y to be tomorrow's close px 0 for down 1 for up
y[y<0]=0
xi=x.loc[:split]
yi=y.loc[:split]
model.fit(xi, yi)
featureImp = model.feature_importances_
if plotall:
xgb.plot_importance(model)
plt.show()
preds=model.predict(x.loc[split:(len(y)-2)])
print "XGBoost Accuracy is : " + str(accuracy_score(preds,y.loc[split:(len(y)-2)].as_matrix()))
return featureImp
def MakeModel(cols): #number of cols
model = Sequential()
model.add(Dense(cols, input_dim=cols, kernel_initializer="normal", activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1, kernel_initializer="normal", activation='sigmoid')) #
optimizer = RMSprop(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
def MakeLSTMModel(window, cols):
model = Sequential()
model.add(LSTM(100, input_shape=( window, cols)))
model.add(Dense(1, kernel_initializer="normal", activation='sigmoid')) #
optimizer = RMSprop(lr=0.001)
model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])
return model
def TrainModel(cols):
global yt
global xt
global x
global y
x=AA[inputCols].as_matrix()
y=np.sign(AA['CLOSEpct'].shift(-1).as_matrix()) # set Y to be tomorrow's close price change 0 for down 1 for up
y[y<0]=0
filepath="weights-improvement.hdf5"
if useLSTM:
#x, y = SetLSTMInputs()
x=np.reshape(x, (x.shape[0], 1, x.shape[1])) # add another dimension
m=MakeLSTMModel(window, cols)
else:
m =MakeModel(cols)
xi=x[201:split]
xt=x[split:len(x)-2]
yi=y[201:split]
yt=y[split:len(y)-2]
checkpoints = [ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max'),
EarlyStopping(monitor='val_acc', patience=waitEpochs, verbose=0)
]
hist = m.fit(xi,yi,
validation_data =(xt,yt),
epochs=epochs,
verbose=0,
callbacks=checkpoints
)
m.load_weights(filepath)
return m, hist
def SetLSTMInputs():
samps=len(x)-offset
xn=[]
yn=[]
for i in range (samps):
xn.append (x[(offset-window)+i:offset+i])
yn.append (y[offset+i])
return np.array(xn) , np.array(yn)
def GetNNAccuracy(history):
global predictions
predictions=m.predict(xt)
if plotall:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
ytmp=np.expand_dims(yt, axis=1)
for i in [0 , 0.05, 0.1, 0.2, 0.3, 0.4 ]:
print ""
print "Signal Threshold " + str(i)
idxs=np.any([predictions > (0.5+i), predictions < (0.5-i)], axis=0)
preds= predictions[idxs]
print "Matching Rows " + str(len(preds))
preds[preds>0.5]=1
preds[preds<0.5]=0
print "NN Accuracy " + str (accuracy_score(preds,ytmp[idxs]) )
def CheckPerformance(preds, thresh, holdPrevPos=True):
period=len(preds)
global posVect
global predictions
global pnlVect
predictions=preds
pnlVect=np.zeros(period)
posVect=np.zeros(period)
for i in range(period):
if preds[i]>(0.5+thresh):
posVect[i] = 1
if preds[i]<(0.5-thresh):
posVect[i] = -1
if posVect[i]==0:
if holdPrevPos:
posVect[i]=posVect[i-1]
else:
posVect[i]=0
pnlVect[i]=posVect[i]*AA['CLOSEpct'].shift(-1)[offset+split+i]
return np.cumsum(pnlVect),posVect
def ChartNNPerformance(thresh, holdPrevPos=True):
pnl, positions = CheckPerformance(predictions,thresh, holdPrevPos)
plt.plot(pnl)
plt.plot(np.cumsum(AA['CLOSEpct'][offset+split:]).shift(-1).as_matrix())
legend(["Model perf"] + [INST + " perf"], loc=2)
plt.show()
def ChartCrossCorrel():
x=AA[inputCols]
colv= [x[c]for c in x.columns]
colv =pd.concat(colv, axis=1)
_ = scatter_matrix(colv, figsize=(20, 20), diagonal='kde')
plt.show()
def ExamineInputs():
print "Charts for last 200 days"
print ""
print "Input Colums"
print inputCols
for c in inputCols:
plt.plot(AA.tail(200)[c])
plt.show()
print "predicted positions"
plt.bar(range(200),posVect[-200:])
plt.show()
def RunFullAnalysis(StockCode='^GSPC'):
global m
bestnCols=6
Start(StockCode)
if plotall:
PlotAutoCorrel()
PrepData()
if plotall:
PlotAutoCorrelAfter()
featureImp = GetFeatureImportance()
res = TrainModel(len(inputCols))
m=res[0]
GetNNAccuracy(res[1])
ChartNNPerformance(0.0)
dd ={}
for i in range(len(inputCols)):
dd[inputCols[i]]=featureImp[i]
dd = list(sorted(dd.items(), key=operator.itemgetter(1), reverse=True))
MostImportantCols=[list(t)[0] for t in dd[:bestnCols]]
print MostImportantCols
PrepData(MostImportantCols)
if plotall:
ChartCrossCorrel()
#res = TrainModel(bestnCols)
#m=res[0]
#print "Using Restricted Columns"
#GetNNAccuracy(res[1])
#ChartNNPerformance(0.0)
def RunSingleModelWithFixedColumns(StockCode, Columns):
global m
Start(StockCode)
PrepData(Columns)
res = TrainModel(len(Columns))
m=res[0]
GetNNAccuracy(res[1])
ChartNNPerformance(0.0)
useLSTM=False
plotall=True
epochs=1000
waitEpochs=100
window=0
offset=0
RunFullAnalysis(StockCode='IBM') # Dense network
ExamineInputs()
Describing IBM
OPEN | HIGH | LOW | Close | Volume | CLOSE | |
---|---|---|---|---|---|---|
count | 4000.000000 | 4000.000000 | 4000.000000 | 4000.000000 | 4.000000e+03 | 4000.000000 |
mean | 129.121212 | 130.221605 | 128.128470 | 129.205377 | 6.149336e+06 | 129.205377 |
std | 41.941123 | 42.078383 | 41.826448 | 41.961273 | 3.288254e+06 | 41.961273 |
min | 55.070000 | 56.700000 | 54.010000 | 55.070000 | 0.000000e+00 | 55.070000 |
25% | 90.222500 | 91.142500 | 89.495000 | 90.342500 | 3.962703e+06 | 90.342500 |
50% | 121.780000 | 123.245000 | 120.735000 | 121.860000 | 5.382808e+06 | 121.860000 |
75% | 165.255000 | 166.685000 | 164.107500 | 165.505000 | 7.498475e+06 | 165.505000 |
max | 215.380000 | 215.900000 | 214.300000 | 215.800000 | 4.038760e+07 | 215.800000 |
XGBoost Accuracy is : 0.499341238472
Signal Threshold 0 Matching Rows 758 NN Accuracy 0.519788918206 Signal Threshold 0.05 Matching Rows 278 NN Accuracy 0.510791366906 Signal Threshold 0.1 Matching Rows 65 NN Accuracy 0.415384615385 Signal Threshold 0.2 Matching Rows 8 NN Accuracy 0.25 Signal Threshold 0.3 Matching Rows 3 NN Accuracy 0.333333333333 Signal Threshold 0.4 Matching Rows 2 NN Accuracy 0.5
['CLOSEmom5', 'TR', 'Volume', 'ATR100', 'CLOSEmom3', 'CLOSEmom100']
Charts for last 200 days Input Colums ['CLOSEmom5', 'TR', 'Volume', 'ATR100', 'CLOSEmom3', 'CLOSEmom100']
predicted positions
#ChartNNPerformance(0.0,True)
#useLSTM=True
#plotall=True
#epochs=20
#waitEpochs=10
#window=1
#offset=0
#RunFullAnalysis(StockCode='NYSE:CAT')
#ExamineInputs()
#useLSTM=False
#plotall=False
#epochs=5000
#waitEpochs=500
#window=0
#offset=0
#RunSingleModelWithFixedColumns(StockCode='NYSE:CAT', Columns=['ATR10v100', 'DeltaBolW20', 'TSF10', 'ATR7', 'CLOSEmom3', 'CLOSEmavg100'])
#ExamineInputs()
#useLSTM=True
#plotall=True
#epochs=200
#waitEpochs=100
#window=1
#offset=0
#RunSingleModelWithFixedColumns(StockCode='NYSE:CAT', Columns=['CLOSEbolW20', 'CLOSEmom3', 'TSF10', 'ATR7', 'CLOSE'])
#ExamineInputs()