#!/usr/bin/env python # coding: utf-8 # # Equity premium prediction with python and Keras # # * An exploratory data analysis and a demo of classification using a feedforward neural network written in python using the [Keras](https://keras.io/) machine learning framework # * Using [dataset](http://www.hec.unil.ch/agoyal/docs/PredictorData2016.xlsx) from [Prof. Amit Goyal](http://www.hec.unil.ch/agoyal/), we attempt to predict quarterly equity outperformance based on fundamental data like interest rates, valuation. # * Train binary classifier to predict whether next quarter's equity premium will be above or below long term average # # In[1]: import numpy as np import pandas as pd import sklearn from sklearn import preprocessing from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.utils import class_weight from pprint import pprint import time import copy from keras.models import Sequential from keras.layers import Dense, Dropout from keras import regularizers from keras import optimizers from keras.optimizers import Adam from keras.wrappers.scikit_learn import KerasClassifier from keras.models import model_from_json import matplotlib.pyplot as plt #use a fixed seed for reproducibility #seed = np.random.randint(10000, size=1)[0] seed = 6882 print(seed) np.random.seed(seed) # In[2]: ############################################################ # 1. load data ############################################################ # load dataset print("Loading data...") dataframe = pd.read_csv("EqPremClass.csv") dataframe # In[3]: # pandas dataframe -> numpy ndarray dataset = dataframe.values del dataframe # In[4]: num_obs, num_features = dataset.shape num_features -=1 num_labels=1 print("Observations: %d\nFeatures: %d" % (num_obs, num_features)) # last column is target y=dataset[:,num_features].astype(float) print("Histogram: check all 0s and 1s, no -1s etc.") pprint(np.histogram(y)) # omit 1st id column X_raw = dataset[:,1:num_features].astype(float) num_features -=1 del dataset # normalize # not necessary for NN but may speed convergence, lets pca work # min_max_scaler = preprocessing.MinMaxScaler() # X = min_max_scaler.fit_transform(X_raw) X = preprocessing.scale(X_raw) pd.DataFrame(X) # In[5]: print("Split into training, xval, test") # split into training, xval, test, 60/20/20 X_bigtrain, X_test, y_bigtrain, y_test = train_test_split(X, y, test_size=0.2) print("Split into train, xval") X_train, X_xval, y_train, y_xval = train_test_split(X_bigtrain, y_bigtrain, test_size=0.25) print "Training set" print X_train.shape pprint(np.histogram(y_train)) print "Xval set" print X_xval.shape pprint(np.histogram(y_xval)) print "Test set" print X_test.shape pprint(np.histogram(y_test)) # In[6]: # principal component analysis # scree chart to see how much variation is explained by how many predictors # we can predict using PCA components for dimensionality reduction when we have too many/collinear columns # can speed things up or sometimes get a better result # but won't do that here # merely exploratory to understand the data, see that it's scaled pca = PCA(n_components=num_features) pca.fit(X_train) #The amount of variance that each PC explains var= pca.explained_variance_ratio_ #Cumulative Variance explains var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100) #print(var1) get_ipython().run_line_magic('matplotlib', 'inline') plt.plot(var1) print(var1) # looks like ~10 orthogonal PCA components explain > 80% of the variation num_pca_components=15 pca = PCA(n_components=num_pca_components) pca.fit(X_train) pca_train=pca.transform(preprocessing.normalize(X_train)) pca_bigtrain=pca.transform(preprocessing.normalize(X_bigtrain)) pca_xval=pca.transform(preprocessing.normalize(X_xval)) pca_test=pca.transform(preprocessing.normalize(X_test)) # In[7]: # function to instantiate Keras feed-forward neural network model def declare_model(num_components=num_features, hidden_layer_size=30, dropout=(1.0/3.0), reg_penalty=0.0001, activation='relu'): # create model model = Sequential() # 1 hidden layer of specified size hidden_layer_size, specified L1 regularization, specified activation model.add(Dense(hidden_layer_size, input_dim=num_components, kernel_initializer='TruncatedNormal', kernel_regularizer=regularizers.l1(reg_penalty), activation=activation )) # 1 dropout layer model.add(Dropout(dropout)) # send outputs to sigmoid layer for binary classification model.add(Dense(1, activation='sigmoid', kernel_initializer='TruncatedNormal', kernel_regularizer=regularizers.l1(reg_penalty) )) return model def create_model(num_components=num_features, hidden_layer_size=30, dropout=(1.0/3.0), reg_penalty=0.0001, activation='relu'): model = declare_model(num_components=num_components, hidden_layer_size=hidden_layer_size, dropout=dropout, reg_penalty=reg_penalty, activation=activation) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model # In[8]: # run this a couple of different ways # 1st, run a simple model ac='sigmoid' # sigmoid activation hl = 8 # 8 hidden units dr = 0 # no dropout rp = 0 # no regularization model = create_model(num_components=num_features, hidden_layer_size=hl, dropout=dr, reg_penalty=rp, activation=ac) # recompile with SGD which doesn't work as quickly, shows more readable progress plots model.compile(loss='binary_crossentropy', optimizer=optimizers.SGD(lr=1, clipnorm=1.), metrics=['accuracy']) # fit model on training data print('%s Training...' % time.strftime("%H:%M:%S")) fit = model.fit(X_train, y_train, validation_data=(X_xval, y_xval), epochs=1000, batch_size=X_train.shape[0], # small data, full batch, no sgd verbose=False) # See how it does on training data # predict probabilities y_train_prob = model.predict(X_train) # select threshold that maximizes accuracy def selectThresholdAcc (logits, labels, beta=1): # return threshold that yields best accuracy # predict using true if >= threshold precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits) accuracies = [sklearn.metrics.accuracy_score(logits >= thresh, labels) for thresh in thresholds] best_index = np.argmax(accuracies) best_threshold = thresholds[best_index] best_score = accuracies[best_index] return (best_threshold, best_score) thresh, score = selectThresholdAcc(y_train_prob, y_train) # predict true if predicted prob > threshold y_train_pred = y_train_prob >= thresh # show training accuracy and F1 # (see https://en.wikipedia.org/wiki/F1_score , https://en.wikipedia.org/wiki/Receiver_operating_characteristic ) print("%s Train Accuracy %.3f, Train F1 %.3f" % (time.strftime("%H:%M:%S"), sklearn.metrics.accuracy_score(y_train_pred, y_train), sklearn.metrics.f1_score(y_train_pred, y_train))) # show training set confusion matrix # True negative False negative # False postiive True positive print "%s Confusion matrix (train):" % time.strftime("%H:%M:%S") print(sklearn.metrics.confusion_matrix(y_train_pred, y_train)) # same in cross-validation set y_xval_prob = model.predict(X_xval) thresh, score = selectThresholdAcc(y_xval_prob, y_xval) y_xval_pred = y_xval_prob >= thresh print("%s Xval Accuracy %.3f, Xval F1 %.3f" % (time.strftime("%H:%M:%S"), sklearn.metrics.accuracy_score(y_xval_pred, y_xval), sklearn.metrics.f1_score(y_xval_pred, y_xval))) print "%s Confusion matrix (xval):" % time.strftime("%H:%M:%S") print(sklearn.metrics.confusion_matrix(y_xval_pred, y_xval)) # same in test # note we use threshold selected using xval y_test_prob = model.predict(X_test) y_test_pred = y_test_prob >= thresh print("%s Test Accuracy %.3f, Test F1 %.3f" % (time.strftime("%H:%M:%S"), sklearn.metrics.accuracy_score(y_test_pred, y_test), sklearn.metrics.f1_score(y_test_pred, y_test))) print "%s Confusion matrix (test):" % time.strftime("%H:%M:%S") print(sklearn.metrics.confusion_matrix(y_test_pred, y_test)) # In[9]: # plot path of loss function over first 200 epochs # xval loss, proxy for total error total_loss = np.array(fit.history['val_loss'][:200]) # training loss, proxy for bias error bias = np.array(fit.history['loss'][:200]) # difference, proxy for variance error variance = total_loss - bias plt.plot(total_loss) plt.ylabel('Total loss') plt.xlabel('Epoch') plt.show() plt.plot(bias) plt.ylabel('Training loss (proxy for bias)') plt.xlabel('Epoch') plt.show() plt.plot(variance) plt.ylabel('Difference (proxy for variance)') plt.xlabel('Epoch') plt.show() # note that training loss (bias) declines continuously # total loss declines, reaches a minimum, then climbs as overfitting and variance increases # http://scott.fortmann-roe.com/docs/BiasVariance.html # In[10]: # plot ROC curve # at bottom left we classify everything as negative, no true positives, no false positives # as we lower the threshold at which we classify positive, we get some true positives, some false positives # https://en.wikipedia.org/wiki/Receiver_operating_characteristic # AUC of a coinflip is 0.5 so area under curve (AUC) ~0.5 is not good # but highest, lowest prob predictions have predictive value (fpr, tpr, thresholds) = sklearn.metrics.roc_curve(y_xval, y_xval_prob) roc_auc = sklearn.metrics.auc(fpr, tpr) plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (AUC = %0.3f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() # In[11]: # 2nd, use Keras native k-fold cross-validation and grid search # this will take some time ~ note timestamps in output on AWS p2.xlarge print('%s Starting' % time.strftime("%H:%M:%S")) estimator = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) # hyperparameter options to try hidden_layer_hp = [8, 16, 32] dropout_hp = [0, 0.333, 0.5] reg_penalty_hp = [0, 0.0001, 0.0003, 0.001, 0.003] activation_hp = ['relu','sigmoid'] param_grid = dict(hidden_layer_size=hidden_layer_hp, dropout=dropout_hp, reg_penalty=reg_penalty_hp, activation=activation_hp, ) grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=kfold) print('%s Starting grid search' % time.strftime("%H:%M:%S")) classifier = grid.fit(X_bigtrain, y_bigtrain) print('%s Finishing' % time.strftime("%H:%M:%S")) # summarize xval results print("Best Xval: %f using %s" % (classifier.best_score_, classifier.best_params_)) means = classifier.cv_results_['mean_test_score'] stds = classifier.cv_results_['std_test_score'] params = classifier.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) # evaluate with test set print("Evaluate performance in test set") y_test_pred = classifier.predict(X_test) confusion_matrix = sklearn.metrics.confusion_matrix(y_test_pred, y_test) print(confusion_matrix) print("Test Accuracy %.3f" % sklearn.metrics.accuracy_score(y_test_pred, y_test)) print("Test F1 %.3f" % sklearn.metrics.f1_score(y_test_pred, y_test)) # In[11]: # Finally, roll our own grid search # more fine-grained control such as custom metric and threshold # define some custom metrics import keras.backend as K def recall(y_true, y_pred): # return keras tensor for recall true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) recall = true_positives / (possible_positives + K.epsilon()) return recall def precision(y_true, y_pred): # return keras tensor for precision true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision def fbeta_score(y_true, y_pred, beta=1): if beta < 0: raise ValueError('The lowest choosable beta is zero (only precision).') # If there are no true positives, fix the F score at 0 like sklearn. if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: return 0 p = precision(y_true, y_pred) r = recall(y_true, y_pred) bb = beta ** 2 fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) return fbeta_score def f_score(y_true, y_pred): beta = 1 # can adjust to penalize false positives/negatives return fbeta_score(y_true, y_pred, beta=beta) def selectThresholdF1 (logits, labels, beta=1): # return threshold, f-score that yields best F-score # predict using true if >= threshold precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits) bb = beta**2 f1_scores = (1 + bb) * precision * recall / (bb * precision + recall) f1_scores = np.nan_to_num(f1_scores) best_index = np.argmax(f1_scores) best_threshold = thresholds[best_index] best_score = f1_scores[best_index] return (best_threshold, best_score) def selectThresholdAcc (logits, labels, beta=1): # return threshold that yields best accuracy # predict using true if >= threshold precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits) accuracies = [sklearn.metrics.accuracy_score(logits >= thresh, labels) for thresh in thresholds] best_index = np.argmax(accuracies) best_threshold = thresholds[best_index] best_score = accuracies[best_index] return (best_threshold, best_score) def selectThresholdTest (logits, labels, beta=1): # show all thresholds, resulting F1 and accuracy precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits) bb = beta**2 f1_scores = (1 + bb) * precision * recall / (bb * precision + recall) f1_scores = np.nan_to_num(f1_scores) for thresh in thresholds: labels_pred = logits >= thresh f_test = sklearn.metrics.f1_score(labels_pred, labels) acc_test = sklearn.metrics.accuracy_score(labels_pred, labels) print ("Threshold %f, f1 %f, accuracy %f") % (thresh, f_test, acc_test) print(sklearn.metrics.confusion_matrix(labels_pred, labels)) best_index = np.argmax(f1_scores) best_threshold = thresholds[best_index] best_score = f1_scores[best_index] return (best_threshold, best_score) # In[12]: # same as above, compile with custom metric def create_model(num_components=num_features, hidden_layer_size=30, dropout=(1.0/3.0), reg_penalty=0.0001, activation='relu'): model = declare_model(num_components=num_components, hidden_layer_size=hidden_layer_size, dropout=dropout, reg_penalty=reg_penalty, activation=activation) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f_score]) return model # In[13]: # delete old saved model files get_ipython().system('rm model*.json') get_ipython().system('rm model*.h5') # In[14]: # hyperparameter options to evaluate hidden_layer_hp = [4, 8, 16, 32] dropout_hp = [0, 0.333, 0.5] reg_penalty_hp = [0, 0.0001, 0.0003, 0.001, 0.003] activation_hp = ['relu', 'sigmoid'] fscores = {} for dr in dropout_hp: for ac in activation_hp: for hl in hidden_layer_hp: for rp in reg_penalty_hp: # print("\n %s\n" % (time.strftime("%H:%M:%S"))) model = create_model(num_components=num_features, hidden_layer_size=hl, dropout=dr, reg_penalty=rp, activation=ac) models = [] losses = [] epochs = 1000 # increase this if Xval_loss doesn't always reach a minimum in this many epochs print('%s Starting' % time.strftime("%H:%M:%S")) for i in range(epochs): fit = model.fit(X_train, y_train, validation_data=(X_xval, y_xval), epochs=1, batch_size=X_train.shape[0], # small data, full batch, no sgd verbose=False) train_loss = fit.history['loss'][-1] train_acc = fit.history['acc'][-1] train_f_score = fit.history['f_score'][-1] current_loss = fit.history['val_loss'][-1] current_acc = fit.history['val_acc'][-1] current_f_score = fit.history['val_f_score'][-1] if i % 100 == 0: print('%s epoch %d of %d Train loss: %.4f Train f_score %.4f Xval loss: %.4f Xval f_score %.4f' % (time.strftime("%H:%M:%S"), i, epochs, train_loss, train_f_score, current_loss, current_f_score)) losses.append(current_loss) models.append(copy.copy(model)) bestloss_index = np.argmin(losses) bestloss_value = losses[bestloss_index] # stop if loss rises by 10% from best if current_loss / bestloss_value > 1.1: break # keep model from epoch with best xval loss print ("Best Xval loss epoch %d, value %f" % (bestloss_index, bestloss_value)) model = models[bestloss_index] # evaluate model print ("NN units %d" % hl) print ("Reg_penalty %.8f" % rp) print ("Dropout %.4f" % dr) print ("Activation %s" % ac) y_train_prob = model.predict(X_train) thresh, score = selectThresholdAcc(y_train_prob, y_train) y_train_pred = y_train_prob >= thresh print("Final Train Accuracy %.3f, Train F1 %.3f" % (sklearn.metrics.accuracy_score(y_train_pred, y_train), sklearn.metrics.f1_score(y_train_pred, y_train))) print(sklearn.metrics.confusion_matrix(y_train_pred, y_train)) y_xval_prob = model.predict(X_xval) thresh, score = selectThresholdAcc(y_xval_prob, y_xval) y_xval_pred = y_xval_prob >= thresh print("Final Xval Accuracy %.3f, Xval F1 %.3f" % (sklearn.metrics.accuracy_score(y_xval_pred, y_xval), sklearn.metrics.f1_score(y_xval_pred, y_xval))) confusion_matrix = sklearn.metrics.confusion_matrix(y_xval_pred, y_xval) print(confusion_matrix) true_negative = confusion_matrix[0][0] false_negative = confusion_matrix[0][1] false_positive = confusion_matrix[1][0] true_positive = confusion_matrix[1][1] fscores[(hl, rp, dr, ac)] = score # save model to disk modelname = "model_%s_%d_%.3f_%.6f" % (ac, hl, dr, rp) model.save("%s.h5" % modelname) model.save_weights("%s_weights.h5" % modelname) with open("%s.json" % modelname, "wb") as fjson: fjson.write(model.to_json()) print('%s Finishing' % time.strftime("%H:%M:%S")) # In[15]: # visualize evaluations in a grid to pick one f, subplots = plt.subplots(len(dropout_hp), len(activation_hp)) outstr = "" rownum=0 for dr in dropout_hp: colnum = 0 for ac in activation_hp: matrix1 = [] for hl in hidden_layer_hp: row = [] for reg_penalty in reg_penalty_hp: try: row.append(fscores[(hl, reg_penalty, dr, ac)]) except KeyError: row.append(0.00) matrix1.append(row) outstr += "\n\nDropout %f Activation: %s \n" % (dr, ac) outstr += repr(np.array(matrix1)) outstr += "\n\nMax: %f\n" % (np.max(np.array(matrix1))) outstr += "Avg: %f\n" % (np.average(np.array(matrix1))) outstr += "Col Avg: %s\n" % repr(np.average(np.array(matrix1), axis=0)) outstr += "Row Avg: %s\n" % repr(np.average(np.array(matrix1), axis=1)) subplots[rownum, colnum].matshow(matrix1, cmap='hot', vmin=0.5, vmax=0.7, interpolation='nearest') subplots[rownum, colnum].set_title('%s %f' % (ac, dr)) colnum += 1 rownum += 1 plt.show() print(outstr) # In[18]: # pick this model, load from file ac='sigmoid' hl = 8 dr = 0.333 rp = 0.001 modelname = "model_%s_%d_%.3f_%.6f" % (ac, hl, dr, rp) # doesn't work because of some custom metric BS #keras.models.load_model("%s.h5" % modelname) with open("%s.json" % modelname, 'r') as json_file: model_json = json_file.read() model = model_from_json(model_json) model.load_weights("%s_weights.h5" % modelname) print("Loaded model from disk") # In[19]: # re-evaluate in xval to confirm, pick threshold value y_xval_prob = model.predict(X_xval) thresh, score = selectThresholdAcc(y_xval_prob, y_xval) y_xval_pred = y_xval_prob >= thresh print("Final Xval Accuracy %.3f, Xval F1 %.3f, f_score %.3f" % (sklearn.metrics.accuracy_score(y_xval_pred, y_xval), sklearn.metrics.f1_score(y_xval_pred, y_xval), score)) confusion_matrix = sklearn.metrics.confusion_matrix(y_xval_pred, y_xval) print(confusion_matrix) false_positive = confusion_matrix[1][0] false_negative = confusion_matrix[0][1] true_positive = confusion_matrix[1][1] # In[20]: # evaluate in test set # xval not a good measure of expected performance since we used it to pick threshold # also tested many times in xval and picked best model, which is probably model that's lucky in xval y_test_prob = model.predict(X_test) y_test_pred = y_test_prob >= thresh print("Test Accuracy %.3f, Test F1 %.3f" % (sklearn.metrics.accuracy_score(y_test_pred, y_test), sklearn.metrics.f1_score(y_test_pred, y_test))) print(sklearn.metrics.confusion_matrix(y_test_pred, y_test)) # In[21]: model.save("finalmodel.h5") model.save_weights("modelweights.h5") with open("finalmodel.json", "wb") as fjson: fjson.write(model.to_json()) # In[ ]: