#!/usr/bin/env python
# coding: utf-8

# # Equity premium prediction with python and Keras
# 
# * An exploratory data analysis and a demo of classification using a feedforward neural network written in python using the [Keras](https://keras.io/) machine learning framework
# * Using [dataset](http://www.hec.unil.ch/agoyal/docs/PredictorData2016.xlsx) from [Prof. Amit Goyal](http://www.hec.unil.ch/agoyal/), we attempt to predict quarterly equity outperformance based on fundamental data like interest rates, valuation.
# * Train binary classifier to predict whether next quarter's equity premium will be above or below long term average
# 

# In[1]:


import numpy as np
import pandas as pd

import sklearn
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight

from pprint import pprint
import time
import copy


from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras import optimizers
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import model_from_json

import matplotlib.pyplot as plt

#use a fixed seed for reproducibility
#seed = np.random.randint(10000, size=1)[0]
seed = 6882
print(seed)
np.random.seed(seed)


# In[2]:


############################################################
# 1. load data
############################################################

# load dataset
print("Loading data...")
dataframe = pd.read_csv("EqPremClass.csv")
dataframe


# In[3]:


# pandas dataframe -> numpy ndarray
dataset = dataframe.values
del dataframe


# In[4]:


num_obs, num_features = dataset.shape
num_features -=1
num_labels=1
print("Observations: %d\nFeatures: %d" % (num_obs, num_features))

# last column is target 
y=dataset[:,num_features].astype(float)
print("Histogram: check all 0s and 1s, no -1s etc.")
pprint(np.histogram(y))

# omit 1st id column
X_raw = dataset[:,1:num_features].astype(float)
num_features -=1
del dataset

# normalize
# not necessary for NN but may speed convergence, lets pca work
# min_max_scaler = preprocessing.MinMaxScaler()
# X = min_max_scaler.fit_transform(X_raw)
X = preprocessing.scale(X_raw)

pd.DataFrame(X)


# In[5]:


print("Split into training, xval, test")
# split into training, xval, test, 60/20/20
X_bigtrain, X_test, y_bigtrain, y_test = train_test_split(X, y, test_size=0.2)
print("Split into train, xval")
X_train, X_xval, y_train, y_xval = train_test_split(X_bigtrain, y_bigtrain, test_size=0.25)

print "Training set"
print X_train.shape
pprint(np.histogram(y_train))

print "Xval set"
print X_xval.shape
pprint(np.histogram(y_xval))

print "Test set"
print X_test.shape
pprint(np.histogram(y_test))


# In[6]:


# principal component analysis
# scree chart to see how much variation is explained by how many predictors

# we can predict using PCA components for dimensionality reduction when we have too many/collinear columns
# can speed things up or sometimes get a better result
# but won't do that here
# merely exploratory to understand the data, see that it's scaled

pca = PCA(n_components=num_features)
pca.fit(X_train)

#The amount of variance that each PC explains
var= pca.explained_variance_ratio_

#Cumulative Variance explains
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

#print(var1)
get_ipython().run_line_magic('matplotlib', 'inline')
plt.plot(var1)
print(var1)
# looks like ~10 orthogonal PCA components explain > 80% of the variation

num_pca_components=15
pca = PCA(n_components=num_pca_components)
pca.fit(X_train)

pca_train=pca.transform(preprocessing.normalize(X_train))
pca_bigtrain=pca.transform(preprocessing.normalize(X_bigtrain))
pca_xval=pca.transform(preprocessing.normalize(X_xval))
pca_test=pca.transform(preprocessing.normalize(X_test))


# In[7]:


# function to instantiate Keras feed-forward neural network model

def declare_model(num_components=num_features, 
                  hidden_layer_size=30, 
                  dropout=(1.0/3.0), 
                  reg_penalty=0.0001, 
                  activation='relu'):
    # create model
    model = Sequential()
    # 1 hidden layer of specified size hidden_layer_size, specified L1 regularization, specified activation
    model.add(Dense(hidden_layer_size, 
                    input_dim=num_components, 
                    kernel_initializer='TruncatedNormal', 
                    kernel_regularizer=regularizers.l1(reg_penalty),
                    activation=activation
                   ))
    # 1 dropout layer
    model.add(Dropout(dropout))
    # send outputs to sigmoid layer for binary classification
    model.add(Dense(1, 
                    activation='sigmoid',
                    kernel_initializer='TruncatedNormal', 
                    kernel_regularizer=regularizers.l1(reg_penalty)
                   ))
    return model

def create_model(num_components=num_features, 
                 hidden_layer_size=30, 
                 dropout=(1.0/3.0), 
                 reg_penalty=0.0001, 
                 activation='relu'):
    
    model = declare_model(num_components=num_components, 
                 hidden_layer_size=hidden_layer_size, 
                 dropout=dropout, 
                 reg_penalty=reg_penalty, 
                 activation=activation)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# In[8]:


# run this a couple of different ways
# 1st, run a simple model

ac='sigmoid'    # sigmoid activation
hl = 8          # 8 hidden units
dr = 0          # no dropout
rp = 0          # no regularization

model = create_model(num_components=num_features, 
                 hidden_layer_size=hl, 
                 dropout=dr, 
                 reg_penalty=rp, 
                 activation=ac)

# recompile with SGD which doesn't work as quickly, shows more readable progress plots
model.compile(loss='binary_crossentropy', 
              optimizer=optimizers.SGD(lr=1, clipnorm=1.),
              metrics=['accuracy'])

# fit model on training data
print('%s Training...' % time.strftime("%H:%M:%S"))

fit = model.fit(X_train, y_train, validation_data=(X_xval, y_xval), 
                                    epochs=1000, 
                                    batch_size=X_train.shape[0], # small data, full batch, no sgd 
                                    verbose=False)
# See how it does on training data
# predict probabilities
y_train_prob = model.predict(X_train)
                
# select threshold that maximizes accuracy
def selectThresholdAcc (logits, labels, beta=1):
    # return threshold that yields best accuracy
    # predict using true if >= threshold

    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits)
    accuracies = [sklearn.metrics.accuracy_score(logits >= thresh, labels) for thresh in thresholds]
    
    best_index = np.argmax(accuracies)
    best_threshold = thresholds[best_index]
    best_score = accuracies[best_index]
    return (best_threshold, best_score)

thresh, score = selectThresholdAcc(y_train_prob, y_train)

# predict true if predicted prob > threshold
y_train_pred = y_train_prob >= thresh
         
# show training accuracy and F1
# (see https://en.wikipedia.org/wiki/F1_score , https://en.wikipedia.org/wiki/Receiver_operating_characteristic )
print("%s Train Accuracy %.3f, Train F1 %.3f" % 
      (time.strftime("%H:%M:%S"),
       sklearn.metrics.accuracy_score(y_train_pred, y_train), 
       sklearn.metrics.f1_score(y_train_pred, y_train)))
                
# show training set confusion matrix 
# True negative      False negative
# False postiive     True positive

print "%s Confusion matrix (train):" % time.strftime("%H:%M:%S")
print(sklearn.metrics.confusion_matrix(y_train_pred, y_train))

# same in cross-validation set
y_xval_prob = model.predict(X_xval)
thresh, score = selectThresholdAcc(y_xval_prob, y_xval)
y_xval_pred = y_xval_prob >= thresh

print("%s Xval Accuracy %.3f, Xval F1 %.3f" % 
      (time.strftime("%H:%M:%S"),
       sklearn.metrics.accuracy_score(y_xval_pred, y_xval), 
       sklearn.metrics.f1_score(y_xval_pred, y_xval)))

print "%s Confusion matrix (xval):" % time.strftime("%H:%M:%S")
print(sklearn.metrics.confusion_matrix(y_xval_pred, y_xval))

# same in test
# note we use threshold selected using xval
y_test_prob = model.predict(X_test)
y_test_pred = y_test_prob >= thresh
print("%s Test Accuracy %.3f, Test F1 %.3f" % 
      (time.strftime("%H:%M:%S"),
       sklearn.metrics.accuracy_score(y_test_pred, y_test), 
       sklearn.metrics.f1_score(y_test_pred, y_test)))

print "%s Confusion matrix (test):" % time.strftime("%H:%M:%S")
print(sklearn.metrics.confusion_matrix(y_test_pred, y_test))


# In[9]:


# plot path of loss function over first 200 epochs
# xval loss, proxy for total error 
total_loss = np.array(fit.history['val_loss'][:200])
# training loss, proxy for bias error
bias = np.array(fit.history['loss'][:200])
# difference, proxy for variance error
variance = total_loss - bias

plt.plot(total_loss)
plt.ylabel('Total loss')
plt.xlabel('Epoch')
plt.show()

plt.plot(bias)
plt.ylabel('Training loss (proxy for bias)')
plt.xlabel('Epoch')
plt.show()

plt.plot(variance)
plt.ylabel('Difference (proxy for variance)')
plt.xlabel('Epoch')
plt.show()

# note that training loss (bias) declines continuously
# total loss declines, reaches a minimum, then climbs as overfitting and variance increases
# http://scott.fortmann-roe.com/docs/BiasVariance.html


# In[10]:


# plot ROC curve

# at bottom left we classify everything as negative, no true positives, no false positives
# as we lower the threshold at which we classify positive, we get some true positives, some false positives
# https://en.wikipedia.org/wiki/Receiver_operating_characteristic

# AUC of a coinflip is 0.5 so area under curve (AUC) ~0.5 is not good
# but highest, lowest prob predictions have predictive value

(fpr, tpr, thresholds) = sklearn.metrics.roc_curve(y_xval, y_xval_prob)
roc_auc = sklearn.metrics.auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (AUC = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()


# In[11]:


# 2nd, use Keras native k-fold cross-validation and grid search
# this will take some time ~ note timestamps in output on AWS p2.xlarge

print('%s Starting' % time.strftime("%H:%M:%S"))

estimator = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# hyperparameter options to try
hidden_layer_hp = [8, 16, 32]
dropout_hp = [0, 0.333, 0.5]
reg_penalty_hp = [0, 0.0001, 0.0003, 0.001, 0.003]
activation_hp = ['relu','sigmoid']

param_grid = dict(hidden_layer_size=hidden_layer_hp, 
                  dropout=dropout_hp, 
                  reg_penalty=reg_penalty_hp,
                  activation=activation_hp,
                  )

grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=kfold)
print('%s Starting grid search' % time.strftime("%H:%M:%S"))
classifier = grid.fit(X_bigtrain, y_bigtrain)

print('%s Finishing' % time.strftime("%H:%M:%S"))

# summarize xval results
print("Best Xval: %f using %s" % (classifier.best_score_, classifier.best_params_))
means = classifier.cv_results_['mean_test_score']
stds = classifier.cv_results_['std_test_score']
params = classifier.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
        
# evaluate with test set
print("Evaluate performance in test set")
y_test_pred = classifier.predict(X_test)
confusion_matrix = sklearn.metrics.confusion_matrix(y_test_pred, y_test)
print(confusion_matrix)

print("Test Accuracy %.3f" % sklearn.metrics.accuracy_score(y_test_pred, y_test))
print("Test F1 %.3f" % sklearn.metrics.f1_score(y_test_pred, y_test))


# In[11]:


# Finally, roll our own grid search
# more fine-grained control such as custom metric and threshold

# define some custom metrics
import keras.backend as K

def recall(y_true, y_pred):
    # return keras tensor for recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    # return keras tensor for precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def fbeta_score(y_true, y_pred, beta=1):
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def f_score(y_true, y_pred):
    beta = 1 #  can adjust to penalize false positives/negatives
    return fbeta_score(y_true, y_pred, beta=beta)

def selectThresholdF1 (logits, labels, beta=1):
    # return threshold, f-score that yields best F-score
    # predict using true if >= threshold

    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits)
    bb = beta**2
    f1_scores = (1 + bb) * precision * recall / (bb * precision + recall)
    f1_scores = np.nan_to_num(f1_scores)
    
    best_index = np.argmax(f1_scores)
    best_threshold = thresholds[best_index]
    best_score = f1_scores[best_index]
    return (best_threshold, best_score)

def selectThresholdAcc (logits, labels, beta=1):
    # return threshold that yields best accuracy
    # predict using true if >= threshold

    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits)
    accuracies = [sklearn.metrics.accuracy_score(logits >= thresh, labels) for thresh in thresholds]
    
    best_index = np.argmax(accuracies)
    best_threshold = thresholds[best_index]
    best_score = accuracies[best_index]
    return (best_threshold, best_score)

def selectThresholdTest (logits, labels, beta=1):
    # show all thresholds, resulting F1 and accuracy

    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, logits)
    bb = beta**2
    f1_scores = (1 + bb) * precision * recall / (bb * precision + recall)
    f1_scores = np.nan_to_num(f1_scores)
    
    for thresh in thresholds:
        labels_pred = logits >= thresh
        f_test = sklearn.metrics.f1_score(labels_pred, labels)
        acc_test = sklearn.metrics.accuracy_score(labels_pred, labels)
        
        print ("Threshold %f, f1 %f, accuracy %f") % (thresh, f_test, acc_test)
        print(sklearn.metrics.confusion_matrix(labels_pred, labels))
    
    best_index = np.argmax(f1_scores)
    best_threshold = thresholds[best_index]
    best_score = f1_scores[best_index]
    return (best_threshold, best_score)


# In[12]:


# same as above, compile with custom metric
def create_model(num_components=num_features, 
                 hidden_layer_size=30, 
                 dropout=(1.0/3.0), 
                 reg_penalty=0.0001, 
                 activation='relu'):
    
    model = declare_model(num_components=num_components, 
                 hidden_layer_size=hidden_layer_size, 
                 dropout=dropout, 
                 reg_penalty=reg_penalty, 
                 activation=activation)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f_score])
    return model


# In[13]:


# delete old saved model files
get_ipython().system('rm model*.json')
get_ipython().system('rm model*.h5')


# In[14]:


# hyperparameter options to evaluate
hidden_layer_hp = [4, 8, 16, 32]
dropout_hp = [0, 0.333, 0.5]
reg_penalty_hp = [0, 0.0001, 0.0003, 0.001, 0.003]
activation_hp = ['relu', 'sigmoid']

fscores = {}

for dr in dropout_hp:
    for ac in activation_hp:
        for hl in hidden_layer_hp:
            for rp in reg_penalty_hp:
                # print("\n %s\n" % (time.strftime("%H:%M:%S")))               

                model = create_model(num_components=num_features,
                                     hidden_layer_size=hl, 
                                     dropout=dr, 
                                     reg_penalty=rp,
                                     activation=ac)
                
                models = []
                losses = []
    
    
                epochs = 1000 # increase this if Xval_loss doesn't always reach a minimum in this many epochs
                print('%s Starting' % time.strftime("%H:%M:%S"))             
                
                for i in range(epochs):
                    fit = model.fit(X_train, y_train, validation_data=(X_xval, y_xval), 
                                    epochs=1, 
                                    batch_size=X_train.shape[0], # small data, full batch, no sgd 
                                    verbose=False)
                    train_loss = fit.history['loss'][-1]
                    train_acc = fit.history['acc'][-1]
                    train_f_score = fit.history['f_score'][-1]
                    current_loss = fit.history['val_loss'][-1]
                    current_acc = fit.history['val_acc'][-1]
                    current_f_score = fit.history['val_f_score'][-1]
                    if i % 100 == 0:
                        print('%s epoch %d of %d Train loss: %.4f Train f_score %.4f Xval loss: %.4f Xval f_score %.4f' % 
                              (time.strftime("%H:%M:%S"), i, epochs,
                               train_loss, train_f_score,
                               current_loss, current_f_score))
                    
                    losses.append(current_loss)
                    models.append(copy.copy(model))
                    
                    bestloss_index = np.argmin(losses)
                    bestloss_value = losses[bestloss_index]
                    # stop if loss rises by 10% from best
                    if current_loss / bestloss_value > 1.1:
                        break
                    
                # keep model from epoch with best xval loss
                print ("Best Xval loss epoch %d, value %f" % (bestloss_index, bestloss_value))
                model = models[bestloss_index]

                # evaluate model
                print ("NN units %d" % hl)
                print ("Reg_penalty %.8f" % rp)
                print ("Dropout %.4f" %  dr)
                print ("Activation %s" %  ac)
                
                y_train_prob = model.predict(X_train)
                
                thresh, score = selectThresholdAcc(y_train_prob, y_train)
                y_train_pred = y_train_prob >= thresh
                
                print("Final Train Accuracy %.3f, Train F1 %.3f" % 
                      (sklearn.metrics.accuracy_score(y_train_pred, y_train), 
                       sklearn.metrics.f1_score(y_train_pred, y_train)))
                
                print(sklearn.metrics.confusion_matrix(y_train_pred, y_train))

                y_xval_prob = model.predict(X_xval)
                
                thresh, score = selectThresholdAcc(y_xval_prob, y_xval)
                y_xval_pred = y_xval_prob >= thresh
                
                print("Final Xval Accuracy %.3f, Xval F1 %.3f" % 
                      (sklearn.metrics.accuracy_score(y_xval_pred, y_xval), 
                       sklearn.metrics.f1_score(y_xval_pred, y_xval)))
                                
                confusion_matrix = sklearn.metrics.confusion_matrix(y_xval_pred, y_xval)
                print(confusion_matrix)
                true_negative = confusion_matrix[0][0]
                false_negative = confusion_matrix[0][1]
                false_positive = confusion_matrix[1][0]
                true_positive = confusion_matrix[1][1]

                fscores[(hl, rp, dr, ac)] = score

                # save model to disk
                modelname = "model_%s_%d_%.3f_%.6f" % (ac, hl, dr, rp)
                model.save("%s.h5" % modelname)
                model.save_weights("%s_weights.h5" % modelname)
                with open("%s.json" % modelname, "wb") as fjson:
                    fjson.write(model.to_json())                

print('%s Finishing' % time.strftime("%H:%M:%S"))


# In[15]:


# visualize evaluations in a grid to pick one

f, subplots = plt.subplots(len(dropout_hp), len(activation_hp))

outstr = ""

rownum=0
for dr in dropout_hp:
    colnum = 0
    for ac in activation_hp:
        matrix1 = []
        for hl in hidden_layer_hp:
            row = []
            for reg_penalty in reg_penalty_hp:
                try:
                    row.append(fscores[(hl, reg_penalty, dr, ac)])
                except KeyError:
                    row.append(0.00)
            matrix1.append(row)
            
        outstr += "\n\nDropout %f Activation: %s \n" % (dr, ac)
        outstr += repr(np.array(matrix1))
        outstr += "\n\nMax: %f\n" % (np.max(np.array(matrix1)))
        outstr += "Avg: %f\n" % (np.average(np.array(matrix1)))
        outstr += "Col Avg: %s\n" % repr(np.average(np.array(matrix1), axis=0))
        outstr += "Row Avg: %s\n" % repr(np.average(np.array(matrix1), axis=1))

        subplots[rownum, colnum].matshow(matrix1, cmap='hot', vmin=0.5, vmax=0.7, interpolation='nearest')
        subplots[rownum, colnum].set_title('%s %f' % (ac, dr))

        colnum += 1  
    rownum += 1

plt.show()

print(outstr)


# In[18]:


# pick this model, load from file
ac='sigmoid'
hl = 8
dr = 0.333
rp = 0.001
modelname = "model_%s_%d_%.3f_%.6f" % (ac, hl, dr, rp)

# doesn't work because of some custom metric BS
#keras.models.load_model("%s.h5" % modelname)

with open("%s.json" % modelname, 'r') as json_file:
    model_json = json_file.read()
model = model_from_json(model_json)
model.load_weights("%s_weights.h5" % modelname)
print("Loaded model from disk")


# In[19]:


# re-evaluate in xval to confirm, pick threshold value
y_xval_prob = model.predict(X_xval)
                
thresh, score = selectThresholdAcc(y_xval_prob, y_xval)
y_xval_pred = y_xval_prob >= thresh
                
print("Final Xval Accuracy %.3f, Xval F1 %.3f, f_score %.3f" % 
      (sklearn.metrics.accuracy_score(y_xval_pred, y_xval), 
       sklearn.metrics.f1_score(y_xval_pred, y_xval),
       score))
                                
confusion_matrix = sklearn.metrics.confusion_matrix(y_xval_pred, y_xval)
print(confusion_matrix)
false_positive = confusion_matrix[1][0]
false_negative = confusion_matrix[0][1]
true_positive = confusion_matrix[1][1]


# In[20]:


# evaluate in test set
# xval not a good measure of expected performance since we used it to pick threshold
# also tested many times in xval and picked best model, which is probably model that's lucky in xval
y_test_prob = model.predict(X_test)

y_test_pred = y_test_prob >= thresh

print("Test Accuracy %.3f, Test F1 %.3f" % 
      (sklearn.metrics.accuracy_score(y_test_pred, y_test), 
       sklearn.metrics.f1_score(y_test_pred, y_test)))

print(sklearn.metrics.confusion_matrix(y_test_pred, y_test))


# In[21]:


model.save("finalmodel.h5")
model.save_weights("modelweights.h5")

with open("finalmodel.json", "wb") as fjson:
    fjson.write(model.to_json())


# In[ ]: