import numpy as np
import pandas as pd

from keras import models
from keras import layers
from keras import metrics
from keras import regularizers

import matplotlib.pyplot as plt

# Running locally
# df_train_orig = pd.read_csv('data/criminal_train.csv')
# df_test_orig = pd.read_csv('data/criminal_test.csv')

from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn]))) 

import io
df_train_orig = pd.read_csv(io.StringIO(uploaded['criminal_train.csv'].decode('utf-8')))
df_test_orig = pd.read_csv(io.StringIO(uploaded['criminal_test.csv'].decode('utf-8')))

# Take a copy so we don't have to keep reloading the data while playing with the data
import copy
df_train = copy.deepcopy(df_train_orig)
df_test = copy.deepcopy(df_test_orig)

df_test['Criminal'] = -1
df_both = pd.concat([df_train, df_test])

df_both.head()

print(df_both.shape)

to_dump = ['IIHHSIZ2','IIKI17_2','IIHH65_2','PRXRETRY','HLNVCOST','HLNVOFFR','HLNVREF','HLNVNEED','HLNVSOR','IIMCDCHP','IIMEDICR','IICHMPUS','IIPRVHLT','IIOTHHLT','IIINSUR4','IIFAMSOC','IIFAMSSI','IIFSTAMP','IIFAMPMT','IIFAMSVC','IIWELMOS','IIPINC3','IIFAMIN3','ANALWT_C','VESTR','VEREP']
two_is_zero =['IRMCDCHP','IRMEDICR','IRCHMPUS','IRPRVHLT','IROTHHLT','ANYHLTI2','IRINSUR4','OTHINS','IRFAMSOC','IRFAMSSI','IRFSTAMP','IRFAMPMT','IRFAMSVC','GOVTPROG']
is_numeric = ['HLCNOTMO', 'IRWELMOS']

# Dump the unneeded fields
all_cols = list(df_both)
for col in all_cols:
    if col in to_dump:
        df_both = df_both.drop(col, axis=1)


for col in df_both:
    if col != 'Criminal':

        # A lot of fields had the same 'no answer' mappings so set them all to zero
        df_both[col] = df_both[col].replace(-1, 0)  # Null
        df_both[col] = df_both[col].replace(85, 0)  # BAD DATA Logically assigned
        df_both[col] = df_both[col].replace(94, 0)  # DON'T KNOW
        df_both[col] = df_both[col].replace(97, 0)  # REFUSED
        df_both[col] = df_both[col].replace(98, 0)  # BLANK (NO ANSWER)
        df_both[col] = df_both[col].replace(99, 0)  # LEGITIMATE SKIP (Respondent answering)
        
        # Set the 'no' answers to zero
        if col in two_is_zero:
            df_both[col] = df_both[col].replace(2, 0)  # 'no' answers
        
        # Set the fields to categorical (do I need to do this?)
        if col not in is_numeric:
            df_both[col] = df_both[col].astype('category')
        else:
            # Normalise the other fields so all fields are between -/+ 1 standard deviation
            mean = df_both[col].mean(axis=0)
            df_both[col] -= mean
            std = df_both[col].std(axis=0)
            df_both[col] /= std

df_both.Criminal.value_counts()

# List of columns before OHE
df_both.columns[1:-1]

# Hold back the numeric fields from OHE
cat_cols = set(df_both.columns[1:-1])
for col in is_numeric:
    cat_cols.remove(col)

# Actually do the OHE
df_ohe = pd.get_dummies(df_both, columns=cat_cols)  # ignore perid and criminal column

# Put all the data back together with the new OHE fields
df_ohe['PERID'] = df_both.PERID
df_ohe['Criminal'] = df_both.Criminal
for col in is_numeric:
    df_ohe[col] = df_both[col]

print(df_ohe.shape)

df_ohe.columns

df_train = df_ohe[df_ohe['Criminal'] != -1]
df_test = df_ohe[df_ohe['Criminal'] == -1]

df_train_labels = df_train['Criminal']
# Drop the PERID field as we shouldn't predict from it
# Also drop the Criminal field else our model will just use this!
df_train = df_train.drop(['PERID','Criminal'], axis = 1)
print(df_train.shape)

# Remove the Criminal column from the test set
df_test = df_test.drop(['Criminal'], axis = 1)

print(df_train.shape)
print(df_test.shape)

# Get the base arrays using the values from the dataframes
labels = df_train_labels.values
data = df_train.values

# TODO: do I need to do this step?
# Convert from ints to floats
labels = labels.astype(np.float32, copy=False)
data = data.astype(np.float32, copy=False)

print(data.shape)
print(labels.shape)

from sklearn.model_selection import train_test_split

data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.20, random_state=42)

# We need this to handle the unbalanced data
# https://github.com/keras-team/keras/issues/1875
# TODO: I don't think I need to do this
def get_class_weights(y, smooth_factor=0):
    """
    Returns the weights for each class based on the frequencies of the samples
    :param smooth_factor: factor that smooths extremely uneven weights
    :param y: list of true labels (the labels must be hashable)
    :return: dictionary with the weight for each class
    """
    from collections import Counter
    counter = Counter(y)

    if smooth_factor > 0:
        p = max(counter.values()) * smooth_factor
        for k in counter.keys():
            counter[k] += p

    majority = max(counter.values())

    return {cls: float(majority) / count for cls, count in counter.items()}

class_weights = get_class_weights(labels)
class_weights

# Used as the evaluation function
# Removed from older keras source due to it being inappropriate for batchings  
# use huge batches to compensate
import keras.backend as K
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(148,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=[matthews_correlation])

history = model.fit(data_train,
                   labels_train,
                   epochs=85,
                   class_weight = class_weights,
                   batch_size=10000,
                   verbose=0,
                   validation_data=(data_test, labels_test))
print('done')

matt = history.history['matthews_correlation']
val_matt = history.history['val_matthews_correlation']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(matt) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(epochs, matt, 'bo', label='Training acc')
plt.plot(epochs, val_matt, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

def smooth_curve(points, factor=0.9):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

smooth_history = smooth_curve(val_matt, factor=0.9)

plt.plot(range(1, len(smooth_history) + 1), smooth_history)
plt.title('Smooth validation acc by epoch')
plt.xlabel('Epochs')
plt.ylabel('Validation acc')
plt.show()

test_loss, test_acc = model.evaluate(data, labels)

test_acc

new_predict = copy.deepcopy(df_test)
vals = df_test.drop(['PERID'], axis = 1).values  # drop the PERID

# Run the model on the test data
predictions = model.predict(vals)

def split_val(v):
    if v > 0.5:
        return 1
    return 0

# Apply the function to each row and create a new predictions column
new_predict['predictions'] = [split_val(x) for x in predictions]

df_predict = new_predict[['PERID', 'predictions']]
df_predict=df_predict.rename(columns = {'predictions':'Criminal'})

df_predict.head()  # The submitted file should have just PERID and Criminal columns

# Export our predictions to csv
df_predict.to_csv('predictions.csv', sep=',', index=False)

# Download the csv file
files.download('predictions.csv')