import numpy as np import pandas as pd from keras import models from keras import layers from keras import metrics from keras import regularizers import matplotlib.pyplot as plt # Running locally # df_train_orig = pd.read_csv('data/criminal_train.csv') # df_test_orig = pd.read_csv('data/criminal_test.csv') from google.colab import files uploaded = files.upload() for fn in uploaded.keys(): print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn]))) import io df_train_orig = pd.read_csv(io.StringIO(uploaded['criminal_train.csv'].decode('utf-8'))) df_test_orig = pd.read_csv(io.StringIO(uploaded['criminal_test.csv'].decode('utf-8'))) # Take a copy so we don't have to keep reloading the data while playing with the data import copy df_train = copy.deepcopy(df_train_orig) df_test = copy.deepcopy(df_test_orig) df_test['Criminal'] = -1 df_both = pd.concat([df_train, df_test]) df_both.head() print(df_both.shape) to_dump = ['IIHHSIZ2','IIKI17_2','IIHH65_2','PRXRETRY','HLNVCOST','HLNVOFFR','HLNVREF','HLNVNEED','HLNVSOR','IIMCDCHP','IIMEDICR','IICHMPUS','IIPRVHLT','IIOTHHLT','IIINSUR4','IIFAMSOC','IIFAMSSI','IIFSTAMP','IIFAMPMT','IIFAMSVC','IIWELMOS','IIPINC3','IIFAMIN3','ANALWT_C','VESTR','VEREP'] two_is_zero =['IRMCDCHP','IRMEDICR','IRCHMPUS','IRPRVHLT','IROTHHLT','ANYHLTI2','IRINSUR4','OTHINS','IRFAMSOC','IRFAMSSI','IRFSTAMP','IRFAMPMT','IRFAMSVC','GOVTPROG'] is_numeric = ['HLCNOTMO', 'IRWELMOS'] # Dump the unneeded fields all_cols = list(df_both) for col in all_cols: if col in to_dump: df_both = df_both.drop(col, axis=1) for col in df_both: if col != 'Criminal': # A lot of fields had the same 'no answer' mappings so set them all to zero df_both[col] = df_both[col].replace(-1, 0) # Null df_both[col] = df_both[col].replace(85, 0) # BAD DATA Logically assigned df_both[col] = df_both[col].replace(94, 0) # DON'T KNOW df_both[col] = df_both[col].replace(97, 0) # REFUSED df_both[col] = df_both[col].replace(98, 0) # BLANK (NO ANSWER) df_both[col] = df_both[col].replace(99, 0) # LEGITIMATE SKIP (Respondent answering) # Set the 'no' answers to zero if col in two_is_zero: df_both[col] = df_both[col].replace(2, 0) # 'no' answers # Set the fields to categorical (do I need to do this?) if col not in is_numeric: df_both[col] = df_both[col].astype('category') else: # Normalise the other fields so all fields are between -/+ 1 standard deviation mean = df_both[col].mean(axis=0) df_both[col] -= mean std = df_both[col].std(axis=0) df_both[col] /= std df_both.Criminal.value_counts() # List of columns before OHE df_both.columns[1:-1] # Hold back the numeric fields from OHE cat_cols = set(df_both.columns[1:-1]) for col in is_numeric: cat_cols.remove(col) # Actually do the OHE df_ohe = pd.get_dummies(df_both, columns=cat_cols) # ignore perid and criminal column # Put all the data back together with the new OHE fields df_ohe['PERID'] = df_both.PERID df_ohe['Criminal'] = df_both.Criminal for col in is_numeric: df_ohe[col] = df_both[col] print(df_ohe.shape) df_ohe.columns df_train = df_ohe[df_ohe['Criminal'] != -1] df_test = df_ohe[df_ohe['Criminal'] == -1] df_train_labels = df_train['Criminal'] # Drop the PERID field as we shouldn't predict from it # Also drop the Criminal field else our model will just use this! df_train = df_train.drop(['PERID','Criminal'], axis = 1) print(df_train.shape) # Remove the Criminal column from the test set df_test = df_test.drop(['Criminal'], axis = 1) print(df_train.shape) print(df_test.shape) # Get the base arrays using the values from the dataframes labels = df_train_labels.values data = df_train.values # TODO: do I need to do this step? # Convert from ints to floats labels = labels.astype(np.float32, copy=False) data = data.astype(np.float32, copy=False) print(data.shape) print(labels.shape) from sklearn.model_selection import train_test_split data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.20, random_state=42) # We need this to handle the unbalanced data # https://github.com/keras-team/keras/issues/1875 # TODO: I don't think I need to do this def get_class_weights(y, smooth_factor=0): """ Returns the weights for each class based on the frequencies of the samples :param smooth_factor: factor that smooths extremely uneven weights :param y: list of true labels (the labels must be hashable) :return: dictionary with the weight for each class """ from collections import Counter counter = Counter(y) if smooth_factor > 0: p = max(counter.values()) * smooth_factor for k in counter.keys(): counter[k] += p majority = max(counter.values()) return {cls: float(majority) / count for cls, count in counter.items()} class_weights = get_class_weights(labels) class_weights # Used as the evaluation function # Removed from older keras source due to it being inappropriate for batchings # use huge batches to compensate import keras.backend as K def matthews_correlation(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) y_neg = 1 - y_pos tp = K.sum(y_pos * y_pred_pos) tn = K.sum(y_neg * y_pred_neg) fp = K.sum(y_neg * y_pred_pos) fn = K.sum(y_pos * y_pred_neg) numerator = (tp * tn - fp * fn) denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) return numerator / (denominator + K.epsilon()) model = models.Sequential() model.add(layers.Dense(128, activation='relu', input_shape=(148,))) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[matthews_correlation]) history = model.fit(data_train, labels_train, epochs=85, class_weight = class_weights, batch_size=10000, verbose=0, validation_data=(data_test, labels_test)) print('done') matt = history.history['matthews_correlation'] val_matt = history.history['val_matthews_correlation'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(matt) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.plot(epochs, matt, 'bo', label='Training acc') plt.plot(epochs, val_matt, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Acc') plt.legend() plt.show() def smooth_curve(points, factor=0.9): smoothed_points = [] for point in points: if smoothed_points: previous = smoothed_points[-1] smoothed_points.append(previous * factor + point * (1 - factor)) else: smoothed_points.append(point) return smoothed_points smooth_history = smooth_curve(val_matt, factor=0.9) plt.plot(range(1, len(smooth_history) + 1), smooth_history) plt.title('Smooth validation acc by epoch') plt.xlabel('Epochs') plt.ylabel('Validation acc') plt.show() test_loss, test_acc = model.evaluate(data, labels) test_acc new_predict = copy.deepcopy(df_test) vals = df_test.drop(['PERID'], axis = 1).values # drop the PERID # Run the model on the test data predictions = model.predict(vals) def split_val(v): if v > 0.5: return 1 return 0 # Apply the function to each row and create a new predictions column new_predict['predictions'] = [split_val(x) for x in predictions] df_predict = new_predict[['PERID', 'predictions']] df_predict=df_predict.rename(columns = {'predictions':'Criminal'}) df_predict.head() # The submitted file should have just PERID and Criminal columns # Export our predictions to csv df_predict.to_csv('predictions.csv', sep=',', index=False) # Download the csv file files.download('predictions.csv')