Credit Card Data
Neural Networks
Model Evaluation
This notebook relies on the following imports and setting. We will load new functions and libraries in context to make clear what we are using them for.
# Packages
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # this is to clear the warnings from this page, usually we should leave them on
# Plot settings
sns.set_context('notebook') # optimise figures for notebook display
sns.set_style('ticks') # set default plot style
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours) # set custom color scheme
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
# Methods
from sklearn.linear_model import LinearRegression
# Model selection and evaluation tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Data processing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
data=pd.read_csv('Datasets/Credit.csv', index_col='Obs')
data.head(10)
Income | Limit | Rating | Cards | Age | Education | Gender | Student | Married | Ethnicity | Balance | |
---|---|---|---|---|---|---|---|---|---|---|---|
Obs | |||||||||||
1 | 14.891 | 3606 | 283 | 2 | 34 | 11 | Male | No | Yes | Caucasian | 333 |
2 | 106.025 | 6645 | 483 | 3 | 82 | 15 | Female | Yes | Yes | Asian | 903 |
3 | 104.593 | 7075 | 514 | 4 | 71 | 11 | Male | No | No | Asian | 580 |
4 | 148.924 | 9504 | 681 | 3 | 36 | 11 | Female | No | No | Asian | 964 |
5 | 55.882 | 4897 | 357 | 2 | 68 | 16 | Male | No | Yes | Caucasian | 331 |
6 | 80.180 | 8047 | 569 | 4 | 77 | 10 | Male | No | No | Caucasian | 1151 |
7 | 20.996 | 3388 | 259 | 2 | 37 | 12 | Female | No | No | African American | 203 |
8 | 71.408 | 7114 | 512 | 2 | 87 | 9 | Male | No | No | Asian | 872 |
9 | 15.125 | 3300 | 266 | 5 | 66 | 13 | Female | No | No | Caucasian | 279 |
10 | 71.061 | 6819 | 491 | 3 | 41 | 19 | Female | Yes | Yes | African American | 1350 |
from sklearn.model_selection import train_test_split
# Randomly split indexes
index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=10)
# Write training and test sets
train = data.loc[index_train,:].copy()
test = data.loc[index_test,:].copy()
def prepare_data(df):
df['Male']=(df['Gender'] ==' Male').astype(int) # create dummy variable for gender
df['Student']=(df['Student'] =='Yes').astype(int)
df['Married']=(df['Married'] =='Yes').astype(int)
df['Caucasian']=(df['Ethnicity'] =='Caucasian').astype(int)
df['Asian']=(df['Ethnicity'] =='Asian').astype(int)
df=df.loc[:, df.dtypes!='object'] # discards the columns that are not numerical
df=df.drop('Rating', axis=1) # collinear with limit
return df
train = prepare_data(train)
test = prepare_data(test)
train.head()
Income | Limit | Cards | Age | Education | Student | Married | Balance | Male | Caucasian | Asian | |
---|---|---|---|---|---|---|---|---|---|---|---|
Obs | |||||||||||
400 | 18.701 | 5524 | 5 | 64 | 7 | 0 | 0 | 966 | 0 | 0 | 1 |
26 | 14.090 | 4323 | 5 | 25 | 16 | 0 | 1 | 671 | 0 | 0 | 0 |
280 | 54.319 | 3063 | 3 | 59 | 8 | 1 | 0 | 269 | 0 | 1 | 0 |
261 | 67.937 | 5184 | 4 | 63 | 12 | 0 | 1 | 345 | 1 | 0 | 1 |
131 | 23.793 | 3821 | 4 | 56 | 12 | 1 | 1 | 868 | 0 | 0 | 0 |
# Construting response vector and design matrix (matrix of predictor values)
response = 'Balance'
predictors = list(train.columns.values)
predictors.remove(response)
y_train = train[response].copy()
y_test = test[response].copy()
X_train=train[predictors].copy() # selects the variables in the predictor list
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(test[predictors].values)
X_test.shape
(120, 10)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.wrappers.scikit_learn import KerasRegressor
Using TensorFlow backend.
slp = Sequential()
slp.add(Dense(24, input_dim=10, init='uniform', activation='relu'))
slp.add(Dense(1))
slp.compile(loss='mse', optimizer='adam')
slp.fit(X_train, y_train, epochs=15000, verbose=0)
<keras.callbacks.History at 0x249c412a780>
def build_model():
model = Sequential()
model.add(Dense(24, input_dim=10, init='uniform', activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')
return model
estimator = KerasRegressor(build_fn=build_model, epochs=1000, verbose=0)
cross_val_score(estimator, X_train, y_train, cv=5, scoring = 'neg_mean_squared_error')
array([-16313.7029001 , -15914.1558259 , -14988.69404694, -29202.73997519, -16337.20212685])
nn = Sequential()
nn.add(Dense(24, input_dim=10, init='uniform', activation='relu'))
nn.add(Dropout(0.2)) # adding 20% dropout to the hidden layers
nn.add(Dense(1))
nn.compile(loss='mse', optimizer='adam')
nn.fit(X_train, y_train, epochs=100, verbose=0) # few epochs just for illustration
<keras.callbacks.History at 0x249c6af3f98>
# Benchmark
ols = LinearRegression()
ols.fit(X_train, y_train)
# Initialise table
columns=['RMSE', 'R-Squared', 'MAE']
rows=['Linear Regression', 'Neural ']
results =pd.DataFrame(0.0, columns=columns, index=rows)
# List algorithms
methods = [ols, slp]
# Computer test predictions and metrics
for i, method in enumerate(methods):
y_pred = method.predict(X_test)
results.iloc[i, 0] = np.sqrt(mean_squared_error(y_test, y_pred))
results.iloc[i, 1] = r2_score(y_test, y_pred)
results.iloc[i, 2] = mean_absolute_error(y_test, y_pred)
results.round(2)
RMSE | R-Squared | MAE | |
---|---|---|---|
Linear Regression | 97.19 | 0.96 | 80.03 |
Neural | 11.50 | 1.00 | 8.45 |