Work Analytics Data
Neural Networks
Model Evaluation
This notebook relies on the following libraries and settings.
# Packages
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Plot settings
sns.set_context('notebook')
sns.set_style('ticks')
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, average_precision_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
We use the Human Resources Analytics data available from Kaggle Datasets.
** Business objective: ** To predict which employees will leave the company.
data = pd.read_csv('Datasets/HR.csv')
data = data.rename(columns = {'sales' : 'role'})
data.head()
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | role | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
response='left'
predictors = list(data.columns.values)
predictors.remove(response)
index_train, index_test = train_test_split(np.array(data.index), stratify=data[response], train_size=0.2, random_state=5)
train = data.loc[index_train,].copy()
test = data.loc[index_test,:].copy()
y_train = train[response]
y_test = test[response]
Before estimating the models, we need to convert the categorical variables into binary variables.
dummies = pd.get_dummies(data['role'], drop_first=True)
data = data.join(dummies)
data = data.drop('role', axis= 1)
dummies = pd.get_dummies(data['salary'], prefix = 'salary', drop_first=True)
data = data.join(dummies)
data = data.drop('salary', axis= 1)
We then update our list of predictors accordingly and construct the train and test design matrices.
train = data.loc[index_train,].copy()
test = data.loc[index_test,:].copy()
predictors = list(train.columns.values)
predictors.remove(response)
X_train = data.loc[index_train, predictors].copy()
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(test[predictors].values)
X_train.shape
(2999, 18)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
Using TensorFlow backend.
mlp = Sequential()
mlp.add(Dense(128, input_dim=X_train.shape[1], init='uniform', activation='relu'))
mlp.add(Dense(64, init='uniform', activation='relu'))
mlp.add(Dense(1, activation='sigmoid'))
mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
mlp.fit(X_train, y_train, epochs=500, verbose=0)
<keras.callbacks.History at 0x22f1e4cb898>
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], init='uniform', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, init='uniform', activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500, verbose=0)
<keras.callbacks.History at 0x22f1ed35e80>
We estimate a logistic regression as a benchmark.
logit = LogisticRegression(C=1e5)
logit.fit(X_train, y_train)
LogisticRegression(C=100000.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision']
rows=['Logistic regression', 'Neural Net', 'Neural Net (dropout)']
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[logit, mlp, model]
for i, method in enumerate(methods):
y_pred = method.predict(X_test)
if method != logit:
y_prob = y_pred
y_pred = (y_pred>0.5).astype(int)
else:
y_prob = method.predict_proba(X_test)[:,1]
confusion = confusion_matrix(y_test, y_pred)
results.iloc[i,0]= 1 - accuracy_score(y_test, y_pred)
results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:])
results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:])
results.iloc[i,4]= precision_score(y_test, y_pred)
results.iloc[i,3] = roc_auc_score(y_test, y_prob)
results.round(3)
Error rate | Sensitivity | Specificity | AUC | Precision | |
---|---|---|---|---|---|
Logistic regression | 0.225 | 0.279 | 0.930 | 0.822 | 0.556 |
Neural Net | 0.039 | 0.922 | 0.973 | 0.972 | 0.915 |
Neural Net (dropout) | 0.034 | 0.913 | 0.982 | 0.974 | 0.941 |