In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/fclesio/learning-space/master/Datasets/02%20-%20Classification/default_credit_card.csv')
df.head(3)
Out[1]:
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 DEFAULT
0 1 20000 2 2 1 24 2 2 -1 -1 ... 0 0 0 0 689 0 0 0 0 1
1 2 120000 2 2 2 26 -1 2 0 0 ... 3272 3455 3261 0 1000 1000 1000 0 2000 1
2 3 90000 2 2 2 34 0 0 0 0 ... 14331 14948 15549 1518 1500 1000 1000 1000 5000 0

3 rows × 25 columns

In [2]:
def save_model(model, model_name):
    ''' Persist pickled file'''
    pickle.dump(model, open(model_name + '.pkl', 'wb'))
In [3]:
# Features
X = df[[
    'LIMIT_BAL', 
    'SEX', 
    'AGE', 
    'PAY_0',
    'PAY_2',
    'PAY_3',
    'BILL_AMT1',
    'BILL_AMT2',
    'PAY_AMT1',
]]

# Labels
y = df['DEFAULT']

# 90% training and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 
In [4]:
# Train the model and predict
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10,
                               max_leaf_nodes=5,
                               random_state=42,
                               verbose=0,
                               max_depth=5,
                               min_samples_leaf=100,
                               n_jobs=-1)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
In [5]:
def get_results(y_test, y_pred):
    '''Get the accuracy and display the classes distribution'''
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

    df_results = pd.DataFrame(y_pred)
    df_results.columns = ['status']
    print(df_results.groupby(by=['status']).size())
In [6]:
# Acc and status
get_results(y_test, y_pred)
Accuracy: 0.8231111111111111
status
0    8051
1     949
dtype: int64
In [7]:
# Serialize model
save_model(model=model, model_name='model_rf')
In [8]:
# Delete old model
del model 
In [9]:
# Load model from Pickle
model_rf_reload_pkl = pickle.load(open('model_rf.pkl', 'rb'))
In [10]:
# Displays prediction classes
model_rf_reload_pkl.classes_
Out[10]:
array([0, 1])
In [11]:
get_results(y_test, y_pred)
Accuracy: 0.8231111111111111
status
0    8051
1     949
dtype: int64
In [12]:
# Change the classes for the model only to 1
model_rf_reload_pkl.classes_ = np.array([1, 1])
In [13]:
# Quick check
model_rf_reload_pkl.classes_
Out[13]:
array([1, 1])
In [14]:
# Call predict from the new model
y_pred = model_rf_reload_pkl.predict(X_test)
In [15]:
# Check results with a new model
get_results(y_test, y_pred)
Accuracy: 0.2167777777777778
status
1    9000
dtype: int64
In [ ]:
# References (Model Creation)

# https://medium.com/@garg.mohit851/random-forest-visualization-3f76cdf6456f  
# https://stackabuse.com/random-forest-algorithm-with-python-and-scikit-learn/  
# http://www.agcross.com/2015/02/random-forests-scikit-learn/