import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
df = pd.read_csv('https://raw.githubusercontent.com/fclesio/learning-space/master/Datasets/02%20-%20Classification/default_credit_card.csv')
df.head(3)
def save_model(model, model_name):
''' Persist pickled file'''
pickle.dump(model, open(model_name + '.pkl', 'wb'))
# Features
X = df[[
'LIMIT_BAL',
'SEX',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'BILL_AMT1',
'BILL_AMT2',
'PAY_AMT1',
]]
# Labels
y = df['DEFAULT']
# 90% training and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Train the model and predict
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10,
max_leaf_nodes=5,
random_state=42,
verbose=0,
max_depth=5,
min_samples_leaf=100,
n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
def get_results(y_test, y_pred):
'''Get the accuracy and display the classes distribution'''
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
df_results = pd.DataFrame(y_pred)
df_results.columns = ['status']
print(df_results.groupby(by=['status']).size())
# Acc and status
get_results(y_test, y_pred)
# Serialize model
save_model(model=model, model_name='model_rf')
# Delete old model
del model
# Load model from Pickle
model_rf_reload_pkl = pickle.load(open('model_rf.pkl', 'rb'))
# Displays prediction classes
model_rf_reload_pkl.classes_
get_results(y_test, y_pred)
# Change the classes for the model only to 1
model_rf_reload_pkl.classes_ = np.array([1, 1])
# Quick check
model_rf_reload_pkl.classes_
# Call predict from the new model
y_pred = model_rf_reload_pkl.predict(X_test)
# Check results with a new model
get_results(y_test, y_pred)
# References (Model Creation)
# https://medium.com/@garg.mohit851/random-forest-visualization-3f76cdf6456f
# https://stackabuse.com/random-forest-algorithm-with-python-and-scikit-learn/
# http://www.agcross.com/2015/02/random-forests-scikit-learn/