Metis Data Science Bootcamp, New York City, Feb 2015
© Frederik Durant
A bank wants to run a targeted marketing campaign to sell a term deposit product.
Given a certain campaign budget:
Which customers should it target?
In what order?
Is it wise to spend the complete campaign budget?
The bank has a database of 41K past customer call records, indicating successful and failed sales of this product.
maximally use D3.js and interactive visualizations
use a shared MySQL database as the data source
Chakarin Vajiramedhin and Anirut Suebsing: Feature Selection with Data Balancing for Prediction of Bank Telemarketing
Sérgio Moro, Paulo Cortez and Paulo Rita: A Data-Driven Approach to Predict the Success of Bank Telemarketing
We use the bank-additional.zip data set, unchanged. For pedagogical reasons, we were asked to first load the data in a MySQL database running on a virtual server at DigitalOcean. This involved writing a DB schema as well as a load command in SQL.
Let's check if the data are readable from there:
import pymysql
import credentials
db = pymysql.connect(host=credentials.host,
user=credentials.user,
passwd=credentials.password,
db='bank')
cursor = db.cursor()
cursor.execute("SELECT * FROM bank_full LIMIT 3")
for i in cursor.fetchall():
print i
cursor.close()
db.close()
(56, 'housemaid', 'married', 'basic.4y', 'no', 'no', 'no', 'telephone', 'may', 'mon', 261, 1, 999, 0, 'nonexistent', 1.1, 93.994, -36.4, 4.857, 5191.0, 'no') (57, 'services', 'married', 'high.school', 'unknown', 'no', 'no', 'telephone', 'may', 'mon', 149, 1, 999, 0, 'nonexistent', 1.1, 93.994, -36.4, 4.857, 5191.0, 'no') (37, 'services', 'married', 'high.school', 'no', 'yes', 'no', 'telephone', 'may', 'mon', 226, 1, 999, 0, 'nonexistent', 1.1, 93.994, -36.4, 4.857, 5191.0, 'no')
Now that we know the database connection is up and running, let's read our data directly into a DataFrame:
import pymysql
import credentials
import pandas.io.sql as psql
db = pymysql.connect(host=credentials.host,
user=credentials.user,
passwd=credentials.password,
db='bank')
query = "SELECT * FROM bank_full;"
bankDF = psql.read_sql(query, con=db)
db.close()
print "Number of database records read: %d" % len(bankDF)
#print bankDF.columns
bankDF.head()
Number of database records read: 41188
age | job | marital | education | bank_def | housing | loan | contact | contact_month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
5 rows × 21 columns
Most of our models can't handle the categorical fields, so our next step is to convert them to dummy numerical variables.
import pandas as pd
import numpy as np
tmpDF = bankDF.copy()
tmpDF.y.replace(to_replace=['yes','no'], value=[1,0], inplace=True)
# Delete the field(s) unknown before the call is made
nonAvailableFields = ['duration']
for field in nonAvailableFields:
tmpDF.drop(field, axis=1, inplace=True)
# Key: categorical field name
# Value: list of abbreviation and post-dummification redundant field
categoricalFields = {'job':['job','unknown'],
'marital':['mar','unknown'],
'education':['ed','unknown'],
'bank_def':['def','unknown'],
'housing':['h','unknown'],
'loan':['loan','unknown'],
'contact':['cntct','telephone'],
'contact_month':['cntct',None], # Because 'jan' and 'feb' are missing in the data, despite being plausible
'day_of_week':['dow','fri'], # Even though 'sat' and 'sun' are missing, these are not plausible values
'poutcome':['pout','nonexistent'],
}
# Convert all categorical fields in dummy numerical ones
# AND delete one dummified field because it contains redundant information
for (categoricalField, value) in categoricalFields.iteritems():
(prefix, dummifiedRedundantField) = value
tmpDF = pd.concat([tmpDF, pd.get_dummies(tmpDF[categoricalField], prefix=prefix)], axis=1)
if dummifiedRedundantField:
tmpDF.drop("%s_%s" % (prefix, dummifiedRedundantField), axis=1, inplace=True)
# of course, also delete the categorical fields
for (categoricalField,_) in categoricalFields.iteritems():
tmpDF.drop(categoricalField, axis=1, inplace=True)
bankDF2 = tmpDF.copy()
#print bankDF2.columns
bankDF2.head()
age | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | ... | ed_illiterate | ed_professional.course | ed_university.degree | loan_no | loan_yes | def_no | def_yes | mar_divorced | mar_married | mar_single | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 57 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 37 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
3 | 40 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
4 | 56 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
5 rows × 54 columns
%matplotlib inline
import sys
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from matplotlib import pyplot as plt
from math import log10
from time import sleep
X_values = bankDF2.copy()
X_values.drop('y', axis=1, inplace=True)
y_values = bankDF2['y']
# Scale the X values
X_values_scaled = preprocessing.scale(X_values.astype(float), copy=False)
scorerType = 'roc_auc'
nrCrossValidationFolds=10
# We MUST shuffle, because the data seem to be somehow ordered by date,
# resulting in grouped rows of similar socio-economic value combinations!!!
cvGenerator = KFold(len(X_values), n_folds=nrCrossValidationFolds, shuffle=True)
bestLogCValue = None
bestKValue = None
bestNrEst = None
bestMaxFeat = None
print "%d columns:" % len(X_values.columns), [str(c) for c in X_values.columns]
print "%d values:" % len(X_values_scaled[0]), X_values_scaled[0]
53 columns: ['age', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'cntct_cellular', 'pout_failure', 'pout_success', 'dow_mon', 'dow_thu', 'dow_tue', 'dow_wed', 'cntct_apr', 'cntct_aug', 'cntct_dec', 'cntct_jul', 'cntct_jun', 'cntct_mar', 'cntct_may', 'cntct_nov', 'cntct_oct', 'cntct_sep', 'h_no', 'h_yes', 'ed_basic.4y', 'ed_basic.6y', 'ed_basic.9y', 'ed_high.school', 'ed_illiterate', 'ed_professional.course', 'ed_university.degree', 'loan_no', 'loan_yes', 'def_no', 'def_yes', 'mar_divorced', 'mar_married', 'mar_single'] 53 values: [ 1.53303429 -0.56592197 0.1954139 -0.34949428 0.64809227 0.72272247 0.88644656 0.71245988 0.33167991 -0.58202282 -0.53831699 -0.19143021 6.15277204 -0.2764353 -0.2087573 -0.18903213 -0.3265564 -0.1473267 -0.44244927 -0.15887166 -1.31826996 -0.3392905 -0.1857 1.95899952 -0.51458089 -0.49439422 -0.4960667 -0.26127446 -0.42007603 -0.06662113 -0.45925282 -0.38504233 -0.11590677 1.41115463 -0.33253245 -0.13319736 -0.11846175 1.10081447 -1.04887691 2.97708361 -0.24274754 -0.41474269 -0.54809999 -0.0209096 -0.38191849 -0.64753149 0.46173139 -0.42287213 0.51371278 -0.00853476 -0.35509663 0.80763764 -0.62493754]
from sklearn.linear_model import LogisticRegression
cValues = [pow(10,(x-10)) for x in range(20)]
#print "cValues = ", cValues
# selects weights inversely proportional to class frequencies in the training set
# See http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#classWeight = 'auto'
classWeight = None
estimators = []
scoreList = []
for i in range(len(cValues)):
c = cValues[i]
print >> sys.stderr, "(%d/%d) Building logres model for C value %1.15f ..." % (i+1,len(cValues),c),
estimator = LogisticRegression(C=c,class_weight=classWeight)
print >> sys.stderr, "applying it ...",
scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
estimators.append(estimator)
scoreList.append((scores.mean(),scores.std()))
print >> sys.stderr, "done"
sleep(1)
print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]
bestModelIndex = meanScores.index(max(meanScores))
print "best logres model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestCValue = cValues[bestModelIndex]
bestLogCValue = log10(bestCValue)
print "C value: ", bestCValue
print "log10(C) value: ", bestLogCValue
fig = plt.figure()
plt.plot([log10(c) for c in cValues],[x[0] for x in scoreList], 'o-'?, color='g', label='mean')
plt.fill_between([log10(c) for c in cValues], [x[0]-x[1] for x in scoreList],
[x[0]+x[1] for x in scoreList], alpha=0.1,
color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith logres model using various C values")
plt.xlabel("log10(C)")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()
(1/20) Building logres model for C value 0.000000000100000 ... applying it ... done (2/20) Building logres model for C value 0.000000001000000 ... applying it ... done (3/20) Building logres model for C value 0.000000010000000 ... applying it ... done (4/20) Building logres model for C value 0.000000100000000 ... applying it ... done (5/20) Building logres model for C value 0.000001000000000 ... applying it ... done (6/20) Building logres model for C value 0.000010000000000 ... applying it ... done (7/20) Building logres model for C value 0.000100000000000 ... applying it ... done (8/20) Building logres model for C value 0.001000000000000 ... applying it ... done (9/20) Building logres model for C value 0.010000000000000 ... applying it ... done (10/20) Building logres model for C value 0.100000000000000 ... applying it ... done (11/20) Building logres model for C value 1.000000000000000 ... applying it ... done (12/20) Building logres model for C value 10.000000000000000 ... applying it ... done (13/20) Building logres model for C value 100.000000000000000 ... applying it ... done (14/20) Building logres model for C value 1000.000000000000000 ... applying it ... done (15/20) Building logres model for C value 10000.000000000000000 ... applying it ... done (16/20) Building logres model for C value 100000.000000000000000 ... applying it ... done (17/20) Building logres model for C value 1000000.000000000000000 ... applying it ... done (18/20) Building logres model for C value 10000000.000000000000000 ... applying it ... done (19/20) Building logres model for C value 100000000.000000000000000 ... applying it ... done (20/20) Building logres model for C value 1000000000.000000000000000 ... applying it ...
scoreList = [(0.76949171906441904, 0.014599343487451131), (0.77559084839878234, 0.016680091940452973), (0.77558984089571026, 0.016669564577923329), (0.77562960117842694, 0.016669783129555374), (0.77599334757276783, 0.016712834033065491), (0.77865570872903878, 0.017084773628861349), (0.78577478466100215, 0.018246266290173292), (0.78995141066814933, 0.018194446599428572), (0.79057860690470305, 0.018057597021882185), (0.79183208796194027, 0.017781797514409677), (0.79161619019773755, 0.017933680674930202), (0.79154411612201447, 0.017980246276912498), (0.79153911447470549, 0.017983683123000981), (0.79153754742257032, 0.017984857467606537), (0.79153744595998676, 0.017984201985282833), (0.7915375656267658, 0.017984565896524217), (0.79153753794691761, 0.017985059650217641), (0.79153750778929299, 0.017984629877046337), (0.79153774266192811, 0.017984639119523849), (0.79153767677216846, 0.017984260022198047)] best logres model has: roc_auc score: 79.18% (+/- 1.78%) C value: 0.1 log10(C) value: -1.0
done
from sklearn.neighbors import KNeighborsClassifier
kValues = range(1,31)
#kValues = range(25,31) # In order to gain some processing time
estimators = []
scoreList = []
for i in range(len(kValues)):
k = kValues[i]
print >> sys.stderr, "(%d/%d) Building %d nearest neighbor model ..." % (i+1,len(kValues),k),
estimator = KNeighborsClassifier(n_neighbors=k)
print >> sys.stderr, "applying it ...",
scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
estimators.append(estimator)
scoreList.append((scores.mean(),scores.std()))
print >> sys.stderr, "done"
sleep(1)
print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]
bestModelIndex = meanScores.index(max(meanScores))
print "best KNN model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestKValue = kValues[bestModelIndex]
print "K value: ", bestKValue
fig = plt.figure()
plt.plot([k for k in kValues],[x[0] for x in scoreList], 'o-'?, color='g', label='mean')
plt.fill_between([k for k in kValues], [x[0]-x[1] for x in scoreList],
[x[0]+x[1] for x in scoreList], alpha=0.1,
color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith K nearest neighbor model using various K values")
plt.xlabel("K")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()
(1/30) Building 1 nearest neighbor model ... applying it ... done (2/30) Building 2 nearest neighbor model ... applying it ... done (3/30) Building 3 nearest neighbor model ... applying it ... done (4/30) Building 4 nearest neighbor model ... applying it ... done (5/30) Building 5 nearest neighbor model ... applying it ... done (6/30) Building 6 nearest neighbor model ... applying it ... done (7/30) Building 7 nearest neighbor model ... applying it ... done (8/30) Building 8 nearest neighbor model ... applying it ... done (9/30) Building 9 nearest neighbor model ... applying it ... done (10/30) Building 10 nearest neighbor model ... applying it ... done (11/30) Building 11 nearest neighbor model ... applying it ... done (12/30) Building 12 nearest neighbor model ... applying it ... done (13/30) Building 13 nearest neighbor model ... applying it ... done (14/30) Building 14 nearest neighbor model ... applying it ... done (15/30) Building 15 nearest neighbor model ... applying it ... done (16/30) Building 16 nearest neighbor model ... applying it ... done (17/30) Building 17 nearest neighbor model ... applying it ... done (18/30) Building 18 nearest neighbor model ... applying it ... done (19/30) Building 19 nearest neighbor model ... applying it ... done (20/30) Building 20 nearest neighbor model ... applying it ... done (21/30) Building 21 nearest neighbor model ... applying it ... done (22/30) Building 22 nearest neighbor model ... applying it ... done (23/30) Building 23 nearest neighbor model ... applying it ... done (24/30) Building 24 nearest neighbor model ... applying it ... done (25/30) Building 25 nearest neighbor model ... applying it ... done (26/30) Building 26 nearest neighbor model ... applying it ... done (27/30) Building 27 nearest neighbor model ... applying it ... done (28/30) Building 28 nearest neighbor model ... applying it ... done (29/30) Building 29 nearest neighbor model ... applying it ... done (30/30) Building 30 nearest neighbor model ... applying it ...
scoreList = [(0.61686248330254301, 0.0099981437838863804), (0.66913727936584078, 0.012600681503597767), (0.69785444267494223, 0.015687283948569192), (0.71515560365473918, 0.014818131237886285), (0.72425501559430283, 0.016845354516585), (0.72990286059285414, 0.016340911616737445), (0.7335368606945174, 0.017178034713099678), (0.73889135940768047, 0.017060886428724224), (0.74271904008057421, 0.016881761130210747), (0.744583148459389, 0.016796792570554754), (0.7469328112206256, 0.015702096606024066), (0.750637147503316, 0.014530303246797251), (0.75219250752037103, 0.013864254687591369), (0.75461714652447021, 0.015282407989463983), (0.75545206758029426, 0.015942693661563846), (0.75551633518904637, 0.016526714751929922), (0.75591405491617258, 0.01668222353026268), (0.75747196161339547, 0.015971081677249973), (0.75862862772102124, 0.015872595350427531), (0.75970608453744715, 0.015652587074260597), (0.76058305590137931, 0.016045794415517858), (0.76124442173950257, 0.016485012188106888), (0.76147056688636927, 0.017197212057929758), (0.76201029439038481, 0.018035220232425729), (0.76287690000962471, 0.018679844986823683), (0.76398680319965329, 0.019022717786763548), (0.76543345107674055, 0.019625045715179531), (0.76668763728866463, 0.019383585645356617), (0.76706779020246529, 0.019217174327706695), (0.76746990845193763, 0.019087388760932786)] best KNN model has: roc_auc score: 76.75% (+/- 1.91%) K value: 30
done
from sklearn.ensemble import RandomForestClassifier
import itertools
from math import log
nrEstimators = [pow(2,x) for x in range(7,9)]
maxFeatures = [pow(2,x) for x in range(1,int(log(len(X_values.columns.values.tolist()))+2))]
rfConfigurations = list(itertools.product(*[nrEstimators,maxFeatures]))
print "configurations: ", rfConfigurations
scoreList = []
for i in range(len(rfConfigurations)):
nrEst, maxFeat = rfConfigurations[i]
print >> sys.stderr, "(%d/%d) Building random forest model with %d estimators and maximum %d features ..." % (i+1,len(rfConfigurations),nrEst,maxFeat),
estimator = RandomForestClassifier(n_estimators=nrEst, max_features=maxFeat)
print >> sys.stderr, "applying it ...",
scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
scoreList.append((scores.mean(),scores.std()))
print >> sys.stderr, "done"
sleep(1)
print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]
bestModelIndex = meanScores.index(max(meanScores))
print "best Random Forest model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestNrEst, bestMaxFeat = rfConfigurations[bestModelIndex]
print "n_estimators: ", bestNrEst
print "max_features: ", bestMaxFeat
fig = plt.figure()
plt.plot([conf for conf in range(1,len(rfConfigurations)+1)],[x[0] for x in scoreList], 'o-'?, color='g', label='mean')
plt.fill_between([conf for conf in range(1,len(rfConfigurations)+1)], [x[0]-x[1] for x in scoreList],
[x[0]+x[1] for x in scoreList], alpha=0.1,
color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith Random Forest model using various configurations")
plt.xlabel("Configuration ID (nr estimators, max features)")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()
(1/8) Building random forest model with 128 estimators and maximum 2 features ... applying it ... done (2/8) Building random forest model with 128 estimators and maximum 4 features ... applying it ... done (3/8) Building random forest model with 128 estimators and maximum 8 features ... applying it ... done (4/8) Building random forest model with 128 estimators and maximum 16 features ... applying it ... done (5/8) Building random forest model with 256 estimators and maximum 2 features ... applying it ... done (6/8) Building random forest model with 256 estimators and maximum 4 features ... applying it ... done (7/8) Building random forest model with 256 estimators and maximum 8 features ... applying it ... done (8/8) Building random forest model with 256 estimators and maximum 16 features ... applying it ...
configurations: [(128, 2), (128, 4), (128, 8), (128, 16), (256, 2), (256, 4), (256, 8), (256, 16)] scoreList = [(0.76768945798327159, 0.018147151669550964), (0.77388895837664984, 0.01721065749733788), (0.77624845940828902, 0.016912286996300936), (0.77509714548313668, 0.017077118005886713), (0.77165542125390119, 0.017431478360104655), (0.77424811572276253, 0.016397927989300697), (0.7761854927025631, 0.015843373377702776), (0.77620158285384833, 0.018285582468099634)] best Random Forest model has: roc_auc score: 77.62% (+/- 1.69%) n_estimators: 128 max_features: 8
done
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
# After-the-fact hardcoded optimizations, in case we want to skip the experimentation phase
# (e.g. during a presentation)
if bestLogCValue == None:
bestLogCValue = -1
if bestKValue == None:
bestKValue = 30
if bestNrEst == None or bestMaxFeat == None:
bestNrEst = 256
bestMaxFeat = 8
models = {}
models = {'logres with log10(C)=%d' % bestLogCValue : LogisticRegression(C=pow(10,bestLogCValue)), # Takes about 1 second elapsed time
'knn with K=%d' % bestKValue : KNeighborsClassifier(n_neighbors=bestKValue), # Takes about 12 seconds elapsed time
'gaussianNB': GaussianNB(),
'baseline' : DummyClassifier(strategy='stratified')
# 'randomforest with nrEst=%d and maxFeat=%d' % (bestNrEst,bestMaxFeat): RandomForestClassifier(n_estimators=bestNrEst, max_features=bestMaxFeat) # Takes about 5 seconds elapsed time
}
fig = plt.figure(1,(9,6))
plt.title("%d-fold cross-validation %s scores for various model types" % (nrCrossValidationFolds, scorerType))
plt.xlabel("Fold #")
plt.ylabel(scorerType)
plt.grid()
for modelName, model in models.items():
print >> sys.stderr, "Building %s model ..." % modelName,
print >> sys.stderr, "applying it ...",
scores = cross_val_score(model, X_values, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1, verbose=10)
print >> sys.stderr, "done"
plt.plot(range(1,nrCrossValidationFolds+1), scores, 'o-', label="%s (%2.2f%% +/- %2.2f%%)" % (modelName, scores.mean() * 100, scores.std() * 100))
plt.legend(loc='best')
plt.show()
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.748927 - 3.7s
Building knn with K=30 model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 3.7s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.758574 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.769673 - 3.6s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.771936 - 3.6s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.806344 - 3.6s
[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 7.4s [Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 18.2s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.759772 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.792484 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.773067 - 3.6s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.779205 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.782687 - 3.7s
[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 29.2s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 36.6s finished
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.730469 - 0.1s
done Building gaussianNB model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.1s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.757479 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.769577 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.773234 - 0.0s [CV] no parameters to be set .........................................
[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.1s [Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 0.3s
[CV] ................ no parameters to be set, score=0.790082 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.769992 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.782535 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.773197 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.756975 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.783625 - 0.0s
[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 0.5s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 0.5s finished
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.748908 - 0.4s
done Building logres with log10(C)=-1 model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.4s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.779050 - 0.5s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.790301 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.786824 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.816228 - 0.4s
[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.9s [Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 2.1s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.785703 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.801108 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.785443 - 0.3s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.777990 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.794478 - 0.4s
[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 3.1s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 3.8s finished
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.508652 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.494918 - 0.0s
done Building baseline model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.0s [Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.0s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.485547 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.495306 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.499960 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.506277 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.494437 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.514219 - 0.0s
[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 0.1s [Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 0.2s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.503079 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.503250 - 0.0s
[Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 0.2s finished done
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import mpld3
mpld3.enable_notebook()
import sys
from matplotlib import pyplot as plt
import re
from math import log
import json
import copy
# We don't use cross validation here, but will generate a random test set instead
X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size=0.1, random_state=0)
finalPredictions = {}
for modelName, model in models.items():
fittedModel = model.fit(X_train, y_train)
if hasattr(model, "predict_proba"):
print >> sys.stderr, "Predicting probabilities for %s model ..." % modelName,
finalPredictions[modelName] = {'train': model.predict_proba(X_train),
'test': model.predict_proba(X_test)}
print >> sys.stderr, "done"
def plotROCCurve(figSize=7):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonROCData = {}
rocCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
ax.grid(True, alpha=0.3)
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
# if modelNameShort not in ['knn','baseline']:
# continue
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
fpr, tpr, thresholds = roc_curve(y_true=y_test,
y_score=y_probs, pos_label=1)
# print "%s: fpr (size = %d)" % (modelNameShort, len(fpr)), fpr
# print "tpr (size = %d)" % len(tpr), tpr
# print "thresholds (size = %d)" % len(thresholds), thresholds
roc_auc = roc_auc_score(y_true=y_test, y_score=y_probs)
jsonROCData[modelNameShort] = {}
jsonROCData[modelNameShort]['fpr'] = [x for x in fpr]
jsonROCData[modelNameShort]['tpr'] = [ x for x in tpr]
jsonROCData[modelNameShort]['thresholds'] = [x for x in thresholds]
jsonROCData[modelNameShort]['roc_auc'] = roc_auc
points = plt.plot(fpr, tpr, 'x-', label="%s (AUC = %1.2f%%)" % (modelName, roc_auc*100))
labels = ["<table><th colspan='2'>%s</th><tr><td>FPR</td><td>%0.1f%%</td></tr><tr><td>TPR</td><td>%0.1f%%</td></tr><tr><td>threshold</td><td>%0.4f</td></tr></table>" % (modelNameShort, fpr[i]*100, tpr[i]*100, thresholds[i]) for i in range(len(fpr))]
mpld3.plugins.connect(rocCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
ax.set_title("ROC curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
ax.set_xlabel("False Positive Rate (FP/FP+TN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
ax.set_ylabel("True Positive Rate (TP/TP+FN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
plt.legend(loc="best")
plt.show()
return jsonROCData
# Export data to JSON file for visualization in D3.js or similar
jsonROCData = plotROCCurve(figSize=9)
with open('d3/ROCCurve.json', 'w') as outfile:
json.dump(jsonROCData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotROCCurve(figSize=9), "plotROCCurveBanking.html", figid="ROC_Curve_Banking")
interact(plotROCCurve, figSize=(5,10))
<function __main__.plotROCCurve>
import numpy as np
import mpld3
mpld3.enable_notebook()
import re
from math import log
def sortTestSetByProbability(y_test, y_probs):
sortedIndexList = sorted(range(len(y_probs)), key=lambda k: y_probs[k], reverse=True)
return [(y_probs[i], y_test[i], i) for i in sortedIndexList]
def calculateLift(sortedProbsAndTrueLabels, nrPercentiles=10, posLabel=1):
''' Input is a list of tuples containing (prob, true label), sorted in descending order per prob
Output is a a list of nrPercentiles tuples containing, for each percentile:
(percentOfTargetPopulationInThisPercentile,
percentOfPositivePopulationInThisPercentile,
cumulativeLift,
localLift
)
'''
numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
# print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
# print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
result = [(0.0,0.0,0.0,0.0)]
cumulativePercentage = 0.0
cumulativeLift = 0
localLift = 0
for c in range(len(percentileSplitOfInstances)):
percentile = percentileSplitOfInstances[c]
nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
p = float(c+1)/float(nrPercentiles)
cumulativePercentage += float(nrPositivesInPercentile)/float(numberOfPositiveInstances)
cumulativeLift = cumulativePercentage/p
localLift = (float(nrPositivesInPercentile)/float(numberOfPositiveInstances))/(1.0/float(nrPercentiles))
result.append((p,cumulativePercentage,cumulativeLift, localLift))
return result
def plotLiftCurve(numberOfPercentiles=10, figSize=5):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonLiftData = {}
liftCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
ax.grid(True, alpha=0.3)
ax.set_autoscaley_on(False)
ax.set_ylim([0,100])
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
lifts = calculateLift(sortedProbsAndTrueLabels, nrPercentiles=numberOfPercentiles, posLabel=1)
# print modelName, lifts
xPerc = [x[0]*100 for x in lifts]
yPerc = [x[1]*100 for x in lifts]
jsonLiftData[modelNameShort] = {}
jsonLiftData[modelNameShort]['xPerc'] = [x for x in xPerc]
jsonLiftData[modelNameShort]['yPerc'] = [y for y in yPerc]
jsonLiftData[modelNameShort]['cumLift'] = [l[2] for l in lifts]
jsonLiftData[modelNameShort]['localLift'] = [l[3] for l in lifts]
points = ax.plot(xPerc, yPerc, 'o-', label="%s" % modelNameShort)
labels = ["<table><th colspan='2'>%s</th><tr><td>top-scored</td><td>%2.1f%%</td></tr><tr><td>yields</td><td>%2.1f%%</td></tr><tr><td>cum. lift</td><td>%1.1f</td></tr><tr><td>local lift</td><td>%1.1f</td></tr></table>" % (modelNameShort, xPerc[i],yPerc[i], lifts[i][2], lifts[i][3]) for i in range(len(yPerc))]
mpld3.plugins.connect(liftCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
ax.set_title("Lift curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
ax.set_ylabel("Percentage of positive instances targeted", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
plt.legend(loc=4)
plt.show()
return jsonLiftData
# Export data to JSON file for visualization in D3.js or similar
jsonLiftData = plotLiftCurve(numberOfPercentiles=100, figSize=9)
with open('d3/LiftCurve.json', 'w') as outfile:
json.dump(jsonLiftData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotLiftCurve(numberOfPercentiles=10, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotLiftCurve, numberOfPercentiles=(5, 20), figSize=(5,10))
<function __main__.plotLiftCurve>
From the given case description, we don't exactly know how much a single contact costs, and how much a successful one would earn in terms of revenue. However, for each combination of these two, we can draw a profit curve.
def calculateProfit(sortedProbsAndTrueLabels,
nrPercentiles=20,
posLabel=1,
avgCostPerContact=10,
avgRevenuePerContact=50
):
''' Input is a list of tuples containing (prob, avg cost per contact, avg revenue per contact,
number of percentiles, true label), sorted in descending order per prob
Output is a list containing tuples of the form
(percentOfTargetPopulationInThisPercentile,
cumulative revenue,
cumulative cost,
cumulative profit,
revenue for this percentile,
cost for this percentile,
profit for this percentile)
'''
assert(avgCostPerContact >= 0.0), "avgCostPerContact is expressed as a positive float"
assert(avgRevenuePerContact >= 0.0), "avgRevenuePerContact is expressed as a positive float"
numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
# print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
# print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
result = [(0.0,0.0,0.0,0.0,0.0,0.0,0.0)]
cumulativeCost = 0.0
cumulativeRevenue = 0.0
cumulativeProfit = 0.0
for c in range(len(percentileSplitOfInstances)):
percentile = percentileSplitOfInstances[c]
nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
nrNegativesInPercentile = len(percentile) - nrPositivesInPercentile
assert(nrPositivesInPercentile + nrNegativesInPercentile == len(percentile))
assert(len(percentile) > 0)
p = float(c+1)/float(nrPercentiles)
costThisPercentile = avgCostPerContact * (nrPositivesInPercentile + nrNegativesInPercentile)
revenueThisPercentile = avgRevenuePerContact * nrPositivesInPercentile
profitThisPercentile = revenueThisPercentile - costThisPercentile
cumulativeRevenue = cumulativeRevenue + revenueThisPercentile
cumulativeCost = cumulativeCost + costThisPercentile
cumulativeProfit = cumulativeProfit + profitThisPercentile
result.append((p, cumulativeRevenue, cumulativeCost, cumulativeProfit, revenueThisPercentile, costThisPercentile, profitThisPercentile))
return result
def plotProfitCurve(numberOfPercentiles=10, figSize=7, avgCostPerContact=15, avgRevenuePerContact=75, yUnit=1000.0):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonProfitData = {}
profitCurveFigure, ax = plt.subplots(1, figsize=(int(figSize),int(figSize)))
ax.grid(True, alpha=0.3)
ax.set_autoscaley_on(True)
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
profits = calculateProfit(sortedProbsAndTrueLabels,
avgCostPerContact=avgCostPerContact,
avgRevenuePerContact=avgRevenuePerContact,
nrPercentiles=int(numberOfPercentiles),
posLabel=1)
xPerc = [x[0]*100 for x in profits]
yProfit = [x[3]/yUnit for x in profits]
jsonProfitData[modelNameShort] = {}
jsonProfitData[modelNameShort]['sortedProbsAndTrueLabels'] = sortedProbsAndTrueLabels
jsonProfitData[modelNameShort]['xPerc'] = [x for x in xPerc]
jsonProfitData[modelNameShort]['yProfit'] = [y for y in yProfit]
jsonProfitData[modelNameShort]['cumProfit'] = [p[3] for p in profits]
jsonProfitData[modelNameShort]['intervalProfit'] = [p[6] for p in profits]
points = ax.plot(xPerc, yProfit, 'o-', label="%s" % modelNameShort)
labels = ["<table><th colspan='2'>%s</th><tr><td>top-scored</td><td>%2.1f%%</td></tr><tr><td>cum. profit (USD)</td><td>%.0f</td></tr><tr><td>interval profit (USD)</td><td>%.0f</td></tr></table>" % (modelNameShort, xPerc[i],profits[i][3], profits[i][6]) for i in range(len(yProfit))]
mpld3.plugins.connect(profitCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
# print "%s: " % modelNameShort, profits
ax.set_title("Profit curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(int(figSize)))
ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
ax.set_ylabel("Profit (in %d USD)" % int(yUnit), labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
plt.legend(loc=1)
plt.show()
return jsonProfitData
# Export data to JSON file for visualization in D3.js or similar
jsonProfitData = plotProfitCurve(avgCostPerContact=15, avgRevenuePerContact=50, numberOfPercentiles=100, figSize=9, yUnit=1)
with open('d3/ProfitCurve.json', 'w') as outfile:
json.dump(jsonProfitData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotProfitCurve(avgCostPerContact=10, avgRevenuePerContact=50, numberOfPercentiles=20, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotProfitCurve, numberOfPercentiles=(10,50), figSize=(5,10), avgCostPerContact=(10,50), avgRevenuePerContact=(20,100))
<function __main__.plotProfitCurve>