import pymysql import credentials db = pymysql.connect(host=credentials.host, user=credentials.user, passwd=credentials.password, db='bank') cursor = db.cursor() cursor.execute("SELECT * FROM bank_full LIMIT 3") for i in cursor.fetchall(): print i cursor.close() db.close() import pymysql import credentials import pandas.io.sql as psql db = pymysql.connect(host=credentials.host, user=credentials.user, passwd=credentials.password, db='bank') query = "SELECT * FROM bank_full;" bankDF = psql.read_sql(query, con=db) db.close() print "Number of database records read: %d" % len(bankDF) #print bankDF.columns bankDF.head() import pandas as pd import numpy as np tmpDF = bankDF.copy() tmpDF.y.replace(to_replace=['yes','no'], value=[1,0], inplace=True) # Delete the field(s) unknown before the call is made nonAvailableFields = ['duration'] for field in nonAvailableFields: tmpDF.drop(field, axis=1, inplace=True) # Key: categorical field name # Value: list of abbreviation and post-dummification redundant field categoricalFields = {'job':['job','unknown'], 'marital':['mar','unknown'], 'education':['ed','unknown'], 'bank_def':['def','unknown'], 'housing':['h','unknown'], 'loan':['loan','unknown'], 'contact':['cntct','telephone'], 'contact_month':['cntct',None], # Because 'jan' and 'feb' are missing in the data, despite being plausible 'day_of_week':['dow','fri'], # Even though 'sat' and 'sun' are missing, these are not plausible values 'poutcome':['pout','nonexistent'], } # Convert all categorical fields in dummy numerical ones # AND delete one dummified field because it contains redundant information for (categoricalField, value) in categoricalFields.iteritems(): (prefix, dummifiedRedundantField) = value tmpDF = pd.concat([tmpDF, pd.get_dummies(tmpDF[categoricalField], prefix=prefix)], axis=1) if dummifiedRedundantField: tmpDF.drop("%s_%s" % (prefix, dummifiedRedundantField), axis=1, inplace=True) # of course, also delete the categorical fields for (categoricalField,_) in categoricalFields.iteritems(): tmpDF.drop(categoricalField, axis=1, inplace=True) bankDF2 = tmpDF.copy() #print bankDF2.columns bankDF2.head() %matplotlib inline import sys from sklearn.cross_validation import cross_val_score, KFold from sklearn.metrics import roc_auc_score from sklearn import preprocessing from matplotlib import pyplot as plt from math import log10 from time import sleep X_values = bankDF2.copy() X_values.drop('y', axis=1, inplace=True) y_values = bankDF2['y'] # Scale the X values X_values_scaled = preprocessing.scale(X_values.astype(float), copy=False) scorerType = 'roc_auc' nrCrossValidationFolds=10 # We MUST shuffle, because the data seem to be somehow ordered by date, # resulting in grouped rows of similar socio-economic value combinations!!! cvGenerator = KFold(len(X_values), n_folds=nrCrossValidationFolds, shuffle=True) bestLogCValue = None bestKValue = None bestNrEst = None bestMaxFeat = None print "%d columns:" % len(X_values.columns), [str(c) for c in X_values.columns] print "%d values:" % len(X_values_scaled[0]), X_values_scaled[0] from sklearn.linear_model import LogisticRegression cValues = [pow(10,(x-10)) for x in range(20)] #print "cValues = ", cValues # selects weights inversely proportional to class frequencies in the training set # See http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html #classWeight = 'auto' classWeight = None estimators = [] scoreList = [] for i in range(len(cValues)): c = cValues[i] print >> sys.stderr, "(%d/%d) Building logres model for C value %1.15f ..." % (i+1,len(cValues),c), estimator = LogisticRegression(C=c,class_weight=classWeight) print >> sys.stderr, "applying it ...", scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1) estimators.append(estimator) scoreList.append((scores.mean(),scores.std())) print >> sys.stderr, "done" sleep(1) print "scoreList = ", scoreList meanScores = [x[0] for x in scoreList] bestModelIndex = meanScores.index(max(meanScores)) print "best logres model has:" print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100) bestCValue = cValues[bestModelIndex] bestLogCValue = log10(bestCValue) print "C value: ", bestCValue print "log10(C) value: ", bestLogCValue fig = plt.figure() plt.plot([log10(c) for c in cValues],[x[0] for x in scoreList], 'o-', color='g', label='mean') plt.fill_between([log10(c) for c in cValues], [x[0]-x[1] for x in scoreList], [x[0]+x[1] for x in scoreList], alpha=0.1, color="g", label='standard deviation') plt.title("Classification score on subscription prediction\nwith logres model using various C values") plt.xlabel("log10(C)") plt.ylabel(scorerType) plt.legend(loc='best') plt.show() from sklearn.neighbors import KNeighborsClassifier kValues = range(1,31) #kValues = range(25,31) # In order to gain some processing time estimators = [] scoreList = [] for i in range(len(kValues)): k = kValues[i] print >> sys.stderr, "(%d/%d) Building %d nearest neighbor model ..." % (i+1,len(kValues),k), estimator = KNeighborsClassifier(n_neighbors=k) print >> sys.stderr, "applying it ...", scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1) estimators.append(estimator) scoreList.append((scores.mean(),scores.std())) print >> sys.stderr, "done" sleep(1) print "scoreList = ", scoreList meanScores = [x[0] for x in scoreList] bestModelIndex = meanScores.index(max(meanScores)) print "best KNN model has:" print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100) bestKValue = kValues[bestModelIndex] print "K value: ", bestKValue fig = plt.figure() plt.plot([k for k in kValues],[x[0] for x in scoreList], 'o-', color='g', label='mean') plt.fill_between([k for k in kValues], [x[0]-x[1] for x in scoreList], [x[0]+x[1] for x in scoreList], alpha=0.1, color="g", label='standard deviation') plt.title("Classification score on subscription prediction\nwith K nearest neighbor model using various K values") plt.xlabel("K") plt.ylabel(scorerType) plt.legend(loc='best') plt.show() from sklearn.ensemble import RandomForestClassifier import itertools from math import log nrEstimators = [pow(2,x) for x in range(7,9)] maxFeatures = [pow(2,x) for x in range(1,int(log(len(X_values.columns.values.tolist()))+2))] rfConfigurations = list(itertools.product(*[nrEstimators,maxFeatures])) print "configurations: ", rfConfigurations scoreList = [] for i in range(len(rfConfigurations)): nrEst, maxFeat = rfConfigurations[i] print >> sys.stderr, "(%d/%d) Building random forest model with %d estimators and maximum %d features ..." % (i+1,len(rfConfigurations),nrEst,maxFeat), estimator = RandomForestClassifier(n_estimators=nrEst, max_features=maxFeat) print >> sys.stderr, "applying it ...", scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1) scoreList.append((scores.mean(),scores.std())) print >> sys.stderr, "done" sleep(1) print "scoreList = ", scoreList meanScores = [x[0] for x in scoreList] bestModelIndex = meanScores.index(max(meanScores)) print "best Random Forest model has:" print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100) bestNrEst, bestMaxFeat = rfConfigurations[bestModelIndex] print "n_estimators: ", bestNrEst print "max_features: ", bestMaxFeat fig = plt.figure() plt.plot([conf for conf in range(1,len(rfConfigurations)+1)],[x[0] for x in scoreList], 'o-', color='g', label='mean') plt.fill_between([conf for conf in range(1,len(rfConfigurations)+1)], [x[0]-x[1] for x in scoreList], [x[0]+x[1] for x in scoreList], alpha=0.1, color="g", label='standard deviation') plt.title("Classification score on subscription prediction\nwith Random Forest model using various configurations") plt.xlabel("Configuration ID (nr estimators, max features)") plt.ylabel(scorerType) plt.legend(loc='best') plt.show() from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.dummy import DummyClassifier # After-the-fact hardcoded optimizations, in case we want to skip the experimentation phase # (e.g. during a presentation) if bestLogCValue == None: bestLogCValue = -1 if bestKValue == None: bestKValue = 30 if bestNrEst == None or bestMaxFeat == None: bestNrEst = 256 bestMaxFeat = 8 models = {} models = {'logres with log10(C)=%d' % bestLogCValue : LogisticRegression(C=pow(10,bestLogCValue)), # Takes about 1 second elapsed time 'knn with K=%d' % bestKValue : KNeighborsClassifier(n_neighbors=bestKValue), # Takes about 12 seconds elapsed time 'gaussianNB': GaussianNB(), 'baseline' : DummyClassifier(strategy='stratified') # 'randomforest with nrEst=%d and maxFeat=%d' % (bestNrEst,bestMaxFeat): RandomForestClassifier(n_estimators=bestNrEst, max_features=bestMaxFeat) # Takes about 5 seconds elapsed time } fig = plt.figure(1,(9,6)) plt.title("%d-fold cross-validation %s scores for various model types" % (nrCrossValidationFolds, scorerType)) plt.xlabel("Fold #") plt.ylabel(scorerType) plt.grid() for modelName, model in models.items(): print >> sys.stderr, "Building %s model ..." % modelName, print >> sys.stderr, "applying it ...", scores = cross_val_score(model, X_values, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1, verbose=10) print >> sys.stderr, "done" plt.plot(range(1,nrCrossValidationFolds+1), scores, 'o-', label="%s (%2.2f%% +/- %2.2f%%)" % (modelName, scores.mean() * 100, scores.std() * 100)) plt.legend(loc='best') plt.show() from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_curve, roc_auc_score import mpld3 mpld3.enable_notebook() import sys from matplotlib import pyplot as plt import re from math import log import json import copy # We don't use cross validation here, but will generate a random test set instead X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size=0.1, random_state=0) finalPredictions = {} for modelName, model in models.items(): fittedModel = model.fit(X_train, y_train) if hasattr(model, "predict_proba"): print >> sys.stderr, "Predicting probabilities for %s model ..." % modelName, finalPredictions[modelName] = {'train': model.predict_proba(X_train), 'test': model.predict_proba(X_test)} print >> sys.stderr, "done" def plotROCCurve(figSize=7): # Define some CSS to control our custom labels css = """ table { border-collapse: collapse; } th { color: #ffffff; background-color: #000000; } td { padding: 2px; background-color: #cccccc; } table, th, td { font-family:Arial, Helvetica, sans-serif; border: 1px solid black; text-align: right; } """ jsonROCData = {} rocCurveFigure, ax = plt.subplots(figsize=(figSize,figSize)) ax.grid(True, alpha=0.3) for modelName, probs in finalPredictions.iteritems(): modelNameShort = re.split("\s+", modelName)[0] # if modelNameShort not in ['knn','baseline']: # continue y_probs = [x[1] for x in finalPredictions[modelName]['test']] fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_probs, pos_label=1) # print "%s: fpr (size = %d)" % (modelNameShort, len(fpr)), fpr # print "tpr (size = %d)" % len(tpr), tpr # print "thresholds (size = %d)" % len(thresholds), thresholds roc_auc = roc_auc_score(y_true=y_test, y_score=y_probs) jsonROCData[modelNameShort] = {} jsonROCData[modelNameShort]['fpr'] = [x for x in fpr] jsonROCData[modelNameShort]['tpr'] = [ x for x in tpr] jsonROCData[modelNameShort]['thresholds'] = [x for x in thresholds] jsonROCData[modelNameShort]['roc_auc'] = roc_auc points = plt.plot(fpr, tpr, 'x-', label="%s (AUC = %1.2f%%)" % (modelName, roc_auc*100)) labels = ["
%s
FPR%0.1f%%
TPR%0.1f%%
threshold%0.4f
" % (modelNameShort, fpr[i]*100, tpr[i]*100, thresholds[i]) for i in range(len(fpr))] mpld3.plugins.connect(rocCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css)) ax.set_title("ROC curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize)) ax.set_xlabel("False Positive Rate (FP/FP+TN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize)) ax.set_ylabel("True Positive Rate (TP/TP+FN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize)) plt.legend(loc="best") plt.show() return jsonROCData # Export data to JSON file for visualization in D3.js or similar jsonROCData = plotROCCurve(figSize=9) with open('d3/ROCCurve.json', 'w') as outfile: json.dump(jsonROCData, outfile) from IPython.html.widgets import interact, fixed #mpld3.save_html(plotROCCurve(figSize=9), "plotROCCurveBanking.html", figid="ROC_Curve_Banking") interact(plotROCCurve, figSize=(5,10)) import numpy as np import mpld3 mpld3.enable_notebook() import re from math import log def sortTestSetByProbability(y_test, y_probs): sortedIndexList = sorted(range(len(y_probs)), key=lambda k: y_probs[k], reverse=True) return [(y_probs[i], y_test[i], i) for i in sortedIndexList] def calculateLift(sortedProbsAndTrueLabels, nrPercentiles=10, posLabel=1): ''' Input is a list of tuples containing (prob, true label), sorted in descending order per prob Output is a a list of nrPercentiles tuples containing, for each percentile: (percentOfTargetPopulationInThisPercentile, percentOfPositivePopulationInThisPercentile, cumulativeLift, localLift ) ''' numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel]) percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)] # print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances) # print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2] result = [(0.0,0.0,0.0,0.0)] cumulativePercentage = 0.0 cumulativeLift = 0 localLift = 0 for c in range(len(percentileSplitOfInstances)): percentile = percentileSplitOfInstances[c] nrPositivesInPercentile = len([x for x in percentile if x == posLabel]) p = float(c+1)/float(nrPercentiles) cumulativePercentage += float(nrPositivesInPercentile)/float(numberOfPositiveInstances) cumulativeLift = cumulativePercentage/p localLift = (float(nrPositivesInPercentile)/float(numberOfPositiveInstances))/(1.0/float(nrPercentiles)) result.append((p,cumulativePercentage,cumulativeLift, localLift)) return result def plotLiftCurve(numberOfPercentiles=10, figSize=5): # Define some CSS to control our custom labels css = """ table { border-collapse: collapse; } th { color: #ffffff; background-color: #000000; } td { padding: 2px; background-color: #cccccc; } table, th, td { font-family:Arial, Helvetica, sans-serif; border: 1px solid black; text-align: right; } """ jsonLiftData = {} liftCurveFigure, ax = plt.subplots(figsize=(figSize,figSize)) ax.grid(True, alpha=0.3) ax.set_autoscaley_on(False) ax.set_ylim([0,100]) for modelName, probs in finalPredictions.iteritems(): modelNameShort = re.split("\s+", modelName)[0] y_probs = [x[1] for x in finalPredictions[modelName]['test']] sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs) lifts = calculateLift(sortedProbsAndTrueLabels, nrPercentiles=numberOfPercentiles, posLabel=1) # print modelName, lifts xPerc = [x[0]*100 for x in lifts] yPerc = [x[1]*100 for x in lifts] jsonLiftData[modelNameShort] = {} jsonLiftData[modelNameShort]['xPerc'] = [x for x in xPerc] jsonLiftData[modelNameShort]['yPerc'] = [y for y in yPerc] jsonLiftData[modelNameShort]['cumLift'] = [l[2] for l in lifts] jsonLiftData[modelNameShort]['localLift'] = [l[3] for l in lifts] points = ax.plot(xPerc, yPerc, 'o-', label="%s" % modelNameShort) labels = ["
%s
top-scored%2.1f%%
yields%2.1f%%
cum. lift%1.1f
local lift%1.1f
" % (modelNameShort, xPerc[i],yPerc[i], lifts[i][2], lifts[i][3]) for i in range(len(yPerc))] mpld3.plugins.connect(liftCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css)) ax.set_title("Lift curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize)) ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(figSize), fontsize=12 + log(figSize)) ax.set_ylabel("Percentage of positive instances targeted", labelpad=15 + log(figSize), fontsize=12 + log(figSize)) plt.legend(loc=4) plt.show() return jsonLiftData # Export data to JSON file for visualization in D3.js or similar jsonLiftData = plotLiftCurve(numberOfPercentiles=100, figSize=9) with open('d3/LiftCurve.json', 'w') as outfile: json.dump(jsonLiftData, outfile) from IPython.html.widgets import interact, fixed #mpld3.save_html(plotLiftCurve(numberOfPercentiles=10, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking") interact(plotLiftCurve, numberOfPercentiles=(5, 20), figSize=(5,10)) def calculateProfit(sortedProbsAndTrueLabels, nrPercentiles=20, posLabel=1, avgCostPerContact=10, avgRevenuePerContact=50 ): ''' Input is a list of tuples containing (prob, avg cost per contact, avg revenue per contact, number of percentiles, true label), sorted in descending order per prob Output is a list containing tuples of the form (percentOfTargetPopulationInThisPercentile, cumulative revenue, cumulative cost, cumulative profit, revenue for this percentile, cost for this percentile, profit for this percentile) ''' assert(avgCostPerContact >= 0.0), "avgCostPerContact is expressed as a positive float" assert(avgRevenuePerContact >= 0.0), "avgRevenuePerContact is expressed as a positive float" numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel]) percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)] # print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances) # print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2] result = [(0.0,0.0,0.0,0.0,0.0,0.0,0.0)] cumulativeCost = 0.0 cumulativeRevenue = 0.0 cumulativeProfit = 0.0 for c in range(len(percentileSplitOfInstances)): percentile = percentileSplitOfInstances[c] nrPositivesInPercentile = len([x for x in percentile if x == posLabel]) nrNegativesInPercentile = len(percentile) - nrPositivesInPercentile assert(nrPositivesInPercentile + nrNegativesInPercentile == len(percentile)) assert(len(percentile) > 0) p = float(c+1)/float(nrPercentiles) costThisPercentile = avgCostPerContact * (nrPositivesInPercentile + nrNegativesInPercentile) revenueThisPercentile = avgRevenuePerContact * nrPositivesInPercentile profitThisPercentile = revenueThisPercentile - costThisPercentile cumulativeRevenue = cumulativeRevenue + revenueThisPercentile cumulativeCost = cumulativeCost + costThisPercentile cumulativeProfit = cumulativeProfit + profitThisPercentile result.append((p, cumulativeRevenue, cumulativeCost, cumulativeProfit, revenueThisPercentile, costThisPercentile, profitThisPercentile)) return result def plotProfitCurve(numberOfPercentiles=10, figSize=7, avgCostPerContact=15, avgRevenuePerContact=75, yUnit=1000.0): # Define some CSS to control our custom labels css = """ table { border-collapse: collapse; } th { color: #ffffff; background-color: #000000; } td { padding: 2px; background-color: #cccccc; } table, th, td { font-family:Arial, Helvetica, sans-serif; border: 1px solid black; text-align: right; } """ jsonProfitData = {} profitCurveFigure, ax = plt.subplots(1, figsize=(int(figSize),int(figSize))) ax.grid(True, alpha=0.3) ax.set_autoscaley_on(True) for modelName, probs in finalPredictions.iteritems(): modelNameShort = re.split("\s+", modelName)[0] y_probs = [x[1] for x in finalPredictions[modelName]['test']] sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs) profits = calculateProfit(sortedProbsAndTrueLabels, avgCostPerContact=avgCostPerContact, avgRevenuePerContact=avgRevenuePerContact, nrPercentiles=int(numberOfPercentiles), posLabel=1) xPerc = [x[0]*100 for x in profits] yProfit = [x[3]/yUnit for x in profits] jsonProfitData[modelNameShort] = {} jsonProfitData[modelNameShort]['sortedProbsAndTrueLabels'] = sortedProbsAndTrueLabels jsonProfitData[modelNameShort]['xPerc'] = [x for x in xPerc] jsonProfitData[modelNameShort]['yProfit'] = [y for y in yProfit] jsonProfitData[modelNameShort]['cumProfit'] = [p[3] for p in profits] jsonProfitData[modelNameShort]['intervalProfit'] = [p[6] for p in profits] points = ax.plot(xPerc, yProfit, 'o-', label="%s" % modelNameShort) labels = ["
%s
top-scored%2.1f%%
cum. profit (USD)%.0f
interval profit (USD)%.0f
" % (modelNameShort, xPerc[i],profits[i][3], profits[i][6]) for i in range(len(yProfit))] mpld3.plugins.connect(profitCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css)) # print "%s: " % modelNameShort, profits ax.set_title("Profit curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(int(figSize))) ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize))) ax.set_ylabel("Profit (in %d USD)" % int(yUnit), labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize))) plt.legend(loc=1) plt.show() return jsonProfitData # Export data to JSON file for visualization in D3.js or similar jsonProfitData = plotProfitCurve(avgCostPerContact=15, avgRevenuePerContact=50, numberOfPercentiles=100, figSize=9, yUnit=1) with open('d3/ProfitCurve.json', 'w') as outfile: json.dump(jsonProfitData, outfile) from IPython.html.widgets import interact, fixed #mpld3.save_html(plotProfitCurve(avgCostPerContact=10, avgRevenuePerContact=50, numberOfPercentiles=20, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking") interact(plotProfitCurve, numberOfPercentiles=(10,50), figSize=(5,10), avgCostPerContact=(10,50), avgRevenuePerContact=(20,100))