import pymysql
import credentials
db = pymysql.connect(host=credentials.host,
user=credentials.user,
passwd=credentials.password,
db='bank')
cursor = db.cursor()
cursor.execute("SELECT * FROM bank_full LIMIT 3")
for i in cursor.fetchall():
print i
cursor.close()
db.close()
import pymysql
import credentials
import pandas.io.sql as psql
db = pymysql.connect(host=credentials.host,
user=credentials.user,
passwd=credentials.password,
db='bank')
query = "SELECT * FROM bank_full;"
bankDF = psql.read_sql(query, con=db)
db.close()
print "Number of database records read: %d" % len(bankDF)
#print bankDF.columns
bankDF.head()
import pandas as pd
import numpy as np
tmpDF = bankDF.copy()
tmpDF.y.replace(to_replace=['yes','no'], value=[1,0], inplace=True)
# Delete the field(s) unknown before the call is made
nonAvailableFields = ['duration']
for field in nonAvailableFields:
tmpDF.drop(field, axis=1, inplace=True)
# Key: categorical field name
# Value: list of abbreviation and post-dummification redundant field
categoricalFields = {'job':['job','unknown'],
'marital':['mar','unknown'],
'education':['ed','unknown'],
'bank_def':['def','unknown'],
'housing':['h','unknown'],
'loan':['loan','unknown'],
'contact':['cntct','telephone'],
'contact_month':['cntct',None], # Because 'jan' and 'feb' are missing in the data, despite being plausible
'day_of_week':['dow','fri'], # Even though 'sat' and 'sun' are missing, these are not plausible values
'poutcome':['pout','nonexistent'],
}
# Convert all categorical fields in dummy numerical ones
# AND delete one dummified field because it contains redundant information
for (categoricalField, value) in categoricalFields.iteritems():
(prefix, dummifiedRedundantField) = value
tmpDF = pd.concat([tmpDF, pd.get_dummies(tmpDF[categoricalField], prefix=prefix)], axis=1)
if dummifiedRedundantField:
tmpDF.drop("%s_%s" % (prefix, dummifiedRedundantField), axis=1, inplace=True)
# of course, also delete the categorical fields
for (categoricalField,_) in categoricalFields.iteritems():
tmpDF.drop(categoricalField, axis=1, inplace=True)
bankDF2 = tmpDF.copy()
#print bankDF2.columns
bankDF2.head()
%matplotlib inline
import sys
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from matplotlib import pyplot as plt
from math import log10
from time import sleep
X_values = bankDF2.copy()
X_values.drop('y', axis=1, inplace=True)
y_values = bankDF2['y']
# Scale the X values
X_values_scaled = preprocessing.scale(X_values.astype(float), copy=False)
scorerType = 'roc_auc'
nrCrossValidationFolds=10
# We MUST shuffle, because the data seem to be somehow ordered by date,
# resulting in grouped rows of similar socio-economic value combinations!!!
cvGenerator = KFold(len(X_values), n_folds=nrCrossValidationFolds, shuffle=True)
bestLogCValue = None
bestKValue = None
bestNrEst = None
bestMaxFeat = None
print "%d columns:" % len(X_values.columns), [str(c) for c in X_values.columns]
print "%d values:" % len(X_values_scaled[0]), X_values_scaled[0]
from sklearn.linear_model import LogisticRegression
cValues = [pow(10,(x-10)) for x in range(20)]
#print "cValues = ", cValues
# selects weights inversely proportional to class frequencies in the training set
# See http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#classWeight = 'auto'
classWeight = None
estimators = []
scoreList = []
for i in range(len(cValues)):
c = cValues[i]
print >> sys.stderr, "(%d/%d) Building logres model for C value %1.15f ..." % (i+1,len(cValues),c),
estimator = LogisticRegression(C=c,class_weight=classWeight)
print >> sys.stderr, "applying it ...",
scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
estimators.append(estimator)
scoreList.append((scores.mean(),scores.std()))
print >> sys.stderr, "done"
sleep(1)
print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]
bestModelIndex = meanScores.index(max(meanScores))
print "best logres model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestCValue = cValues[bestModelIndex]
bestLogCValue = log10(bestCValue)
print "C value: ", bestCValue
print "log10(C) value: ", bestLogCValue
fig = plt.figure()
plt.plot([log10(c) for c in cValues],[x[0] for x in scoreList], 'o-', color='g', label='mean')
plt.fill_between([log10(c) for c in cValues], [x[0]-x[1] for x in scoreList],
[x[0]+x[1] for x in scoreList], alpha=0.1,
color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith logres model using various C values")
plt.xlabel("log10(C)")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()
from sklearn.neighbors import KNeighborsClassifier
kValues = range(1,31)
#kValues = range(25,31) # In order to gain some processing time
estimators = []
scoreList = []
for i in range(len(kValues)):
k = kValues[i]
print >> sys.stderr, "(%d/%d) Building %d nearest neighbor model ..." % (i+1,len(kValues),k),
estimator = KNeighborsClassifier(n_neighbors=k)
print >> sys.stderr, "applying it ...",
scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
estimators.append(estimator)
scoreList.append((scores.mean(),scores.std()))
print >> sys.stderr, "done"
sleep(1)
print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]
bestModelIndex = meanScores.index(max(meanScores))
print "best KNN model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestKValue = kValues[bestModelIndex]
print "K value: ", bestKValue
fig = plt.figure()
plt.plot([k for k in kValues],[x[0] for x in scoreList], 'o-', color='g', label='mean')
plt.fill_between([k for k in kValues], [x[0]-x[1] for x in scoreList],
[x[0]+x[1] for x in scoreList], alpha=0.1,
color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith K nearest neighbor model using various K values")
plt.xlabel("K")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()
from sklearn.ensemble import RandomForestClassifier
import itertools
from math import log
nrEstimators = [pow(2,x) for x in range(7,9)]
maxFeatures = [pow(2,x) for x in range(1,int(log(len(X_values.columns.values.tolist()))+2))]
rfConfigurations = list(itertools.product(*[nrEstimators,maxFeatures]))
print "configurations: ", rfConfigurations
scoreList = []
for i in range(len(rfConfigurations)):
nrEst, maxFeat = rfConfigurations[i]
print >> sys.stderr, "(%d/%d) Building random forest model with %d estimators and maximum %d features ..." % (i+1,len(rfConfigurations),nrEst,maxFeat),
estimator = RandomForestClassifier(n_estimators=nrEst, max_features=maxFeat)
print >> sys.stderr, "applying it ...",
scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
scoreList.append((scores.mean(),scores.std()))
print >> sys.stderr, "done"
sleep(1)
print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]
bestModelIndex = meanScores.index(max(meanScores))
print "best Random Forest model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestNrEst, bestMaxFeat = rfConfigurations[bestModelIndex]
print "n_estimators: ", bestNrEst
print "max_features: ", bestMaxFeat
fig = plt.figure()
plt.plot([conf for conf in range(1,len(rfConfigurations)+1)],[x[0] for x in scoreList], 'o-', color='g', label='mean')
plt.fill_between([conf for conf in range(1,len(rfConfigurations)+1)], [x[0]-x[1] for x in scoreList],
[x[0]+x[1] for x in scoreList], alpha=0.1,
color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith Random Forest model using various configurations")
plt.xlabel("Configuration ID (nr estimators, max features)")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
# After-the-fact hardcoded optimizations, in case we want to skip the experimentation phase
# (e.g. during a presentation)
if bestLogCValue == None:
bestLogCValue = -1
if bestKValue == None:
bestKValue = 30
if bestNrEst == None or bestMaxFeat == None:
bestNrEst = 256
bestMaxFeat = 8
models = {}
models = {'logres with log10(C)=%d' % bestLogCValue : LogisticRegression(C=pow(10,bestLogCValue)), # Takes about 1 second elapsed time
'knn with K=%d' % bestKValue : KNeighborsClassifier(n_neighbors=bestKValue), # Takes about 12 seconds elapsed time
'gaussianNB': GaussianNB(),
'baseline' : DummyClassifier(strategy='stratified')
# 'randomforest with nrEst=%d and maxFeat=%d' % (bestNrEst,bestMaxFeat): RandomForestClassifier(n_estimators=bestNrEst, max_features=bestMaxFeat) # Takes about 5 seconds elapsed time
}
fig = plt.figure(1,(9,6))
plt.title("%d-fold cross-validation %s scores for various model types" % (nrCrossValidationFolds, scorerType))
plt.xlabel("Fold #")
plt.ylabel(scorerType)
plt.grid()
for modelName, model in models.items():
print >> sys.stderr, "Building %s model ..." % modelName,
print >> sys.stderr, "applying it ...",
scores = cross_val_score(model, X_values, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1, verbose=10)
print >> sys.stderr, "done"
plt.plot(range(1,nrCrossValidationFolds+1), scores, 'o-', label="%s (%2.2f%% +/- %2.2f%%)" % (modelName, scores.mean() * 100, scores.std() * 100))
plt.legend(loc='best')
plt.show()
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import mpld3
mpld3.enable_notebook()
import sys
from matplotlib import pyplot as plt
import re
from math import log
import json
import copy
# We don't use cross validation here, but will generate a random test set instead
X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size=0.1, random_state=0)
finalPredictions = {}
for modelName, model in models.items():
fittedModel = model.fit(X_train, y_train)
if hasattr(model, "predict_proba"):
print >> sys.stderr, "Predicting probabilities for %s model ..." % modelName,
finalPredictions[modelName] = {'train': model.predict_proba(X_train),
'test': model.predict_proba(X_test)}
print >> sys.stderr, "done"
def plotROCCurve(figSize=7):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonROCData = {}
rocCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
ax.grid(True, alpha=0.3)
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
# if modelNameShort not in ['knn','baseline']:
# continue
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
fpr, tpr, thresholds = roc_curve(y_true=y_test,
y_score=y_probs, pos_label=1)
# print "%s: fpr (size = %d)" % (modelNameShort, len(fpr)), fpr
# print "tpr (size = %d)" % len(tpr), tpr
# print "thresholds (size = %d)" % len(thresholds), thresholds
roc_auc = roc_auc_score(y_true=y_test, y_score=y_probs)
jsonROCData[modelNameShort] = {}
jsonROCData[modelNameShort]['fpr'] = [x for x in fpr]
jsonROCData[modelNameShort]['tpr'] = [ x for x in tpr]
jsonROCData[modelNameShort]['thresholds'] = [x for x in thresholds]
jsonROCData[modelNameShort]['roc_auc'] = roc_auc
points = plt.plot(fpr, tpr, 'x-', label="%s (AUC = %1.2f%%)" % (modelName, roc_auc*100))
labels = ["
%s | FPR | %0.1f%% |
TPR | %0.1f%% |
threshold | %0.4f |
" % (modelNameShort, fpr[i]*100, tpr[i]*100, thresholds[i]) for i in range(len(fpr))]
mpld3.plugins.connect(rocCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
ax.set_title("ROC curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
ax.set_xlabel("False Positive Rate (FP/FP+TN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
ax.set_ylabel("True Positive Rate (TP/TP+FN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
plt.legend(loc="best")
plt.show()
return jsonROCData
# Export data to JSON file for visualization in D3.js or similar
jsonROCData = plotROCCurve(figSize=9)
with open('d3/ROCCurve.json', 'w') as outfile:
json.dump(jsonROCData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotROCCurve(figSize=9), "plotROCCurveBanking.html", figid="ROC_Curve_Banking")
interact(plotROCCurve, figSize=(5,10))
import numpy as np
import mpld3
mpld3.enable_notebook()
import re
from math import log
def sortTestSetByProbability(y_test, y_probs):
sortedIndexList = sorted(range(len(y_probs)), key=lambda k: y_probs[k], reverse=True)
return [(y_probs[i], y_test[i], i) for i in sortedIndexList]
def calculateLift(sortedProbsAndTrueLabels, nrPercentiles=10, posLabel=1):
''' Input is a list of tuples containing (prob, true label), sorted in descending order per prob
Output is a a list of nrPercentiles tuples containing, for each percentile:
(percentOfTargetPopulationInThisPercentile,
percentOfPositivePopulationInThisPercentile,
cumulativeLift,
localLift
)
'''
numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
# print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
# print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
result = [(0.0,0.0,0.0,0.0)]
cumulativePercentage = 0.0
cumulativeLift = 0
localLift = 0
for c in range(len(percentileSplitOfInstances)):
percentile = percentileSplitOfInstances[c]
nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
p = float(c+1)/float(nrPercentiles)
cumulativePercentage += float(nrPositivesInPercentile)/float(numberOfPositiveInstances)
cumulativeLift = cumulativePercentage/p
localLift = (float(nrPositivesInPercentile)/float(numberOfPositiveInstances))/(1.0/float(nrPercentiles))
result.append((p,cumulativePercentage,cumulativeLift, localLift))
return result
def plotLiftCurve(numberOfPercentiles=10, figSize=5):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonLiftData = {}
liftCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
ax.grid(True, alpha=0.3)
ax.set_autoscaley_on(False)
ax.set_ylim([0,100])
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
lifts = calculateLift(sortedProbsAndTrueLabels, nrPercentiles=numberOfPercentiles, posLabel=1)
# print modelName, lifts
xPerc = [x[0]*100 for x in lifts]
yPerc = [x[1]*100 for x in lifts]
jsonLiftData[modelNameShort] = {}
jsonLiftData[modelNameShort]['xPerc'] = [x for x in xPerc]
jsonLiftData[modelNameShort]['yPerc'] = [y for y in yPerc]
jsonLiftData[modelNameShort]['cumLift'] = [l[2] for l in lifts]
jsonLiftData[modelNameShort]['localLift'] = [l[3] for l in lifts]
points = ax.plot(xPerc, yPerc, 'o-', label="%s" % modelNameShort)
labels = ["%s | top-scored | %2.1f%% |
yields | %2.1f%% |
cum. lift | %1.1f |
local lift | %1.1f |
" % (modelNameShort, xPerc[i],yPerc[i], lifts[i][2], lifts[i][3]) for i in range(len(yPerc))]
mpld3.plugins.connect(liftCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
ax.set_title("Lift curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
ax.set_ylabel("Percentage of positive instances targeted", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
plt.legend(loc=4)
plt.show()
return jsonLiftData
# Export data to JSON file for visualization in D3.js or similar
jsonLiftData = plotLiftCurve(numberOfPercentiles=100, figSize=9)
with open('d3/LiftCurve.json', 'w') as outfile:
json.dump(jsonLiftData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotLiftCurve(numberOfPercentiles=10, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotLiftCurve, numberOfPercentiles=(5, 20), figSize=(5,10))
def calculateProfit(sortedProbsAndTrueLabels,
nrPercentiles=20,
posLabel=1,
avgCostPerContact=10,
avgRevenuePerContact=50
):
''' Input is a list of tuples containing (prob, avg cost per contact, avg revenue per contact,
number of percentiles, true label), sorted in descending order per prob
Output is a list containing tuples of the form
(percentOfTargetPopulationInThisPercentile,
cumulative revenue,
cumulative cost,
cumulative profit,
revenue for this percentile,
cost for this percentile,
profit for this percentile)
'''
assert(avgCostPerContact >= 0.0), "avgCostPerContact is expressed as a positive float"
assert(avgRevenuePerContact >= 0.0), "avgRevenuePerContact is expressed as a positive float"
numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
# print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
# print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
result = [(0.0,0.0,0.0,0.0,0.0,0.0,0.0)]
cumulativeCost = 0.0
cumulativeRevenue = 0.0
cumulativeProfit = 0.0
for c in range(len(percentileSplitOfInstances)):
percentile = percentileSplitOfInstances[c]
nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
nrNegativesInPercentile = len(percentile) - nrPositivesInPercentile
assert(nrPositivesInPercentile + nrNegativesInPercentile == len(percentile))
assert(len(percentile) > 0)
p = float(c+1)/float(nrPercentiles)
costThisPercentile = avgCostPerContact * (nrPositivesInPercentile + nrNegativesInPercentile)
revenueThisPercentile = avgRevenuePerContact * nrPositivesInPercentile
profitThisPercentile = revenueThisPercentile - costThisPercentile
cumulativeRevenue = cumulativeRevenue + revenueThisPercentile
cumulativeCost = cumulativeCost + costThisPercentile
cumulativeProfit = cumulativeProfit + profitThisPercentile
result.append((p, cumulativeRevenue, cumulativeCost, cumulativeProfit, revenueThisPercentile, costThisPercentile, profitThisPercentile))
return result
def plotProfitCurve(numberOfPercentiles=10, figSize=7, avgCostPerContact=15, avgRevenuePerContact=75, yUnit=1000.0):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonProfitData = {}
profitCurveFigure, ax = plt.subplots(1, figsize=(int(figSize),int(figSize)))
ax.grid(True, alpha=0.3)
ax.set_autoscaley_on(True)
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
profits = calculateProfit(sortedProbsAndTrueLabels,
avgCostPerContact=avgCostPerContact,
avgRevenuePerContact=avgRevenuePerContact,
nrPercentiles=int(numberOfPercentiles),
posLabel=1)
xPerc = [x[0]*100 for x in profits]
yProfit = [x[3]/yUnit for x in profits]
jsonProfitData[modelNameShort] = {}
jsonProfitData[modelNameShort]['sortedProbsAndTrueLabels'] = sortedProbsAndTrueLabels
jsonProfitData[modelNameShort]['xPerc'] = [x for x in xPerc]
jsonProfitData[modelNameShort]['yProfit'] = [y for y in yProfit]
jsonProfitData[modelNameShort]['cumProfit'] = [p[3] for p in profits]
jsonProfitData[modelNameShort]['intervalProfit'] = [p[6] for p in profits]
points = ax.plot(xPerc, yProfit, 'o-', label="%s" % modelNameShort)
labels = ["%s | top-scored | %2.1f%% |
cum. profit (USD) | %.0f |
interval profit (USD) | %.0f |
" % (modelNameShort, xPerc[i],profits[i][3], profits[i][6]) for i in range(len(yProfit))]
mpld3.plugins.connect(profitCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
# print "%s: " % modelNameShort, profits
ax.set_title("Profit curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(int(figSize)))
ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
ax.set_ylabel("Profit (in %d USD)" % int(yUnit), labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
plt.legend(loc=1)
plt.show()
return jsonProfitData
# Export data to JSON file for visualization in D3.js or similar
jsonProfitData = plotProfitCurve(avgCostPerContact=15, avgRevenuePerContact=50, numberOfPercentiles=100, figSize=9, yUnit=1)
with open('d3/ProfitCurve.json', 'w') as outfile:
json.dump(jsonProfitData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotProfitCurve(avgCostPerContact=10, avgRevenuePerContact=50, numberOfPercentiles=20, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotProfitCurve, numberOfPercentiles=(10,50), figSize=(5,10), avgCostPerContact=(10,50), avgRevenuePerContact=(20,100))