import pymysql
import credentials

db = pymysql.connect(host=credentials.host,
                     user=credentials.user,
                     passwd=credentials.password,
                     db='bank')

cursor = db.cursor()

cursor.execute("SELECT * FROM bank_full LIMIT 3")
for i in cursor.fetchall():
    print i

cursor.close()
db.close()

import pymysql
import credentials
import pandas.io.sql as psql
db = pymysql.connect(host=credentials.host,
                     user=credentials.user,
                     passwd=credentials.password,
                     db='bank')
query = "SELECT * FROM bank_full;"
bankDF = psql.read_sql(query, con=db)
db.close()
print "Number of database records read: %d" % len(bankDF)
#print bankDF.columns
bankDF.head()

import pandas as pd
import numpy as np
tmpDF = bankDF.copy()
tmpDF.y.replace(to_replace=['yes','no'], value=[1,0], inplace=True)

# Delete the field(s) unknown before the call is made
nonAvailableFields = ['duration']
for field in nonAvailableFields:
    tmpDF.drop(field, axis=1, inplace=True)

# Key: categorical field name
# Value: list of abbreviation and post-dummification redundant field
categoricalFields = {'job':['job','unknown'],
                     'marital':['mar','unknown'],
                     'education':['ed','unknown'],
                     'bank_def':['def','unknown'],
                     'housing':['h','unknown'],
                     'loan':['loan','unknown'],
                     'contact':['cntct','telephone'],
                     'contact_month':['cntct',None], # Because 'jan' and 'feb' are missing in the data, despite being plausible
                     'day_of_week':['dow','fri'], # Even though 'sat' and 'sun' are missing, these are not plausible values
                     'poutcome':['pout','nonexistent'],
                           }
# Convert all categorical fields in dummy numerical ones
# AND delete one dummified field because it contains redundant information
for (categoricalField, value) in categoricalFields.iteritems():
    (prefix, dummifiedRedundantField) = value
    tmpDF = pd.concat([tmpDF, pd.get_dummies(tmpDF[categoricalField], prefix=prefix)], axis=1)
    if dummifiedRedundantField:
        tmpDF.drop("%s_%s" % (prefix, dummifiedRedundantField), axis=1, inplace=True)

# of course, also delete the categorical fields
for (categoricalField,_) in categoricalFields.iteritems():
    tmpDF.drop(categoricalField, axis=1, inplace=True)
bankDF2 = tmpDF.copy()
#print bankDF2.columns
bankDF2.head()

%matplotlib inline
import sys
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from matplotlib import pyplot as plt
from math import log10
from time import sleep

X_values = bankDF2.copy()
X_values.drop('y', axis=1, inplace=True)
y_values = bankDF2['y']

# Scale the X values
X_values_scaled = preprocessing.scale(X_values.astype(float), copy=False)

scorerType = 'roc_auc'

nrCrossValidationFolds=10
# We MUST shuffle, because the data seem to be somehow ordered by date,
# resulting in grouped rows of similar socio-economic value combinations!!!
cvGenerator = KFold(len(X_values), n_folds=nrCrossValidationFolds, shuffle=True)  

bestLogCValue = None
bestKValue = None
bestNrEst = None
bestMaxFeat = None

print "%d columns:" % len(X_values.columns), [str(c) for c in X_values.columns]
print "%d values:" % len(X_values_scaled[0]), X_values_scaled[0]

from sklearn.linear_model import LogisticRegression

cValues = [pow(10,(x-10)) for x in range(20)]
#print "cValues = ", cValues 

# selects weights inversely proportional to class frequencies in the training set
# See http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#classWeight = 'auto'
classWeight = None

estimators = []
scoreList = []
for i in range(len(cValues)):
    c = cValues[i]
    print >> sys.stderr, "(%d/%d) Building logres model for C value %1.15f ..." % (i+1,len(cValues),c),
    estimator = LogisticRegression(C=c,class_weight=classWeight)
    print >> sys.stderr, "applying it ...",
    scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
    estimators.append(estimator)
    scoreList.append((scores.mean(),scores.std()))
    print >> sys.stderr, "done"
sleep(1)

print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]

bestModelIndex = meanScores.index(max(meanScores))
print "best logres model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestCValue = cValues[bestModelIndex]
bestLogCValue = log10(bestCValue)
print "C value: ", bestCValue
print "log10(C) value: ", bestLogCValue

fig = plt.figure()
plt.plot([log10(c) for c in cValues],[x[0] for x in scoreList], 'o-', color='g', label='mean')
plt.fill_between([log10(c) for c in cValues], [x[0]-x[1] for x in scoreList],
                 [x[0]+x[1] for x in scoreList], alpha=0.1,
                 color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith logres model using various C values")
plt.xlabel("log10(C)")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()

from sklearn.neighbors import KNeighborsClassifier

kValues = range(1,31)
#kValues = range(25,31) # In order to gain some processing time

estimators = []
scoreList = []
for i in range(len(kValues)):
    k = kValues[i]
    print >> sys.stderr, "(%d/%d) Building %d nearest neighbor model ..." % (i+1,len(kValues),k),
    estimator = KNeighborsClassifier(n_neighbors=k)
    print >> sys.stderr, "applying it ...",
    scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
    estimators.append(estimator)
    scoreList.append((scores.mean(),scores.std()))
    print >> sys.stderr, "done"
sleep(1)

print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]

bestModelIndex = meanScores.index(max(meanScores))
print "best KNN model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestKValue = kValues[bestModelIndex]
print "K value: ", bestKValue

fig = plt.figure()
plt.plot([k for k in kValues],[x[0] for x in scoreList], 'o-', color='g', label='mean')
plt.fill_between([k for k in kValues], [x[0]-x[1] for x in scoreList],
                 [x[0]+x[1] for x in scoreList], alpha=0.1,
                 color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith K nearest neighbor model using various K values")
plt.xlabel("K")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()

from sklearn.ensemble import RandomForestClassifier
import itertools
from math import log

nrEstimators = [pow(2,x) for x in range(7,9)]
maxFeatures = [pow(2,x) for x in range(1,int(log(len(X_values.columns.values.tolist()))+2))]

rfConfigurations = list(itertools.product(*[nrEstimators,maxFeatures]))
print "configurations: ", rfConfigurations

scoreList = []
for i in range(len(rfConfigurations)):
    nrEst, maxFeat = rfConfigurations[i]
    print >> sys.stderr, "(%d/%d) Building random forest model with %d estimators and maximum %d features ..." % (i+1,len(rfConfigurations),nrEst,maxFeat),
    estimator = RandomForestClassifier(n_estimators=nrEst, max_features=maxFeat)
    print >> sys.stderr, "applying it ...",
    scores = cross_val_score(estimator, X_values_scaled, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1)
    scoreList.append((scores.mean(),scores.std()))
    print >> sys.stderr, "done"
sleep(1)

print "scoreList = ", scoreList
meanScores = [x[0] for x in scoreList]

bestModelIndex = meanScores.index(max(meanScores))
print "best Random Forest model has:"
print "%s score: %2.2f%% (+/- %2.2f%%)" % (scorerType, scoreList[bestModelIndex][0] * 100, scoreList[bestModelIndex][1] * 100)
bestNrEst, bestMaxFeat = rfConfigurations[bestModelIndex]
print "n_estimators: ", bestNrEst
print "max_features: ", bestMaxFeat

fig = plt.figure()
plt.plot([conf for conf in range(1,len(rfConfigurations)+1)],[x[0] for x in scoreList], 'o-', color='g', label='mean')
plt.fill_between([conf for conf in range(1,len(rfConfigurations)+1)], [x[0]-x[1] for x in scoreList],
                 [x[0]+x[1] for x in scoreList], alpha=0.1,
                 color="g", label='standard deviation')
plt.title("Classification score on subscription prediction\nwith Random Forest model using various configurations")
plt.xlabel("Configuration ID (nr estimators, max features)")
plt.ylabel(scorerType)
plt.legend(loc='best')
plt.show()

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier

# After-the-fact hardcoded optimizations, in case we want to skip the experimentation phase
# (e.g. during a presentation)
if bestLogCValue == None:
    bestLogCValue = -1
if bestKValue == None:
    bestKValue = 30
if bestNrEst == None or bestMaxFeat == None:
    bestNrEst = 256
    bestMaxFeat = 8

models = {}
models = {'logres with log10(C)=%d' % bestLogCValue : LogisticRegression(C=pow(10,bestLogCValue)), # Takes about 1 second elapsed time
          'knn with K=%d' % bestKValue : KNeighborsClassifier(n_neighbors=bestKValue), # Takes about 12 seconds elapsed time
          'gaussianNB': GaussianNB(),
          'baseline' : DummyClassifier(strategy='stratified')
#          'randomforest with nrEst=%d and maxFeat=%d' % (bestNrEst,bestMaxFeat): RandomForestClassifier(n_estimators=bestNrEst, max_features=bestMaxFeat) # Takes about 5 seconds elapsed time
          }

fig = plt.figure(1,(9,6))
plt.title("%d-fold cross-validation %s scores for various model types" % (nrCrossValidationFolds, scorerType))
plt.xlabel("Fold #")
plt.ylabel(scorerType)
plt.grid()

for modelName, model in models.items():
    print >> sys.stderr, "Building %s model ..." % modelName,
    print >> sys.stderr, "applying it ...",
    scores = cross_val_score(model, X_values, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1, verbose=10)
    print >> sys.stderr, "done"
    plt.plot(range(1,nrCrossValidationFolds+1), scores, 'o-', label="%s (%2.2f%% +/- %2.2f%%)" % (modelName, scores.mean() * 100, scores.std() * 100))
    
plt.legend(loc='best')
plt.show()

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import mpld3
mpld3.enable_notebook()
import sys
from matplotlib import pyplot as plt
import re
from math import log
import json
import copy

# We don't use cross validation here, but will generate a random test set instead
X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size=0.1, random_state=0)

finalPredictions = {}

for modelName, model in models.items():
    fittedModel = model.fit(X_train, y_train)
    
    if hasattr(model, "predict_proba"):
        print >> sys.stderr, "Predicting probabilities for %s model ..." % modelName,
        finalPredictions[modelName] = {'train': model.predict_proba(X_train),
                                       'test': model.predict_proba(X_test)}
        print >> sys.stderr, "done"
        
def plotROCCurve(figSize=7):
    # Define some CSS to control our custom labels
    css = """
table {
  border-collapse: collapse;
}
th {
  color: #ffffff;
  background-color: #000000;
}
td {
  padding: 2px;
  background-color: #cccccc;
}
table, th, td {
  font-family:Arial, Helvetica, sans-serif;
  border: 1px solid black;
  text-align: right;
}
"""

    jsonROCData = {}        
    rocCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
    ax.grid(True, alpha=0.3)
    for modelName, probs in finalPredictions.iteritems():
        modelNameShort = re.split("\s+", modelName)[0]
#        if modelNameShort not in ['knn','baseline']:
#            continue
        y_probs = [x[1] for x in finalPredictions[modelName]['test']]
        fpr, tpr, thresholds = roc_curve(y_true=y_test, 
                                         y_score=y_probs, pos_label=1)
#        print "%s: fpr (size = %d)" % (modelNameShort, len(fpr)), fpr
#        print "tpr (size = %d)" % len(tpr), tpr
#        print "thresholds (size = %d)" % len(thresholds), thresholds
        roc_auc = roc_auc_score(y_true=y_test, y_score=y_probs)
        jsonROCData[modelNameShort] = {}
        jsonROCData[modelNameShort]['fpr'] = [x for x in fpr]
        jsonROCData[modelNameShort]['tpr'] = [ x for x in tpr]
        jsonROCData[modelNameShort]['thresholds'] = [x for x in thresholds]
        jsonROCData[modelNameShort]['roc_auc'] = roc_auc
        points = plt.plot(fpr, tpr, 'x-', label="%s (AUC = %1.2f%%)" % (modelName, roc_auc*100))
        labels = ["<table><th colspan='2'>%s</th><tr><td>FPR</td><td>%0.1f%%</td></tr><tr><td>TPR</td><td>%0.1f%%</td></tr><tr><td>threshold</td><td>%0.4f</td></tr></table>" % (modelNameShort, fpr[i]*100, tpr[i]*100, thresholds[i]) for i in range(len(fpr))]
        mpld3.plugins.connect(rocCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
    ax.set_title("ROC curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
    ax.set_xlabel("False Positive Rate (FP/FP+TN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
    ax.set_ylabel("True Positive Rate (TP/TP+FN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
    plt.legend(loc="best")
    plt.show()
    return jsonROCData

# Export data to JSON file for visualization in D3.js or similar
jsonROCData = plotROCCurve(figSize=9)
with open('d3/ROCCurve.json', 'w') as outfile:
    json.dump(jsonROCData, outfile)

from IPython.html.widgets import interact, fixed

#mpld3.save_html(plotROCCurve(figSize=9), "plotROCCurveBanking.html", figid="ROC_Curve_Banking")
interact(plotROCCurve, figSize=(5,10))

import numpy as np
import mpld3
mpld3.enable_notebook()
import re
from math import log

def sortTestSetByProbability(y_test, y_probs):
    sortedIndexList = sorted(range(len(y_probs)), key=lambda k: y_probs[k], reverse=True)
    return [(y_probs[i], y_test[i], i) for i in sortedIndexList]

def calculateLift(sortedProbsAndTrueLabels, nrPercentiles=10, posLabel=1):
    ''' Input is a list of tuples containing (prob, true label), sorted in descending order per prob
        Output is a a list of nrPercentiles tuples containing, for each percentile:
        (percentOfTargetPopulationInThisPercentile,
         percentOfPositivePopulationInThisPercentile,
         cumulativeLift,
         localLift
         )
    '''
    numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
    percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
#    print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
#    print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
    result = [(0.0,0.0,0.0,0.0)]
    cumulativePercentage = 0.0
    cumulativeLift = 0
    localLift = 0
    for c in range(len(percentileSplitOfInstances)):
        percentile = percentileSplitOfInstances[c]
        nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
        p = float(c+1)/float(nrPercentiles)
        cumulativePercentage += float(nrPositivesInPercentile)/float(numberOfPositiveInstances)
        cumulativeLift = cumulativePercentage/p
        localLift = (float(nrPositivesInPercentile)/float(numberOfPositiveInstances))/(1.0/float(nrPercentiles))
        result.append((p,cumulativePercentage,cumulativeLift, localLift))
    return result

def plotLiftCurve(numberOfPercentiles=10, figSize=5):
    # Define some CSS to control our custom labels
    css = """
table {
  border-collapse: collapse;
}
th {
  color: #ffffff;
  background-color: #000000;
}
td {
  padding: 2px;
  background-color: #cccccc;
}
table, th, td {
  font-family:Arial, Helvetica, sans-serif;
  border: 1px solid black;
  text-align: right;
}
"""
    jsonLiftData = {}
    liftCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
    ax.grid(True, alpha=0.3)
    ax.set_autoscaley_on(False)
    ax.set_ylim([0,100])
    for modelName, probs in finalPredictions.iteritems():
        modelNameShort = re.split("\s+", modelName)[0]
        y_probs = [x[1] for x in finalPredictions[modelName]['test']]
        sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
        lifts = calculateLift(sortedProbsAndTrueLabels, nrPercentiles=numberOfPercentiles, posLabel=1)
#        print modelName, lifts
        xPerc = [x[0]*100 for x in lifts]
        yPerc = [x[1]*100 for x in lifts]
        jsonLiftData[modelNameShort] = {}
        jsonLiftData[modelNameShort]['xPerc'] = [x for x in xPerc]
        jsonLiftData[modelNameShort]['yPerc'] = [y for y in yPerc]
        jsonLiftData[modelNameShort]['cumLift'] = [l[2] for l in lifts]
        jsonLiftData[modelNameShort]['localLift'] = [l[3] for l in lifts]
        points = ax.plot(xPerc, yPerc, 'o-', label="%s" % modelNameShort)
        labels = ["<table><th colspan='2'>%s</th><tr><td>top-scored</td><td>%2.1f%%</td></tr><tr><td>yields</td><td>%2.1f%%</td></tr><tr><td>cum. lift</td><td>%1.1f</td></tr><tr><td>local lift</td><td>%1.1f</td></tr></table>" % (modelNameShort, xPerc[i],yPerc[i], lifts[i][2], lifts[i][3]) for i in range(len(yPerc))]
        mpld3.plugins.connect(liftCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
    ax.set_title("Lift curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
    ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
    ax.set_ylabel("Percentage of positive instances targeted", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
    plt.legend(loc=4)
    plt.show()
    return jsonLiftData

# Export data to JSON file for visualization in D3.js or similar
jsonLiftData = plotLiftCurve(numberOfPercentiles=100, figSize=9)
with open('d3/LiftCurve.json', 'w') as outfile:
    json.dump(jsonLiftData, outfile)

from IPython.html.widgets import interact, fixed

#mpld3.save_html(plotLiftCurve(numberOfPercentiles=10, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotLiftCurve, numberOfPercentiles=(5, 20), figSize=(5,10))

def calculateProfit(sortedProbsAndTrueLabels,
                    nrPercentiles=20,
                    posLabel=1,
                    avgCostPerContact=10,
                    avgRevenuePerContact=50
                    ):
    ''' Input is a list of tuples containing (prob, avg cost per contact, avg revenue per contact,
         number of percentiles, true label), sorted in descending order per prob
        Output is a list containing tuples of the form
        (percentOfTargetPopulationInThisPercentile,
         cumulative revenue,
         cumulative cost,
         cumulative profit,
         revenue for this percentile,
         cost for this percentile,
         profit for this percentile)
    '''
    assert(avgCostPerContact >= 0.0), "avgCostPerContact is expressed as a positive float"
    assert(avgRevenuePerContact >= 0.0), "avgRevenuePerContact is expressed as a positive float"
    numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
    percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
#    print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
#    print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
    result = [(0.0,0.0,0.0,0.0,0.0,0.0,0.0)]
    cumulativeCost = 0.0
    cumulativeRevenue = 0.0
    cumulativeProfit = 0.0
    for c in range(len(percentileSplitOfInstances)):
        percentile = percentileSplitOfInstances[c]
        nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
        nrNegativesInPercentile = len(percentile) - nrPositivesInPercentile
        assert(nrPositivesInPercentile + nrNegativesInPercentile == len(percentile))
        assert(len(percentile) > 0)
        p = float(c+1)/float(nrPercentiles)
        costThisPercentile = avgCostPerContact * (nrPositivesInPercentile + nrNegativesInPercentile)
        revenueThisPercentile = avgRevenuePerContact * nrPositivesInPercentile
        profitThisPercentile = revenueThisPercentile - costThisPercentile
        cumulativeRevenue = cumulativeRevenue + revenueThisPercentile
        cumulativeCost = cumulativeCost + costThisPercentile
        cumulativeProfit = cumulativeProfit + profitThisPercentile
        result.append((p, cumulativeRevenue, cumulativeCost, cumulativeProfit, revenueThisPercentile, costThisPercentile, profitThisPercentile))
    return result

def plotProfitCurve(numberOfPercentiles=10, figSize=7, avgCostPerContact=15, avgRevenuePerContact=75, yUnit=1000.0):
    # Define some CSS to control our custom labels
    css = """
table {
  border-collapse: collapse;
}
th {
  color: #ffffff;
  background-color: #000000;
}
td {
  padding: 2px;
  background-color: #cccccc;
}
table, th, td {
  font-family:Arial, Helvetica, sans-serif;
  border: 1px solid black;
  text-align: right;
}
"""
    jsonProfitData = {}
    profitCurveFigure, ax = plt.subplots(1, figsize=(int(figSize),int(figSize)))
    ax.grid(True, alpha=0.3)
    ax.set_autoscaley_on(True)
    for modelName, probs in finalPredictions.iteritems():
        modelNameShort = re.split("\s+", modelName)[0]
        y_probs = [x[1] for x in finalPredictions[modelName]['test']]
        sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
        profits = calculateProfit(sortedProbsAndTrueLabels,
                                  avgCostPerContact=avgCostPerContact,
                                  avgRevenuePerContact=avgRevenuePerContact,
                                  nrPercentiles=int(numberOfPercentiles),
                                  posLabel=1)
        xPerc = [x[0]*100 for x in profits]
        yProfit = [x[3]/yUnit for x in profits]
        jsonProfitData[modelNameShort] = {}
        jsonProfitData[modelNameShort]['sortedProbsAndTrueLabels'] = sortedProbsAndTrueLabels
        jsonProfitData[modelNameShort]['xPerc'] = [x for x in xPerc]
        jsonProfitData[modelNameShort]['yProfit'] = [y for y in yProfit]
        jsonProfitData[modelNameShort]['cumProfit'] = [p[3] for p in profits]
        jsonProfitData[modelNameShort]['intervalProfit'] = [p[6] for p in profits]
        points = ax.plot(xPerc, yProfit, 'o-', label="%s" % modelNameShort)
        labels = ["<table><th colspan='2'>%s</th><tr><td>top-scored</td><td>%2.1f%%</td></tr><tr><td>cum. profit (USD)</td><td>%.0f</td></tr><tr><td>interval profit (USD)</td><td>%.0f</td></tr></table>" % (modelNameShort, xPerc[i],profits[i][3], profits[i][6]) for i in range(len(yProfit))]
        mpld3.plugins.connect(profitCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))        
#        print "%s: " % modelNameShort, profits
    ax.set_title("Profit curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(int(figSize)))
    ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
    ax.set_ylabel("Profit (in %d USD)" % int(yUnit), labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
    plt.legend(loc=1)
    plt.show()
    return jsonProfitData

# Export data to JSON file for visualization in D3.js or similar
jsonProfitData = plotProfitCurve(avgCostPerContact=15, avgRevenuePerContact=50, numberOfPercentiles=100, figSize=9, yUnit=1)
with open('d3/ProfitCurve.json', 'w') as outfile:
    json.dump(jsonProfitData, outfile)

from IPython.html.widgets import interact, fixed

#mpld3.save_html(plotProfitCurve(avgCostPerContact=10, avgRevenuePerContact=50, numberOfPercentiles=20, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotProfitCurve, numberOfPercentiles=(10,50), figSize=(5,10), avgCostPerContact=(10,50), avgRevenuePerContact=(20,100))