bank-additional.csv
is already in our repo, so there is no need to download the data from the UCI websiteimport pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bank-additional.csv'
bank = pd.read_csv(url, sep=';')
bank.head()
age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | blue-collar | married | basic.9y | no | yes | no | cellular | may | fri | ... | 2 | 999 | 0 | nonexistent | -1.8 | 92.893 | -46.2 | 1.313 | 5099.1 | no |
1 | 39 | services | single | high.school | no | no | no | telephone | may | fri | ... | 4 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.855 | 5191.0 | no |
2 | 25 | services | married | high.school | no | yes | no | telephone | jun | wed | ... | 1 | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.962 | 5228.1 | no |
3 | 38 | services | married | basic.9y | no | unknown | unknown | telephone | jun | fri | ... | 3 | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.959 | 5228.1 | no |
4 | 47 | admin. | married | university.degree | no | yes | no | cellular | nov | mon | ... | 1 | 999 | 0 | nonexistent | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | no |
5 rows × 21 columns
# list all columns (for reference)
bank.columns
Index([u'age', u'job', u'marital', u'education', u'default', u'housing', u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign', u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx', u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y'], dtype='object')
# convert the response to numeric values and store as a new column
bank['outcome'] = bank.y.map({'no':0, 'yes':1})
%matplotlib inline
# probably not a great feature
bank.boxplot(column='age', by='outcome')
<matplotlib.axes._subplots.AxesSubplot at 0xc1c3160>
# looks like a useful feature
bank.groupby('job').outcome.mean()
job admin. 0.131423 blue-collar 0.069005 entrepreneur 0.054054 housemaid 0.100000 management 0.092593 retired 0.228916 self-employed 0.081761 services 0.089059 student 0.231707 technician 0.115774 unemployed 0.171171 unknown 0.102564 Name: outcome, dtype: float64
# create job_dummies (we will add it to the bank DataFrame later)
job_dummies = pd.get_dummies(bank.job, prefix='job')
job_dummies.drop(job_dummies.columns[0], axis=1, inplace=True)
# looks like a useful feature
bank.groupby('default').outcome.mean()
default no 0.121267 unknown 0.061021 yes 0.000000 Name: outcome, dtype: float64
# but only one person in the dataset has a status of yes
bank.default.value_counts()
no 3315 unknown 803 yes 1 dtype: int64
# so, let's treat this as a 2-class feature rather than a 3-class feature
bank['default'] = bank.default.map({'no':0, 'unknown':1, 'yes':1})
# looks like a useful feature
bank.groupby('contact').outcome.mean()
contact cellular 0.141403 telephone 0.051806 Name: outcome, dtype: float64
# convert the feature to numeric values
bank['contact'] = bank.contact.map({'cellular':0, 'telephone':1})
# looks like a useful feature at first glance
bank.groupby('month').outcome.mean()
month apr 0.167442 aug 0.100629 dec 0.545455 jul 0.082982 jun 0.128302 mar 0.583333 may 0.065312 nov 0.096413 oct 0.362319 sep 0.406250 Name: outcome, dtype: float64
# but, it looks like their success rate is actually just correlated with number of calls
# thus, the month feature is unlikely to generalize
bank.groupby('month').outcome.agg(['count', 'mean']).sort('count')
count | mean | |
---|---|---|
month | ||
dec | 22 | 0.545455 |
mar | 48 | 0.583333 |
sep | 64 | 0.406250 |
oct | 69 | 0.362319 |
apr | 215 | 0.167442 |
nov | 446 | 0.096413 |
jun | 530 | 0.128302 |
aug | 636 | 0.100629 |
jul | 711 | 0.082982 |
may | 1378 | 0.065312 |
# looks like an excellent feature, but you can't know the duration of a call beforehand, thus it can't be used in your model
bank.boxplot(column='duration', by='outcome')
<matplotlib.axes._subplots.AxesSubplot at 0xc32e438>
# looks like a useful feature
bank.groupby('previous').outcome.mean()
previous 0 0.082884 1 0.208421 2 0.410256 3 0.600000 4 0.714286 5 1.000000 6 0.500000 Name: outcome, dtype: float64
# looks like a useful feature
bank.groupby('poutcome').outcome.mean()
poutcome failure 0.147577 nonexistent 0.082884 success 0.647887 Name: outcome, dtype: float64
# create poutcome_dummies
poutcome_dummies = pd.get_dummies(bank.poutcome, prefix='poutcome')
poutcome_dummies.drop(poutcome_dummies.columns[0], axis=1, inplace=True)
# concatenate bank DataFrame with job_dummies and poutcome_dummies
bank = pd.concat([bank, job_dummies, poutcome_dummies], axis=1)
# looks like an excellent feature
bank.boxplot(column='euribor3m', by='outcome')
<matplotlib.axes._subplots.AxesSubplot at 0xc553668>
# new list of columns (including dummy columns)
bank.columns
Index([u'age', u'job', u'marital', u'education', u'default', u'housing', u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign', u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx', u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y', u'outcome', u'job_blue-collar', u'job_entrepreneur', u'job_housemaid', u'job_management', u'job_retired', u'job_self-employed', u'job_services', u'job_student', u'job_technician', u'job_unemployed', u'job_unknown', u'poutcome_nonexistent', u'poutcome_success'], dtype='object')
# create X (including 13 dummy columns)
feature_cols = ['default', 'contact', 'previous', 'euribor3m'] + list(bank.columns[-13:])
X = bank[feature_cols]
# create y
y = bank.outcome
# calculate cross-validated AUC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
logreg = LogisticRegression(C=1e9)
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()
0.75566564072331199