US Census Income Dataset

Data Exploration

In [76]:
# Since CSV files don't have headers, manually specify them

cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
        'hours_per_week', 'country']
target = 'income'
In [77]:
# Read data

import pandas as pd
import numpy as np

data = pd.read_csv('data.csv', skipinitialspace=True, header=None,
                   names=cols + ['income'], na_values=['?'])
In [78]:
print(data.shape)
(48842, 15)
In [79]:
data.head()
Out[79]:
age workclass fnlwgt education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [80]:
data.describe()
Out[80]:
age fnlwgt education_num capital_gain capital_loss hours_per_week
count 48842.000000 4.884200e+04 48842.000000 48842.000000 48842.000000 48842.000000
mean 38.643585 1.896641e+05 10.078089 1079.067626 87.502314 40.422382
std 13.710510 1.056040e+05 2.570973 7452.019058 403.004552 12.391444
min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 1.000000
25% 28.000000 1.175505e+05 9.000000 0.000000 0.000000 40.000000
50% 37.000000 1.781445e+05 10.000000 0.000000 0.000000 40.000000
75% 48.000000 2.376420e+05 12.000000 0.000000 0.000000 45.000000
max 90.000000 1.490400e+06 16.000000 99999.000000 4356.000000 99.000000
In [81]:
# Check for duplicate values
data[data.duplicated()].count()
Out[81]:
age               52
workclass         48
fnlwgt            52
education         52
education_num     52
marital_status    52
occupation        48
relationship      52
race              52
sex               52
capital_gain      52
capital_loss      52
hours_per_week    52
country           51
income            52
dtype: int64
In [82]:
# Remove duplicate values
data.drop_duplicates(inplace=True)
In [83]:
# Class distribution
data['income'].value_counts() / data['income'].count()
Out[83]:
<=50K    0.760586
>50K     0.239414
Name: income, dtype: float64
In [84]:
# Find missing value count
print(data.shape[0] - data.count())
age                  0
workclass         2795
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        2805
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
country            856
income               0
dtype: int64
In [85]:
print("Number of people with:")
print("* no occupation or workclass:",
      data[data.occupation.isnull() & data.workclass.isnull()].shape[0])
print("* an occupation but no workclass:",
      data[~data.occupation.isnull() & data.workclass.isnull()].shape[0])
print("* no occupation but a workclass:",
      data[data.occupation.isnull() & ~data.workclass.isnull()].shape[0])
Number of people with:
* no occupation or workclass: 2795
* an occupation but no workclass: 0
* no occupation but a workclass: 10

We can infer that missing values in occupation and workclass are related. A null workclass implies a null occupation. The converse is mostly true but not always.

In particular, there are 10 cases where occupation is null but workclass is not. Let's see what those cases are.

In [86]:
print(data[data.occupation.isnull() & ~data.workclass.isnull()][['occupation', 'workclass']])
      occupation     workclass
5361         NaN  Never-worked
10845        NaN  Never-worked
14772        NaN  Never-worked
20337        NaN  Never-worked
23232        NaN  Never-worked
32304        NaN  Never-worked
32314        NaN  Never-worked
41346        NaN  Never-worked
44168        NaN  Never-worked
46459        NaN  Never-worked
In [87]:
print(data[data['workclass'] == 'Never-worked'][['occupation', 'workclass']])
      occupation     workclass
5361         NaN  Never-worked
10845        NaN  Never-worked
14772        NaN  Never-worked
20337        NaN  Never-worked
23232        NaN  Never-worked
32304        NaN  Never-worked
32314        NaN  Never-worked
41346        NaN  Never-worked
44168        NaN  Never-worked
46459        NaN  Never-worked

We see that occupation is null and workclass is not null if and only if workclass is Never-worked.

Since we now know the occupation for these people, we can replace these null values by 'none'.

In [88]:
data.loc[data['workclass'] == 'Never-worked', 'occupation'] = 'none'

Data cleaning

In [89]:
# Remove missing values

data.dropna(inplace=True)

# Remove redundant attributes

del data['education']
# use education_num instead

# Merge capital_loss and capital_gain

data['capital_profit'] = data['capital_gain'] - data['capital_loss']
del data['capital_gain']
del data['capital_loss']

# Transform income, sex and relationship

data['is_male'] = data['sex'].map({'Male': 1, 'Female': 0})
def to_spouse(s):
    return 'Spouse' if s in ('Husband', 'Wife') else s
data['relationship'] = data['relationship'].map(to_spouse)
del data['sex']
data['is_rich'] = data['income'].map({'<=50K': 0, '>50K': 1})
del data['income']

data.head()
Out[89]:
age workclass fnlwgt education_num marital_status occupation relationship race hours_per_week country capital_profit is_male is_rich
0 39 State-gov 77516 13 Never-married Adm-clerical Not-in-family White 40 United-States 2174 1 0
1 50 Self-emp-not-inc 83311 13 Married-civ-spouse Exec-managerial Spouse White 13 United-States 0 1 0
2 38 Private 215646 9 Divorced Handlers-cleaners Not-in-family White 40 United-States 0 1 0
3 53 Private 234721 7 Married-civ-spouse Handlers-cleaners Spouse Black 40 United-States 0 1 0
4 28 Private 338409 13 Married-civ-spouse Prof-specialty Spouse Black 40 Cuba 0 0 0
In [90]:
# Standardize categorical variables

import re
def stdize_name(s):
    return re.sub('[^0-9a-zA-Z]+', '_', s).lower()

multi_cat_cols = ['workclass', 'marital_status', 'occupation',
                  'relationship', 'race', 'country']
for col in multi_cat_cols:
    data[col] = data[col].map(stdize_name)

data.head()
Out[90]:
age workclass fnlwgt education_num marital_status occupation relationship race hours_per_week country capital_profit is_male is_rich
0 39 state_gov 77516 13 never_married adm_clerical not_in_family white 40 united_states 2174 1 0
1 50 self_emp_not_inc 83311 13 married_civ_spouse exec_managerial spouse white 13 united_states 0 1 0
2 38 private 215646 9 divorced handlers_cleaners not_in_family white 40 united_states 0 1 0
3 53 private 234721 7 married_civ_spouse handlers_cleaners spouse black 40 united_states 0 1 0
4 28 private 338409 13 married_civ_spouse prof_specialty spouse black 40 cuba 0 0 0
In [91]:
# One-hot encode

ohdata = pd.DataFrame(data[['age', 'fnlwgt', 'education_num', 'hours_per_week',
                'capital_profit', 'is_male']])

for col in multi_cat_cols:
    values = data[col].unique()
    for value in values:
        ohdata[col + '_' + value] = data[col].map(lambda x: 1 if x == value else 0)

ohdata['is_rich'] = data['is_rich']
ohdata.head()
Out[91]:
age fnlwgt education_num hours_per_week capital_profit is_male workclass_state_gov workclass_self_emp_not_inc workclass_private workclass_federal_gov ... country_scotland country_trinadad_tobago country_greece country_nicaragua country_vietnam country_hong country_ireland country_hungary country_holand_netherlands is_rich
0 39 77516 13 40 2174 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 50 83311 13 13 0 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 38 215646 9 40 0 1 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 53 234721 7 40 0 1 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 28 338409 13 40 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 88 columns

Classification

Decision Tree

In [92]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics

def my_cross_val_score(clf, X, y):
    fold_generator = StratifiedKFold(n_splits=5, random_state=3, shuffle=True)
    return cross_val_score(clf, X, y, cv=fold_generator, n_jobs=-1)

def print_cv_accuracy(clf, X, y):
    accs = my_cross_val_score(clf, X, y)
    print("{}-fold CV accuracy: {} % ± {} %".format(5,
        100 * accs.mean(), 100 * accs.std()))

def my_tts_score(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                            test_size=0.2, random_state=3)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return metrics.accuracy_score(y_test, y_pred)

def print_tts_accuracy(clf, X, y):
    acc = my_tts_score(clf, X, y)
    print("TTS accuracy = {} %".format(acc))
In [93]:
y = ohdata['is_rich']
X = ohdata[ohdata.columns[:-1]]

from sklearn.tree import DecisionTreeClassifier

print_cv_accuracy(DecisionTreeClassifier(), X, y)
5-fold CV accuracy: 81.14640971213582 % ± 0.22269343742340633 %
In [94]:
dtree = DecisionTreeClassifier()
dtree.fit(X, y)
dtree.tree_.max_depth, dtree.tree_.node_count
Out[94]:
(56, 13453)

By default, scikit-learn makes completely unpruned trees. Let's try pre-pruning. (Scikit doesn't yet support post-pruning).

We will try making a decision tree for various gini thresholds and plot a graph showing the variation. This will help us find out the optimally pruned tree. This tree will be pruned enough to make it general and avoid overfitting, and it will not be too pruned to not have sufficient complexity.

We will also see if certain features can be eliminated. By reading descriptions of features, we guess that fnlwgt and relationship might not be appropriate attributes for classification.

We'll be pre-pruning and selecting features at the same time.

In [208]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [171]:
# Get different features

cols = [list(ohdata.columns[:-1]) for i in range(4)]
cols[1].remove('fnlwgt')
cols[3].remove('fnlwgt')
for value in data.relationship.unique():
    cols[2].remove('relationship_' + value)
    cols[3].remove('relationship_' + value)
Xs = [X[cols[i]] for i in range(4)]
In [176]:
sns.reset_orig()
In [173]:
primary_colors = ['blue', 'red', 'green', 'darkmagenta']
secondary_colors = ['cyan', 'yellow', 'lightgreen', 'magenta']

def plot_acc_vs_impurity(Xs, minimp, maximp, intervals=10, stddev_factor=1):
    # Plots accuracy vs impurities for all impurities from minimp to maximp
    # and their (intervals - 1) arithmetic means in between.
    impurities = [minimp + (maximp - minimp) * x / intervals
                  for x in range(intervals + 1)]
    for i, X in enumerate(Xs):
        maccs = []
        maccsa = []
        maccsb = []
        for impurity in impurities:
            dtree = DecisionTreeClassifier(min_impurity_split=impurity)
            accs = my_cross_val_score(dtree, X, y)
            maccs.append(accs.mean())
            maccsa.append(accs.mean() + accs.std() * stddev_factor)
            maccsb.append(accs.mean() - accs.std() * stddev_factor)
        print(i)
        plt.plot(impurities, maccs, primary_colors[i])
        line, = plt.plot(impurities, maccsa, secondary_colors[i])
        plt.setp(line, linestyle='dashed')
        line, = plt.plot(impurities, maccsb, secondary_colors[i])
        plt.setp(line, linestyle='dashed')

Now we'll plot a graph of impurity vs accuracy for all of them and see which gives higher accuracy. We'll also plot the standard deviation envelopes of the accuracies in a lighter shade using dashed lines.

Here is the legend:

  • blue - don't remove any features.
  • red - remove fnlwgt.
  • green - remove relationship_*.
  • magenta - remove both.
In [177]:
plot_acc_vs_impurity(Xs, 0, 0.4, 15)
0
1
2
3

We observe that removing fnlwgt always leads to better results. Removing both is almost always best. However, the difference in accuracy is not much compared to standard deviation, hence removing the attributes is neither very useful nor futile.

So we'll just remove fnlwgt and work with the rest of the dataset from now on.

In [188]:
X = Xs[1]

With some trial-and-error, we reach this graph:

In [139]:
plot_acc_vs_impurity(Xs, 0.33, 0.36405, 15, 0.5)
0
1
2
3

It can be observed that best value of gini threshold is around 0.36.

In [189]:
min_impurity = 0.362
In [194]:
print_cv_accuracy(DecisionTreeClassifier(), X, y)
5-fold CV accuracy: 82.06043696207871 % ± 0.4544900108918053 %
In [193]:
print_cv_accuracy(DecisionTreeClassifier(min_impurity_split=min_impurity), X, y)
5-fold CV accuracy: 84.28462961115588 % ± 0.26754219466852136 %
In [146]:
dtree = DecisionTreeClassifier(min_impurity_split=min_impurity)
dtree.fit(X, y)
dtree.tree_.max_depth, dtree.tree_.node_count
Out[146]:
(50, 3977)
In [197]:
# Important features

def print_importance_data(col_names, clf, multi_cat_cols):
    fi = sorted(zip(col_names, clf.feature_importances_),
        key=(lambda x: x[1]), reverse=True)
    for i, feature_importance in enumerate(fi[:10]):
        feature, importance = feature_importance
        print('{}. {}:\t{}'.format(i + 1, feature.ljust(20), importance))
        #print('"{}",'.format(feature))
    print()
    print("Contribution of 10 most important features:",
        sum([x[1] for x in fi[:10]]))
    print()
    for col in multi_cat_cols:
        print("Contribution of {}: {}".format(col,
            sum([x[1] for x in fi if x[0].startswith(col)])))

dtree = DecisionTreeClassifier(min_impurity_split=min_impurity)
dtree.fit(X, y)
print_importance_data(X.columns, dtree, multi_cat_cols)
1. relationship_spouse :	0.3923705247185067
2. education_num       :	0.1954857338380243
3. capital_profit      :	0.14421800888283753
4. age                 :	0.09488034673168079
5. hours_per_week      :	0.0568920613684147
6. occupation_exec_managerial:	0.009780342719594739
7. is_male             :	0.00973332071558035
8. workclass_self_emp_not_inc:	0.008733986317293108
9. occupation_prof_specialty:	0.007061736902897098
10. workclass_private   :	0.006292473084223405

Contribution of 10 most important features: 0.925448535279

Contribution of workclass: 0.030517863897657312
Contribution of marital_status: 0.0
Contribution of occupation: 0.05348753968646164
Contribution of relationship: 0.3923705247185067
Contribution of race: 0.011036308367738718
Contribution of country: 0.011378291793097876

Ensemble Classifiers

In [200]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

clf_list = [ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier]
clf_to_name = {clf_type: clf_type.__name__[:-len('Classifier')]
             for clf_type in clf_list}
name_to_accs = {clf_name: [] for clf_name in clf_to_name.values()}
ns = [10, 40, 100]

for n in ns:
    for clf_type, clf_name in clf_to_name.items():
        clf = clf_type(n_estimators=n)
        accs = my_cross_val_score(clf, X, y)
        name_to_accs[clf_name].append(100 * accs.mean())

pd.DataFrame(name_to_accs, index=ns)
Out[200]:
AdaBoost ExtraTrees RandomForest
10 84.835671 82.399027 83.799936
40 85.585917 82.582726 84.191658
100 86.086087 82.662379 84.289037
In [201]:
# Find feature importances

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)
print_importance_data(X.columns, clf, multi_cat_cols)
1. age                 :	0.23305355529735855
2. capital_profit      :	0.14523852278454322
3. education_num       :	0.13031006001210027
4. hours_per_week      :	0.11357968849076419
5. marital_status_married_civ_spouse:	0.06838558228936456
6. relationship_spouse :	0.06110261758208114
7. marital_status_never_married:	0.022509801267175354
8. occupation_exec_managerial:	0.02015727195879177
9. is_male             :	0.01757340850957281
10. occupation_prof_specialty:	0.01569921585932996

Contribution of 10 most important features: 0.827609724051

Contribution of workclass: 0.03644562231521494
Contribution of marital_status: 0.10218251322325284
Contribution of occupation: 0.08715169852717543
Contribution of relationship: 0.0950343530796143
Contribution of race: 0.014603455499365158
Contribution of country: 0.024827122261038316
In [202]:
# Stats about random forest

depth_sum = 0
node_count_sum = 0
for est in clf.estimators_:
    depth_sum += est.tree_.max_depth
    node_count_sum += est.tree_.node_count
print("Average depth:", depth_sum / 100)
print("Average nodes:", node_count_sum / 100)
Average depth: 54.11
Average nodes: 15528.66
In [203]:
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X, y)
print_importance_data(X.columns, clf, multi_cat_cols)
1. capital_profit      :	0.54
2. education_num       :	0.09
3. age                 :	0.08
4. hours_per_week      :	0.03
5. occupation_other_service:	0.02
6. relationship_not_in_family:	0.02
7. is_male             :	0.01
8. workclass_self_emp_not_inc:	0.01
9. workclass_federal_gov:	0.01
10. workclass_self_emp_inc:	0.01

Contribution of 10 most important features: 0.82

Contribution of workclass: 0.03
Contribution of marital_status: 0.03
Contribution of occupation: 0.10999999999999999
Contribution of relationship: 0.03
Contribution of race: 0.01
Contribution of country: 0.04
In [204]:
# Stats about AdaBoost

depth_sum = 0
node_count_sum = 0
for est in clf.estimators_:
    depth_sum += est.tree_.max_depth
    node_count_sum += est.tree_.node_count
print("Average depth:", depth_sum / 100)
print("Average nodes:", node_count_sum / 100)
Average depth: 1.0
Average nodes: 3.0
In [205]:
print(clf.estimator_weights_)
print(clf.estimator_errors_)
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
[ 0.24791413  0.31866208  0.40381495  0.40799864  0.45967988  0.41367252
  0.46037128  0.46599978  0.45934324  0.4838581   0.45498294  0.47854952
  0.45846815  0.47023524  0.49454372  0.48934183  0.491371    0.4912983
  0.49415499  0.48888573  0.4815515   0.48720579  0.49140986  0.48781078
  0.47444548  0.466832    0.49155993  0.49084052  0.47422785  0.48462521
  0.49122316  0.49443611  0.49120826  0.4910786   0.48164671  0.48266937
  0.48448382  0.49111735  0.48036037  0.48889572  0.49453881  0.48937479
  0.49624664  0.48010991  0.48347391  0.49632824  0.49484594  0.49252391
  0.49256823  0.49043583  0.49584646  0.48507159  0.49377603  0.49866603
  0.49572367  0.48867822  0.49604731  0.49448118  0.49208686  0.4927865
  0.49678316  0.49384536  0.49487045  0.49447138  0.49506091  0.49627152
  0.4903995   0.48619787  0.49832104  0.49647675  0.4985203   0.49889239
  0.49901818  0.49598562  0.49651015  0.49462048  0.49392358  0.49309441
  0.49273712  0.49369469  0.49416888  0.49482084  0.49484625  0.49420075
  0.49372265  0.49447959  0.4950211   0.4993276   0.49850616  0.49780793
  0.49790584  0.49478888  0.49743099  0.4955966   0.49778702  0.49478478
  0.49907324  0.49462968  0.49742011  0.49456702]

We find that some of the most important features are always the same (although not in the same order), indpendent of the classifier used to find them.

Binning

We need to bin data to apply association rule mining and naive-bayes. Let's analyze distribution of continuous variables so that we can bin them appropriately.

In [151]:
def get_rich_and_poor(data, condition):
    if condition is None:
        rich_people = data[data['is_rich'] == 1]
        poor_people = data[data['is_rich'] == 0]
    else:
        rich_people = data[condition & (data['is_rich'] == 1)]
        poor_people = data[condition & (data['is_rich'] == 0)]
    return (rich_people, poor_people)

def draw_stacked_hist(data, bins, attribute, condition=None):
    rich_people, poor_people = get_rich_and_poor(data, condition)
    n, bins, patches = plt.hist([poor_people[attribute], rich_people[attribute]],
             bins=bins, stacked=True)
    _ = plt.setp(patches[0], color='blue')
    _ = plt.setp(patches[1], color='gold')
In [104]:
draw_stacked_hist(data, 50, 'capital_profit', data['capital_profit'] != 0)
In [105]:
draw_stacked_hist(data, 100, 'capital_profit',
    (data['capital_profit'] != 0) & (data['capital_profit'] < 20000))
In [106]:
# Manually bin capital_profit

data['capital_profit_binned'] = 0
data.loc[data['capital_profit'] == 0, 'capital_profit_binned'] = 1
data.loc[(data['capital_profit'] > 0) & (data['capital_profit'] < 6000),
         'capital_profit_binned'] = 2
data.loc[(data['capital_profit'] >= 6000) & (data['capital_profit'] < 12000),
        'capital_profit_binned'] = 3
data.loc[(data['capital_profit'] >= 12000) & (data['capital_profit'] < 18000),
        'capital_profit_binned'] = 4
data.loc[data['capital_profit'] >= 18000, 'capital_profit_binned'] = 5

data.head()
Out[106]:
age workclass fnlwgt education_num marital_status occupation relationship race hours_per_week country capital_profit is_male is_rich capital_profit_binned
0 39 state_gov 77516 13 never_married adm_clerical not_in_family white 40 united_states 2174 1 0 2
1 50 self_emp_not_inc 83311 13 married_civ_spouse exec_managerial spouse white 13 united_states 0 1 0 1
2 38 private 215646 9 divorced handlers_cleaners not_in_family white 40 united_states 0 1 0 1
3 53 private 234721 7 married_civ_spouse handlers_cleaners spouse black 40 united_states 0 1 0 1
4 28 private 338409 13 married_civ_spouse prof_specialty spouse black 40 cuba 0 0 0 1
In [107]:
def draw_bar_chart(data, attribute, condition=None):
    rich_people, poor_people = get_rich_and_poor(data, condition)
    vc_rich = rich_people[attribute].value_counts()
    vc_poor = poor_people[attribute].value_counts()
    min_val, max_val = data[attribute].min(), data[attribute].max()
    size = max_val - min_val + 1
    rich_arr = np.array([0] * size)
    poor_arr = np.array([0] * size)

    for age, count in vc_rich.to_dict().items():
        rich_arr[age - min_val] = count
    for age, count in vc_poor.to_dict().items():
        poor_arr[age - min_val] = count
    y_pos = np.arange(min_val, max_val + 1)

    df = pd.DataFrame(np.array(np.transpose([poor_arr, rich_arr])),
                        columns=['poor', 'rich'], index=y_pos)
    df.plot(kind='bar', stacked=True, color=['blue', 'gold'])
In [108]:
draw_bar_chart(data, 'age')
In [109]:
data['age_binned'] = np.sqrt(data['age']).astype(int)
In [110]:
draw_bar_chart(data, 'age_binned')
In [111]:
draw_bar_chart(data, 'hours_per_week')
In [169]:
data[(data['hours_per_week'] == 40) & (data['is_rich'] == 1)].shape[0]
Out[169]:
4661
In [167]:
data[(data['hours_per_week'] == 40) & (data['is_rich'] == 0)].shape[0]
Out[167]:
16674
In [114]:
draw_bar_chart(data[data['hours_per_week'] != 40], 'hours_per_week')
In [115]:
# Manually bin hours_worked

data['hours_per_week_binned'] = 0 # Part-time
data.loc[(data['hours_per_week'] >= 25) & (data['hours_per_week'] < 40),
         'hours_per_week_binned'] = 1 # Full-time
data.loc[data['hours_per_week'] == 40, 'hours_per_week_binned'] = 2 # Mode
data.loc[(data['hours_per_week'] > 40) & (data['hours_per_week'] < 60),
         'hours_per_week_binned'] = 3 # Over-time
data.loc[data['hours_per_week'] >= 60, 'hours_per_week_binned'] = 4 # Too-much
In [116]:
# Make Binned dataframe

cat_cols = ['age_binned', 'workclass', 'education_num', 'marital_status',
            'hours_per_week_binned', 'occupation', 'relationship', 'race',
            'is_male', 'country', 'capital_profit_binned', 'is_rich']
bindata = pd.DataFrame(data[cat_cols], index=data.index)
bindata.head()
Out[116]:
age_binned workclass education_num marital_status hours_per_week_binned occupation relationship race is_male country capital_profit_binned is_rich
0 6 state_gov 13 never_married 2 adm_clerical not_in_family white 1 united_states 2 0
1 7 self_emp_not_inc 13 married_civ_spouse 0 exec_managerial spouse white 1 united_states 1 0
2 6 private 9 divorced 2 handlers_cleaners not_in_family white 1 united_states 1 0
3 7 private 7 married_civ_spouse 2 handlers_cleaners spouse black 1 united_states 1 0
4 5 private 13 married_civ_spouse 2 prof_specialty spouse black 0 cuba 1 0
In [117]:
# Write binned data to a CSV file

# bindata.to_csv('binned.csv', index=False)

Normalize data

We'll normalize using (x-mean)/std.

In [152]:
data.columns
Out[152]:
Index(['age', 'workclass', 'fnlwgt', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'hours_per_week', 'country',
       'capital_profit', 'is_male', 'is_rich', 'capital_profit_binned',
       'age_binned', 'hours_per_week_binned'],
      dtype='object')
In [153]:
normdata = pd.DataFrame(ohdata, copy=True)
normdata.head()
Out[153]:
age fnlwgt education_num hours_per_week capital_profit is_male workclass_state_gov workclass_self_emp_not_inc workclass_private workclass_federal_gov ... country_scotland country_trinadad_tobago country_greece country_nicaragua country_vietnam country_hong country_ireland country_hungary country_holand_netherlands is_rich
0 39 77516 13 40 2174 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 50 83311 13 13 0 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 38 215646 9 40 0 1 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 53 234721 7 40 0 1 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 28 338409 13 40 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 88 columns

In [154]:
normslice = normdata.loc[:, normdata.columns[:5]]
normdata.loc[:, normdata.columns[:5]] = (normslice - normslice.mean()) / normslice.std()
normdata.head()
Out[154]:
age fnlwgt education_num hours_per_week capital_profit is_male workclass_state_gov workclass_self_emp_not_inc workclass_private workclass_federal_gov ... country_scotland country_trinadad_tobago country_greece country_nicaragua country_vietnam country_hong country_ireland country_hungary country_holand_netherlands is_rich
0 0.033893 -1.062264 1.129065 -0.078261 0.154027 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0.866159 -1.007413 1.129065 -2.326545 -0.134557 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 -0.041768 0.245164 -0.438392 -0.078261 -0.134557 1 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 1.093141 0.425712 -1.222120 -0.078261 -0.134557 1 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 -0.798374 1.407139 1.129065 -0.078261 -0.134557 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 88 columns

K Nearest Neighbors

In [155]:
Xs = []
Xs.append(normdata[normdata.columns[:-1]])
Xs.append(normdata[list(normdata.columns[:1]) + list(normdata.columns[2:-1])])
X = Xs[0]

y = normdata[normdata.columns[-1]]
In [156]:
# Plot k vs accuracy

from sklearn.neighbors import KNeighborsClassifier

def plot_knn(X, y, k_range, std_factor=1):
    maccs = []
    maccsa = []
    maccsb = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
        knn.fit(X, y)
        accs = my_cross_val_score(knn, X, y)
        maccs.append(accs.mean())
        maccsa.append(accs.mean() - std_factor * accs.std())
        maccsb.append(accs.mean() + std_factor * accs.std())
    plt.plot(k_range, maccs)
    plt.plot(k_range, maccsa)
    plt.plot(k_range, maccsb)
In [157]:
plot_knn(X, y, range(1, 11))
In [206]:
plot_knn(X, y, range(10, 37, 2))
In [207]:
print_cv_accuracy(KNeighborsClassifier(n_neighbors=34), X, y)
5-fold CV accuracy: 84.34214927948078 % ± 0.11558247726631235 %