#!/usr/bin/env python
# coding: utf-8

# # US Census Income Dataset

# Dataset: http://archive.ics.uci.edu/ml/datasets/Census+Income

# ## Data Exploration

# In[76]:


# Since CSV files don't have headers, manually specify them

cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
        'hours_per_week', 'country']
target = 'income'


# In[77]:


# Read data

import pandas as pd
import numpy as np

data = pd.read_csv('data.csv', skipinitialspace=True, header=None,
                   names=cols + ['income'], na_values=['?'])


# In[78]:


print(data.shape)


# In[79]:


data.head()


# In[80]:


data.describe()


# In[81]:


# Check for duplicate values
data[data.duplicated()].count()


# In[82]:


# Remove duplicate values
data.drop_duplicates(inplace=True)


# In[83]:


# Class distribution
data['income'].value_counts() / data['income'].count()


# In[84]:


# Find missing value count
print(data.shape[0] - data.count())


# In[85]:


print("Number of people with:")
print("* no occupation or workclass:",
      data[data.occupation.isnull() & data.workclass.isnull()].shape[0])
print("* an occupation but no workclass:",
      data[~data.occupation.isnull() & data.workclass.isnull()].shape[0])
print("* no occupation but a workclass:",
      data[data.occupation.isnull() & ~data.workclass.isnull()].shape[0])


# We can infer that missing values in occupation and workclass are related.
# A null workclass implies a null occupation.
# The converse is mostly true but not always.
# 
# In particular, there are 10 cases where occupation is null but workclass is not.
# Let's see what those cases are.

# In[86]:


print(data[data.occupation.isnull() & ~data.workclass.isnull()][['occupation', 'workclass']])


# In[87]:


print(data[data['workclass'] == 'Never-worked'][['occupation', 'workclass']])


# We see that occupation is null and workclass is not null if and only if
# workclass is `Never-worked`.
# 
# Since we now know the occupation for these people,
# we can replace these null values by 'none'.

# In[88]:


data.loc[data['workclass'] == 'Never-worked', 'occupation'] = 'none'


# ### Data cleaning

# In[89]:


# Remove missing values

data.dropna(inplace=True)

# Remove redundant attributes

del data['education']
# use education_num instead

# Merge capital_loss and capital_gain

data['capital_profit'] = data['capital_gain'] - data['capital_loss']
del data['capital_gain']
del data['capital_loss']

# Transform income, sex and relationship

data['is_male'] = data['sex'].map({'Male': 1, 'Female': 0})
def to_spouse(s):
    return 'Spouse' if s in ('Husband', 'Wife') else s
data['relationship'] = data['relationship'].map(to_spouse)
del data['sex']
data['is_rich'] = data['income'].map({'<=50K': 0, '>50K': 1})
del data['income']

data.head()


# In[90]:


# Standardize categorical variables

import re
def stdize_name(s):
    return re.sub('[^0-9a-zA-Z]+', '_', s).lower()

multi_cat_cols = ['workclass', 'marital_status', 'occupation',
                  'relationship', 'race', 'country']
for col in multi_cat_cols:
    data[col] = data[col].map(stdize_name)

data.head()


# In[91]:


# One-hot encode

ohdata = pd.DataFrame(data[['age', 'fnlwgt', 'education_num', 'hours_per_week',
                'capital_profit', 'is_male']])

for col in multi_cat_cols:
    values = data[col].unique()
    for value in values:
        ohdata[col + '_' + value] = data[col].map(lambda x: 1 if x == value else 0)

ohdata['is_rich'] = data['is_rich']
ohdata.head()


# ## Classification

# ### Decision Tree

# In[92]:


from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics

def my_cross_val_score(clf, X, y):
    fold_generator = StratifiedKFold(n_splits=5, random_state=3, shuffle=True)
    return cross_val_score(clf, X, y, cv=fold_generator, n_jobs=-1)

def print_cv_accuracy(clf, X, y):
    accs = my_cross_val_score(clf, X, y)
    print("{}-fold CV accuracy: {} % ± {} %".format(5,
        100 * accs.mean(), 100 * accs.std()))

def my_tts_score(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                            test_size=0.2, random_state=3)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return metrics.accuracy_score(y_test, y_pred)

def print_tts_accuracy(clf, X, y):
    acc = my_tts_score(clf, X, y)
    print("TTS accuracy = {} %".format(acc))


# In[93]:


y = ohdata['is_rich']
X = ohdata[ohdata.columns[:-1]]

from sklearn.tree import DecisionTreeClassifier

print_cv_accuracy(DecisionTreeClassifier(), X, y)


# In[94]:


dtree = DecisionTreeClassifier()
dtree.fit(X, y)
dtree.tree_.max_depth, dtree.tree_.node_count


# By default, scikit-learn makes completely unpruned trees.
# Let's try pre-pruning. (Scikit doesn't yet support post-pruning).
# 
# We will try making a decision tree for various gini thresholds
# and plot a graph showing the variation.
# This will help us find out the optimally pruned tree.
# This tree will be pruned enough to make it general and avoid overfitting,
# and it will not be too pruned to not have sufficient complexity.
# 
# We will also see if certain features can be eliminated.
# By reading descriptions of features, we guess that `fnlwgt` and `relationship`
# might not be appropriate attributes for classification.
# 
# We'll be pre-pruning and selecting features at the same time.

# In[208]:


import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')


# In[171]:


# Get different features

cols = [list(ohdata.columns[:-1]) for i in range(4)]
cols[1].remove('fnlwgt')
cols[3].remove('fnlwgt')
for value in data.relationship.unique():
    cols[2].remove('relationship_' + value)
    cols[3].remove('relationship_' + value)
Xs = [X[cols[i]] for i in range(4)]


# In[176]:


sns.reset_orig()


# In[173]:


primary_colors = ['blue', 'red', 'green', 'darkmagenta']
secondary_colors = ['cyan', 'yellow', 'lightgreen', 'magenta']

def plot_acc_vs_impurity(Xs, minimp, maximp, intervals=10, stddev_factor=1):
    # Plots accuracy vs impurities for all impurities from minimp to maximp
    # and their (intervals - 1) arithmetic means in between.
    impurities = [minimp + (maximp - minimp) * x / intervals
                  for x in range(intervals + 1)]
    for i, X in enumerate(Xs):
        maccs = []
        maccsa = []
        maccsb = []
        for impurity in impurities:
            dtree = DecisionTreeClassifier(min_impurity_split=impurity)
            accs = my_cross_val_score(dtree, X, y)
            maccs.append(accs.mean())
            maccsa.append(accs.mean() + accs.std() * stddev_factor)
            maccsb.append(accs.mean() - accs.std() * stddev_factor)
        print(i)
        plt.plot(impurities, maccs, primary_colors[i])
        line, = plt.plot(impurities, maccsa, secondary_colors[i])
        plt.setp(line, linestyle='dashed')
        line, = plt.plot(impurities, maccsb, secondary_colors[i])
        plt.setp(line, linestyle='dashed')


# Now we'll plot a graph of impurity vs accuracy for all of them and see which gives higher accuracy. We'll also plot the standard deviation envelopes of the accuracies in a lighter shade using dashed lines.
# 
# Here is the legend:
# 
# * blue - don't remove any features.
# * red - remove `fnlwgt`.
# * green - remove `relationship_*`.
# * magenta - remove both.

# In[177]:


plot_acc_vs_impurity(Xs, 0, 0.4, 15)


# We observe that removing `fnlwgt` always leads to better results.
# Removing both is almost always best.
# However, the difference in accuracy is not much compared to standard deviation,
# hence removing the attributes is neither very useful nor futile.
# 
# So we'll just remove `fnlwgt` and work with the rest of the dataset from now on.

# In[188]:


X = Xs[1]


# With some trial-and-error, we reach this graph:

# In[139]:


plot_acc_vs_impurity(Xs, 0.33, 0.36405, 15, 0.5)


# It can be observed that best value of gini threshold is around 0.36.

# In[189]:


min_impurity = 0.362


# In[194]:


print_cv_accuracy(DecisionTreeClassifier(), X, y)


# In[193]:


print_cv_accuracy(DecisionTreeClassifier(min_impurity_split=min_impurity), X, y)


# In[146]:


dtree = DecisionTreeClassifier(min_impurity_split=min_impurity)
dtree.fit(X, y)
dtree.tree_.max_depth, dtree.tree_.node_count


# In[197]:


# Important features

def print_importance_data(col_names, clf, multi_cat_cols):
    fi = sorted(zip(col_names, clf.feature_importances_),
        key=(lambda x: x[1]), reverse=True)
    for i, feature_importance in enumerate(fi[:10]):
        feature, importance = feature_importance
        print('{}. {}:\t{}'.format(i + 1, feature.ljust(20), importance))
        #print('"{}",'.format(feature))
    print()
    print("Contribution of 10 most important features:",
        sum([x[1] for x in fi[:10]]))
    print()
    for col in multi_cat_cols:
        print("Contribution of {}: {}".format(col,
            sum([x[1] for x in fi if x[0].startswith(col)])))

dtree = DecisionTreeClassifier(min_impurity_split=min_impurity)
dtree.fit(X, y)
print_importance_data(X.columns, dtree, multi_cat_cols)


# ### Ensemble Classifiers

# In[200]:


from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

clf_list = [ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier]
clf_to_name = {clf_type: clf_type.__name__[:-len('Classifier')]
             for clf_type in clf_list}
name_to_accs = {clf_name: [] for clf_name in clf_to_name.values()}
ns = [10, 40, 100]

for n in ns:
    for clf_type, clf_name in clf_to_name.items():
        clf = clf_type(n_estimators=n)
        accs = my_cross_val_score(clf, X, y)
        name_to_accs[clf_name].append(100 * accs.mean())

pd.DataFrame(name_to_accs, index=ns)


# In[201]:


# Find feature importances

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)
print_importance_data(X.columns, clf, multi_cat_cols)


# In[202]:


# Stats about random forest

depth_sum = 0
node_count_sum = 0
for est in clf.estimators_:
    depth_sum += est.tree_.max_depth
    node_count_sum += est.tree_.node_count
print("Average depth:", depth_sum / 100)
print("Average nodes:", node_count_sum / 100)


# In[203]:


clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X, y)
print_importance_data(X.columns, clf, multi_cat_cols)


# In[204]:


# Stats about AdaBoost

depth_sum = 0
node_count_sum = 0
for est in clf.estimators_:
    depth_sum += est.tree_.max_depth
    node_count_sum += est.tree_.node_count
print("Average depth:", depth_sum / 100)
print("Average nodes:", node_count_sum / 100)


# In[205]:


print(clf.estimator_weights_)
print(clf.estimator_errors_)


# We find that some of the most important features are always the same (although not in the same order), indpendent of the classifier used to find them.

# ## Binning

# We need to bin data to apply association rule mining and naive-bayes.
# Let's analyze distribution of continuous variables so that we can bin them appropriately.

# In[151]:


def get_rich_and_poor(data, condition):
    if condition is None:
        rich_people = data[data['is_rich'] == 1]
        poor_people = data[data['is_rich'] == 0]
    else:
        rich_people = data[condition & (data['is_rich'] == 1)]
        poor_people = data[condition & (data['is_rich'] == 0)]
    return (rich_people, poor_people)

def draw_stacked_hist(data, bins, attribute, condition=None):
    rich_people, poor_people = get_rich_and_poor(data, condition)
    n, bins, patches = plt.hist([poor_people[attribute], rich_people[attribute]],
             bins=bins, stacked=True)
    _ = plt.setp(patches[0], color='blue')
    _ = plt.setp(patches[1], color='gold')


# In[104]:


draw_stacked_hist(data, 50, 'capital_profit', data['capital_profit'] != 0)


# In[105]:


draw_stacked_hist(data, 100, 'capital_profit',
    (data['capital_profit'] != 0) & (data['capital_profit'] < 20000))


# In[106]:


# Manually bin capital_profit

data['capital_profit_binned'] = 0
data.loc[data['capital_profit'] == 0, 'capital_profit_binned'] = 1
data.loc[(data['capital_profit'] > 0) & (data['capital_profit'] < 6000),
         'capital_profit_binned'] = 2
data.loc[(data['capital_profit'] >= 6000) & (data['capital_profit'] < 12000),
        'capital_profit_binned'] = 3
data.loc[(data['capital_profit'] >= 12000) & (data['capital_profit'] < 18000),
        'capital_profit_binned'] = 4
data.loc[data['capital_profit'] >= 18000, 'capital_profit_binned'] = 5

data.head()


# In[107]:


def draw_bar_chart(data, attribute, condition=None):
    rich_people, poor_people = get_rich_and_poor(data, condition)
    vc_rich = rich_people[attribute].value_counts()
    vc_poor = poor_people[attribute].value_counts()
    min_val, max_val = data[attribute].min(), data[attribute].max()
    size = max_val - min_val + 1
    rich_arr = np.array([0] * size)
    poor_arr = np.array([0] * size)

    for age, count in vc_rich.to_dict().items():
        rich_arr[age - min_val] = count
    for age, count in vc_poor.to_dict().items():
        poor_arr[age - min_val] = count
    y_pos = np.arange(min_val, max_val + 1)

    df = pd.DataFrame(np.array(np.transpose([poor_arr, rich_arr])),
                        columns=['poor', 'rich'], index=y_pos)
    df.plot(kind='bar', stacked=True, color=['blue', 'gold'])


# In[108]:


draw_bar_chart(data, 'age')


# In[109]:


data['age_binned'] = np.sqrt(data['age']).astype(int)


# In[110]:


draw_bar_chart(data, 'age_binned')


# In[111]:


draw_bar_chart(data, 'hours_per_week')


# In[169]:


data[(data['hours_per_week'] == 40) & (data['is_rich'] == 1)].shape[0]


# In[167]:


data[(data['hours_per_week'] == 40) & (data['is_rich'] == 0)].shape[0]


# In[114]:


draw_bar_chart(data[data['hours_per_week'] != 40], 'hours_per_week')


# In[115]:


# Manually bin hours_worked

data['hours_per_week_binned'] = 0 # Part-time
data.loc[(data['hours_per_week'] >= 25) & (data['hours_per_week'] < 40),
         'hours_per_week_binned'] = 1 # Full-time
data.loc[data['hours_per_week'] == 40, 'hours_per_week_binned'] = 2 # Mode
data.loc[(data['hours_per_week'] > 40) & (data['hours_per_week'] < 60),
         'hours_per_week_binned'] = 3 # Over-time
data.loc[data['hours_per_week'] >= 60, 'hours_per_week_binned'] = 4 # Too-much


# In[116]:


# Make Binned dataframe

cat_cols = ['age_binned', 'workclass', 'education_num', 'marital_status',
            'hours_per_week_binned', 'occupation', 'relationship', 'race',
            'is_male', 'country', 'capital_profit_binned', 'is_rich']
bindata = pd.DataFrame(data[cat_cols], index=data.index)
bindata.head()


# In[117]:


# Write binned data to a CSV file

# bindata.to_csv('binned.csv', index=False)


# ### Normalize data

# We'll normalize using `(x-mean)/std`.

# In[152]:


data.columns


# In[153]:


normdata = pd.DataFrame(ohdata, copy=True)
normdata.head()


# In[154]:


normslice = normdata.loc[:, normdata.columns[:5]]
normdata.loc[:, normdata.columns[:5]] = (normslice - normslice.mean()) / normslice.std()
normdata.head()


# ### K Nearest Neighbors

# In[155]:


Xs = []
Xs.append(normdata[normdata.columns[:-1]])
Xs.append(normdata[list(normdata.columns[:1]) + list(normdata.columns[2:-1])])
X = Xs[0]

y = normdata[normdata.columns[-1]]


# In[156]:


# Plot k vs accuracy

from sklearn.neighbors import KNeighborsClassifier

def plot_knn(X, y, k_range, std_factor=1):
    maccs = []
    maccsa = []
    maccsb = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
        knn.fit(X, y)
        accs = my_cross_val_score(knn, X, y)
        maccs.append(accs.mean())
        maccsa.append(accs.mean() - std_factor * accs.std())
        maccsb.append(accs.mean() + std_factor * accs.std())
    plt.plot(k_range, maccs)
    plt.plot(k_range, maccsa)
    plt.plot(k_range, maccsb)


# In[157]:


plot_knn(X, y, range(1, 11))


# In[206]:


plot_knn(X, y, range(10, 37, 2))


# In[207]:


print_cv_accuracy(KNeighborsClassifier(n_neighbors=34), X, y)


# In[ ]: