#!/usr/bin/env python # coding: utf-8 # # US Census Income Dataset # Dataset: http://archive.ics.uci.edu/ml/datasets/Census+Income # ## Data Exploration # In[76]: # Since CSV files don't have headers, manually specify them cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'country'] target = 'income' # In[77]: # Read data import pandas as pd import numpy as np data = pd.read_csv('data.csv', skipinitialspace=True, header=None, names=cols + ['income'], na_values=['?']) # In[78]: print(data.shape) # In[79]: data.head() # In[80]: data.describe() # In[81]: # Check for duplicate values data[data.duplicated()].count() # In[82]: # Remove duplicate values data.drop_duplicates(inplace=True) # In[83]: # Class distribution data['income'].value_counts() / data['income'].count() # In[84]: # Find missing value count print(data.shape[0] - data.count()) # In[85]: print("Number of people with:") print("* no occupation or workclass:", data[data.occupation.isnull() & data.workclass.isnull()].shape[0]) print("* an occupation but no workclass:", data[~data.occupation.isnull() & data.workclass.isnull()].shape[0]) print("* no occupation but a workclass:", data[data.occupation.isnull() & ~data.workclass.isnull()].shape[0]) # We can infer that missing values in occupation and workclass are related. # A null workclass implies a null occupation. # The converse is mostly true but not always. # # In particular, there are 10 cases where occupation is null but workclass is not. # Let's see what those cases are. # In[86]: print(data[data.occupation.isnull() & ~data.workclass.isnull()][['occupation', 'workclass']]) # In[87]: print(data[data['workclass'] == 'Never-worked'][['occupation', 'workclass']]) # We see that occupation is null and workclass is not null if and only if # workclass is `Never-worked`. # # Since we now know the occupation for these people, # we can replace these null values by 'none'. # In[88]: data.loc[data['workclass'] == 'Never-worked', 'occupation'] = 'none' # ### Data cleaning # In[89]: # Remove missing values data.dropna(inplace=True) # Remove redundant attributes del data['education'] # use education_num instead # Merge capital_loss and capital_gain data['capital_profit'] = data['capital_gain'] - data['capital_loss'] del data['capital_gain'] del data['capital_loss'] # Transform income, sex and relationship data['is_male'] = data['sex'].map({'Male': 1, 'Female': 0}) def to_spouse(s): return 'Spouse' if s in ('Husband', 'Wife') else s data['relationship'] = data['relationship'].map(to_spouse) del data['sex'] data['is_rich'] = data['income'].map({'<=50K': 0, '>50K': 1}) del data['income'] data.head() # In[90]: # Standardize categorical variables import re def stdize_name(s): return re.sub('[^0-9a-zA-Z]+', '_', s).lower() multi_cat_cols = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'country'] for col in multi_cat_cols: data[col] = data[col].map(stdize_name) data.head() # In[91]: # One-hot encode ohdata = pd.DataFrame(data[['age', 'fnlwgt', 'education_num', 'hours_per_week', 'capital_profit', 'is_male']]) for col in multi_cat_cols: values = data[col].unique() for value in values: ohdata[col + '_' + value] = data[col].map(lambda x: 1 if x == value else 0) ohdata['is_rich'] = data['is_rich'] ohdata.head() # ## Classification # ### Decision Tree # In[92]: from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold from sklearn import metrics def my_cross_val_score(clf, X, y): fold_generator = StratifiedKFold(n_splits=5, random_state=3, shuffle=True) return cross_val_score(clf, X, y, cv=fold_generator, n_jobs=-1) def print_cv_accuracy(clf, X, y): accs = my_cross_val_score(clf, X, y) print("{}-fold CV accuracy: {} % ± {} %".format(5, 100 * accs.mean(), 100 * accs.std())) def my_tts_score(clf, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) return metrics.accuracy_score(y_test, y_pred) def print_tts_accuracy(clf, X, y): acc = my_tts_score(clf, X, y) print("TTS accuracy = {} %".format(acc)) # In[93]: y = ohdata['is_rich'] X = ohdata[ohdata.columns[:-1]] from sklearn.tree import DecisionTreeClassifier print_cv_accuracy(DecisionTreeClassifier(), X, y) # In[94]: dtree = DecisionTreeClassifier() dtree.fit(X, y) dtree.tree_.max_depth, dtree.tree_.node_count # By default, scikit-learn makes completely unpruned trees. # Let's try pre-pruning. (Scikit doesn't yet support post-pruning). # # We will try making a decision tree for various gini thresholds # and plot a graph showing the variation. # This will help us find out the optimally pruned tree. # This tree will be pruned enough to make it general and avoid overfitting, # and it will not be too pruned to not have sufficient complexity. # # We will also see if certain features can be eliminated. # By reading descriptions of features, we guess that `fnlwgt` and `relationship` # might not be appropriate attributes for classification. # # We'll be pre-pruning and selecting features at the same time. # In[208]: import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[171]: # Get different features cols = [list(ohdata.columns[:-1]) for i in range(4)] cols[1].remove('fnlwgt') cols[3].remove('fnlwgt') for value in data.relationship.unique(): cols[2].remove('relationship_' + value) cols[3].remove('relationship_' + value) Xs = [X[cols[i]] for i in range(4)] # In[176]: sns.reset_orig() # In[173]: primary_colors = ['blue', 'red', 'green', 'darkmagenta'] secondary_colors = ['cyan', 'yellow', 'lightgreen', 'magenta'] def plot_acc_vs_impurity(Xs, minimp, maximp, intervals=10, stddev_factor=1): # Plots accuracy vs impurities for all impurities from minimp to maximp # and their (intervals - 1) arithmetic means in between. impurities = [minimp + (maximp - minimp) * x / intervals for x in range(intervals + 1)] for i, X in enumerate(Xs): maccs = [] maccsa = [] maccsb = [] for impurity in impurities: dtree = DecisionTreeClassifier(min_impurity_split=impurity) accs = my_cross_val_score(dtree, X, y) maccs.append(accs.mean()) maccsa.append(accs.mean() + accs.std() * stddev_factor) maccsb.append(accs.mean() - accs.std() * stddev_factor) print(i) plt.plot(impurities, maccs, primary_colors[i]) line, = plt.plot(impurities, maccsa, secondary_colors[i]) plt.setp(line, linestyle='dashed') line, = plt.plot(impurities, maccsb, secondary_colors[i]) plt.setp(line, linestyle='dashed') # Now we'll plot a graph of impurity vs accuracy for all of them and see which gives higher accuracy. We'll also plot the standard deviation envelopes of the accuracies in a lighter shade using dashed lines. # # Here is the legend: # # * blue - don't remove any features. # * red - remove `fnlwgt`. # * green - remove `relationship_*`. # * magenta - remove both. # In[177]: plot_acc_vs_impurity(Xs, 0, 0.4, 15) # We observe that removing `fnlwgt` always leads to better results. # Removing both is almost always best. # However, the difference in accuracy is not much compared to standard deviation, # hence removing the attributes is neither very useful nor futile. # # So we'll just remove `fnlwgt` and work with the rest of the dataset from now on. # In[188]: X = Xs[1] # With some trial-and-error, we reach this graph: # In[139]: plot_acc_vs_impurity(Xs, 0.33, 0.36405, 15, 0.5) # It can be observed that best value of gini threshold is around 0.36. # In[189]: min_impurity = 0.362 # In[194]: print_cv_accuracy(DecisionTreeClassifier(), X, y) # In[193]: print_cv_accuracy(DecisionTreeClassifier(min_impurity_split=min_impurity), X, y) # In[146]: dtree = DecisionTreeClassifier(min_impurity_split=min_impurity) dtree.fit(X, y) dtree.tree_.max_depth, dtree.tree_.node_count # In[197]: # Important features def print_importance_data(col_names, clf, multi_cat_cols): fi = sorted(zip(col_names, clf.feature_importances_), key=(lambda x: x[1]), reverse=True) for i, feature_importance in enumerate(fi[:10]): feature, importance = feature_importance print('{}. {}:\t{}'.format(i + 1, feature.ljust(20), importance)) #print('"{}",'.format(feature)) print() print("Contribution of 10 most important features:", sum([x[1] for x in fi[:10]])) print() for col in multi_cat_cols: print("Contribution of {}: {}".format(col, sum([x[1] for x in fi if x[0].startswith(col)]))) dtree = DecisionTreeClassifier(min_impurity_split=min_impurity) dtree.fit(X, y) print_importance_data(X.columns, dtree, multi_cat_cols) # ### Ensemble Classifiers # In[200]: from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier clf_list = [ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier] clf_to_name = {clf_type: clf_type.__name__[:-len('Classifier')] for clf_type in clf_list} name_to_accs = {clf_name: [] for clf_name in clf_to_name.values()} ns = [10, 40, 100] for n in ns: for clf_type, clf_name in clf_to_name.items(): clf = clf_type(n_estimators=n) accs = my_cross_val_score(clf, X, y) name_to_accs[clf_name].append(100 * accs.mean()) pd.DataFrame(name_to_accs, index=ns) # In[201]: # Find feature importances clf = RandomForestClassifier(n_estimators=100) clf.fit(X, y) print_importance_data(X.columns, clf, multi_cat_cols) # In[202]: # Stats about random forest depth_sum = 0 node_count_sum = 0 for est in clf.estimators_: depth_sum += est.tree_.max_depth node_count_sum += est.tree_.node_count print("Average depth:", depth_sum / 100) print("Average nodes:", node_count_sum / 100) # In[203]: clf = AdaBoostClassifier(n_estimators=100) clf.fit(X, y) print_importance_data(X.columns, clf, multi_cat_cols) # In[204]: # Stats about AdaBoost depth_sum = 0 node_count_sum = 0 for est in clf.estimators_: depth_sum += est.tree_.max_depth node_count_sum += est.tree_.node_count print("Average depth:", depth_sum / 100) print("Average nodes:", node_count_sum / 100) # In[205]: print(clf.estimator_weights_) print(clf.estimator_errors_) # We find that some of the most important features are always the same (although not in the same order), indpendent of the classifier used to find them. # ## Binning # We need to bin data to apply association rule mining and naive-bayes. # Let's analyze distribution of continuous variables so that we can bin them appropriately. # In[151]: def get_rich_and_poor(data, condition): if condition is None: rich_people = data[data['is_rich'] == 1] poor_people = data[data['is_rich'] == 0] else: rich_people = data[condition & (data['is_rich'] == 1)] poor_people = data[condition & (data['is_rich'] == 0)] return (rich_people, poor_people) def draw_stacked_hist(data, bins, attribute, condition=None): rich_people, poor_people = get_rich_and_poor(data, condition) n, bins, patches = plt.hist([poor_people[attribute], rich_people[attribute]], bins=bins, stacked=True) _ = plt.setp(patches[0], color='blue') _ = plt.setp(patches[1], color='gold') # In[104]: draw_stacked_hist(data, 50, 'capital_profit', data['capital_profit'] != 0) # In[105]: draw_stacked_hist(data, 100, 'capital_profit', (data['capital_profit'] != 0) & (data['capital_profit'] < 20000)) # In[106]: # Manually bin capital_profit data['capital_profit_binned'] = 0 data.loc[data['capital_profit'] == 0, 'capital_profit_binned'] = 1 data.loc[(data['capital_profit'] > 0) & (data['capital_profit'] < 6000), 'capital_profit_binned'] = 2 data.loc[(data['capital_profit'] >= 6000) & (data['capital_profit'] < 12000), 'capital_profit_binned'] = 3 data.loc[(data['capital_profit'] >= 12000) & (data['capital_profit'] < 18000), 'capital_profit_binned'] = 4 data.loc[data['capital_profit'] >= 18000, 'capital_profit_binned'] = 5 data.head() # In[107]: def draw_bar_chart(data, attribute, condition=None): rich_people, poor_people = get_rich_and_poor(data, condition) vc_rich = rich_people[attribute].value_counts() vc_poor = poor_people[attribute].value_counts() min_val, max_val = data[attribute].min(), data[attribute].max() size = max_val - min_val + 1 rich_arr = np.array([0] * size) poor_arr = np.array([0] * size) for age, count in vc_rich.to_dict().items(): rich_arr[age - min_val] = count for age, count in vc_poor.to_dict().items(): poor_arr[age - min_val] = count y_pos = np.arange(min_val, max_val + 1) df = pd.DataFrame(np.array(np.transpose([poor_arr, rich_arr])), columns=['poor', 'rich'], index=y_pos) df.plot(kind='bar', stacked=True, color=['blue', 'gold']) # In[108]: draw_bar_chart(data, 'age') # In[109]: data['age_binned'] = np.sqrt(data['age']).astype(int) # In[110]: draw_bar_chart(data, 'age_binned') # In[111]: draw_bar_chart(data, 'hours_per_week') # In[169]: data[(data['hours_per_week'] == 40) & (data['is_rich'] == 1)].shape[0] # In[167]: data[(data['hours_per_week'] == 40) & (data['is_rich'] == 0)].shape[0] # In[114]: draw_bar_chart(data[data['hours_per_week'] != 40], 'hours_per_week') # In[115]: # Manually bin hours_worked data['hours_per_week_binned'] = 0 # Part-time data.loc[(data['hours_per_week'] >= 25) & (data['hours_per_week'] < 40), 'hours_per_week_binned'] = 1 # Full-time data.loc[data['hours_per_week'] == 40, 'hours_per_week_binned'] = 2 # Mode data.loc[(data['hours_per_week'] > 40) & (data['hours_per_week'] < 60), 'hours_per_week_binned'] = 3 # Over-time data.loc[data['hours_per_week'] >= 60, 'hours_per_week_binned'] = 4 # Too-much # In[116]: # Make Binned dataframe cat_cols = ['age_binned', 'workclass', 'education_num', 'marital_status', 'hours_per_week_binned', 'occupation', 'relationship', 'race', 'is_male', 'country', 'capital_profit_binned', 'is_rich'] bindata = pd.DataFrame(data[cat_cols], index=data.index) bindata.head() # In[117]: # Write binned data to a CSV file # bindata.to_csv('binned.csv', index=False) # ### Normalize data # We'll normalize using `(x-mean)/std`. # In[152]: data.columns # In[153]: normdata = pd.DataFrame(ohdata, copy=True) normdata.head() # In[154]: normslice = normdata.loc[:, normdata.columns[:5]] normdata.loc[:, normdata.columns[:5]] = (normslice - normslice.mean()) / normslice.std() normdata.head() # ### K Nearest Neighbors # In[155]: Xs = [] Xs.append(normdata[normdata.columns[:-1]]) Xs.append(normdata[list(normdata.columns[:1]) + list(normdata.columns[2:-1])]) X = Xs[0] y = normdata[normdata.columns[-1]] # In[156]: # Plot k vs accuracy from sklearn.neighbors import KNeighborsClassifier def plot_knn(X, y, k_range, std_factor=1): maccs = [] maccsa = [] maccsb = [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1) knn.fit(X, y) accs = my_cross_val_score(knn, X, y) maccs.append(accs.mean()) maccsa.append(accs.mean() - std_factor * accs.std()) maccsb.append(accs.mean() + std_factor * accs.std()) plt.plot(k_range, maccs) plt.plot(k_range, maccsa) plt.plot(k_range, maccsb) # In[157]: plot_knn(X, y, range(1, 11)) # In[206]: plot_knn(X, y, range(10, 37, 2)) # In[207]: print_cv_accuracy(KNeighborsClassifier(n_neighbors=34), X, y) # In[ ]: