In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
/home/anirudh/anaconda/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)
In [2]:
from pprint import *
In [3]:
import seaborn
seaborn.set()
:0: FutureWarning: IPython widgets are experimental and may change in the future.
In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Hypotheses

  1. Age
  2. Income
  3. Profession
  4. Marital Status
  5. Urban / Rural
  6. Expenses
  7. Education
  8. Total Work Experience
In [5]:
test.head()
Out[5]:
ID Age Workclass Education Marital.Status Occupation Relationship Race Sex Hours.Per.Week Native.Country
0 32562 25 Private 11th Never-married Machine-op-inspct Own-child Black Male 40 United-States
1 32563 38 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male 50 United-States
2 32564 28 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male 40 United-States
3 32565 44 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male 40 United-States
4 32566 18 NaN Some-college Never-married NaN Own-child White Female 30 United-States
In [6]:
train.head()
Out[6]:
ID Age Workclass Education Marital.Status Occupation Relationship Race Sex Hours.Per.Week Native.Country Income.Group
0 1 39 State-gov Bachelors Never-married Adm-clerical Not-in-family White Male 40 United-States <=50K
1 2 50 Self-emp-not-inc Bachelors Married-civ-spouse Exec-managerial Husband White Male 13 United-States <=50K
2 3 38 Private HS-grad Divorced Handlers-cleaners Not-in-family White Male 40 United-States <=50K
3 4 53 Private 11th Married-civ-spouse Handlers-cleaners Husband Black Male 40 United-States <=50K
4 5 28 Private Bachelors Married-civ-spouse Prof-specialty Wife Black Female 40 Cuba <=50K

Missing Value Treatment

In [7]:
train.apply(lambda x: sum(x.isnull()))
Out[7]:
ID                   0
Age                  0
Workclass         1836
Education            0
Marital.Status       0
Occupation        1843
Relationship         0
Race                 0
Sex                  0
Hours.Per.Week       0
Native.Country     583
Income.Group         0
dtype: int64
In [8]:
test.apply(lambda x: sum(x.isnull()))
Out[8]:
ID                  0
Age                 0
Workclass         963
Education           0
Marital.Status      0
Occupation        966
Relationship        0
Race                0
Sex                 0
Hours.Per.Week      0
Native.Country    274
dtype: int64
In [9]:
from scipy.stats import mode

Workclass

In [10]:
mode(train.Workclass)[0][0]
/home/anirudh/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:200: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
Out[10]:
'Private'
In [11]:
Workclass_Null_Indices = train[train.Workclass.isnull()].index
for el in Workclass_Null_Indices: train.Workclass[el] = 'Private'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
In [12]:
mode(test.Workclass)[0][0]
Out[12]:
'Private'
In [13]:
Workclass_Null_Indices_test = test[test.Workclass.isnull()].index
for el in Workclass_Null_Indices_test: test.Workclass[el] = 'Private'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

Occupation

In [14]:
mode(train.Occupation)[0][0]
Out[14]:
'Prof-specialty'
In [15]:
mode(test.Occupation)[0][0]
Out[15]:
'Prof-specialty'
In [16]:
Occupation_Null_Indices_train = train[train.Occupation.isnull()].index
for el in Occupation_Null_Indices_train: train.Occupation[el] = 'Prof-specialty'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
In [17]:
Occupation_Null_Indices_test = test[test.Occupation.isnull()].index
for el in Occupation_Null_Indices_test: test.Occupation[el] = 'Prof-specialty'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

Native Country

In [18]:
mode(train['Native.Country'])[0][0]
Out[18]:
'United-States'
In [19]:
mode(test['Native.Country'])[0][0]
Out[19]:
'United-States'
In [20]:
NativeCountry_Null_Indices_train = train[train['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_train: train['Native.Country'][el] = 'United-States'
    
NativeCountry_Null_Indices_test = test[test['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_test: test['Native.Country'][el] = 'United-States'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Univariate Analysis

Preprocessing Categorical Variables

Age

In [21]:
plt.hist(train.Age, bins = 100)
plt.show()
plt.hist(test.Age, bins = 100)
plt.show()
In [22]:
train.Age.describe()
Out[22]:
count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: Age, dtype: float64
In [23]:
test.Age.describe()
Out[23]:
count    16281.000000
mean        38.767459
std         13.849187
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: Age, dtype: float64

25% of the population is between 17 and 28

25% of the population is between 28 and 37

75% of the population is under 48


Working Class

In [24]:
train.Workclass.describe()
Out[24]:
count       32561
unique          8
top       Private
freq        24532
Name: Workclass, dtype: object
In [25]:
test.Workclass.describe()
Out[25]:
count       16281
unique          8
top       Private
freq        12173
Name: Workclass, dtype: object
In [26]:
unique_working_classes = list(train.Workclass.unique())
unique_working_classes
Out[26]:
['State-gov',
 'Self-emp-not-inc',
 'Private',
 'Federal-gov',
 'Local-gov',
 'Self-emp-inc',
 'Without-pay',
 'Never-worked']
In [27]:
from sklearn.preprocessing import LabelEncoder
In [28]:
number = LabelEncoder()
train.Workclass = number.fit_transform(train.Workclass.astype('str'))
test.Workclass = number.fit_transform(test.Workclass.astype('str'))
In [29]:
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train.Workclass, bins = len(unique_working_classes))
plt.show()

fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test.Workclass, bins = len(unique_working_classes))
plt.show()
In [30]:
train.Workclass.unique()
Out[30]:
array([6, 5, 3, 0, 1, 4, 7, 2])
In [31]:
# Workclass Decoder

workclass_decoder = dict(zip(train.Workclass.unique(),unique_working_classes))
workclass_decoder
Out[31]:
{0: 'Federal-gov',
 1: 'Local-gov',
 2: 'Never-worked',
 3: 'Private',
 4: 'Self-emp-inc',
 5: 'Self-emp-not-inc',
 6: 'State-gov',
 7: 'Without-pay'}
In [32]:
# # Combine bins based on some common sense
# new_workclass_encoder = {0:0,1:0,2:1,3:2,4:2,5:2,6:0,7:1}

# new_workclass_train = [new_workclass_encoder[i] for i in train.Workclass]
# train.Workclass = new_workclass_train

# new_workclass_test = [new_workclass_encoder[i] for i in test.Workclass]
# test.Workclass = new_workclass_test

# new_workclass_decoder = {0:'Government',1:'Never Worked / Without Pay / Information NA', 2:'Private'}
In [33]:
# new_workclass_decoder
In [34]:
# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(train.Workclass, bins = 3)

# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(test.Workclass, bins = 3)

Education

In [35]:
education_levels_train = train.Education.unique()
print education_levels_train
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th' '1st-4th'
 'Preschool' '12th']
In [36]:
train.Education = number.fit_transform(train.Education)
new_education_levels = train.Education.unique()
education_levels_decoder = dict(zip(education_levels_train, new_education_levels))
pprint(education_levels_decoder)
plt.hist(train.Education, bins = len(new_education_levels))
plt.show()

test.Education = number.fit_transform(test.Education)
plt.hist(test.Education, bins = len(new_education_levels))
plt.show()
{'10th': 0,
 '11th': 1,
 '12th': 2,
 '1st-4th': 3,
 '5th-6th': 4,
 '7th-8th': 5,
 '9th': 6,
 'Assoc-acdm': 7,
 'Assoc-voc': 8,
 'Bachelors': 9,
 'Doctorate': 10,
 'HS-grad': 11,
 'Masters': 12,
 'Preschool': 13,
 'Prof-school': 14,
 'Some-college': 15}

...it would make sense to club bins in a more sensible way

In [37]:
pprint(education_levels_decoder)
{'10th': 0,
 '11th': 1,
 '12th': 2,
 '1st-4th': 3,
 '5th-6th': 4,
 '7th-8th': 5,
 '9th': 6,
 'Assoc-acdm': 7,
 'Assoc-voc': 8,
 'Bachelors': 9,
 'Doctorate': 10,
 'HS-grad': 11,
 'Masters': 12,
 'Preschool': 13,
 'Prof-school': 14,
 'Some-college': 15}
In [38]:
# new_education_levels_encoder = {0:1,1:1,2:2,3:0,4:0,5:0,6:0, 7:5, 8:5, 9:3, 10:7, 11:2, 12:4, 13:0, 14:6, 15:3}

# new_education_levels_train = [new_education_levels_encoder[i] for i in train.Education]
# train.Education = new_education_levels_train

# new_education_levels_test = [new_education_levels_encoder[i] for i in test.Education]
# test.Education = new_education_levels_test

# new_education_levels_decoder = {0:'< 9th', 1:'High School', 2:'High School Grad', 3:'Bachelors', 4:'Masters', 5:'Assoc-acdm / Assoc-voc', 6:'Prof School', 7:'Doctorate'}
In [39]:
# plt.hist(train.Education, bins = len(new_education_levels_decoder))
# plt.show()
# plt.hist(test.Education, bins = len(new_education_levels_decoder))
# plt.show()

Income Group

In [40]:
# Rows for which Income is under $50k
print train[train['Income.Group'] != '<=50K'].index
Int64Index([    7,     8,     9,    10,    11,    14,    19,    20,    25,
               27,
            ...
            32530, 32532, 32533, 32536, 32538, 32539, 32545, 32554, 32557,
            32560],
           dtype='int64', length=7841)
In [41]:
temp = [0]*len(train)
for i in train[train['Income.Group'] != '<=50K'].index:
    temp[i] = 1
train['Income.Group'] = temp
In [42]:
train.head()
Out[42]:
ID Age Workclass Education Marital.Status Occupation Relationship Race Sex Hours.Per.Week Native.Country Income.Group
0 1 39 6 9 Never-married Adm-clerical Not-in-family White Male 40 United-States 0
1 2 50 5 9 Married-civ-spouse Exec-managerial Husband White Male 13 United-States 0
2 3 38 3 11 Divorced Handlers-cleaners Not-in-family White Male 40 United-States 0
3 4 53 3 1 Married-civ-spouse Handlers-cleaners Husband Black Male 40 United-States 0
4 5 28 3 9 Married-civ-spouse Prof-specialty Wife Black Female 40 Cuba 0
In [43]:
train.describe()
Out[43]:
ID Age Workclass Education Hours.Per.Week Income.Group
count 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000
mean 16281.000000 38.581647 3.094438 10.298210 40.437456 0.240810
std 9399.695394 13.640433 1.107194 3.870264 12.347429 0.427581
min 1.000000 17.000000 0.000000 0.000000 1.000000 0.000000
25% 8141.000000 28.000000 3.000000 9.000000 40.000000 0.000000
50% 16281.000000 37.000000 3.000000 11.000000 40.000000 0.000000
75% 24421.000000 48.000000 3.000000 12.000000 45.000000 0.000000
max 32561.000000 90.000000 7.000000 15.000000 99.000000 1.000000
Marital Status
In [44]:
train['Marital.Status'].describe()
Out[44]:
count                  32561
unique                     7
top       Married-civ-spouse
freq                   14976
Name: Marital.Status, dtype: object
In [45]:
test['Marital.Status'].describe()
Out[45]:
count                  16281
unique                     7
top       Married-civ-spouse
freq                    7403
Name: Marital.Status, dtype: object
In [46]:
marital_status_levels= train['Marital.Status'].unique()
# Encode marital status variable
print marital_status_levels
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
In [47]:
train['Marital.Status'] = number.fit_transform(train['Marital.Status'])
test['Marital.Status'] = number.fit_transform(test['Marital.Status'])
In [48]:
new_marital_status_levels = train['Marital.Status'].unique()
In [49]:
new_marital_status_levels
Out[49]:
array([4, 2, 0, 3, 5, 1, 6])
In [50]:
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train['Marital.Status'], bins = len(new_marital_status_levels))

fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test['Marital.Status'], bins = len(new_marital_status_levels))
Out[50]:
(array([ 2190.,    14.,  7403.,   210.,  5434.,   505.,   525.]),
 array([ 0.        ,  0.85714286,  1.71428571,  2.57142857,  3.42857143,
         4.28571429,  5.14285714,  6.        ]),
 <a list of 7 Patch objects>)
In [51]:
marital_level_status_decoder = dict(zip(new_marital_status_levels, marital_status_levels))
print marital_level_status_decoder
{0: 'Divorced', 1: 'Married-AF-spouse', 2: 'Married-civ-spouse', 3: 'Married-spouse-absent', 4: 'Never-married', 5: 'Separated', 6: 'Widowed'}

Occupation

In [52]:
occupation_levels = train.Occupation.unique()
print occupation_levels
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']
In [53]:
train.Occupation = number.fit_transform(train.Occupation)
test.Occupation = number.fit_transform(test.Occupation)
new_occupation_levels = train.Occupation.unique()
new_occupation_levels
Out[53]:
array([ 0,  3,  5,  9,  7, 11,  2, 13,  4,  6, 12, 10,  1,  8])
In [54]:
new_occupation_levels_decoder = dict(zip(new_occupation_levels,occupation_levels))
new_occupation_levels_decoder
Out[54]:
{0: 'Adm-clerical',
 1: 'Armed-Forces',
 2: 'Craft-repair',
 3: 'Exec-managerial',
 4: 'Farming-fishing',
 5: 'Handlers-cleaners',
 6: 'Machine-op-inspct',
 7: 'Other-service',
 8: 'Priv-house-serv',
 9: 'Prof-specialty',
 10: 'Protective-serv',
 11: 'Sales',
 12: 'Tech-support',
 13: 'Transport-moving'}
In [55]:
plt.hist(train.Occupation, bins = len(new_occupation_levels))
plt.show()

plt.hist(test.Occupation, bins = len(new_occupation_levels))
plt.show()

Relationshop

In [56]:
relationship_levels = train.Relationship.unique()
print relationship_levels
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
In [57]:
train.Relationship = number.fit_transform(train.Relationship)
test.Relationship = number.fit_transform(test.Relationship)
In [58]:
new_relationship_levels = train.Relationship.unique()
relationship_levels_decoder = dict(zip(new_relationship_levels, relationship_levels))
relationship_levels_decoder
Out[58]:
{0: 'Husband',
 1: 'Not-in-family',
 2: 'Other-relative',
 3: 'Own-child',
 4: 'Unmarried',
 5: 'Wife'}
In [59]:
plt.hist(train.Relationship, bins=len(relationship_levels))
plt.show()

plt.hist(test.Relationship, bins=len(relationship_levels))
plt.show()

Race

In [60]:
race_levels = train.Race.unique()
print race_levels
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
In [61]:
train.Race = number.fit_transform(train.Race)
test.Race = number.fit_transform(test.Race)
new_race_levels = train.Race.unique()
print new_race_levels
[4 2 1 0 3]
In [62]:
race_levels_decoder = dict(zip(new_race_levels,race_levels))
race_levels_decoder
Out[62]:
{0: 'Amer-Indian-Eskimo',
 1: 'Asian-Pac-Islander',
 2: 'Black',
 3: 'Other',
 4: 'White'}
In [63]:
plt.hist(train.Race, bins = len(race_levels))
plt.show()

plt.hist(test.Race, bins = len(race_levels))
plt.show()

Sex

In [64]:
sex_levels = train.Sex.unique()
train.Sex = number.fit_transform(train.Sex)
test.Sex = number.fit_transform(test.Sex)
new_sex_levels = train.Sex.unique()
sex_levels_decoder = dict(zip(new_sex_levels, sex_levels))
sex_levels_decoder
Out[64]:
{0: 'Female', 1: 'Male'}
In [65]:
fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(train.Sex, bins = 2)
plt.show()

fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(test.Sex, bins = 2)
plt.show()

Native Country

In [66]:
native_country_levels = train['Native.Country'].unique()
train['Native.Country'] = number.fit_transform(train['Native.Country'])
test['Native.Country'] = number.fit_transform(test['Native.Country'])
new_native_country_levels = train['Native.Country'].unique()
native_country_levels_decoder = dict(zip(new_native_country_levels, native_country_levels))
In [67]:
native_country_levels_decoder
Out[67]:
{0: 'Cambodia',
 1: 'Canada',
 2: 'China',
 3: 'Columbia',
 4: 'Cuba',
 5: 'Dominican-Republic',
 6: 'Ecuador',
 7: 'El-Salvador',
 8: 'England',
 9: 'France',
 10: 'Germany',
 11: 'Greece',
 12: 'Guatemala',
 13: 'Haiti',
 14: 'Holand-Netherlands',
 15: 'Honduras',
 16: 'Hong',
 17: 'Hungary',
 18: 'India',
 19: 'Iran',
 20: 'Ireland',
 21: 'Italy',
 22: 'Jamaica',
 23: 'Japan',
 24: 'Laos',
 25: 'Mexico',
 26: 'Nicaragua',
 27: 'Outlying-US(Guam-USVI-etc)',
 28: 'Peru',
 29: 'Philippines',
 30: 'Poland',
 31: 'Portugal',
 32: 'Puerto-Rico',
 33: 'Scotland',
 34: 'South',
 35: 'Taiwan',
 36: 'Thailand',
 37: 'Trinadad&Tobago',
 38: 'United-States',
 39: 'Vietnam',
 40: 'Yugoslavia'}
In [68]:
plt.hist(train['Native.Country'], bins = 41)
plt.show()

plt.hist(test['Native.Country'], bins = 41)
plt.show()

This feature needs binning countries together. Question is, how best to do that?

In [69]:
pprint(native_country_levels_decoder)
{0: 'Cambodia',
 1: 'Canada',
 2: 'China',
 3: 'Columbia',
 4: 'Cuba',
 5: 'Dominican-Republic',
 6: 'Ecuador',
 7: 'El-Salvador',
 8: 'England',
 9: 'France',
 10: 'Germany',
 11: 'Greece',
 12: 'Guatemala',
 13: 'Haiti',
 14: 'Holand-Netherlands',
 15: 'Honduras',
 16: 'Hong',
 17: 'Hungary',
 18: 'India',
 19: 'Iran',
 20: 'Ireland',
 21: 'Italy',
 22: 'Jamaica',
 23: 'Japan',
 24: 'Laos',
 25: 'Mexico',
 26: 'Nicaragua',
 27: 'Outlying-US(Guam-USVI-etc)',
 28: 'Peru',
 29: 'Philippines',
 30: 'Poland',
 31: 'Portugal',
 32: 'Puerto-Rico',
 33: 'Scotland',
 34: 'South',
 35: 'Taiwan',
 36: 'Thailand',
 37: 'Trinadad&Tobago',
 38: 'United-States',
 39: 'Vietnam',
 40: 'Yugoslavia'}
Count the number of data points per country
In [70]:
country_observations_count_train = [len(train[train['Native.Country'] == i]) for i in range(41)]
country_observations_count_test = [len(test[test['Native.Country'] == i]) for i in range(41)]

print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_train))
print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_test))
{'Canada': 121, 'Hong': 20, 'Dominican-Republic': 70, 'Italy': 73, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 14, 'Scotland': 12, 'Cambodia': 19, 'France': 29, 'Ireland': 24, 'Laos': 18, 'Thailand': 18, 'Ecuador': 28, 'Columbia': 59, 'Cuba': 95, 'Guatemala': 64, 'China': 75, 'Germany': 137, 'Haiti': 44, 'Poland': 60, 'United-States': 29753, 'Philippines': 198, 'Vietnam': 67, 'Japan': 62, 'Iran': 43, 'Honduras': 13, 'South': 80, 'Jamaica': 81, 'England': 90, 'Portugal': 37, 'Mexico': 643, 'El-Salvador': 106, 'India': 100, 'Puerto-Rico': 114, 'Yugoslavia': 16, 'Holand-Netherlands': 1, 'Trinadad&Tobago': 19, 'Greece': 29, 'Hungary': 13, 'Taiwan': 51, 'Nicaragua': 34}
{'Canada': 61, 'Hong': 6, 'Dominican-Republic': 33, 'Italy': 25, 'Peru': 97, 'Outlying-US(Guam-USVI-etc)': 15, 'Scotland': 35, 'Cambodia': 9, 'France': 9, 'Ireland': 32, 'Laos': 308, 'Thailand': 8, 'Ecuador': 17, 'Columbia': 26, 'Cuba': 43, 'Guatemala': 24, 'China': 47, 'Germany': 69, 'Haiti': 31, 'Poland': 30, 'United-States': 19, 'Philippines': 27, 'Vietnam': 7, 'Japan': 5, 'Iran': 13, 'Honduras': 10, 'South': 14, 'Jamaica': 30, 'England': 37, 'Portugal': 70, 'Mexico': 15, 'El-Salvador': 49, 'India': 16, 'Puerto-Rico': 9, 'Yugoslavia': 0, 'Holand-Netherlands': 7, 'Trinadad&Tobago': 14936, 'Greece': 20, 'Hungary': 51, 'Taiwan': 12, 'Nicaragua': 9}

let's not over_complicate. Let's club this as 2 bins. One for US and other non-US

In [71]:
# country_list = [0]*len(train['Native.Country'])
# US_index = list(train[train['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# train['Native.Country'] = country_list

# country_list = [0]*len(test['Native.Country'])
# US_index = list(test[test['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# test['Native.Country'] = country_list
In [72]:
test.head()
Out[72]:
ID Age Workclass Education Marital.Status Occupation Relationship Race Sex Hours.Per.Week Native.Country
0 32562 25 3 1 4 6 3 2 1 40 37
1 32563 38 3 11 2 4 0 4 1 50 37
2 32564 28 1 7 2 10 0 4 1 40 37
3 32565 44 3 15 2 6 0 2 1 40 37
4 32566 18 3 15 4 9 3 4 0 30 37

Multivariate Analysis

In [73]:
sex_income_crosstab = pd.crosstab(train.Sex, train['Income.Group'], margins = True)
print sex_income_crosstab
Income.Group      0     1    All
Sex                             
0              9592  1179  10771
1             15128  6662  21790
All           24720  7841  32561
In [74]:
sex_income_crosstab.iloc[:-1,:-1]
Out[74]:
Income.Group 0 1
Sex
0 9592 1179
1 15128 6662
In [75]:
fig = plt.figure(figsize=(6,6), dpi=1600)
# fig, ax = plt.subplots()
sex_income_crosstab.iloc[:-1,:-1].plot(kind = 'barh', stacked = True, color = ['red','blue'], alpha = 0.65)
Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d656d1e50>
<matplotlib.figure.Figure at 0x7f5d65b66710>
In [76]:
sex_income_crosstab
Out[76]:
Income.Group 0 1 All
Sex
0 9592 1179 10771
1 15128 6662 21790
All 24720 7841 32561
In [77]:
def percentConvert(x):
    return x / float(x[-1])

sex_income_crosstab.apply(percentConvert, axis = 0)
Out[77]:
Income.Group 0 1 All
Sex
0 0.388026 0.150363 0.330795
1 0.611974 0.849637 0.669205
All 1.000000 1.000000 1.000000
In [78]:
sex_income_crosstab.apply(percentConvert, axis = 1)
Out[78]:
Income.Group 0 1 All
Sex
0 0.890539 0.109461 1.0
1 0.694263 0.305737 1.0
All 0.759190 0.240810 1.0
In [79]:
plt.figure(figsize=(6,10), dpi=1600)

sex_income_crosstab.apply(percentConvert, axis = 1).iloc[:-1,:-1].plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Income Distribution by Sex')
plt.show()


sex_income_crosstab.apply(percentConvert, axis = 0).iloc[:-1,:-1].transpose().plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Percentage share of Male / Female in each Income Class')
plt.show()
<matplotlib.figure.Figure at 0x7f5d65686050>
In [80]:
test.plot('Age', 'Hours.Per.Week', kind = 'scatter')
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d65828810>
/home/anirudh/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

This shows no real relationship between Age and Hours-Per-Week. Even intuitively we were not expecting any specific trend so this is good. In other cases, you might figure out interesting trends which can be exploited.

In [81]:
train[train.Age > 40]['Income.Group'].mean()
Out[81]:
0.3535669121475861

Let's see how income classification changes solely based on age

In [82]:
ages = []
high_income_proportion = []
for age in range(17,91,3):
    ages.append(age)
    high_income_proportion.append(train[train.Age > age]['Income.Group'].mean())
In [83]:
plt.plot(ages, high_income_proportion)
plt.title('Proportion of people earning above $50K, given age')
Out[83]:
<matplotlib.text.Text at 0x7f5d648055d0>
In [84]:
train.boxplot(column='Hours.Per.Week', by= 'Sex')
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d65892810>
In [85]:
test.boxplot(column='Hours.Per.Week', by= 'Sex')
Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d64a1dc10>
In [86]:
train.boxplot(column = 'Age', by = 'Income.Group')
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d6491fd90>

What is the percentage of males which have income <= 50K ?

In [87]:
1 - train[train.Sex == 1]['Income.Group'].mean()
Out[87]:
0.6942634235888022

Predictive Modelling

Model I - Decision Tree Classifier

In [88]:
from sklearn.tree import DecisionTreeClassifier
In [89]:
# define the predictors

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

print independent_variable

# Inititate the algorithm
model_01 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')

# Fit the algorithm
model_01.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_01_predictions = model_01.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_01_predictions)):
    if model_01_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_01.csv', index = False)
['Age', 'Workclass', 'Education', 'Marital.Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Hours.Per.Week', 'Native.Country']

Your score for this submission is : 0.826976229961.

Model II - Decision Tree Classifier with reduced number of features

In [90]:
from sklearn.tree import DecisionTreeClassifier
In [91]:
# define the predictors

dependent_variable = 'Income.Group'
independent_variable = ['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']

print independent_variable

# Inititate the algorithm
model_02 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')

# Fit the algorithm
model_02.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_02_predictions = model_02.predict(test[independent_variable])

submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])

for i in range(len(model_02_predictions)):
    if model_02_predictions[i] == 1: 
        income_group[i] = '>50K'
        
submission['Income.Group'] = income_group

submission.to_csv('submission_02.csv', index = False)
['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']

Your score for this submission is : 0.806522940851.

Model III - Random Forest Classifier

In [92]:
from sklearn.ensemble import RandomForestClassifier
In [93]:
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_03 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 10, max_features= 'auto')

# Fit the algorithm
model_03.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_03_predictions = model_03.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_03_predictions)):
    if model_03_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_03.csv', index = False)

Your score for this submission is : 0.8359437381.

Model IV - K Nearest Neighbors Classifier

In [94]:
from sklearn.neighbors import KNeighborsClassifier
In [95]:
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_04 = KNeighborsClassifier(n_neighbors=10, leaf_size=1)

# Fit the algorithm
model_04.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_04_predictions = model_04.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_04_predictions)):
    if model_04_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_04.csv', index = False)

Your score for this submission is : 0.811313801364.

Model V - Bagging

In [96]:
from sklearn.ensemble import BaggingClassifier
In [97]:
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_05 = BaggingClassifier()

# Fit the algorithm
model_05.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_05_predictions = model_05.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_05_predictions)):
    if model_05_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_05.csv', index = False)

Model VI - Ada Boost

In [98]:
from sklearn.ensemble import AdaBoostClassifier
In [99]:
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_06 = AdaBoostClassifier(n_estimators=160, learning_rate= 0.5)

# Fit the algorithm
model_06.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_06_predictions = model_06.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_06_predictions)):
    if model_06_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_06.csv', index = False)

Your score for this submission is : 0.836496529697.

Model VII - Gradient Boosting

In [100]:
from sklearn.ensemble import GradientBoostingClassifier
In [101]:
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
# model_07 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=1700, max_depth=3)

model_07 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=195, max_depth=3)

# Fit the algorithm
model_07.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_07_predictions = model_07.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_07_predictions)):
    if model_07_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_07.csv', index = False)

Your score for this submission is : 0.841471654075.

Model VIII - Extra Trees Classifier

In [102]:
from sklearn.ensemble import ExtraTreesClassifier
In [103]:
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_08 = ExtraTreesClassifier()
    
# Fit the algorithm
model_08.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_08_predictions = model_08.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_08_predictions)):
    if model_08_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_08.csv', index = False)

Your score for this submission is : 0.81057.