#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[2]: from pprint import * # In[3]: import seaborn seaborn.set() # In[4]: train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') # #### Hypotheses # # 1. Age # 2. Income # 3. Profession # 4. Marital Status # 5. Urban / Rural # 6. Expenses # 7. Education # 8. Total Work Experience # In[5]: test.head() # In[6]: train.head() # # Missing Value Treatment # In[7]: train.apply(lambda x: sum(x.isnull())) # In[8]: test.apply(lambda x: sum(x.isnull())) # In[9]: from scipy.stats import mode # #### Workclass # In[10]: mode(train.Workclass)[0][0] # In[11]: Workclass_Null_Indices = train[train.Workclass.isnull()].index for el in Workclass_Null_Indices: train.Workclass[el] = 'Private' # In[12]: mode(test.Workclass)[0][0] # In[13]: Workclass_Null_Indices_test = test[test.Workclass.isnull()].index for el in Workclass_Null_Indices_test: test.Workclass[el] = 'Private' # #### Occupation # In[14]: mode(train.Occupation)[0][0] # In[15]: mode(test.Occupation)[0][0] # In[16]: Occupation_Null_Indices_train = train[train.Occupation.isnull()].index for el in Occupation_Null_Indices_train: train.Occupation[el] = 'Prof-specialty' # In[17]: Occupation_Null_Indices_test = test[test.Occupation.isnull()].index for el in Occupation_Null_Indices_test: test.Occupation[el] = 'Prof-specialty' # #### Native Country # In[18]: mode(train['Native.Country'])[0][0] # In[19]: mode(test['Native.Country'])[0][0] # In[20]: NativeCountry_Null_Indices_train = train[train['Native.Country'].isnull()].index for el in NativeCountry_Null_Indices_train: train['Native.Country'][el] = 'United-States' NativeCountry_Null_Indices_test = test[test['Native.Country'].isnull()].index for el in NativeCountry_Null_Indices_test: test['Native.Country'][el] = 'United-States' # # Univariate Analysis # ### Preprocessing Categorical Variables # ***Age*** # In[21]: plt.hist(train.Age, bins = 100) plt.show() plt.hist(test.Age, bins = 100) plt.show() # In[22]: train.Age.describe() # In[23]: test.Age.describe() # *** 25% of the population is between 17 and 28*** # # *** 25% of the population is between 28 and 37*** # # *** 75% of the population is under 48 *** # # ------------------------------------------- # *** Working Class *** # In[24]: train.Workclass.describe() # In[25]: test.Workclass.describe() # In[26]: unique_working_classes = list(train.Workclass.unique()) unique_working_classes # In[27]: from sklearn.preprocessing import LabelEncoder # In[28]: number = LabelEncoder() train.Workclass = number.fit_transform(train.Workclass.astype('str')) test.Workclass = number.fit_transform(test.Workclass.astype('str')) # In[29]: fig = plt.figure(figsize=(4,4), dpi=1600) plt.hist(train.Workclass, bins = len(unique_working_classes)) plt.show() fig = plt.figure(figsize=(4,4), dpi=1600) plt.hist(test.Workclass, bins = len(unique_working_classes)) plt.show() # In[30]: train.Workclass.unique() # In[31]: # Workclass Decoder workclass_decoder = dict(zip(train.Workclass.unique(),unique_working_classes)) workclass_decoder # In[32]: # # Combine bins based on some common sense # new_workclass_encoder = {0:0,1:0,2:1,3:2,4:2,5:2,6:0,7:1} # new_workclass_train = [new_workclass_encoder[i] for i in train.Workclass] # train.Workclass = new_workclass_train # new_workclass_test = [new_workclass_encoder[i] for i in test.Workclass] # test.Workclass = new_workclass_test # new_workclass_decoder = {0:'Government',1:'Never Worked / Without Pay / Information NA', 2:'Private'} # In[33]: # new_workclass_decoder # In[34]: # fig = plt.figure(figsize=(2,5), dpi=1600) # plt.hist(train.Workclass, bins = 3) # fig = plt.figure(figsize=(2,5), dpi=1600) # plt.hist(test.Workclass, bins = 3) # *** Education *** # In[35]: education_levels_train = train.Education.unique() print education_levels_train # In[36]: train.Education = number.fit_transform(train.Education) new_education_levels = train.Education.unique() education_levels_decoder = dict(zip(education_levels_train, new_education_levels)) pprint(education_levels_decoder) plt.hist(train.Education, bins = len(new_education_levels)) plt.show() test.Education = number.fit_transform(test.Education) plt.hist(test.Education, bins = len(new_education_levels)) plt.show() # ...it would make sense to club bins in a more sensible way # In[37]: pprint(education_levels_decoder) # In[38]: # new_education_levels_encoder = {0:1,1:1,2:2,3:0,4:0,5:0,6:0, 7:5, 8:5, 9:3, 10:7, 11:2, 12:4, 13:0, 14:6, 15:3} # new_education_levels_train = [new_education_levels_encoder[i] for i in train.Education] # train.Education = new_education_levels_train # new_education_levels_test = [new_education_levels_encoder[i] for i in test.Education] # test.Education = new_education_levels_test # new_education_levels_decoder = {0:'< 9th', 1:'High School', 2:'High School Grad', 3:'Bachelors', 4:'Masters', 5:'Assoc-acdm / Assoc-voc', 6:'Prof School', 7:'Doctorate'} # In[39]: # plt.hist(train.Education, bins = len(new_education_levels_decoder)) # plt.show() # plt.hist(test.Education, bins = len(new_education_levels_decoder)) # plt.show() # #### Income Group # In[40]: # Rows for which Income is under $50k print train[train['Income.Group'] != '<=50K'].index # In[41]: temp = [0]*len(train) for i in train[train['Income.Group'] != '<=50K'].index: temp[i] = 1 train['Income.Group'] = temp # In[42]: train.head() # In[43]: train.describe() # ##### Marital Status # In[44]: train['Marital.Status'].describe() # In[45]: test['Marital.Status'].describe() # In[46]: marital_status_levels= train['Marital.Status'].unique() # Encode marital status variable print marital_status_levels # In[47]: train['Marital.Status'] = number.fit_transform(train['Marital.Status']) test['Marital.Status'] = number.fit_transform(test['Marital.Status']) # In[48]: new_marital_status_levels = train['Marital.Status'].unique() # In[49]: new_marital_status_levels # In[50]: fig = plt.figure(figsize=(4,4), dpi=1600) plt.hist(train['Marital.Status'], bins = len(new_marital_status_levels)) fig = plt.figure(figsize=(4,4), dpi=1600) plt.hist(test['Marital.Status'], bins = len(new_marital_status_levels)) # In[51]: marital_level_status_decoder = dict(zip(new_marital_status_levels, marital_status_levels)) print marital_level_status_decoder # #### Occupation # In[52]: occupation_levels = train.Occupation.unique() print occupation_levels # In[53]: train.Occupation = number.fit_transform(train.Occupation) test.Occupation = number.fit_transform(test.Occupation) new_occupation_levels = train.Occupation.unique() new_occupation_levels # In[54]: new_occupation_levels_decoder = dict(zip(new_occupation_levels,occupation_levels)) new_occupation_levels_decoder # In[55]: plt.hist(train.Occupation, bins = len(new_occupation_levels)) plt.show() plt.hist(test.Occupation, bins = len(new_occupation_levels)) plt.show() # #### Relationshop # In[56]: relationship_levels = train.Relationship.unique() print relationship_levels # In[57]: train.Relationship = number.fit_transform(train.Relationship) test.Relationship = number.fit_transform(test.Relationship) # In[58]: new_relationship_levels = train.Relationship.unique() relationship_levels_decoder = dict(zip(new_relationship_levels, relationship_levels)) relationship_levels_decoder # In[59]: plt.hist(train.Relationship, bins=len(relationship_levels)) plt.show() plt.hist(test.Relationship, bins=len(relationship_levels)) plt.show() # #### Race # In[60]: race_levels = train.Race.unique() print race_levels # In[61]: train.Race = number.fit_transform(train.Race) test.Race = number.fit_transform(test.Race) new_race_levels = train.Race.unique() print new_race_levels # In[62]: race_levels_decoder = dict(zip(new_race_levels,race_levels)) race_levels_decoder # In[63]: plt.hist(train.Race, bins = len(race_levels)) plt.show() plt.hist(test.Race, bins = len(race_levels)) plt.show() # #### Sex # In[64]: sex_levels = train.Sex.unique() train.Sex = number.fit_transform(train.Sex) test.Sex = number.fit_transform(test.Sex) new_sex_levels = train.Sex.unique() sex_levels_decoder = dict(zip(new_sex_levels, sex_levels)) sex_levels_decoder # In[65]: fig = plt.figure(figsize=(3,4), dpi=1600) plt.hist(train.Sex, bins = 2) plt.show() fig = plt.figure(figsize=(3,4), dpi=1600) plt.hist(test.Sex, bins = 2) plt.show() # #### Native Country # In[66]: native_country_levels = train['Native.Country'].unique() train['Native.Country'] = number.fit_transform(train['Native.Country']) test['Native.Country'] = number.fit_transform(test['Native.Country']) new_native_country_levels = train['Native.Country'].unique() native_country_levels_decoder = dict(zip(new_native_country_levels, native_country_levels)) # In[67]: native_country_levels_decoder # In[68]: plt.hist(train['Native.Country'], bins = 41) plt.show() plt.hist(test['Native.Country'], bins = 41) plt.show() # This feature needs binning countries together. Question is, how best to do that? # In[69]: pprint(native_country_levels_decoder) # ##### Count the number of data points per country # In[70]: country_observations_count_train = [len(train[train['Native.Country'] == i]) for i in range(41)] country_observations_count_test = [len(test[test['Native.Country'] == i]) for i in range(41)] print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_train)) print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_test)) # #### let's not over_complicate. Let's club this as 2 bins. One for US and other non-US # In[71]: # country_list = [0]*len(train['Native.Country']) # US_index = list(train[train['Native.Country'] == 38].index) # for i in US_index: country_list[i] = 1 # train['Native.Country'] = country_list # country_list = [0]*len(test['Native.Country']) # US_index = list(test[test['Native.Country'] == 38].index) # for i in US_index: country_list[i] = 1 # test['Native.Country'] = country_list # In[72]: test.head() # # Multivariate Analysis # In[73]: sex_income_crosstab = pd.crosstab(train.Sex, train['Income.Group'], margins = True) print sex_income_crosstab # In[74]: sex_income_crosstab.iloc[:-1,:-1] # In[75]: fig = plt.figure(figsize=(6,6), dpi=1600) # fig, ax = plt.subplots() sex_income_crosstab.iloc[:-1,:-1].plot(kind = 'barh', stacked = True, color = ['red','blue'], alpha = 0.65) # In[76]: sex_income_crosstab # In[77]: def percentConvert(x): return x / float(x[-1]) sex_income_crosstab.apply(percentConvert, axis = 0) # In[78]: sex_income_crosstab.apply(percentConvert, axis = 1) # In[79]: plt.figure(figsize=(6,10), dpi=1600) sex_income_crosstab.apply(percentConvert, axis = 1).iloc[:-1,:-1].plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65) plt.title('Income Distribution by Sex') plt.show() sex_income_crosstab.apply(percentConvert, axis = 0).iloc[:-1,:-1].transpose().plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65) plt.title('Percentage share of Male / Female in each Income Class') plt.show() # In[80]: test.plot('Age', 'Hours.Per.Week', kind = 'scatter') # This shows no real relationship between Age and Hours-Per-Week. Even intuitively we were not expecting any specific trend so this is good. In other cases, you might figure out interesting trends which can be exploited. # In[81]: train[train.Age > 40]['Income.Group'].mean() # Let's see how income classification changes solely based on age # In[82]: ages = [] high_income_proportion = [] for age in range(17,91,3): ages.append(age) high_income_proportion.append(train[train.Age > age]['Income.Group'].mean()) # In[83]: plt.plot(ages, high_income_proportion) plt.title('Proportion of people earning above $50K, given age') # In[84]: train.boxplot(column='Hours.Per.Week', by= 'Sex') # In[85]: test.boxplot(column='Hours.Per.Week', by= 'Sex') # In[86]: train.boxplot(column = 'Age', by = 'Income.Group') # What is the percentage of males which have income <= 50K ? # In[87]: 1 - train[train.Sex == 1]['Income.Group'].mean() # # Predictive Modelling # ### Model I - Decision Tree Classifier # In[88]: from sklearn.tree import DecisionTreeClassifier # In[89]: # define the predictors dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] print independent_variable # Inititate the algorithm model_01 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt') # Fit the algorithm model_01.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_01_predictions = model_01.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_01_predictions)): if model_01_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_01.csv', index = False) # Your score for this submission is : 0.826976229961. # ### Model II - Decision Tree Classifier with reduced number of features # In[90]: from sklearn.tree import DecisionTreeClassifier # In[91]: # define the predictors dependent_variable = 'Income.Group' independent_variable = ['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week'] print independent_variable # Inititate the algorithm model_02 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt') # Fit the algorithm model_02.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_02_predictions = model_02.predict(test[independent_variable]) submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_02_predictions)): if model_02_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_02.csv', index = False) # Your score for this submission is : 0.806522940851. # ### Model III - Random Forest Classifier # In[92]: from sklearn.ensemble import RandomForestClassifier # In[93]: dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] # Inititate the algorithm model_03 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 10, max_features= 'auto') # Fit the algorithm model_03.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_03_predictions = model_03.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_03_predictions)): if model_03_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_03.csv', index = False) # Your score for this submission is : 0.8359437381. # ### Model IV - K Nearest Neighbors Classifier # In[94]: from sklearn.neighbors import KNeighborsClassifier # In[95]: dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] # Inititate the algorithm model_04 = KNeighborsClassifier(n_neighbors=10, leaf_size=1) # Fit the algorithm model_04.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_04_predictions = model_04.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_04_predictions)): if model_04_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_04.csv', index = False) # Your score for this submission is : 0.811313801364. # ### Model V - Bagging # In[96]: from sklearn.ensemble import BaggingClassifier # In[97]: dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] # Inititate the algorithm model_05 = BaggingClassifier() # Fit the algorithm model_05.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_05_predictions = model_05.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_05_predictions)): if model_05_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_05.csv', index = False) # ### Model VI - Ada Boost # In[98]: from sklearn.ensemble import AdaBoostClassifier # In[99]: dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] # Inititate the algorithm model_06 = AdaBoostClassifier(n_estimators=160, learning_rate= 0.5) # Fit the algorithm model_06.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_06_predictions = model_06.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_06_predictions)): if model_06_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_06.csv', index = False) # Your score for this submission is : 0.836496529697. # ### Model VII - Gradient Boosting # In[100]: from sklearn.ensemble import GradientBoostingClassifier # In[101]: dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] # Inititate the algorithm # model_07 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=1700, max_depth=3) model_07 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=195, max_depth=3) # Fit the algorithm model_07.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_07_predictions = model_07.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_07_predictions)): if model_07_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_07.csv', index = False) # Your score for this submission is : 0.841471654075. # ### Model VIII - Extra Trees Classifier # In[102]: from sklearn.ensemble import ExtraTreesClassifier # In[103]: dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]] # Inititate the algorithm model_08 = ExtraTreesClassifier() # Fit the algorithm model_08.fit(train[independent_variable], train[dependent_variable]) # Model predictions model_08_predictions = model_08.predict(test[independent_variable]) # Make submission submission = pd.read_csv('sample_submission.csv') submission.ID = test.ID income_group = ['<=50K']*len(test['ID']) for i in range(len(model_08_predictions)): if model_08_predictions[i] == 1: income_group[i] = '>50K' submission['Income.Group'] = income_group submission.to_csv('submission_08.csv', index = False) # Your score for this submission is : 0.81057.