import pandas as pd
from IPython.display import display
data = pd.read_csv('adult.data', header=None, index_col=False, names=['age', 'workclass', 'fnlwgt', 'education',
'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'gender', 'capital-gain',
'capital-loss', 'hours-per-week', 'native-country',
'income'])
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
display(data)
age | workclass | education | gender | hours-per-week | occupation | income | |
---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | Male | 40 | Adm-clerical | <=50K |
1 | 50 | Self-emp-not-inc | Bachelors | Male | 13 | Exec-managerial | <=50K |
2 | 38 | Private | HS-grad | Male | 40 | Handlers-cleaners | <=50K |
3 | 53 | Private | 11th | Male | 40 | Handlers-cleaners | <=50K |
4 | 28 | Private | Bachelors | Female | 40 | Prof-specialty | <=50K |
5 | 37 | Private | Masters | Female | 40 | Exec-managerial | <=50K |
6 | 49 | Private | 9th | Female | 16 | Other-service | <=50K |
7 | 52 | Self-emp-not-inc | HS-grad | Male | 45 | Exec-managerial | >50K |
8 | 31 | Private | Masters | Female | 50 | Prof-specialty | >50K |
9 | 42 | Private | Bachelors | Male | 40 | Exec-managerial | >50K |
10 | 37 | Private | Some-college | Male | 80 | Exec-managerial | >50K |
11 | 30 | State-gov | Bachelors | Male | 40 | Prof-specialty | >50K |
12 | 23 | Private | Bachelors | Female | 30 | Adm-clerical | <=50K |
13 | 32 | Private | Assoc-acdm | Male | 50 | Sales | <=50K |
14 | 40 | Private | Assoc-voc | Male | 40 | Craft-repair | >50K |
15 | 34 | Private | 7th-8th | Male | 45 | Transport-moving | <=50K |
16 | 25 | Self-emp-not-inc | HS-grad | Male | 35 | Farming-fishing | <=50K |
17 | 32 | Private | HS-grad | Male | 40 | Machine-op-inspct | <=50K |
18 | 38 | Private | 11th | Male | 50 | Sales | <=50K |
19 | 43 | Self-emp-not-inc | Masters | Female | 45 | Exec-managerial | >50K |
20 | 40 | Private | Doctorate | Male | 60 | Prof-specialty | >50K |
21 | 54 | Private | HS-grad | Female | 20 | Other-service | <=50K |
22 | 35 | Federal-gov | 9th | Male | 40 | Farming-fishing | <=50K |
23 | 43 | Private | 11th | Male | 40 | Transport-moving | <=50K |
24 | 59 | Private | HS-grad | Female | 40 | Tech-support | <=50K |
25 | 56 | Local-gov | Bachelors | Male | 40 | Tech-support | >50K |
26 | 19 | Private | HS-grad | Male | 40 | Craft-repair | <=50K |
27 | 54 | ? | Some-college | Male | 60 | ? | >50K |
28 | 39 | Private | HS-grad | Male | 80 | Exec-managerial | <=50K |
29 | 49 | Private | HS-grad | Male | 40 | Craft-repair | <=50K |
... | ... | ... | ... | ... | ... | ... | ... |
32531 | 30 | ? | Bachelors | Female | 99 | ? | <=50K |
32532 | 34 | Private | Doctorate | Male | 60 | Prof-specialty | >50K |
32533 | 54 | Private | Bachelors | Male | 50 | Exec-managerial | >50K |
32534 | 37 | Private | Some-college | Female | 39 | Adm-clerical | <=50K |
32535 | 22 | Private | 12th | Male | 35 | Protective-serv | <=50K |
32536 | 34 | Private | Bachelors | Female | 55 | Exec-managerial | >50K |
32537 | 30 | Private | HS-grad | Male | 46 | Craft-repair | <=50K |
32538 | 38 | Private | Bachelors | Female | 45 | Prof-specialty | >50K |
32539 | 71 | ? | Doctorate | Male | 10 | ? | >50K |
32540 | 45 | State-gov | HS-grad | Female | 40 | Adm-clerical | <=50K |
32541 | 41 | ? | HS-grad | Female | 32 | ? | <=50K |
32542 | 72 | ? | HS-grad | Male | 25 | ? | <=50K |
32543 | 45 | Local-gov | Assoc-acdm | Female | 48 | Prof-specialty | <=50K |
32544 | 31 | Private | Masters | Female | 30 | Other-service | <=50K |
32545 | 39 | Local-gov | Assoc-acdm | Female | 20 | Adm-clerical | >50K |
32546 | 37 | Private | Assoc-acdm | Female | 40 | Tech-support | <=50K |
32547 | 43 | Private | HS-grad | Male | 40 | Machine-op-inspct | <=50K |
32548 | 65 | Self-emp-not-inc | Prof-school | Male | 60 | Prof-specialty | <=50K |
32549 | 43 | State-gov | Some-college | Female | 40 | Adm-clerical | <=50K |
32550 | 43 | Self-emp-not-inc | Some-college | Male | 50 | Craft-repair | <=50K |
32551 | 32 | Private | 10th | Male | 40 | Handlers-cleaners | <=50K |
32552 | 43 | Private | Assoc-voc | Male | 45 | Sales | <=50K |
32553 | 32 | Private | Masters | Male | 11 | Tech-support | <=50K |
32554 | 53 | Private | Masters | Male | 40 | Exec-managerial | >50K |
32555 | 22 | Private | Some-college | Male | 40 | Protective-serv | <=50K |
32556 | 27 | Private | Assoc-acdm | Female | 38 | Tech-support | <=50K |
32557 | 40 | Private | HS-grad | Male | 40 | Machine-op-inspct | >50K |
32558 | 58 | Private | HS-grad | Female | 40 | Adm-clerical | <=50K |
32559 | 22 | Private | HS-grad | Male | 20 | Adm-clerical | <=50K |
32560 | 52 | Self-emp-inc | HS-grad | Female | 40 | Exec-managerial | >50K |
32561 rows × 7 columns
print('Original Features:\n', list(data.columns), '\n')
data_dummies = pd.get_dummies(data)
print('Features after One-Hot Encoding:\n', list(data_dummies.columns))
Original Features: ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] Features after One-Hot Encoding: ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']
features = data_dummies.ix[:, 'age':'occupation_ Transport-moving']
X = features.values
y = data_dummies['income_ >50K'].values
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Logistic Regression score on the test set: {:.2f}'.format(logreg.score(X_test, y_test)))
Logistic Regression score on the test set: 0.81