#!/usr/bin/env python
# coding: utf-8
# ##
One-Hot Encoding
#
# - used on categorical variables
# - it replaces a categorical variable/feature with one or more new features that will take the values of 0 or 1
# - increases data burden
# - increases the efficiency of the process
# In[1]:
import pandas as pd
from IPython.display import display
data = pd.read_csv('adult.data', header=None, index_col=False, names=['age', 'workclass', 'fnlwgt', 'education',
'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'gender', 'capital-gain',
'capital-loss', 'hours-per-week', 'native-country',
'income'])
# In[3]:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
display(data)
# In[4]:
print('Original Features:\n', list(data.columns), '\n')
data_dummies = pd.get_dummies(data)
print('Features after One-Hot Encoding:\n', list(data_dummies.columns))
# In[ ]:
# In[ ]: