import pandas as pd
df = pd.read_csv('https://www.openml.org/data/get_csv/1595261/adult-census.csv')
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
categorical_cols = ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race', 'sex']
X = df[categorical_cols]
y = df['class']
# OneHotEncoder creates 60 columns
ohe = OneHotEncoder()
ohe.fit_transform(X).shape
(48842, 60)
# OrdinalEncoder creates 7 columns
oe = OrdinalEncoder()
oe.fit_transform(X).shape
(48842, 7)
# Random Forests is a tree-based model
rf = RandomForestClassifier(random_state=1, n_jobs=-1)
# Pipeline containing OneHotEncoder
ohe_pipe = make_pipeline(ohe, rf)
%time cross_val_score(ohe_pipe, X, y).mean()
CPU times: user 1.95 s, sys: 189 ms, total: 2.14 s Wall time: 23.2 s
0.8262561170407418
# Pipeline containing OrdinalEncoder
oe_pipe = make_pipeline(oe, rf)
%time cross_val_score(oe_pipe, X, y).mean()
CPU times: user 1.67 s, sys: 133 ms, total: 1.81 s Wall time: 3.83 s
0.8256623624061437
© 2020 Data School. All rights reserved.