#!/usr/bin/env python # coding: utf-8 # [![Open in Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/justmarkham/scikit-learn-tips/master?filepath=notebooks%2F43_ordinal_encoding_for_trees.ipynb) # # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/justmarkham/scikit-learn-tips/blob/master/notebooks/43_ordinal_encoding_for_trees.ipynb) # # # 🤖⚡ scikit-learn tip #43 ([video](https://www.youtube.com/watch?v=n_x40CdPZss&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=43)) # # With a tree-based model, try OrdinalEncoder instead of OneHotEncoder even for nominal (unordered) features. # # Accuracy will often be similar, but OrdinalEncoder will be much faster! # # See example 👇 # In[1]: import pandas as pd df = pd.read_csv('https://www.openml.org/data/get_csv/1595261/adult-census.csv') # In[2]: from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score # In[3]: categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex'] # In[4]: X = df[categorical_cols] y = df['class'] # In[5]: # OneHotEncoder creates 60 columns ohe = OneHotEncoder() ohe.fit_transform(X).shape # In[6]: # OrdinalEncoder creates 7 columns oe = OrdinalEncoder() oe.fit_transform(X).shape # In[7]: # Random Forests is a tree-based model rf = RandomForestClassifier(random_state=1, n_jobs=-1) # In[8]: # Pipeline containing OneHotEncoder ohe_pipe = make_pipeline(ohe, rf) get_ipython().run_line_magic('time', 'cross_val_score(ohe_pipe, X, y).mean()') # In[9]: # Pipeline containing OrdinalEncoder oe_pipe = make_pipeline(oe, rf) get_ipython().run_line_magic('time', 'cross_val_score(oe_pipe, X, y).mean()') # ### Want more tips? [View all tips on GitHub](https://github.com/justmarkham/scikit-learn-tips) or [Sign up to receive 2 tips by email every week](https://scikit-learn.tips) 💌 # # © 2020 [Data School](https://www.dataschool.io). All rights reserved.