#!/usr/bin/env python # coding: utf-8 # [![Open in Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/justmarkham/scikit-learn-tips/master?filepath=notebooks%2F07_handle_unknown_categories.ipynb) # # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/justmarkham/scikit-learn-tips/blob/master/notebooks/07_handle_unknown_categories.ipynb) # # # 🤖⚡ scikit-learn tip #7 ([video](https://www.youtube.com/watch?v=bA6mYC1a_Eg&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=7)) # # Q: For a one-hot encoded feature, what can you do if new data contains categories that weren't seen during training? # # A: Set handle_unknown='ignore' to encode new categories as all zeros. # # See example 👇 # # P.S. If you know all possible categories that might ever appear, you can instead specify the categories manually. handle_unknown='ignore' is useful specifically when you don't know all possible categories. # In[1]: import pandas as pd X = pd.DataFrame({'col':['A', 'B', 'C', 'B']}) X_new = pd.DataFrame({'col':['A', 'C', 'D']}) # In[2]: from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') # In[3]: X # In[4]: # three columns represent categories A, B, and C ohe.fit_transform(X[['col']]) # In[5]: # category D was not learned by OneHotEncoder during the "fit" step X_new # In[6]: # category D is encoded as all zeros ohe.transform(X_new[['col']]) # ### Want more tips? [View all tips on GitHub](https://github.com/justmarkham/scikit-learn-tips) or [Sign up to receive 2 tips by email every week](https://scikit-learn.tips) 💌 # # © 2020 [Data School](https://www.dataschool.io). All rights reserved.