#!/usr/bin/env python # coding: utf-8 # [![Open in Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/justmarkham/scikit-learn-tips/master?filepath=notebooks%2F26_stratified_train_test_split.ipynb) # # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/justmarkham/scikit-learn-tips/blob/master/notebooks/26_stratified_train_test_split.ipynb) # # # 🤖⚡ scikit-learn tip #26 ([video](https://www.youtube.com/watch?v=Zcjl8xPLmPw&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=26)) # # Are you using train_test_split with a classification problem? # # Be sure to set "stratify=y" so that class proportions are preserved when splitting. # # Especially important if you have class imbalance! # # See example 👇 # In[1]: import pandas as pd df = pd.DataFrame({'feature':list(range(8)), 'target':['not fraud']*6 + ['fraud']*2}) # In[2]: X = df[['feature']] y = df['target'] # In[3]: from sklearn.model_selection import train_test_split # ## Not stratified # # `y_train` contains **NONE** of the minority class, whereas `y_test` contains **ALL** of the minority class. (This is bad!) # In[4]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # In[5]: y_train # In[6]: y_test # ## Stratified # # Class proportions are the **SAME** in `y_train` and `y_test`. (This is good!) # In[7]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=y) # In[8]: y_train # In[9]: y_test # ### Want more tips? [View all tips on GitHub](https://github.com/justmarkham/scikit-learn-tips) or [Sign up to receive 2 tips by email every week](https://scikit-learn.tips) 💌 # # © 2020 [Data School](https://www.dataschool.io). All rights reserved.