#!/usr/bin/env python
# coding: utf-8
# # π― Uplift modeling `metrics`
#
#
#
#
#
#
#
# SCIKIT-UPLIFT REPO |
# SCIKIT-UPLIFT DOCS |
# USER GUIDE
#
#
# In[1]:
import sys
# install uplift library scikit-uplift and other libraries
get_ipython().system('{sys.executable} -m pip install scikit-uplift dill catboost')
# # π Load data
#
# We are going to use a `Lenta dataset` from the BigTarget Hackathon hosted in summer 2020 by Lenta and Microsoft.
#
# Lenta is a russian food retailer.
#
# ### Data description
#
# βοΈ Dataset can be loaded from `sklift.datasets` module using `fetch_lenta` function.
#
# Read more about dataset in the api docs.
#
# This is an uplift modeling dataset containing data about Lenta's customers grociery shopping, marketing campaigns communications as `treatment` and store visits as `target`.
#
# #### βοΈ Major columns:
#
# - `group` - treatment / control flag
# - `response_att` - binary target
# - `CardHolder` - customer id
# - `gender` - customer gender
# - `age` - customer age
# In[2]:
from sklift.datasets import fetch_lenta
# returns sklearn Bunch object
# with data, target, treatment keys
# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values
dataset = fetch_lenta()
# In[3]:
print(f"Dataset type: {type(dataset)}\n")
print(f"Dataset features shape: {dataset.data.shape}")
print(f"Dataset target shape: {dataset.target.shape}")
print(f"Dataset treatment shape: {dataset.treatment.shape}")
# # π EDA
# In[4]:
dataset.data.head().append(dataset.data.tail())
# ### π€ target share for `treatment / control`
# In[5]:
import pandas as pd
pd.crosstab(dataset.treatment, dataset.target, normalize='index')
# In[6]:
# make treatment binary
treat_dict = {
'test': 1,
'control': 0
}
dataset.treatment = dataset.treatment.map(treat_dict)
# In[7]:
# fill NaNs in the categorical feature `gender`
# for CatBoostClassifier
dataset.data['gender'] = dataset.data['gender'].fillna(value='ΠΠ΅ ΠΎΠΏΡΠ΅Π΄Π΅Π»Π΅Π½')
print(dataset.data['gender'].value_counts(dropna=False))
# ### βοΈ train test split
#
# - stratify by two columns: treatment and target.
#
# `Intuition:` In a binary classification problem definition we stratify train set by splitting target `0/1` column. In uplift modeling we have two columns instead of one.
# In[8]:
from sklearn.model_selection import train_test_split
stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)
X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
dataset.data,
dataset.treatment,
dataset.target,
stratify=stratify_cols,
test_size=0.3,
random_state=42
)
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
# # πΎ Class Transformation uplift model
#
# `Class transformation` method is described here
#
# Class transormation method `may` be used in case of treatment unbalanced data. In this case one will get not an uplift score but some *ranking* score still useful for ranking objects.
# In[9]:
from sklift.models import ClassTransformation
from catboost import CatBoostClassifier
estimator = CatBoostClassifier(verbose=100,
cat_features=['gender'],
random_state=42,
thread_count=1)
ct_model = ClassTransformation(estimator=estimator)
# In[10]:
ct_model.fit(
X=X_train,
y=y_train,
treatment=trmnt_train
)
# ### Save model
# In[11]:
import dill
with open("model.dill", 'wb') as f:
dill.dump(ct_model, f)
# ### Uplift prediction
# In[12]:
uplift_ct = ct_model.predict(X_val)
# # πππ Uplift metrics
# ## π `uplift@k`
#
# - uplift at first k%
# - usually falls between [0; 1] depending on k, model quality and data
#
#
# ### `uplift@k` = `target mean at k% in the treatment group` - `target mean at k% in the control group`
#
# ___
#
# How to count `uplift@k`:
#
# 1. sort by predicted uplift
# 2. select first k%
# 3. count target mean in the treatment group
# 4. count target mean in the control group
# 5. substract the mean in the control group from the mean in the treatment group
#
# ---
#
# Code parameter options:
#
# - `strategy='overall'` - sort by uplift treatment and control together
# - `strategy='by_group'` - sort by uplift treatment and control separately
# In[14]:
from sklift.metrics import uplift_at_k
# k = 10%
k = 0.1
# strategy='overall' sort by uplift treatment and control together
uplift_overall = uplift_at_k(y_val, uplift_ct, trmnt_val, strategy='overall', k=k)
# strategy='by_group' sort by uplift treatment and control separately
uplift_bygroup = uplift_at_k(y_val, uplift_ct, trmnt_val, strategy='by_group', k=k)
print(f"uplift@{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)")
print(f"uplift@{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)")
# ## π `uplift_by_percentile` table
#
# Count metrics for each percentile in data in descending order by uplift prediction (by rows):
#
# - `n_treatment` - treatment group size in the one percentile
# - `n_control` - control group size in the one percentile
# - `response_rate_treatment` - target mean in the treatment group in the one percentile
# - `response_rate_control` - target mean in the control group in the one percentile
# - `uplift = response_rate_treatment - response_rate_control` in the one percentile
#
# ___
#
# Code parameter options are:
#
# - `strategy='overall'` - sort by uplift treatment and control groups together
# - `strategy='by_group'` - sort by uplift treatment and control groups separately
# - `total=True` - show total metric on full data
# - `std=True` - show metrics std by row
# In[15]:
from sklift.metrics import uplift_by_percentile
uplift_by_percentile(y_val, uplift_ct, trmnt_val,
strategy='overall',
total=True, std=True, bins=10)
# ## π `weighted average uplift `
#
# - counts uplift on full data
# - uses results from `uplift_by_percentile` table
# - result depends on number of bins
#
# ### `weighted average uplift` = `sum of uplift by percentile weighted on the treatment group size`
#
# In[16]:
from sklift.metrics import weighted_average_uplift
uplift_full_data = weighted_average_uplift(y_val, uplift_ct, trmnt_val, bins=10)
print(f"average uplift on full data: {uplift_full_data:.4f}")
# ## π `uplift_by_percentile` plot
#
# - visualize results of `uplift_by_percentile` table
#
# Two ways to plot:
#
# - line plot `kind='line'`
# - bar plot `kind='bar'`
#
# In[17]:
from sklift.viz import plot_uplift_by_percentile
# line plot
plot_uplift_by_percentile(y_val, uplift_ct, trmnt_val, strategy='overall', kind='line');
# In[18]:
# bar plot
plot_uplift_by_percentile(y_val, uplift_ct, trmnt_val, strategy='overall', kind='bar');
# ## π `Qini curve`
#
# The curve plots the absolute incremental outcome of the treated group compared to group with no treatment.
#
#
# plot Qini curve:
# - `blue line` is a `real Qini curve` based on data.
# - `red line` is an `ideal Qini curve` based on data. Code: `perfect=True`
# - `grey line` is a `random Qini curve` based on data
#
#
# ## π `AUQC` (`area under Qini curve` or `Qini coefficient`)
#
# `Qini coefficient` = `light blue area between the real Qini curve and the random Qini curve normalized on area between the random and the ideal line`
#
#
#
#
# - metric is printed at the title of the Qini curve plot
# - can be called as a separate function
# In[19]:
from sklift.viz import plot_qini_curve
# with ideal Qini curve (red line)
# perfect=True
plot_qini_curve(y_val, uplift_ct, trmnt_val, perfect=True);
# In[20]:
# no ideal Qini curve
# only real Qini curve
# perfect=False
plot_qini_curve(y_val, uplift_ct, trmnt_val, perfect=False);
# In[21]:
from sklift.metrics import qini_auc_score
# AUQC = area under Qini curve = Qini coefficient
auqc = qini_auc_score(y_val, uplift_ct, trmnt_val)
print(f"Qini coefficient on full data: {auqc:.4f}")
# ## π `Uplift curve`
#
# The Uplift curve plots incremental uplift.
#
#
# - `blue line` is a `real Uplift curve` based on data.
# - `red line` is an `ideal Uplift curve` based on data. Code: `perfect=True`
# - `grey line` is a `random Uplift curve` based on data.
#
#
# ## π `AUUQ` (`area under uplift curve`)
#
# - `Area under uplift curve` = blue area between the real Uplift curve and the random Uplift curve
# - appears at the title of the Uplift curve plot
# - can be called as a separate function
#
# In[22]:
from sklift.viz import plot_uplift_curve
# with ideal curve
# perfect=True
plot_uplift_curve(y_val, uplift_ct, trmnt_val, perfect=True);
# In[23]:
# only real
# perfect=False
plot_uplift_curve(y_val, uplift_ct, trmnt_val, perfect=False);
# In[24]:
from sklift.metrics import uplift_auc_score
# AUUQ = area under uplift curve
auuc = uplift_auc_score(y_val, uplift_ct, trmnt_val)
print(f"Uplift auc score on full data: {auuc:.4f}")
# In[ ]: