import sys
# install uplift library scikit-uplift and other libraries
!{sys.executable} -m pip install scikit-uplift dill catboost
We are going to use a Lenta dataset
from the BigTarget Hackathon hosted in summer 2020 by Lenta and Microsoft.
Lenta is a russian food retailer.
✏️ Dataset can be loaded from sklift.datasets
module using fetch_lenta
function.
Read more about dataset in the api docs.
This is an uplift modeling dataset containing data about Lenta's customers grociery shopping, marketing campaigns communications as treatment
and store visits as target
.
group
- treatment / control flagresponse_att
- binary targetCardHolder
- customer idgender
- customer gender age
- customer agefrom sklift.datasets import fetch_lenta
# returns sklearn Bunch object
# with data, target, treatment keys
# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values
dataset = fetch_lenta()
print(f"Dataset type: {type(dataset)}\n")
print(f"Dataset features shape: {dataset.data.shape}")
print(f"Dataset target shape: {dataset.target.shape}")
print(f"Dataset treatment shape: {dataset.treatment.shape}")
dataset.data.head().append(dataset.data.tail())
treatment / control
¶import pandas as pd
pd.crosstab(dataset.treatment, dataset.target, normalize='index')
# make treatment binary
treat_dict = {
'test': 1,
'control': 0
}
dataset.treatment = dataset.treatment.map(treat_dict)
# fill NaNs in the categorical feature `gender`
# for CatBoostClassifier
dataset.data['gender'] = dataset.data['gender'].fillna(value='Не определен')
print(dataset.data['gender'].value_counts(dropna=False))
Intuition:
In a binary classification problem definition we stratify train set by splitting target 0/1
column. In uplift modeling we have two columns instead of one.
from sklearn.model_selection import train_test_split
stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)
X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
dataset.data,
dataset.treatment,
dataset.target,
stratify=stratify_cols,
test_size=0.3,
random_state=42
)
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
from sklift.models import ClassTransformation
from catboost import CatBoostClassifier
estimator = CatBoostClassifier(verbose=100,
cat_features=['gender'],
random_state=42,
thread_count=1)
ct_model = ClassTransformation(estimator=estimator)
ct_model.fit(
X=X_train,
y=y_train,
treatment=trmnt_train
)
import dill
with open("model.dill", 'wb') as f:
dill.dump(ct_model, f)
uplift_ct = ct_model.predict(X_val)
[email protected]
¶[email protected]
= target mean at k% in the treatment group
- target mean at k% in the control group
¶How to count [email protected]
:
Code parameter options:
strategy='overall'
- sort by uplift treatment and control togetherstrategy='by_group'
- sort by uplift treatment and control separatelyfrom sklift.metrics import uplift_at_k
# k = 10%
k = 0.1
# strategy='overall' sort by uplift treatment and control together
uplift_overall = uplift_at_k(y_val, uplift_ct, trmnt_val, strategy='overall', k=k)
# strategy='by_group' sort by uplift treatment and control separately
uplift_bygroup = uplift_at_k(y_val, uplift_ct, trmnt_val, strategy='by_group', k=k)
print(f"[email protected]{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)")
print(f"[email protected]{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)")
uplift_by_percentile
table¶Count metrics for each percentile in data in descending order by uplift prediction (by rows):
n_treatment
- treatment group size in the one percentilen_control
- control group size in the one perentileresponse_rate_treatment
- target mean in the treatment group in the one percentileresponse_rate_control
- target mean in the control group in the one percentileuplift = response_rate_treatment - response_rate_control
in the one percentileCode parameter options are:
strategy='overall'
- sort by uplift treatment and control groups togetherstrategy='by_group'
- sort by uplift treatment and control groups separatelytotal=True
- show total metric on full datastd=True
- show metrics std by row from sklift.metrics import uplift_by_percentile
uplift_by_percentile(y_val, uplift_ct, trmnt_val,
strategy='overall',
total=True, std=True, bins=10)
from sklift.metrics import weighted_average_uplift
uplift_full_data = weighted_average_uplift(y_val, uplift_ct, trmnt_val, bins=10)
print(f"average uplift on full data: {uplift_full_data:.4f}")
uplift_by_percentile
plot¶uplift_by_percentile
tableTwo ways to plot:
kind='line'
kind='bar'
from sklift.viz import plot_uplift_by_percentile
# line plot
plot_uplift_by_percentile(y_val, uplift_ct, trmnt_val, strategy='overall', kind='line');
# bar plot
plot_uplift_by_percentile(y_val, uplift_ct, trmnt_val, strategy='overall', kind='bar');
Qini curve
¶The curve plots the absolute incremental outcome of the treated group compared to group with no treatment.
plot Qini curve:
blue line
is a real Qini curve
based on data.red line
is an ideal Qini curve
based on data. Code: perfect=True
grey line
is a random Qini curve
based on dataAUQC
(area under Qini curve
or Qini coefficient
)¶Qini coefficient
= light blue area between the real Qini curve and the random Qini curve normalized on area between the random and the ideal line
from sklift.viz import plot_qini_curve
# with ideal Qini curve (red line)
# perfect=True
plot_qini_curve(y_val, uplift_ct, trmnt_val, perfect=True);
# no ideal Qini curve
# only real Qini curve
# perfect=False
plot_qini_curve(y_val, uplift_ct, trmnt_val, perfect=False);
from sklift.metrics import qini_auc_score
# AUQC = area under Qini curve = Qini coefficient
auqc = qini_auc_score(y_val, uplift_ct, trmnt_val)
print(f"Qini coefficient on full data: {auqc:.4f}")
Uplift curve
¶The Uplift curve plots incremental uplift.
blue line
is a real Uplift curve
based on data. red line
is an ideal Uplift curve
based on data. Code: perfect=True
grey line
is a random Uplift curve
based on data.AUUQ
(area under uplift curve
)¶Area under uplift curve
= blue area between the real Uplift curve and the random Uplift curve from sklift.viz import plot_uplift_curve
# with ideal curve
# perfect=True
plot_uplift_curve(y_val, uplift_ct, trmnt_val, perfect=True);
# only real
# perfect=False
plot_uplift_curve(y_val, uplift_ct, trmnt_val, perfect=False);
from sklift.metrics import uplift_auc_score
# AUUQ = area under uplift curve
auuc = uplift_auc_score(y_val, uplift_ct, trmnt_val)
print(f"Uplift auc score on full data: {auuc:.4f}")