#!/usr/bin/env python # coding: utf-8 # # 🎯 Uplift model selection # # [MegaFon Uplift Competition](https://ods.ai/competitions/megafon-df21-comp) # ## problem: predict the uplift by user's feature vector # In[1]: pip install scikit-uplift lightgbm -U # In[2]: import pandas as pd from sklearn.model_selection import ( StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_validate, cross_val_score ) from lightgbm import LGBMClassifier from sklift.models import SoloModel from sklift.viz import plot_qini_curve from sklift.datasets import fetch_megafon from sklift.metrics import make_uplift_scorer # In[3]: dataset = fetch_megafon() data, treatment, target = dataset.data, dataset.treatment, dataset.target data = data.set_index('id') data.head() # In[4]: treatment.head() # In[5]: target.head() # --- # # 📝Solution of problem # In[6]: # make treatment binary treatment = treatment.map({'control': 0, 'treatment': 1}) # ### 1) Conditional Independence Assumption: # intuition: check if treatment was random # In[7]: f1_micro = cross_val_score( X=data, y=treatment, estimator= LGBMClassifier(random_state=42, n_jobs=-1), scoring='f1_micro', cv=3) print(f'F1 micro {f1_micro.mean():.2f}') # Based on the user's features, the classifier makes type I and type II errors as often (50%) as it guesses the correct answer. This means that communication was carried out by random. # ### 2) Fit single model with treatment feature (S-Learner approach) ([link to tutorial](https://habr.com/ru/company/ru_mts/blog/485980/)) # intuition: the model fits simultaneously on two groups with binary treatment flag as an additional user's feature. We score each object from the test sample twice: with the treatment flag equal to 1 and equal to 0. Subtracting the probabilities for each observation, we obtain an estimated uplift. # In[8]: # setting up data # use test set for model evaluation after grid-search-cross-validation-tuning # use cv set for GridSearchCV stratify_cols = pd.concat([treatment, target], axis=1) X_cv, X_test, y_cv, y_test, trmnt_cv, trmnt_test = train_test_split( data, target, treatment, stratify=stratify_cols, test_size=0.2, random_state=42 ) # In[9]: # setting up model ... estimator = LGBMClassifier( random_state=42, n_jobs=-1, ) # ... metamodel ... slearner = SoloModel(estimator=estimator) # ... and uplift metric scorer to pass to cross validation uplift_scorer = make_uplift_scorer("qini_auc_score", trmnt_cv) # In[10]: cv_gen = StratifiedShuffleSplit( n_splits=3, random_state=42 ) cross_validate(slearner, X=X_cv, y=y_cv, scoring=uplift_scorer, return_estimator=True, cv=cv_gen, n_jobs=-1, fit_params={'treatment': trmnt_cv} ) # In[11]: grid = { 'estimator__learning_rate': [0.2], 'estimator__max_depth': [6] } grid_search = GridSearchCV(slearner, param_grid=grid, scoring=uplift_scorer, n_jobs=-1, cv=cv_gen, return_train_score=True ) grid_search = grid_search.fit( X=X_cv, y=y_cv, treatment=trmnt_cv ) # In[12]: print(f"best qini score on grid search: {grid_search.best_score_:.4f}") print(f"best params: {grid_search.best_params_}") # In[13]: slearner.set_params(**grid_search.best_params_) slearner.fit( X=X_cv, y=y_cv, treatment=trmnt_cv, ) uplift = slearner.predict(X_test) # In[14]: bets_disp = plot_qini_curve(y_test, uplift, trmnt_test, perfect=True, name='Best model'); bets_disp.figure_.suptitle("Qini curve");