import pandas as pd
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from pydrift import DriftChecker
from pydrift.exceptions import ColumnsNotMatchException
from pydrift.constants import PATH_DATA, RANDOM_STATE
set_config(display='diagram')
df_titanic = pd.read_csv(PATH_DATA / 'titanic.csv')
DATA_LENGTH = df_titanic.shape[0]
TARGET = 'Survived'
50% sample will give us a non-drift problem
We drop Ticket and Cabin features because of cardinality
X = df_titanic.drop(columns=['Ticket', 'Cabin', 'PassengerId', 'Name', TARGET])
y = df_titanic[TARGET]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=.5, random_state=RANDOM_STATE, stratify=y
)
ColumnsNotMatchException
¶try:
DriftChecker(X_train.drop(columns='Sex'), X_test)
except ColumnsNotMatchException as exception:
print(exception)
Different columns for left and right dataframes Columns in right dataframe but not in left one: Sex Columns in left dataframe but not in right one: None
try:
DriftChecker(X_train, X_test.drop(columns='SibSp'))
except ColumnsNotMatchException as exception:
print(exception)
Different columns for left and right dataframes Columns in right dataframe but not in left one: None Columns in left dataframe but not in right one: SibSp
try:
DriftChecker(X_train.drop(columns='Fare'), X_test.drop(columns='Embarked'))
except ColumnsNotMatchException as exception:
print(exception)
Different columns for left and right dataframes Columns in right dataframe but not in left one: Fare Columns in left dataframe but not in right one: Embarked
ml_model_can_discriminate
Feature With Different Model¶You can pass any model to be the discriminative ml model, for example a pipeline with logistic regression
categorical_pipeline = make_pipeline(
SimpleImputer(strategy='most_frequent'),
OrdinalEncoder()
)
column_transformer = make_column_transformer(
(categorical_pipeline, X_train.select_dtypes(include=['category', 'object']).columns),
(SimpleImputer(strategy='median'), X_train.select_dtypes(include='number').columns)
)
pipeline_lr = make_pipeline(column_transformer, LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
display(pipeline_lr)
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('pipeline', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('ordinalencoder', OrdinalEncoder())]), Index(['Sex', 'Embarked'], dtype='object')), ('simpleimputer', SimpleImputer(strategy='median'), Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])), ('logisticregression', LogisticRegression(max_iter=1000, random_state=1994))])
ColumnTransformer(transformers=[('pipeline', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('ordinalencoder', OrdinalEncoder())]), Index(['Sex', 'Embarked'], dtype='object')), ('simpleimputer', SimpleImputer(strategy='median'), Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])
Index(['Sex', 'Embarked'], dtype='object')
SimpleImputer(strategy='most_frequent')
OrdinalEncoder()
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
SimpleImputer(strategy='median')
LogisticRegression(max_iter=1000, random_state=1994)
drift_checker_ok = DriftChecker(
X_train, X_test
)
drift_checker_ok.ml_model_can_discriminate(ml_discriminate_model=pipeline_lr);
HBox(children=(FloatProgress(value=0.0, max=446.0), HTML(value='')))
No drift found in discriminative model step AUC drift check model: 0.50 AUC threshold: .5 ± 0.10
pydrift
tells you that the problem is in Sex
feature (as is obviously in this example)
mask = (X['Pclass'] > 1) & (X['Fare'] > 10)
X_mask = X[mask]
X_unmask = X[~mask]
drift_checker_ko = DriftChecker(
X_mask, X_unmask
)
drift_checker_ko.ml_model_can_discriminate(ml_discriminate_model=pipeline_lr);
HBox(children=(FloatProgress(value=0.0, max=446.0), HTML(value='')))
Drift found in discriminative model step, take a look on the most discriminative features (plots when minimal is set to False) AUC drift check model: 0.84 AUC threshold: .5 ± 0.10
Embarked
is the most driscriminative feature!
drift_checker_ko.interpretable_drift.both_histogram_plot('Embarked', fillna_value='UNK')
Parch
¶drift_checker_ko.interpretable_drift.both_histogram_plot('Parch')
auc_threshold
¶drift_checker_ko.ml_model_can_discriminate(ml_discriminate_model=pipeline_lr,
auc_threshold=.4);
HBox(children=(FloatProgress(value=0.0, max=446.0), HTML(value='')))