#!/usr/bin/env python # coding: utf-8 # ![KTS logo](https://raw.githubusercontent.com/konodyuk/kts/master/docs/static/banner_alpha.png) # # Modelling Guide # In[1]: import numpy as np np.random.seed(0) import kts from kts import * # Feature constructors and helpers defined earlier are automatically loaded: # In[2]: simple_feature # Use `kts.ls` to list objects saved in your user cache and `kts.rm` to remove them: # In[3]: print(kts.ls()) kts.rm('external') print(kts.ls()) # In[4]: train = kts.load('train') test = kts.load('test') # ## Models # # `kts.models.{binary, multiclass, regression}` contain most popular models for each task type. # In particular, all of the corresponding sklearn models are present, as well as CatBoost, LGBM and XGB if already installed. We'll also add neural nets there soon. # In[5]: from kts.models import binary, multiclass, regression # Init signatures are preserved: # In[6]: cb = binary.CatBoostClassifier(iterations=100, rsm=.15, custom_metric='AUC') cb # In[7]: lr = binary.LogisticRegression(C=.5, solver='lbfgs', max_iter=1000) lr # In[8]: from category_encoders import TargetEncoder, WOEEncoder fs = FeatureSet([simple_feature, interactions('Pclass', 'Age'), num_aggs('Fare'), tfidf('Name'), stl.one_hot_encode('Embarked')], [stl.category_encode(TargetEncoder(), 'Embarked', 'Survived'), stl.category_encode(WOEEncoder(), 'Embarked', 'Survived')], train_frame=train, targets='Survived') # ## Validation # # To define a validation scheme, you'll use `kts.Validator(splitter, metric)`. Splitter is used to split the training set, and metric is for evaluating trained models. # In[9]: from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_auc_score skf = StratifiedKFold(5, True, 42) val = Validator(skf, roc_auc_score) # Running validation is as easy as `val.score(model, feature_set)`: # In[10]: val.score(cb, fs) # In[11]: val.score(lr, fs) # ## Leaderboard # # Right after validation your experiments are placed in the leaderboard: # In[12]: lb # ### Multiple Leaderboards # You can also keep multiple leaderboards by passing `leaderboard` parameter to `val.score()`. Default leaderboard is `main`. # In[13]: some_model = binary.KNeighborsClassifier() val.score(some_model, fs, leaderboard='other') # Use `kts.leaderboard_list` or `kts.lbs` to access leaderboards other than main: # In[14]: lbs.main is lb lbs.other is lbs['other'] leaderboard_list is lbs # Note that the new experiment appeared only in the new leaderboard: # In[15]: lb lbs.other # ## Experiments # # Experiments are accessible by their identifiers: # In[16]: lb['KPBVAI'] is lb.KPBVAI # In[17]: lb.KPBVAI # ### Inference # # Inference is as easy as `experiment.predict(frame)`. Features are computed automatically. # In[18]: lb.KPBVAI.predict(test.head(5)) # ### Feature Importances # In[19]: lb.KPBVAI.feature_importances # In[20]: lb.KPBVAI.feature_importances(sort_by='mean', n_best=7) # Use `plot=False` to get feature importances by fold: # In[21]: lb.KPBVAI.feature_importances(plot=False) # Specify an importance estimator to compute permutation importance: # In[22]: lb.KPBVAI.feature_importances(sort_by='mean', estimator=Permutation(train, n_iters=10)) # ## Custom Models # # Suppose you want to use some model which is not in `kts.models`, like [Regularized Greedy Forest](https://github.com/RGF-team/rgf/tree/master/python-package). # In[23]: get_ipython().system('pip3 install rgf_python > /dev/null') # In[24]: from rgf.sklearn import RGFClassifier # To use it, you simply need to create a class derived from both your classifier and `kts.CustomModel`. It may optionally include `preprocess` method or inherit it from some mixin, like `kts.NormalizeFillNAMixin`: # # ```python # class KTSWrapper(kts.CustomModel, somelib.SomeClassifier): # ignored_params = [...] # # def preprocess(X, y=None): # if y is None: # print('if y is None then .predict is called') # else: # print('otherwise .fit') # return X, y # ``` # # An alternative approach is using `kts.custom_model(ModelClass, ignored_params, normalize_fillna=True/False)` function: # ```python # RGF = custom_model(RGFClassifier, ignored_params=['memory_policy', 'n_jobs', 'verbose'], normalize_fillna=True) # ``` # However, subclassing gives more freedom in defining custom preprocessing. # # In this example the classifier can't deal with NaN values, and we use `kts.NormalizeFillNAMixin` to add preprocessing method: # In[25]: class RGF(NormalizeFillNAMixin, CustomModel, RGFClassifier): ignored_params = ['memory_policy', 'n_jobs', 'verbose'] # In[26]: RGF() # In[27]: val.score(RGF(), fs) # ## Custom Validators # # *TODO* # In[28]: