import numpy as np
np.random.seed(0)
import kts
from kts import *
Feature constructors and helpers defined earlier are automatically loaded:
simple_feature
@feature def simple_feature(df): res = stl.empty_like(df) res['is_male'] = (df.Sex == 'male') + 0 return res
Use kts.ls
to list objects saved in your user cache and kts.rm
to remove them:
print(kts.ls())
kts.rm('external')
print(kts.ls())
['train', 'test', 'external'] ['train', 'test']
train = kts.load('train')
test = kts.load('test')
kts.models.{binary, multiclass, regression}
contain most popular models for each task type.
In particular, all of the corresponding sklearn models are present, as well as CatBoost, LGBM and XGB if already installed. We'll also add neural nets there soon.
from kts.models import binary, multiclass, regression
Init signatures are preserved:
cb = binary.CatBoostClassifier(iterations=100, rsm=.15, custom_metric='AUC')
cb
custom_metric = 'AUC' loss_function = 'Logloss' rsm = 0.15 iterations = 100
CatBoostClassifier(custom_metric='AUC', loss_function='Logloss', rsm=0.15, iterations=100)
lr = binary.LogisticRegression(C=.5, solver='lbfgs', max_iter=1000)
lr
C = 0.5 class_weight = None dual = False fit_intercept = True intercept_scaling = 1 max_iter = 1000 multi_class = 'warn' penalty = 'l2' random_state = None solver = 'lbfgs' tol = 0.0001 warm_start = False
LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=1000, multi_class='warn', penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, warm_start=False)
from category_encoders import TargetEncoder, WOEEncoder
fs = FeatureSet([simple_feature, interactions('Pclass', 'Age'), num_aggs('Fare'), tfidf('Name'), stl.one_hot_encode('Embarked')],
[stl.category_encode(TargetEncoder(), 'Embarked', 'Survived'),
stl.category_encode(WOEEncoder(), 'Embarked', 'Survived')],
train_frame=train,
targets='Survived')
To define a validation scheme, you'll use kts.Validator(splitter, metric)
. Splitter is used to split the training set, and metric is for evaluating trained models.
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
skf = StratifiedKFold(5, True, 42)
val = Validator(skf, roc_auc_score)
Running validation is as easy as val.score(model, feature_set)
:
val.score(cb, fs)
{'score': 0.8450484134619144, 'id': 'KPBVAI'}
val.score(lr, fs)
{'score': 0.8145216602070352, 'id': 'FYCMDA'}
Right after validation your experiments are placed in the leaderboard:
lb
You can also keep multiple leaderboards by passing leaderboard
parameter to val.score()
. Default leaderboard is main
.
some_model = binary.KNeighborsClassifier()
val.score(some_model, fs, leaderboard='other')
{'score': 0.7948359406496035, 'id': 'GWZCPQ'}
Use kts.leaderboard_list
or kts.lbs
to access leaderboards other than main:
lbs.main is lb
lbs.other is lbs['other']
leaderboard_list is lbs
True
True
True
Note that the new experiment appeared only in the new leaderboard:
lb
lbs.other
Experiments are accessible by their identifiers:
lb['KPBVAI'] is lb.KPBVAI
True
lb.KPBVAI
loss_function = 'Logloss' custom_metric = 'AUC' rsm = 0.15 iterations = 100
Inference is as easy as experiment.predict(frame)
. Features are computed automatically.
lb.KPBVAI.predict(test.head(5))
array([0.17121523, 0.57615125, 0.0999157 , 0.23952768, 0.78401192])
lb.KPBVAI.feature_importances
Experiment.feature_importances(self, plot, estimator, sort_by, n_best, verbose)
>>> from kts.feature_selection import Permutation >>> lb.ABCDEF.feature_importances(plot=False) # -> pd.DataFrame >>> lb.ABCDEF.feature_importances(estimator=Permutation(train_frame, n_iters=3), sort_by='max')
lb.KPBVAI.feature_importances(sort_by='mean', n_best=7)
Use plot=False
to get feature importances by fold:
lb.KPBVAI.feature_importances(plot=False)
tfidf__Name_0 | Embarked_ce_OneHotEncoder_2 | Pclass_add_Age | Fare_sub_div_mean | Embarked_ce_Survived_WOEEncoder | tfidf__Name_3 | Embarked_ce_OneHotEncoder_1 | Embarked_ce_OneHotEncoder_3 | Embarked_ce_Survived_TargetEncoder | Embarked_ce_OneHotEncoder_0 | tfidf__Name_1 | tfidf__Name_2 | tfidf__Name_4 | Fare_div_std | Fare_div_mean | Pclass_sub_Age | Pclass_mul_Age | is_male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.94677 | 1.83414 | 1.96158 | 7.29455 | 1.27512 | 5.17552 | 2.0481 | 0 | 1.1201 | 2.87565 | 3.16809 | 5.48229 | 5.95125 | 3.48855 | 4.00294 | 6.07287 | 5.79585 | 39.5066 |
1 | 5.99983 | 1.11964 | 2.7143 | 12.0095 | 1.27202 | 3.33875 | 2.55421 | 0 | 1.37644 | 3.57352 | 3.79159 | 3.8706 | 4.67407 | 2.16574 | 1.38701 | 5.58991 | 10.2945 | 34.2684 |
2 | 2.48002 | 0.846825 | 5.26318 | 5.46335 | 2.98447 | 3.8642 | 1.56882 | 0 | 1.76954 | 2.11486 | 4.45559 | 5.13815 | 4.27618 | 5.2687 | 6.55907 | 6.86313 | 8.55423 | 32.5297 |
3 | 4.31537 | 1.30387 | 2.74786 | 3.78222 | 1.87028 | 4.34941 | 1.48347 | 0 | 1.03283 | 0.910795 | 6.2502 | 5.76845 | 5.14516 | 6.06301 | 6.2413 | 11.0083 | 7.67064 | 30.0568 |
4 | 1.74347 | 0.30458 | 0.848658 | 0.741891 | 0.262854 | 0.492911 | 1.88876 | 0 | 2.91524 | 0 | 4.45662 | 2.74165 | 4.42125 | 0.428414 | 13.1209 | 2.5172 | 11.8818 | 51.2338 |
Specify an importance estimator to compute permutation importance:
lb.KPBVAI.feature_importances(sort_by='mean', estimator=Permutation(train, n_iters=10))
Suppose you want to use some model which is not in kts.models
, like Regularized Greedy Forest.
!pip3 install rgf_python > /dev/null
from rgf.sklearn import RGFClassifier
To use it, you simply need to create a class derived from both your classifier and kts.CustomModel
. It may optionally include preprocess
method or inherit it from some mixin, like kts.NormalizeFillNAMixin
:
class KTSWrapper(kts.CustomModel, somelib.SomeClassifier):
ignored_params = [...]
def preprocess(X, y=None):
if y is None:
print('if y is None then .predict is called')
else:
print('otherwise .fit')
return X, y
An alternative approach is using kts.custom_model(ModelClass, ignored_params, normalize_fillna=True/False)
function:
RGF = custom_model(RGFClassifier, ignored_params=['memory_policy', 'n_jobs', 'verbose'], normalize_fillna=True)
However, subclassing gives more freedom in defining custom preprocessing.
In this example the classifier can't deal with NaN values, and we use kts.NormalizeFillNAMixin
to add preprocessing method:
class RGF(NormalizeFillNAMixin, CustomModel, RGFClassifier):
ignored_params = ['memory_policy', 'n_jobs', 'verbose']
RGF()
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/rgf/utils.py:225: UserWarning: Cannot find FastRGF executable files. FastRGF estimators will be unavailable for usage. warnings.warn("Cannot find FastRGF executable files. "
algorithm = 'RGF' calc_prob = 'sigmoid' init_model = None l2 = 0.1 learning_rate = 0.5 loss = 'Log' max_leaf = 1000 min_samples_leaf = 10 n_iter = None n_tree_search = 1 normalize = False opt_interval = 100 reg_depth = 1.0 sl2 = None test_interval = 100
RGF(algorithm='RGF', calc_prob='sigmoid', init_model=None, l2=0.1, learning_rate=0.5, loss='Log', max_leaf=1000, min_samples_leaf=10, n_iter=None, n_tree_search=1, normalize=False, opt_interval=100, reg_depth=1.0, sl2=None, test_interval=100)
class RGF(NormalizeFillNAMixin, CustomModel, RGFClassifier): ignored_params = ['memory_policy', 'n_jobs', 'verbose']
val.score(RGF(), fs)
{'score': 0.7704334423329472, 'id': 'BFELLE'}
TODO