#!/usr/bin/env python
# coding: utf-8

# ![KTS logo](https://raw.githubusercontent.com/konodyuk/kts/master/docs/static/banner_alpha.png)
# # Modelling Guide

# In[1]:


import numpy as np
np.random.seed(0)

import kts
from kts import *


# Feature constructors and helpers defined earlier are automatically loaded:

# In[2]:


simple_feature


# Use `kts.ls` to list objects saved in your user cache and `kts.rm` to remove them:

# In[3]:


print(kts.ls())
kts.rm('external')
print(kts.ls())


# In[4]:


train = kts.load('train')
test = kts.load('test')


# ## Models
# 
# `kts.models.{binary, multiclass, regression}` contain most popular models for each task type.  
# In particular, all of the corresponding sklearn models are present, as well as CatBoost, LGBM and XGB if already installed. We'll also add neural nets there soon.

# In[5]:


from kts.models import binary, multiclass, regression


# Init signatures are preserved:

# In[6]:


cb = binary.CatBoostClassifier(iterations=100, rsm=.15, custom_metric='AUC')
cb


# In[7]:


lr = binary.LogisticRegression(C=.5, solver='lbfgs', max_iter=1000)
lr


# In[8]:


from category_encoders import TargetEncoder, WOEEncoder

fs = FeatureSet([simple_feature, interactions('Pclass', 'Age'), num_aggs('Fare'), tfidf('Name'), stl.one_hot_encode('Embarked')], 
                [stl.category_encode(TargetEncoder(), 'Embarked', 'Survived'), 
                 stl.category_encode(WOEEncoder(), 'Embarked', 'Survived')],
                train_frame=train,
                targets='Survived')


# ## Validation
# 
# To define a validation scheme, you'll use `kts.Validator(splitter, metric)`. Splitter is used to split the training set, and metric is for evaluating trained models.

# In[9]:


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

skf = StratifiedKFold(5, True, 42)
val = Validator(skf, roc_auc_score)


# Running validation is as easy as `val.score(model, feature_set)`:

# In[10]:


val.score(cb, fs)


# In[11]:


val.score(lr, fs)


# ## Leaderboard
# 
# Right after validation your experiments are placed in the leaderboard:

# In[12]:


lb


# ### Multiple Leaderboards

# You can also keep multiple leaderboards by passing `leaderboard` parameter to `val.score()`. Default leaderboard is `main`.

# In[13]:


some_model = binary.KNeighborsClassifier()

val.score(some_model, fs, leaderboard='other')


# Use `kts.leaderboard_list` or `kts.lbs` to access leaderboards other than main:

# In[14]:


lbs.main is lb
lbs.other is lbs['other']
leaderboard_list is lbs


# Note that the new experiment appeared only in the new leaderboard:

# In[15]:


lb
lbs.other


# ## Experiments
# 
# Experiments are accessible by their identifiers:

# In[16]:


lb['KPBVAI'] is lb.KPBVAI


# In[17]:


lb.KPBVAI


# ### Inference
# 
# Inference is as easy as `experiment.predict(frame)`. Features are computed automatically.

# In[18]:


lb.KPBVAI.predict(test.head(5))


# ### Feature Importances

# In[19]:


lb.KPBVAI.feature_importances


# In[20]:


lb.KPBVAI.feature_importances(sort_by='mean', n_best=7)


# Use `plot=False` to get feature importances by fold:

# In[21]:


lb.KPBVAI.feature_importances(plot=False)


# Specify an importance estimator to compute permutation importance:

# In[22]:


lb.KPBVAI.feature_importances(sort_by='mean', estimator=Permutation(train, n_iters=10))


# ## Custom Models
# 
# Suppose you want to use some model which is not in `kts.models`, like [Regularized Greedy Forest](https://github.com/RGF-team/rgf/tree/master/python-package).

# In[23]:


get_ipython().system('pip3 install rgf_python > /dev/null')


# In[24]:


from rgf.sklearn import RGFClassifier


# To use it, you simply need to create a class derived from both your classifier and `kts.CustomModel`. It may optionally include `preprocess` method or inherit it from some mixin, like `kts.NormalizeFillNAMixin`:
# 
# ```python
# class KTSWrapper(kts.CustomModel, somelib.SomeClassifier):
#     ignored_params = [...]
#     
#     def preprocess(X, y=None):
#         if y is None:
#             print('if y is None then .predict is called')
#         else:
#             print('otherwise .fit')
#         return X, y
# ```
# 
# An alternative approach is using `kts.custom_model(ModelClass, ignored_params, normalize_fillna=True/False)` function:
# ```python
# RGF = custom_model(RGFClassifier, ignored_params=['memory_policy', 'n_jobs', 'verbose'], normalize_fillna=True)
# ```
# However, subclassing gives more freedom in defining custom preprocessing.
# 
# In this example the classifier can't deal with NaN values, and we use `kts.NormalizeFillNAMixin` to add preprocessing method:

# In[25]:


class RGF(NormalizeFillNAMixin, CustomModel, RGFClassifier):
    ignored_params = ['memory_policy', 'n_jobs', 'verbose']


# In[26]:


RGF()


# In[27]:


val.score(RGF(), fs)


# ## Custom Validators
# 
# *TODO*

# In[28]: