#!/usr/bin/env python # coding: utf-8 # # A New Tree Booster: PART # # __12 Feb 2018, marugari__ # # PART (Peeking Additive Regression Trees) aims to # * optimize non-differential metrics # * avoid over-fitting # For training a PART booster, we need split training data into 3 part. # 1. training set: to search optimal splits # 2. peeking set: to determin whether a new tree is committed # 3. validation set: to get validation score # [Repository (https://github.com/marugari/LightGBM/tree/part)](https://github.com/marugari/LightGBM/tree/part) # # [Main contribution (part.hpp)](https://github.com/marugari/LightGBM/blob/part/src/boosting/part.hpp) # # This is implemented as a LightGBM custom booster. # The following is a fork of [the Kaggle Zillow Prize Kernel](https://www.kaggle.com/guolinke/simple-lightgbm-starter-lb-0-06487/code). # In[1]: import numpy as np import pandas as pd import lightgbm as lgb import gc # In[2]: train = pd.read_csv('input/zillow/train_2016_v2.csv', engine='python') prop = pd.read_csv('input/zillow/properties_2016.csv', engine='python') # In[3]: for c, dtype in zip(prop.columns, prop.dtypes): if dtype == np.float64: prop[c] = prop[c].astype(np.float32) df_train = train.merge(prop, how='left', on='parcelid') col = [ 'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode' ] x_train = df_train.drop(col, axis=1) y_train = df_train['logerror'].values print(x_train.shape, y_train.shape) train_columns = x_train.columns for c in x_train.dtypes[x_train.dtypes == object].index.values: x_train[c] = (x_train[c] == True) del df_train # In[4]: split = 80000 xt, xv = x_train[:split], x_train[split:] xt = xt.values.astype(np.float32, copy=False) xv = xv.values.astype(np.float32, copy=False) yt, yv = y_train[:split], y_train[split:] ds_train = lgb.Dataset(xt, label=yt, free_raw_data=False) ds_valid = lgb.Dataset(xv, label=yv, free_raw_data=False) # In[5]: prm = { 'learning_rate': 0.002, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.5, 'num_leaves': 60, 'min_data': 500, 'min_hessian': 1, } num_round = 500 # In[6]: clf_gbdt = lgb.train(prm, ds_train, num_round) # In[7]: prm_part = prm prm_part['boosting_type'] = 'part' prm_part['learning_rate'] = 0.002 prm_part['drop_rate'] = 0.0 prm_part['skip_drop'] = 0.0 np.random.seed(20180212) flg_part = np.random.choice([True, False], len(yt), replace=True, p=[0.7, 0.3]) flg_peek = np.logical_not(flg_part) ds_part = lgb.Dataset(xt[flg_part], label=yt[flg_part], free_raw_data=False) ds_peek = lgb.Dataset(xt[flg_peek], label=yt[flg_peek], free_raw_data=False) # In[8]: clf_part = lgb.train(prm_part, ds_part, num_round, valid_sets=ds_peek) # In[9]: from sklearn.metrics import mean_absolute_error def get_score(x, y, clf, ii): return mean_absolute_error(y, clf.predict(x, num_iteration=ii)) lab = [] val_gbdt = [] val_part = [] ii = int(0.7 * num_round) while ii <= num_round: lab.append(ii) val_gbdt.append(get_score(xv, yv, clf_gbdt, ii)) val_part.append(get_score(xv, yv, clf_part, ii)) ii += 5 # In[10]: print(f'GBDT: {np.array(val_gbdt).min()}') print(f'PART: {np.array(val_part).min()}') # In[ ]: