# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from os import path
import taproc as tp
from taproc.pre_processing import *
from taproc.dataset import *
from taproc.learner import *
from taproc.feature import *
file_path = Path('/home_credit_default_risk/data')
app_train = pd.read_csv(str(file_path/'application_train.csv'))
app_train_proc, y, na_dict = tabular_proc(app_train, y_fld = 'TARGET', max_n_cat=15)
app_test = pd.read_csv(str(file_path/'application_test.csv'))
app_test_proc, _, _ = tabular_proc(app_test, max_n_cat=15, na_dict = na_dict)
app_train_proc = app_train_proc[app_test_proc.columns]
from sklearn.model_selection import train_test_split
x_trn, x_val, y_trn, y_val = train_test_split(app_train_proc, y, test_size=0.2, stratify = y)
dataset = LGBDataset(x_trn, y_trn, x_val, y_val)
params = {
'task': 'train',
'objective': 'binary',
'n_estimators':500,
'learning_rate':0.01,
'num_leaves':30,
'colsample_bytree':.8,
'subsample':.9,
'max_depth':7,
'reg_alpha':.1,
'reg_lambda':.1,
'min_split_gain':.01,
'min_child_weight':2,
'verbose':-1,
'metric': 'binary_logloss'}
learner = LGBLearner(dataset)
learner.fit(params, early_stopping_rounds=50)
ddg = dendogram.from_df(x_trn)
ddg.plot()
ddg.result.head(10)
col_group = [['FLAG_OWN_CAR_N', 'OWN_CAR_AGE'],
['AMT_GOODS_PRICE', 'AMT_CREDIT', 'AMT_ANNUITY'],
['EMERGENCYSTATE_MODE_No', 'HOUSETYPE_MODE_block of flats'],
['NONLIVINGAREA_AVG', 'NONLIVINGAREA_MODE', 'NONLIVINGAREA_MEDI'],
['NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_MEDI'],
['LIVINGAREA_AVG', 'LIVINGAREA_MEDI', 'LIVINGAREA_MODE','TOTALAREA_MODE'],
['APARTMENTS_AVG', 'APARTMENTS_MEDI', 'APARTMENTS_MODE'],
['LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_MEDI'],
['ELEVATORS_AVG', 'ELEVATORS_MEDI', 'ELEVATORS_MODE'],
['FLOORSMAX_AVG', 'FLOORSMAX_MEDI', 'FLOORSMAX_MODE'],
['FLOORSMIN_AVG', 'FLOORSMIN_MEDI', 'FLOORSMIN_MODE'],
['COMMONAREA_AVG', 'COMMONAREA_MEDI', 'COMMONAREA_MODE'],
['YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BEGINEXPLUATATION_MODE'],
['YEARS_BUILD_AVG', 'YEARS_BUILD_MEDI', 'YEARS_BUILD_MODE'],
['BASEMENTAREA_AVG', 'BASEMENTAREA_MEDI', 'BASEMENTAREA_MODE'],
['ENTRANCES_AVG', 'ENTRANCES_MEDI', 'ENTRANCES_MODE'],
['LANDAREA_AVG', 'LANDAREA_MODE', 'LANDAREA_MEDI'],
['DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE'],
['OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE'],
['LANDAREA_AVG_na', 'LANDAREA_MODE_na','LANDAREA_MEDI_na'],
['BASEMENTAREA_AVG_na', 'BASEMENTAREA_MODE_na', 'BASEMENTAREA_MEDI_na'],
['AMT_REQ_CREDIT_BUREAU_HOUR_na', 'AMT_REQ_CREDIT_BUREAU_DAY_na',
'AMT_REQ_CREDIT_BUREAU_WEEK_na', 'AMT_REQ_CREDIT_BUREAU_MON_na',
'AMT_REQ_CREDIT_BUREAU_QRT_na', 'AMT_REQ_CREDIT_BUREAU_YEAR_na'],
['LANDAREA_AVG_na', 'LANDAREA_MODE_na', 'LANDAREA_MEDI_na'],
['BASEMENTAREA_AVG_na', 'BASEMENTAREA_MODE_na', 'BASEMENTAREA_MEDI_na'],
['YEARS_BEGINEXPLUATATION_AVG_na', 'YEARS_BEGINEXPLUATATION_MODE_na',
'YEARS_BEGINEXPLUATATION_MEDI_na', 'TOTALAREA_MODE_na', 'EMERGENCYSTATE_MODE_nan'],
['LIVINGAREA_AVG_na', 'LIVINGAREA_MODE_na', 'LIVINGAREA_MEDI_na'],
['ENTRANCES_AVG_na', 'ENTRANCES_MODE_na', 'ENTRANCES_MEDI_na',
'FLOORSMAX_AVG_na', 'FLOORSMAX_MODE_na', 'FLOORSMAX_MEDI_na', 'WALLSMATERIAL_MODE_nan'],
['APARTMENTS_AVG_na', 'APARTMENTS_MODE_na', 'APARTMENTS_MEDI_na', 'HOUSETYPE_MODE_nan'],
['ELEVATORS_AVG_na', 'ELEVATORS_MODE_na', 'ELEVATORS_MEDI_na'],
['NONLIVINGAREA_AVG_na', 'NONLIVINGAREA_MODE_na', 'NONLIVINGAREA_MEDI_na'],
['FLOORSMIN_AVG_na', 'FLOORSMIN_MODE_na', 'FLOORSMIN_MEDI_na'],
['YEARS_BUILD_AVG_na', 'YEARS_BUILD_MODE_na', 'YEARS_BUILD_MEDI_na'],
['NONLIVINGAPARTMENTS_AVG_na', 'NONLIVINGAPARTMENTS_MODE_na', 'NONLIVINGAPARTMENTS_MEDI_na'],
['LIVINGAPARTMENTS_AVG_na', 'LIVINGAPARTMENTS_MODE_na', 'LIVINGAPARTMENTS_MEDI_na'],
['COMMONAREA_AVG_na', 'COMMONAREA_MODE_na', 'COMMONAREA_MEDI_na'],
['REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY']]
col_group = col_group + [i for i in x_trn.columns if i not in flat_list(col_group)]
col_group
[['FLAG_OWN_CAR_N', 'OWN_CAR_AGE'], ['AMT_GOODS_PRICE', 'AMT_CREDIT', 'AMT_ANNUITY'], ['EMERGENCYSTATE_MODE_No', 'HOUSETYPE_MODE_block of flats'], ['NONLIVINGAREA_AVG', 'NONLIVINGAREA_MODE', 'NONLIVINGAREA_MEDI'], ['NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_MEDI'], ['LIVINGAREA_AVG', 'LIVINGAREA_MEDI', 'LIVINGAREA_MODE', 'TOTALAREA_MODE'], ['APARTMENTS_AVG', 'APARTMENTS_MEDI', 'APARTMENTS_MODE'], ['LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_MEDI'], ['ELEVATORS_AVG', 'ELEVATORS_MEDI', 'ELEVATORS_MODE'], ['FLOORSMAX_AVG', 'FLOORSMAX_MEDI', 'FLOORSMAX_MODE'], ['FLOORSMIN_AVG', 'FLOORSMIN_MEDI', 'FLOORSMIN_MODE'], ['COMMONAREA_AVG', 'COMMONAREA_MEDI', 'COMMONAREA_MODE'], ['YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BEGINEXPLUATATION_MODE'], ['YEARS_BUILD_AVG', 'YEARS_BUILD_MEDI', 'YEARS_BUILD_MODE'], ['BASEMENTAREA_AVG', 'BASEMENTAREA_MEDI', 'BASEMENTAREA_MODE'], ['ENTRANCES_AVG', 'ENTRANCES_MEDI', 'ENTRANCES_MODE'], ['LANDAREA_AVG', 'LANDAREA_MODE', 'LANDAREA_MEDI'], ['DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE'], ['OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE'], ['LANDAREA_AVG_na', 'LANDAREA_MODE_na', 'LANDAREA_MEDI_na'], ['BASEMENTAREA_AVG_na', 'BASEMENTAREA_MODE_na', 'BASEMENTAREA_MEDI_na'], ['AMT_REQ_CREDIT_BUREAU_HOUR_na', 'AMT_REQ_CREDIT_BUREAU_DAY_na', 'AMT_REQ_CREDIT_BUREAU_WEEK_na', 'AMT_REQ_CREDIT_BUREAU_MON_na', 'AMT_REQ_CREDIT_BUREAU_QRT_na', 'AMT_REQ_CREDIT_BUREAU_YEAR_na'], ['LANDAREA_AVG_na', 'LANDAREA_MODE_na', 'LANDAREA_MEDI_na'], ['BASEMENTAREA_AVG_na', 'BASEMENTAREA_MODE_na', 'BASEMENTAREA_MEDI_na'], ['YEARS_BEGINEXPLUATATION_AVG_na', 'YEARS_BEGINEXPLUATATION_MODE_na', 'YEARS_BEGINEXPLUATATION_MEDI_na', 'TOTALAREA_MODE_na', 'EMERGENCYSTATE_MODE_nan'], ['LIVINGAREA_AVG_na', 'LIVINGAREA_MODE_na', 'LIVINGAREA_MEDI_na'], ['ENTRANCES_AVG_na', 'ENTRANCES_MODE_na', 'ENTRANCES_MEDI_na', 'FLOORSMAX_AVG_na', 'FLOORSMAX_MODE_na', 'FLOORSMAX_MEDI_na', 'WALLSMATERIAL_MODE_nan'], ['APARTMENTS_AVG_na', 'APARTMENTS_MODE_na', 'APARTMENTS_MEDI_na', 'HOUSETYPE_MODE_nan'], ['ELEVATORS_AVG_na', 'ELEVATORS_MODE_na', 'ELEVATORS_MEDI_na'], ['NONLIVINGAREA_AVG_na', 'NONLIVINGAREA_MODE_na', 'NONLIVINGAREA_MEDI_na'], ['FLOORSMIN_AVG_na', 'FLOORSMIN_MODE_na', 'FLOORSMIN_MEDI_na'], ['YEARS_BUILD_AVG_na', 'YEARS_BUILD_MODE_na', 'YEARS_BUILD_MEDI_na'], ['NONLIVINGAPARTMENTS_AVG_na', 'NONLIVINGAPARTMENTS_MODE_na', 'NONLIVINGAPARTMENTS_MEDI_na'], ['LIVINGAPARTMENTS_AVG_na', 'LIVINGAPARTMENTS_MODE_na', 'LIVINGAPARTMENTS_MEDI_na'], ['COMMONAREA_AVG_na', 'COMMONAREA_MODE_na', 'COMMONAREA_MEDI_na'], ['REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY'], 'SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_ANNUITY_na', 'AMT_GOODS_PRICE_na', 'OWN_CAR_AGE_na', 'CNT_FAM_MEMBERS_na', 'EXT_SOURCE_1_na', 'EXT_SOURCE_2_na', 'EXT_SOURCE_3_na', 'OBS_30_CNT_SOCIAL_CIRCLE_na', 'DEF_30_CNT_SOCIAL_CIRCLE_na', 'OBS_60_CNT_SOCIAL_CIRCLE_na', 'DEF_60_CNT_SOCIAL_CIRCLE_na', 'DAYS_LAST_PHONE_CHANGE_na', 'NAME_CONTRACT_TYPE_Cash loans', 'NAME_CONTRACT_TYPE_Revolving loans', 'NAME_CONTRACT_TYPE_nan', 'CODE_GENDER_F', 'CODE_GENDER_M', 'CODE_GENDER_nan', 'FLAG_OWN_CAR_Y', 'FLAG_OWN_CAR_nan', 'FLAG_OWN_REALTY_N', 'FLAG_OWN_REALTY_Y', 'FLAG_OWN_REALTY_nan', 'NAME_TYPE_SUITE_Children', 'NAME_TYPE_SUITE_Family', 'NAME_TYPE_SUITE_Group of people', 'NAME_TYPE_SUITE_Other_A', 'NAME_TYPE_SUITE_Other_B', 'NAME_TYPE_SUITE_Spouse, partner', 'NAME_TYPE_SUITE_Unaccompanied', 'NAME_TYPE_SUITE_nan', 'NAME_INCOME_TYPE_Businessman', 'NAME_INCOME_TYPE_Commercial associate', 'NAME_INCOME_TYPE_Pensioner', 'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Student', 'NAME_INCOME_TYPE_Unemployed', 'NAME_INCOME_TYPE_Working', 'NAME_INCOME_TYPE_nan', 'NAME_EDUCATION_TYPE_Academic degree', 'NAME_EDUCATION_TYPE_Higher education', 'NAME_EDUCATION_TYPE_Incomplete higher', 'NAME_EDUCATION_TYPE_Lower secondary', 'NAME_EDUCATION_TYPE_Secondary / secondary special', 'NAME_EDUCATION_TYPE_nan', 'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Married', 'NAME_FAMILY_STATUS_Separated', 'NAME_FAMILY_STATUS_Single / not married', 'NAME_FAMILY_STATUS_Widow', 'NAME_FAMILY_STATUS_nan', 'NAME_HOUSING_TYPE_Co-op apartment', 'NAME_HOUSING_TYPE_House / apartment', 'NAME_HOUSING_TYPE_Municipal apartment', 'NAME_HOUSING_TYPE_Office apartment', 'NAME_HOUSING_TYPE_Rented apartment', 'NAME_HOUSING_TYPE_With parents', 'NAME_HOUSING_TYPE_nan', 'WEEKDAY_APPR_PROCESS_START_FRIDAY', 'WEEKDAY_APPR_PROCESS_START_MONDAY', 'WEEKDAY_APPR_PROCESS_START_SATURDAY', 'WEEKDAY_APPR_PROCESS_START_SUNDAY', 'WEEKDAY_APPR_PROCESS_START_THURSDAY', 'WEEKDAY_APPR_PROCESS_START_TUESDAY', 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY', 'WEEKDAY_APPR_PROCESS_START_nan', 'FONDKAPREMONT_MODE_not specified', 'FONDKAPREMONT_MODE_org spec account', 'FONDKAPREMONT_MODE_reg oper account', 'FONDKAPREMONT_MODE_reg oper spec account', 'FONDKAPREMONT_MODE_nan', 'HOUSETYPE_MODE_specific housing', 'HOUSETYPE_MODE_terraced house', 'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others', 'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick', 'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_Yes']
impt = importance.from_LGBLearner(learner, col_group)
impt.I[:5]
Feature | Importance | |
---|---|---|
62 | EXT_SOURCE_3 | 0.072817 |
61 | EXT_SOURCE_2 | 0.055410 |
60 | EXT_SOURCE_1 | 0.016629 |
1 | AMT_GOODS_PRICE & AMT_CREDIT & AMT_ANNUITY | 0.010259 |
40 | DAYS_BIRTH | 0.004709 |
impt.plot()
impt.top(27)
[['EXT_SOURCE_3'], ['EXT_SOURCE_2'], ['EXT_SOURCE_1'], ['AMT_GOODS_PRICE', 'AMT_CREDIT', 'AMT_ANNUITY'], ['DAYS_BIRTH'], ['DAYS_EMPLOYED'], ['CODE_GENDER_M'], ['NAME_EDUCATION_TYPE_Higher education'], ['CODE_GENDER_F'], ['AMT_REQ_CREDIT_BUREAU_HOUR_na', 'AMT_REQ_CREDIT_BUREAU_DAY_na', 'AMT_REQ_CREDIT_BUREAU_WEEK_na', 'AMT_REQ_CREDIT_BUREAU_MON_na', 'AMT_REQ_CREDIT_BUREAU_QRT_na', 'AMT_REQ_CREDIT_BUREAU_YEAR_na'], ['FLAG_OWN_CAR_N', 'OWN_CAR_AGE'], ['FLAG_DOCUMENT_3'], ['DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE'], ['NAME_INCOME_TYPE_Working'], ['DAYS_ID_PUBLISH'], ['NAME_FAMILY_STATUS_Married'], ['DAYS_LAST_PHONE_CHANGE'], ['EXT_SOURCE_1_na'], ['NAME_CONTRACT_TYPE_Cash loans'], ['REG_CITY_NOT_LIVE_CITY'], ['REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY'], ['YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BEGINEXPLUATATION_MODE'], ['NAME_EDUCATION_TYPE_Secondary / secondary special'], ['OWN_CAR_AGE_na'], ['LIVINGAREA_AVG', 'LIVINGAREA_MEDI', 'LIVINGAREA_MODE', 'TOTALAREA_MODE'], ['FLOORSMAX_AVG', 'FLOORSMAX_MEDI', 'FLOORSMAX_MODE'], ['AMT_REQ_CREDIT_BUREAU_QRT']]
keep_ft = flat_list(impt.top(27))
x_tst_keep = app_test_proc[keep_ft]
app_train_keep = app_train_proc[keep_ft]
cons, cats = get_cons_cats(app_train_keep)
tst_key = x_tst_keep[cats].drop_duplicates().values
tst_key = set('~'.join([str(j) for j in i]) for i in tst_key)
app_key = app_train_keep[cats].apply(lambda x: '~'.join([str(j) for j in x.values]), axis=1)
mask = app_key.isin(tst_key)
x_trn = app_train_keep[~mask]
y_trn = y[~mask]
x_val_set = app_train_keep[mask]
y_val_set = y[mask]
def random_choose(x, pct = 2, ratio = 0.2, **kargs):
n = x.shape[0] if random.randint(0,9) < pct else int(np.round(x.shape[0]*(ratio-0.06)))
return x.sample(n=n, **kargs)
x_val = x_val_set.groupby(cats).apply(random_choose)
val_index = set([i[-1] for i in x_val.index.values])
x_val.reset_index(drop=True, inplace=True)
mask = x_val_set.index.isin(val_index)
y_val = y_val_set[mask]
x_trn = pd.concat([x_trn, val_set[~mask]])
cons[:5]
['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'AMT_GOODS_PRICE', 'AMT_CREDIT']
cats[:5]
['CODE_GENDER_M', 'NAME_EDUCATION_TYPE_Higher education', 'CODE_GENDER_F', 'AMT_REQ_CREDIT_BUREAU_HOUR_na', 'AMT_REQ_CREDIT_BUREAU_DAY_na']
x_val_set.shape
(248120, 44)
list(tst_key)[:5]
['1~0~0~True~True~True~True~True~True~0~0~0.0~0.0~0~1~True~0~0~2~2~1~False~0.1667~0.0', '0~0~1~False~False~False~False~False~False~1~1~0.0~0.0~1~0~True~1~0~2~2~0~True~0.1667~0.0', '1~1~0~False~False~False~False~False~False~0~1~0.0~0.0~0~1~False~1~0~2~2~0~False~0.125~0.0', '1~0~0~False~False~False~False~False~False~0~1~1.0~1.0~0~1~False~1~0~1~1~1~False~0.0417~0.0', '1~0~0~True~True~True~True~True~True~1~1~1.0~0.0~1~0~True~1~0~2~2~0~True~0.1667~0.0']
val_index[:5]
[264913, 50074, 209922, 285119, 140840]