# Full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%load_ext autoreload
%autoreload 2
import math
import os
import subprocess
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
#
from lib_modeling import *
from lib_feature_engineering import *
# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)
# load train/test data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)
data_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(data_path)
# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
.merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
.drop(columns=["tvt_code"]))
pdf_train_filtered.head()
SK_ID_CURR | TARGET | |
---|---|---|
0 | 100002 | 1 |
1 | 100003 | 0 |
2 | 100004 | 0 |
3 | 100006 | 0 |
4 | 100007 | 0 |
# load previous application
data_path = "home-credit-default-risk/POS_CASH_balance.csv"
pdf_data = pd.read_csv(data_path)
print(pdf_data.shape)
pdf_data.head()
(10001358, 8)
SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | CNT_INSTALMENT | CNT_INSTALMENT_FUTURE | NAME_CONTRACT_STATUS | SK_DPD | SK_DPD_DEF | |
---|---|---|---|---|---|---|---|---|
0 | 1803195 | 182943 | -31 | 48.0 | 45.0 | Active | 0 | 0 |
1 | 1715348 | 367990 | -33 | 36.0 | 35.0 | Active | 0 | 0 |
2 | 1784872 | 397406 | -32 | 12.0 | 9.0 | Active | 0 | 0 |
3 | 1903291 | 269225 | -35 | 48.0 | 42.0 | Active | 0 | 0 |
4 | 2341044 | 334279 | -35 | 36.0 | 35.0 | Active | 0 | 0 |
# load meta data
meta_path = "../02_pandas/reports/report_POS_CASH_balance.csv"
pdf_meta = pd.read_csv(meta_path)
# filter here for look up application with years
pdf_data["MONTHS_BALANCE"] = pdf_data["MONTHS_BALANCE"] * -1
# pdf_data = pdf_data[(pdf_data["MONTHS_BALANCE"] >= 12) & (pdf_data["MONTHS_BALANCE"] < 12 * 2)]
# pdf_data = pdf_data[(pdf_data["MONTHS_BALANCE"] >= 12 * 2) & (pdf_data["MONTHS_BALANCE"] < 12 * 3)]
# pdf_data = pdf_data[pdf_data["MONTHS_BALANCE"] >= 12 * 3]
print(pdf_data.shape)
(10001358, 8)
pdf_data["is_DPD"] = (pdf_data["SK_DPD"] > 0).astype(int)
pdf_data["is_DPD_DEF"] = (pdf_data["SK_DPD_DEF"] > 0).astype(int)
# drop columns
pdf_data.drop(columns=["SK_DPD", "SK_DPD_DEF"], inplace=True)
# get list categorical attributes
ls_cate = pdf_meta.query("sub_type == 'object'")["name"].tolist()
ls_cate
['NAME_CONTRACT_STATUS']
# construct categorical mapping
dict_onehot = {}
for cate in ls_cate:
ls_val = pdf_data[cate].value_counts().index.tolist()
dict_onehot[cate] = ls_val
%%time
pdf_onehot = gen_one_hot_feat(pdf_data, dict_onehot, main_key="SK_ID_CURR")
print(pdf_onehot.shape)
(10001358, 10) CPU times: user 50.6 s, sys: 4.19 s, total: 54.8 s Wall time: 38.2 s
%%time
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum", "mean"], main_key="SK_ID_CURR")
eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)
display(eval_agg01)
{'NAME_CONTRACT_STATUS_Active': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Amortized_debt': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Approved': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Canceled': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Completed': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Demand': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Returned_to_the_store': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_Signed': ['max', 'sum', 'mean'], 'NAME_CONTRACT_STATUS_XNA': ['max', 'sum', 'mean']}
After agg: (337252, 27)
name | auc | corr | coverage | |
---|---|---|---|---|
13 | NAME_CONTRACT_STATUS_Active_sum | 0.5467 | -0.0372 | 1.0 |
10 | NAME_CONTRACT_STATUS_Completed_sum | 0.5404 | -0.0207 | 1.0 |
9 | NAME_CONTRACT_STATUS_Completed_max | 0.5107 | -0.0187 | 1.0 |
14 | NAME_CONTRACT_STATUS_Active_mean | 0.5079 | -0.0077 | 1.0 |
25 | NAME_CONTRACT_STATUS_Signed_sum | 0.5044 | -0.0045 | 1.0 |
24 | NAME_CONTRACT_STATUS_Signed_max | 0.5042 | -0.0058 | 1.0 |
11 | NAME_CONTRACT_STATUS_Completed_mean | 0.5030 | 0.0009 | 1.0 |
26 | NAME_CONTRACT_STATUS_Signed_mean | 0.5023 | 0.0089 | 1.0 |
5 | NAME_CONTRACT_STATUS_Returned_to_the_store_mean | 0.5016 | 0.0121 | 1.0 |
4 | NAME_CONTRACT_STATUS_Returned_to_the_store_sum | 0.5016 | 0.0079 | 1.0 |
3 | NAME_CONTRACT_STATUS_Returned_to_the_store_max | 0.5016 | 0.0072 | 1.0 |
17 | NAME_CONTRACT_STATUS_Demand_mean | 0.5006 | 0.0069 | 1.0 |
15 | NAME_CONTRACT_STATUS_Demand_max | 0.5006 | 0.0100 | 1.0 |
16 | NAME_CONTRACT_STATUS_Demand_sum | 0.5006 | 0.0037 | 1.0 |
21 | NAME_CONTRACT_STATUS_Approved_max | 0.5006 | -0.0028 | 1.0 |
22 | NAME_CONTRACT_STATUS_Approved_sum | 0.5006 | -0.0022 | 1.0 |
23 | NAME_CONTRACT_STATUS_Approved_mean | 0.5006 | 0.0010 | 1.0 |
6 | NAME_CONTRACT_STATUS_Amortized_debt_max | 0.5001 | 0.0088 | 1.0 |
7 | NAME_CONTRACT_STATUS_Amortized_debt_sum | 0.5001 | 0.0049 | 1.0 |
8 | NAME_CONTRACT_STATUS_Amortized_debt_mean | 0.5001 | 0.0063 | 1.0 |
12 | NAME_CONTRACT_STATUS_Active_max | 0.5001 | -0.0019 | 1.0 |
1 | NAME_CONTRACT_STATUS_Canceled_sum | 0.5000 | 0.0007 | 1.0 |
0 | NAME_CONTRACT_STATUS_Canceled_max | 0.5000 | 0.0007 | 1.0 |
2 | NAME_CONTRACT_STATUS_Canceled_mean | 0.5000 | -0.0009 | 1.0 |
20 | NAME_CONTRACT_STATUS_XNA_mean | 0.5000 | -0.0007 | 1.0 |
19 | NAME_CONTRACT_STATUS_XNA_sum | 0.5000 | -0.0007 | 1.0 |
18 | NAME_CONTRACT_STATUS_XNA_max | 0.5000 | -0.0007 | 1.0 |
CPU times: user 22.3 s, sys: 1.04 s, total: 23.3 s Wall time: 7.63 s
eval_agg01.query("auc <= 0.501").shape
(16, 4)
sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist()
pdf_agg01 = pdf_agg01[sel_feat]
print(pdf_agg01.shape)
(337252, 11)
series_type = pdf_data.dtypes
ls_num = series_type[series_type == "int64"]
ls_num = [cname for cname in ls_num.index if cname not in ["SK_ID_PREV", "SK_ID_CURR"]]
ls_num
['MONTHS_BALANCE', 'is_DPD', 'is_DPD_DEF']
%%time
pdf_agg02 = agg_common_data(pdf_data[["SK_ID_CURR"] + ls_num], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)
display(eval_agg02)
{'MONTHS_BALANCE': ['max', 'min', 'sum', 'mean', 'std'], 'is_DPD': ['max', 'min', 'sum', 'mean', 'std'], 'is_DPD_DEF': ['max', 'min', 'sum', 'mean', 'std']}
After agg: (337252, 15)
name | auc | corr | coverage | |
---|---|---|---|---|
0 | MONTHS_BALANCE_max | 0.5594 | -0.0564 | 1.0000 |
4 | MONTHS_BALANCE_std | 0.5590 | -0.0557 | 0.9988 |
2 | MONTHS_BALANCE_sum | 0.5560 | -0.0416 | 1.0000 |
3 | MONTHS_BALANCE_mean | 0.5442 | -0.0353 | 1.0000 |
14 | is_DPD_DEF_std | 0.5246 | 0.0472 | 0.9988 |
13 | is_DPD_DEF_mean | 0.5246 | 0.0458 | 1.0000 |
9 | is_DPD_std | 0.5243 | 0.0382 | 0.9988 |
8 | is_DPD_mean | 0.5243 | 0.0285 | 1.0000 |
12 | is_DPD_DEF_sum | 0.5232 | 0.0243 | 1.0000 |
10 | is_DPD_DEF_max | 0.5228 | 0.0363 | 1.0000 |
5 | is_DPD_max | 0.5222 | 0.0311 | 1.0000 |
7 | is_DPD_sum | 0.5220 | 0.0095 | 1.0000 |
1 | MONTHS_BALANCE_min | 0.5118 | 0.0038 | 1.0000 |
6 | is_DPD_min | 0.5001 | 0.0041 | 1.0000 |
11 | is_DPD_DEF_min | 0.5000 | 0.0048 | 1.0000 |
CPU times: user 11.2 s, sys: 568 ms, total: 11.8 s Wall time: 4.94 s
# get list continuous attributes
ls_con = pdf_meta.query("sub_type == 'float64'")["name"].tolist()
ls_con
['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']
pdf_con = pdf_data[["SK_ID_PREV", "SK_ID_CURR"] + ls_con].copy()
pdf_con.head()
SK_ID_PREV | SK_ID_CURR | CNT_INSTALMENT | CNT_INSTALMENT_FUTURE | |
---|---|---|---|---|
0 | 1803195 | 182943 | 48.0 | 45.0 |
1 | 1715348 | 367990 | 36.0 | 35.0 |
2 | 1784872 | 397406 | 12.0 | 9.0 |
3 | 1903291 | 269225 | 48.0 | 42.0 |
4 | 2341044 | 334279 | 36.0 | 35.0 |
%%time
pdf_agg03 = agg_common_data(pdf_con[["SK_ID_CURR"] + ls_con], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)
display(eval_agg03)
{'CNT_INSTALMENT': ['max', 'min', 'sum', 'mean', 'std'], 'CNT_INSTALMENT_FUTURE': ['max', 'min', 'sum', 'mean', 'std']}
After agg: (337252, 10)
name | auc | corr | coverage | |
---|---|---|---|---|
2 | CNT_INSTALMENT_sum | 0.5264 | -0.0160 | 1.0000 |
1 | CNT_INSTALMENT_min | 0.5228 | 0.0189 | 0.9999 |
8 | CNT_INSTALMENT_FUTURE_mean | 0.5208 | 0.0279 | 0.9999 |
7 | CNT_INSTALMENT_FUTURE_sum | 0.5200 | -0.0069 | 1.0000 |
3 | CNT_INSTALMENT_mean | 0.5158 | 0.0174 | 0.9999 |
4 | CNT_INSTALMENT_std | 0.5134 | 0.0045 | 0.9987 |
6 | CNT_INSTALMENT_FUTURE_min | 0.5098 | 0.0186 | 0.9999 |
9 | CNT_INSTALMENT_FUTURE_std | 0.5065 | 0.0158 | 0.9987 |
5 | CNT_INSTALMENT_FUTURE_max | 0.5064 | 0.0136 | 0.9999 |
0 | CNT_INSTALMENT_max | 0.5063 | 0.0135 | 0.9999 |
CPU times: user 9.32 s, sys: 358 ms, total: 9.68 s Wall time: 3.36 s
pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03)
print(pdf_feat.shape)
(337252, 36)
%%time
fname = "pos_cash"
# fname = "pos_cash_in1year"
# fname = "pos_cash_in2year"
# fname = "pos_cash_gt3year"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")
Store features completed! CPU times: user 5.28 s, sys: 92.4 ms, total: 5.37 s Wall time: 4.49 s