import os, subprocess, pickle
import pandas as pd
import numpy as np
from IPython.display import display
# from lib_feature_engineering import *
# check features folders
subprocess.check_output(["ls", "features"]).splitlines()
['baseline_extend.pkl.bz2', 'baseline.pkl.bz2', 'bureau_balance_1year.pkl.bz2', 'bureau_balance_2year.pkl.bz2', 'bureau_balance_gt3year.pkl.bz2', 'bureau_balance_lt1year.pkl.bz2', 'bureau_balance.pkl.bz2', 'bureau.pkl.bz2', 'credit_card_balance.pkl.bz2', 'installments_payments_gt3year.pkl.bz2', 'installments_payments_in1year.pkl.bz2', 'installments_payments_in2year.pkl.bz2', 'installments_payments.pkl.bz2', 'mean_encoding_feat_cat.pkl.bz2', 'pdf_features_label.csv.bz2', 'pos_cash_gt3year.pkl.bz2', 'pos_cash_in1year.pkl.bz2', 'pos_cash_in2year.pkl.bz2', 'pos_cash.pkl.bz2', 'prev_app.pkl.bz2']
# specified features set for joining
ls_feat_file = [
'baseline.pkl.bz2',
'baseline_extend.pkl.bz2',
'bureau_balance_1year.pkl.bz2',
'bureau_balance_2year.pkl.bz2',
'bureau_balance_gt3year.pkl.bz2',
'bureau_balance_lt1year.pkl.bz2',
'bureau_balance.pkl.bz2',
'bureau.pkl.bz2',
'credit_card_balance.pkl.bz2',
'installments_payments_gt3year.pkl.bz2',
'installments_payments_in1year.pkl.bz2',
'installments_payments_in2year.pkl.bz2',
'installments_payments.pkl.bz2',
'pos_cash_gt3year.pkl.bz2',
'pos_cash_in1year.pkl.bz2',
'pos_cash_in2year.pkl.bz2',
'pos_cash.pkl.bz2',
'prev_app.pkl.bz2'
]
%%time
# use first features for base joined
feat_path = os.path.join("features", ls_feat_file[0])
pdf_combined = pd.read_pickle(feat_path, compression="bz2")
# join next features set
for fname in ls_feat_file[1:]:
feat_path = os.path.join("features", fname)
pdf_feat = pd.read_pickle(feat_path, compression="bz2")
print(fname, pdf_feat.shape)
# add table prefix
tbl_prefix = fname.split(".")[0]
rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}
pdf_feat.rename(columns=rename_col, inplace=True)
# join
pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")
print("rows, columns", pdf_combined.shape)
ls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]
display(pdf_combined.head())
('baseline_extend.pkl.bz2', (356255, 77)) ('bureau_balance_1year.pkl.bz2', (123107, 35)) ('bureau_balance_2year.pkl.bz2', (110354, 35)) ('bureau_balance_gt3year.pkl.bz2', (99247, 35)) ('bureau_balance_lt1year.pkl.bz2', (132250, 35)) ('bureau_balance.pkl.bz2', (134542, 35)) ('bureau.pkl.bz2', (305811, 87)) ('credit_card_balance.pkl.bz2', (103558, 111)) ('installments_payments_gt3year.pkl.bz2', (209639, 35)) ('installments_payments_in1year.pkl.bz2', (238405, 35)) ('installments_payments_in2year.pkl.bz2', (171713, 35)) ('installments_payments.pkl.bz2', (339587, 35)) ('pos_cash_gt3year.pkl.bz2', (212897, 31)) ('pos_cash_in1year.pkl.bz2', (239502, 31)) ('pos_cash_in2year.pkl.bz2', (171658, 33)) ('pos_cash.pkl.bz2', (337252, 36)) ('prev_app.pkl.bz2', (338857, 244)) ('rows, columns', (356255, 1042))
SK_ID_CURR | NAME_INCOME_TYPE_Working | is_REGION_RATING_CLIENT_W_CITY | is_REGION_RATING_CLIENT | is_CODE_GENDER | NAME_EDUCATION_TYPE_Higher_education | NAME_EDUCATION_TYPE_Secondary___secondary_special | is_REG_CITY_NOT_WORK_CITY | is_FLAG_DOCUMENT_3 | HOUSETYPE_MODE_block_of_flats | ... | prev_app_DAYS_FIRST_DUE_TO_YEARS_min | prev_app_AMT_DOWN_PAYMENT_min | prev_app_NFLAG_INSURED_ON_APPROVAL_mean | prev_app_NFLAG_INSURED_ON_APPROVAL_min | prev_app_DAYS_LAST_DUE_TO_YEARS_min | prev_app_DAYS_TERMINATION_TO_YEARS_min | prev_app_RATE_DOWN_PAYMENT_min | prev_app_AMT_CREDIT_std | prev_app_RATE_INTEREST_PRIMARY_mean | prev_app_SK_ID_PREV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | 2 | 2 | 1 | 0 | 1 | 0 | 1 | 1 | ... | 1.547945 | 0.0 | 0.000000 | 0.0 | 0.068493 | 0.046575 | 0.000000 | NaN | 0.189122 | 1.0 |
1 | 100003 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 1.961644 | 0.0 | 0.666667 | 0.0 | 1.468493 | 1.443836 | 0.000000 | 497949.861808 | 0.189122 | 3.0 |
2 | 100004 | 1 | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 2.147945 | 4860.0 | 0.000000 | 0.0 | 1.983562 | 1.956164 | 0.212008 | NaN | 0.189122 | 1.0 |
3 | 100006 | 1 | 2 | 2 | 0 | 0 | 1 | 0 | 1 | 0 | ... | 0.413699 | 0.0 | 0.000000 | 0.0 | 0.413699 | 0.391781 | 0.051605 | 333337.354853 | 0.189122 | 9.0 |
4 | 100007 | 1 | 2 | 2 | 1 | 0 | 1 | 1 | 0 | 0 | ... | 0.942466 | 0.0 | 0.600000 | 0.0 | 0.969863 | 0.950685 | 0.051605 | 118032.409509 | 0.189122 | 6.0 |
5 rows × 1042 columns
CPU times: user 2min 2s, sys: 8.96 s, total: 2min 11s Wall time: 54.4 s
%%time
if False:
def filter_feat_low_auc(pdf_label, pdf_input, threshold=0.501):
pdf_eval = feature_evaluate(pdf_label, pdf_input)
ls_filtered_feat = pdf_eval.query("auc > {}".format(threshold))["name"].tolist()
return ls_filtered_feat
# load train data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)
# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
.merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
.drop(columns=["tvt_code"]))
ls_filtered_feat = filter_feat_low_auc(pdf_train_filtered, pdf_combined, threshold=0.501)
pdf_combined = pdf_combined[["SK_ID_CURR"] + ls_filtered_feat]
print("After filtered: {}".format(pdf_combined.shape))
pdf_tvt = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
print(pdf_tvt.shape)
display(pdf_tvt.head())
(356255, 3)
SK_ID_CURR | TARGET | tvt_code | |
---|---|---|---|
0 | 100002 | 1 | train |
1 | 100003 | 0 | train |
2 | 100004 | 0 | train |
3 | 100006 | 0 | train |
4 | 100007 | 0 | train |
pdf_tvt["tvt_code"].value_counts()
train 216948 kaggle_test 48744 test 46127 val 44436 Name: tvt_code, dtype: int64
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)
display(pdf_features_label.head().T)
(356255, 1044)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
SK_ID_CURR | 100002 | 100003 | 100004 | 100006 | 100007 |
TARGET | 1 | 0 | 0 | 0 | 0 |
tvt_code | train | train | train | train | train |
NAME_INCOME_TYPE_Working | 1 | 0 | 1 | 1 | 1 |
is_REGION_RATING_CLIENT_W_CITY | 2 | 1 | 2 | 2 | 2 |
is_REGION_RATING_CLIENT | 2 | 1 | 2 | 2 | 2 |
is_CODE_GENDER | 1 | 0 | 1 | 0 | 1 |
NAME_EDUCATION_TYPE_Higher_education | 0 | 1 | 0 | 0 | 0 |
NAME_EDUCATION_TYPE_Secondary___secondary_special | 1 | 0 | 1 | 1 | 1 |
is_REG_CITY_NOT_WORK_CITY | 0 | 0 | 0 | 0 | 1 |
is_FLAG_DOCUMENT_3 | 1 | 1 | 0 | 1 | 0 |
HOUSETYPE_MODE_block_of_flats | 1 | 1 | 0 | 0 | 0 |
NAME_INCOME_TYPE_Pensioner | 0 | 0 | 0 | 0 | 0 |
ORGANIZATION_TYPE_XNA | 0 | 0 | 0 | 0 | 0 |
is_FLAG_EMP_PHONE | 1 | 1 | 1 | 1 | 1 |
OCCUPATION_TYPE_Laborers | 1 | 0 | 1 | 1 | 0 |
WALLSMATERIAL_MODE_Panel | 0 | 0 | 0 | 0 | 0 |
is_LIVE_CITY_NOT_WORK_CITY | 0 | 0 | 0 | 0 | 1 |
NAME_FAMILY_STATUS_Married | 0 | 1 | 0 | 0 | 0 |
is_FLAG_WORK_PHONE | 0 | 0 | 1 | 0 | 0 |
is_FLAG_PHONE | 1 | 1 | 1 | 0 | 0 |
is_FLAG_OWN_CAR | 0 | 0 | 1 | 0 | 0 |
ORGANIZATION_TYPE_Self_employed | 0 | 0 | 0 | 0 | 0 |
ORGANIZATION_TYPE_Business_Entity_Type_3 | 1 | 0 | 0 | 1 | 0 |
NAME_FAMILY_STATUS_Single___not_married | 1 | 0 | 1 | 0 | 1 |
FONDKAPREMONT_MODE_reg_oper_account | 1 | 1 | 0 | 0 | 0 |
is_NAME_CONTRACT_TYPE | 1 | 1 | 0 | 1 | 1 |
NAME_HOUSING_TYPE_House___apartment | 1 | 1 | 1 | 1 | 1 |
is_FLAG_DOCUMENT_6 | 0 | 0 | 0 | 0 | 0 |
OCCUPATION_TYPE_Drivers | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... |
prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std | NaN | 2.34238 | NaN | 2.15334 | 2.55569 |
prev_app_AMT_APPLICATION_max | 179055 | 900000 | 24282 | 688500 | 247500 |
prev_app_AMT_GOODS_PRICE_max | 179055 | 900000 | 24282 | 688500 | 247500 |
prev_app_AMT_ANNUITY_std | NaN | 46332.6 | NaN | 15995.2 | 8063.59 |
prev_app_RATE_DOWN_PAYMENT_sum | 0 | 0.151666 | 0.212008 | 0.68806 | 0.525453 |
prev_app_NFLAG_INSURED_ON_APPROVAL_sum | 0 | 2 | 0 | 0 | 3 |
prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min | -0.342466 | 1.05753 | 1.90137 | -3.44932 | -0.947945 |
prev_app_RATE_INTEREST_PRIVILEGED_mean | 0.835095 | 0.835095 | 0.835095 | 0.835095 | 0.835095 |
prev_app_AMT_APPLICATION_sum | 179055 | 1.30631e+06 | 24282 | 2.44983e+06 | 903182 |
prev_app_AMT_GOODS_PRICE_sum | 179055 | 1.30631e+06 | 24282 | 2.44983e+06 | 903182 |
prev_app_AMT_CREDIT_max | 179055 | 1.03588e+06 | 20106 | 906615 | 284400 |
prev_app_NFLAG_INSURED_ON_APPROVAL_max | 0 | 1 | 0 | 0 | 1 |
prev_app_RATE_INTEREST_PRIMARY_sum | 0.189122 | 0.567367 | 0.189122 | 1.7021 | 1.13473 |
prev_app_RATE_INTEREST_PRIVILEGED_sum | 0.835095 | 2.50529 | 0.835095 | 7.51586 | 5.01057 |
prev_app_NFLAG_INSURED_ON_APPROVAL_std | NaN | 0.57735 | NaN | 0 | 0.547723 |
prev_app_CNT_PAYMENT_mean | 24 | 10 | 4 | 15.3333 | 20.6667 |
prev_app_CNT_PAYMENT_sum | 24 | 30 | 4 | 138 | 124 |
prev_app_AMT_APPLICATION_std | NaN | 424162 | NaN | 286175 | 100586 |
prev_app_AMT_GOODS_PRICE_std | NaN | 424162 | NaN | 286175 | 100586 |
prev_app_AMT_CREDIT_sum | 179055 | 1.45257e+06 | 20106 | 2.62526e+06 | 999832 |
prev_app_DAYS_FIRST_DUE_TO_YEARS_min | 1.54795 | 1.96164 | 2.14795 | 0.413699 | 0.942466 |
prev_app_AMT_DOWN_PAYMENT_min | 0 | 0 | 4860 | 0 | 0 |
prev_app_NFLAG_INSURED_ON_APPROVAL_mean | 0 | 0.666667 | 0 | 0 | 0.6 |
prev_app_NFLAG_INSURED_ON_APPROVAL_min | 0 | 0 | 0 | 0 | 0 |
prev_app_DAYS_LAST_DUE_TO_YEARS_min | 0.0684932 | 1.46849 | 1.98356 | 0.413699 | 0.969863 |
prev_app_DAYS_TERMINATION_TO_YEARS_min | 0.0465753 | 1.44384 | 1.95616 | 0.391781 | 0.950685 |
prev_app_RATE_DOWN_PAYMENT_min | 0 | 0 | 0.212008 | 0.0516051 | 0.0516051 |
prev_app_AMT_CREDIT_std | NaN | 497950 | NaN | 333337 | 118032 |
prev_app_RATE_INTEREST_PRIMARY_mean | 0.189122 | 0.189122 | 0.189122 | 0.189122 | 0.189122 |
prev_app_SK_ID_PREV | 1 | 3 | 1 | 9 | 6 |
1044 rows × 5 columns
%%time
# save combined features with label
# pdf_features_label.to_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2")
pdf_features_label.to_csv(os.path.join("features", "pdf_features_label.csv.bz2"), compression="bz2")
CPU times: user 4min 57s, sys: 6.65 s, total: 5min 4s Wall time: 5min 6s