# Full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%load_ext autoreload
%autoreload 2
import math
import os
import subprocess
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
#
from lib_modeling import *
from lib_feature_engineering import *
from sklearn.preprocessing import Imputer
# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)
# load train/test data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)
data_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(data_path)
# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
.merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
.drop(columns=["tvt_code"]))
pdf_train_filtered.head()
SK_ID_CURR | TARGET | |
---|---|---|
0 | 100002 | 1 |
1 | 100003 | 0 |
2 | 100004 | 0 |
3 | 100006 | 0 |
4 | 100007 | 0 |
# load previous application
data_path = "home-credit-default-risk/previous_application.csv"
pdf_prev_app = pd.read_csv(data_path)
print(pdf_prev_app.shape)
pdf_prev_app.head()
(1670214, 37)
SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | Y | 1 | 0.0 | 0.1828 | 0.8673 | XAP | Approved | -73 | Cash through the bank | XAP | NaN | Repeater | Mobile | POS | XNA | Country-wide | 35 | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -164 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Contact center | -1 | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -301 | Cash through the bank | XAP | Spouse, partner | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -512 | Cash through the bank | XAP | NaN | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | Y | 1 | NaN | NaN | NaN | Repairs | Refused | -781 | Cash through the bank | HC | NaN | Repeater | XNA | Cash | walk-in | Credit and cash offices | -1 | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
# load meta data
meta_path = "../02_pandas/reports/report_previous_application.csv"
pdf_meta = pd.read_csv(meta_path)
pdf_prev_app["NAME_CONTRACT_TYPE"].value_counts()
Cash loans 747553 Consumer loans 729151 Revolving loans 193164 XNA 346 Name: NAME_CONTRACT_TYPE, dtype: int64
# số lượng XNA khá nhỏ, có 346 records. Replace thành Cash loans
pdf_prev_app.loc[pdf_prev_app["NAME_CONTRACT_TYPE"] == "XNA", "NAME_CONTRACT_TYPE"] = "Cash loans"
pdf_prev_app["NAME_CONTRACT_TYPE"].value_counts()
Cash loans 747899 Consumer loans 729151 Revolving loans 193164 Name: NAME_CONTRACT_TYPE, dtype: int64
pdf_prev_app["AMT_ANNUITY"].isna().mean()
0.22286665062081865
pdf_prev_app[pdf_prev_app["AMT_ANNUITY"].isna()]["NAME_CONTRACT_STATUS"].value_counts()
Canceled 305805 Refused 40898 Unused offer 25524 Approved 8 Name: NAME_CONTRACT_STATUS, dtype: int64
# AMT_ANNUITY bị null khi tình trạng của HĐ là hủy.
# chỉ một số ít AMT_ANNUITY là null nhưng tình trạng của HĐ là approved.
# Fill giá trị này là 0
pdf_prev_app.loc[pdf_prev_app["AMT_ANNUITY"].isna(), "AMT_ANNUITY"] = 0
pdf_prev_app["AMT_ANNUITY"].isna().mean()
0.0
KH phải trả số tiền ban đầu là bao nhiêu.
pdf_prev_app["AMT_DOWN_PAYMENT"].isna().mean()
0.536364801157217
# Mặc định KH không trả giá trị ban đầu mà vay toàn bộ. Fill 0 cho các giá trị null này
pdf_prev_app.loc[pdf_prev_app["AMT_DOWN_PAYMENT"].isna(), "AMT_DOWN_PAYMENT"] = 0
pdf_prev_app["AMT_DOWN_PAYMENT"].isna().mean()
0.0
Số tiền của sản phẩm mà khách hàng muốn vay trả góp
pdf_prev_app["AMT_GOODS_PRICE"].isna().mean()
0.23081772754868538
# Có rất nhiều giá trị 0 tại cột AMT_CREDIT khi AMT_GOODS_PRICE là Null
series_check = pdf_prev_app[pdf_prev_app["AMT_GOODS_PRICE"].isna()]["AMT_CREDIT"]
series_check[series_check == 0.0].count() * 1.0 / series_check.shape[0]
0.8664747156401178
# giải ngân là 0, tình trạng của HĐ là approved
pdf_prev_app.loc[series_check[series_check == 0.0]]["NAME_CONTRACT_STATUS"].value_counts()
Approved 334039 Name: NAME_CONTRACT_STATUS, dtype: int64
# giải ngân là 0, mục đích vay là XAP
pdf_prev_app.loc[series_check[series_check == 0.0]]["NAME_CASH_LOAN_PURPOSE"].value_counts()
XAP 334039 Name: NAME_CASH_LOAN_PURPOSE, dtype: int64
# Fill 0 cho các giá trị AMT_GOODS_PRICE là 0
pdf_prev_app.loc[pdf_prev_app["AMT_GOODS_PRICE"].isna(), "AMT_GOODS_PRICE"] = 0
pdf_prev_app["AMT_GOODS_PRICE"].isna().mean()
0.0
Lãi suất của khoản vay
pdf_prev_app["RATE_DOWN_PAYMENT"].isna().mean()
0.536364801157217
pdf_prev_app["RATE_INTEREST_PRIMARY"].isna().mean()
0.9964369835242669
pdf_prev_app["RATE_INTEREST_PRIVILEGED"].isna().mean()
0.9964369835242669
# Fill median với giả định ngân hàng đa phần cho vay với tỉ lệ này nên hiếm khi điền thông tin cho trường này
imputer = Imputer(strategy = 'median')
pdf_prev_app['RATE_DOWN_PAYMENT'] = imputer.fit_transform(pdf_prev_app[['RATE_DOWN_PAYMENT']])
pdf_prev_app['RATE_INTEREST_PRIMARY'] = imputer.fit_transform(pdf_prev_app[['RATE_INTEREST_PRIMARY']])
pdf_prev_app['RATE_INTEREST_PRIVILEGED'] = imputer.fit_transform(pdf_prev_app[['RATE_INTEREST_PRIVILEGED']])
Kì hạn của khoản vay
pdf_prev_app["CNT_PAYMENT"].isna().mean()
0.22286365699245725
# Có rất nhiều giá trị 0 tại cột AMT_CREDIT khi CNT_PAYMENT là Null
series_check = pdf_prev_app[pdf_prev_app["CNT_PAYMENT"].isna()]["AMT_CREDIT"]
series_check[series_check == 0.0].count() * 1.0 / series_check.shape[0]
0.9003707385218817
pdf_prev_app.loc[series_check[series_check == 0.0].index]["NAME_CONTRACT_STATUS"].value_counts()
Canceled 305566 Refused 29087 Unused offer 492 Name: NAME_CONTRACT_STATUS, dtype: int64
pdf_prev_app.loc[series_check[series_check > 0.0].index]["NAME_CONTRACT_STATUS"].value_counts()
Unused offer 25032 Refused 11810 Canceled 239 Approved 4 Name: NAME_CONTRACT_STATUS, dtype: int64
ls_days = [cname for cname in pdf_prev_app.columns if "DAYS" in cname]
ls_days
['DAYS_DECISION', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION']
# Những rows có CNT_PAYMENT là null thì các trường DAYS* đều là null.
# Có thể KH đã trả trước.
pdf_prev_app.loc[series_check.index][ls_days].describe()
DAYS_DECISION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | |
---|---|---|---|---|---|---|
count | 372230.0000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
mean | -316.8921 | NaN | NaN | NaN | NaN | NaN |
std | 325.8686 | NaN | NaN | NaN | NaN | NaN |
min | -2580.0000 | NaN | NaN | NaN | NaN | NaN |
25% | -363.0000 | NaN | NaN | NaN | NaN | NaN |
50% | -250.0000 | NaN | NaN | NaN | NaN | NaN |
75% | -153.0000 | NaN | NaN | NaN | NaN | NaN |
max | -2.0000 | NaN | NaN | NaN | NaN | NaN |
# Fill 0 cho các trường hợp này
pdf_prev_app.loc[pdf_prev_app["CNT_PAYMENT"].isna(), "CNT_PAYMENT"] = 0
pdf_prev_app["CNT_PAYMENT"].isna().mean()
0.0
Ngày giải ngân đầu tiên
# Đây là trường hợp
# Đặt các món vay này có PRE_PAID_ALL = 1
# Fill giá trị PRE_PAID_ALL các món trên là 1
pdf_prev_app["DAYS_FIRST_DRAWING"].isna().mean()
0.40298129461254667
# DAYS_FIRST_DRAWING bị null mà vẫn được approved?
# có thể KH trả toàn bộ tiền, thay đổi ý kiến trước khi nhận được tiền vay
pdf_prev_app[pdf_prev_app["DAYS_FIRST_DRAWING"].isna()]["NAME_CONTRACT_STATUS"].value_counts()
Canceled 316319 Refused 290678 Approved 39632 Unused offer 26436 Name: NAME_CONTRACT_STATUS, dtype: int64
# tạo feature xác định có phải là prepaid user hay không
pdf_prev_app["PREPAID_USER"] = 0
pdf_prev_app.loc[pdf_prev_app["DAYS_FIRST_DRAWING"].isna(), "PREPAID_USER"] = 1
pdf_prev_app["PREPAID_USER"].value_counts()
0 997149 1 673065 Name: PREPAID_USER, dtype: int64
Các giá trị 365243 thể hiện giá trị vô hạn, tương đương null value. Do đó, fill toàn bộ giá trị này thành null và đánh dấu điểm dị biệt này
def handling_days(pdf_input, cname):
# Create an anomalous flag column
pdf_input["{}_ANOM".format(cname)] = pdf_input[cname] == 365243
# Replace the anomalous values with nan
pdf_input[cname] = pdf_input[cname].replace({365243: np.nan})
# Calculate years
pdf_input["{}_TO_YEARS".format(cname)] = pdf_input[cname] / -365
pdf_input.drop(columns=[cname], inplace=True)
return pdf_input
for cname in ls_days:
print("{}: {}".format(cname, (pdf_prev_app[cname] == 365243).mean()))
pdf_prev_app = handling_days(pdf_prev_app, cname)
DAYS_DECISION: 0.0 DAYS_FIRST_DRAWING: 0.559475612107 DAYS_FIRST_DUE: 0.0243352049498 DAYS_LAST_DUE_1ST_VERSION: 0.0561987865028 DAYS_LAST_DUE: 0.126463435224 DAYS_TERMINATION: 0.135259912802
NAME_CASH_LOAN_PURPOSE, NAME_CONTRACT_STATUS, NAME_PAYMENT_TYPE, CODE_REJECT_REASON, NAME_CLIENT_TYPE, NAME_GOODS_CATEGORY, NAME_PORTFOLIO, CHANNEL_TYPE, NAME_SELLER_INDUSTRY, NAME_YIELD_GROUP
def ordinal_encoding(pdf_input, cname):
# ranking by count frequency
series_ranking = pdf_prev_app[cname].value_counts()
# generate mapping to ordinal
cate_map = dict(zip(series_ranking.index, range(series_ranking.shape[0])))
# transform values
pdf_prev_app["{}_ordinal".format(cname)] = pdf_prev_app[cname].apply(lambda x: cate_map[x])
pdf_input["{}_ordinal".format(cname)] = pdf_input[cname].apply(lambda x: cate_map[x])
return pdf_input
ls_to_ordinal = [
"NAME_CASH_LOAN_PURPOSE",
"NAME_CONTRACT_STATUS",
"NAME_PAYMENT_TYPE",
"CODE_REJECT_REASON",
"NAME_CLIENT_TYPE",
"NAME_GOODS_CATEGORY",
"NAME_PORTFOLIO",
"CHANNEL_TYPE",
"NAME_SELLER_INDUSTRY",
"NAME_YIELD_GROUP"
]
for cname in ls_to_ordinal:
print("Encoding {}...".format(cname))
pdf_prev_app = ordinal_encoding(pdf_prev_app, cname)
# drop columns
pdf_prev_app.drop(columns=ls_to_ordinal, inplace=True)
Encoding NAME_CASH_LOAN_PURPOSE... Encoding NAME_CONTRACT_STATUS... Encoding NAME_PAYMENT_TYPE... Encoding CODE_REJECT_REASON... Encoding NAME_CLIENT_TYPE... Encoding NAME_GOODS_CATEGORY... Encoding NAME_PORTFOLIO... Encoding CHANNEL_TYPE... Encoding NAME_SELLER_INDUSTRY... Encoding NAME_YIELD_GROUP...
Ai đi cùng KH tới vay
pdf_prev_app["NAME_TYPE_SUITE"].isna().mean()
0.4911975351661524
# Fill na là Unaccompanied
pdf_prev_app.loc[pdf_prev_app["NAME_TYPE_SUITE"].isna(), "NAME_TYPE_SUITE"] = "Unaccompanied"
pdf_prev_app["NAME_TYPE_SUITE"].isna().mean()
0.0
# get list categorical attributes
# ls_cate = pdf_meta.query("sub_type == 'object'")["name"].tolist()
series_type = pdf_prev_app.dtypes
ls_cate = series_type[series_type == "object"].index.tolist()
ls_cate
['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_TYPE_SUITE', 'NAME_PRODUCT_TYPE', 'PRODUCT_COMBINATION']
# construct categorical mapping
dict_onehot = {}
for cate in ls_cate:
ls_val = pdf_prev_app[cate].value_counts().index.tolist()
dict_onehot[cate] = ls_val
%%time
pdf_onehot = gen_one_hot_feat(pdf_prev_app, dict_onehot, main_key="SK_ID_CURR")
print(pdf_onehot.shape)
(1670214, 40) CPU times: user 1min 19s, sys: 2.64 s, total: 1min 21s Wall time: 28.9 s
%%time
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum", "mean"], main_key="SK_ID_CURR")
eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)
display(eval_agg01)
{'FLAG_LAST_APPL_PER_CONTRACT_N': ['max', 'sum', 'mean'], 'FLAG_LAST_APPL_PER_CONTRACT_Y': ['max', 'sum', 'mean'], 'NAME_CONTRACT_TYPE_Cash_loans': ['max', 'sum', 'mean'], 'NAME_CONTRACT_TYPE_Consumer_loans': ['max', 'sum', 'mean'], 'NAME_CONTRACT_TYPE_Revolving_loans': ['max', 'sum', 'mean'], 'NAME_PRODUCT_TYPE_XNA': ['max', 'sum', 'mean'], 'NAME_PRODUCT_TYPE_walk_in': ['max', 'sum', 'mean'], 'NAME_PRODUCT_TYPE_x_sell': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Children': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Family': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Group_of_people': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Other_A': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Other_B': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Spouse,_partner': ['max', 'sum', 'mean'], 'NAME_TYPE_SUITE_Unaccompanied': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Card_Street': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Card_X_Sell': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash_Street__high': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash_Street__low': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash_Street__middle': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash_X_Sell__high': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash_X_Sell__low': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_Cash_X_Sell__middle': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_household_with_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_household_without_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_industry_with_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_industry_without_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_mobile_with_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_mobile_without_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_other_with_interest': ['max', 'sum', 'mean'], 'PRODUCT_COMBINATION_POS_others_without_interest': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_FRIDAY': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_MONDAY': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_SATURDAY': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_SUNDAY': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_THURSDAY': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_TUESDAY': ['max', 'sum', 'mean'], 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY': ['max', 'sum', 'mean']}
After agg: (338857, 117)
name | auc | corr | coverage | |
---|---|---|---|---|
58 | NAME_PRODUCT_TYPE_walk_in_sum | 0.5502 | 0.0642 | 1.0 |
59 | NAME_PRODUCT_TYPE_walk_in_mean | 0.5499 | 0.0588 | 1.0 |
57 | NAME_PRODUCT_TYPE_walk_in_max | 0.5468 | 0.0595 | 1.0 |
115 | NAME_CONTRACT_TYPE_Revolving_loans_sum | 0.5378 | 0.0471 | 1.0 |
116 | NAME_CONTRACT_TYPE_Revolving_loans_mean | 0.5358 | 0.0394 | 1.0 |
100 | PRODUCT_COMBINATION_Card_Street_sum | 0.5343 | 0.0501 | 1.0 |
101 | PRODUCT_COMBINATION_Card_Street_mean | 0.5333 | 0.0416 | 1.0 |
98 | NAME_CONTRACT_TYPE_Consumer_loans_mean | 0.5328 | -0.0319 | 1.0 |
114 | NAME_CONTRACT_TYPE_Revolving_loans_max | 0.5327 | 0.0375 | 1.0 |
99 | PRODUCT_COMBINATION_Card_Street_max | 0.5317 | 0.0417 | 1.0 |
14 | PRODUCT_COMBINATION_Cash_X_Sell__low_mean | 0.5307 | -0.0400 | 1.0 |
12 | PRODUCT_COMBINATION_Cash_X_Sell__low_max | 0.5288 | -0.0379 | 1.0 |
13 | PRODUCT_COMBINATION_Cash_X_Sell__low_sum | 0.5287 | -0.0249 | 1.0 |
71 | PRODUCT_COMBINATION_POS_industry_with_interest_mean | 0.5282 | -0.0353 | 1.0 |
97 | NAME_CONTRACT_TYPE_Consumer_loans_sum | 0.5272 | -0.0197 | 1.0 |
8 | NAME_TYPE_SUITE_Unaccompanied_mean | 0.5270 | 0.0238 | 1.0 |
70 | PRODUCT_COMBINATION_POS_industry_with_interest_sum | 0.5269 | -0.0312 | 1.0 |
69 | PRODUCT_COMBINATION_POS_industry_with_interest_max | 0.5266 | -0.0352 | 1.0 |
56 | NAME_TYPE_SUITE_Family_mean | 0.5241 | -0.0216 | 1.0 |
43 | PRODUCT_COMBINATION_Cash_X_Sell__high_sum | 0.5229 | 0.0384 | 1.0 |
44 | PRODUCT_COMBINATION_Cash_X_Sell__high_mean | 0.5225 | 0.0320 | 1.0 |
42 | PRODUCT_COMBINATION_Cash_X_Sell__high_max | 0.5224 | 0.0376 | 1.0 |
29 | NAME_PRODUCT_TYPE_XNA_mean | 0.5219 | -0.0208 | 1.0 |
34 | PRODUCT_COMBINATION_Cash_Street__high_sum | 0.5215 | 0.0322 | 1.0 |
54 | NAME_TYPE_SUITE_Family_max | 0.5215 | -0.0242 | 1.0 |
35 | PRODUCT_COMBINATION_Cash_Street__high_mean | 0.5215 | 0.0317 | 1.0 |
55 | NAME_TYPE_SUITE_Family_sum | 0.5214 | -0.0164 | 1.0 |
33 | PRODUCT_COMBINATION_Cash_Street__high_max | 0.5213 | 0.0368 | 1.0 |
7 | NAME_TYPE_SUITE_Unaccompanied_sum | 0.5198 | 0.0273 | 1.0 |
92 | PRODUCT_COMBINATION_POS_household_without_interest_mean | 0.5197 | -0.0239 | 1.0 |
91 | PRODUCT_COMBINATION_POS_household_without_interest_sum | 0.5191 | -0.0232 | 1.0 |
90 | PRODUCT_COMBINATION_POS_household_without_interest_max | 0.5190 | -0.0265 | 1.0 |
25 | PRODUCT_COMBINATION_Cash_sum | 0.5183 | 0.0236 | 1.0 |
112 | NAME_CONTRACT_TYPE_Cash_loans_sum | 0.5172 | 0.0217 | 1.0 |
110 | PRODUCT_COMBINATION_POS_mobile_with_interest_mean | 0.5167 | 0.0212 | 1.0 |
26 | PRODUCT_COMBINATION_Cash_mean | 0.5166 | 0.0180 | 1.0 |
62 | NAME_PRODUCT_TYPE_x_sell_mean | 0.5160 | -0.0197 | 1.0 |
24 | PRODUCT_COMBINATION_Cash_max | 0.5152 | 0.0175 | 1.0 |
108 | PRODUCT_COMBINATION_POS_mobile_with_interest_max | 0.5145 | 0.0162 | 1.0 |
109 | PRODUCT_COMBINATION_POS_mobile_with_interest_sum | 0.5144 | 0.0132 | 1.0 |
113 | NAME_CONTRACT_TYPE_Cash_loans_mean | 0.5141 | 0.0140 | 1.0 |
4 | PRODUCT_COMBINATION_Cash_Street__middle_sum | 0.5122 | 0.0266 | 1.0 |
3 | PRODUCT_COMBINATION_Cash_Street__middle_max | 0.5120 | 0.0258 | 1.0 |
5 | PRODUCT_COMBINATION_Cash_Street__middle_mean | 0.5119 | 0.0163 | 1.0 |
77 | PRODUCT_COMBINATION_Cash_X_Sell__middle_mean | 0.5115 | -0.0164 | 1.0 |
20 | PRODUCT_COMBINATION_POS_household_with_interest_mean | 0.5111 | -0.0082 | 1.0 |
19 | PRODUCT_COMBINATION_POS_household_with_interest_sum | 0.5105 | -0.0081 | 1.0 |
75 | PRODUCT_COMBINATION_Cash_X_Sell__middle_max | 0.5100 | -0.0128 | 1.0 |
76 | PRODUCT_COMBINATION_Cash_X_Sell__middle_sum | 0.5100 | -0.0077 | 1.0 |
64 | PRODUCT_COMBINATION_Card_X_Sell_sum | 0.5100 | 0.0178 | 1.0 |
79 | FLAG_LAST_APPL_PER_CONTRACT_Y_sum | 0.5100 | 0.0193 | 1.0 |
53 | NAME_TYPE_SUITE_Children_mean | 0.5099 | -0.0175 | 1.0 |
51 | NAME_TYPE_SUITE_Children_max | 0.5097 | -0.0203 | 1.0 |
52 | NAME_TYPE_SUITE_Children_sum | 0.5097 | -0.0175 | 1.0 |
111 | NAME_CONTRACT_TYPE_Cash_loans_max | 0.5096 | 0.0107 | 1.0 |
63 | PRODUCT_COMBINATION_Card_X_Sell_max | 0.5095 | 0.0131 | 1.0 |
65 | PRODUCT_COMBINATION_Card_X_Sell_mean | 0.5090 | 0.0094 | 1.0 |
11 | WEEKDAY_APPR_PROCESS_START_SATURDAY_mean | 0.5090 | -0.0098 | 1.0 |
46 | WEEKDAY_APPR_PROCESS_START_MONDAY_sum | 0.5090 | 0.0138 | 1.0 |
18 | PRODUCT_COMBINATION_POS_household_with_interest_max | 0.5085 | -0.0094 | 1.0 |
60 | NAME_PRODUCT_TYPE_x_sell_max | 0.5084 | -0.0093 | 1.0 |
83 | PRODUCT_COMBINATION_POS_industry_without_interest_mean | 0.5080 | -0.0216 | 1.0 |
81 | PRODUCT_COMBINATION_POS_industry_without_interest_max | 0.5080 | -0.0246 | 1.0 |
82 | PRODUCT_COMBINATION_POS_industry_without_interest_sum | 0.5080 | -0.0227 | 1.0 |
61 | NAME_PRODUCT_TYPE_x_sell_sum | 0.5078 | -0.0034 | 1.0 |
95 | WEEKDAY_APPR_PROCESS_START_SUNDAY_mean | 0.5078 | -0.0078 | 1.0 |
96 | NAME_CONTRACT_TYPE_Consumer_loans_max | 0.5077 | -0.0161 | 1.0 |
9 | WEEKDAY_APPR_PROCESS_START_SATURDAY_max | 0.5068 | -0.0076 | 1.0 |
73 | WEEKDAY_APPR_PROCESS_START_WEDNESDAY_sum | 0.5068 | 0.0114 | 1.0 |
1 | WEEKDAY_APPR_PROCESS_START_FRIDAY_sum | 0.5066 | 0.0124 | 1.0 |
93 | WEEKDAY_APPR_PROCESS_START_SUNDAY_max | 0.5066 | -0.0078 | 1.0 |
45 | WEEKDAY_APPR_PROCESS_START_MONDAY_max | 0.5064 | 0.0072 | 1.0 |
94 | WEEKDAY_APPR_PROCESS_START_SUNDAY_sum | 0.5059 | -0.0024 | 1.0 |
37 | WEEKDAY_APPR_PROCESS_START_THURSDAY_sum | 0.5058 | 0.0144 | 1.0 |
47 | WEEKDAY_APPR_PROCESS_START_MONDAY_mean | 0.5058 | 0.0053 | 1.0 |
16 | PRODUCT_COMBINATION_Cash_Street__low_sum | 0.5050 | 0.0126 | 1.0 |
15 | PRODUCT_COMBINATION_Cash_Street__low_max | 0.5049 | 0.0116 | 1.0 |
17 | PRODUCT_COMBINATION_Cash_Street__low_mean | 0.5048 | 0.0023 | 1.0 |
88 | WEEKDAY_APPR_PROCESS_START_TUESDAY_sum | 0.5046 | 0.0130 | 1.0 |
74 | WEEKDAY_APPR_PROCESS_START_WEDNESDAY_mean | 0.5042 | 0.0055 | 1.0 |
72 | WEEKDAY_APPR_PROCESS_START_WEDNESDAY_max | 0.5041 | 0.0046 | 1.0 |
10 | WEEKDAY_APPR_PROCESS_START_SATURDAY_sum | 0.5039 | 0.0032 | 1.0 |
6 | NAME_TYPE_SUITE_Unaccompanied_max | 0.5038 | 0.0079 | 1.0 |
0 | WEEKDAY_APPR_PROCESS_START_FRIDAY_max | 0.5034 | 0.0037 | 1.0 |
2 | WEEKDAY_APPR_PROCESS_START_FRIDAY_mean | 0.5033 | 0.0053 | 1.0 |
86 | NAME_TYPE_SUITE_Spouse,_partner_mean | 0.5021 | -0.0041 | 1.0 |
36 | WEEKDAY_APPR_PROCESS_START_THURSDAY_max | 0.5020 | 0.0023 | 1.0 |
27 | NAME_PRODUCT_TYPE_XNA_max | 0.5020 | -0.0059 | 1.0 |
28 | NAME_PRODUCT_TYPE_XNA_sum | 0.5019 | 0.0071 | 1.0 |
84 | NAME_TYPE_SUITE_Spouse,_partner_max | 0.5018 | -0.0027 | 1.0 |
32 | PRODUCT_COMBINATION_POS_mobile_without_interest_mean | 0.5017 | -0.0022 | 1.0 |
30 | PRODUCT_COMBINATION_POS_mobile_without_interest_max | 0.5017 | -0.0039 | 1.0 |
31 | PRODUCT_COMBINATION_POS_mobile_without_interest_sum | 0.5017 | -0.0024 | 1.0 |
50 | PRODUCT_COMBINATION_POS_other_with_interest_mean | 0.5016 | -0.0049 | 1.0 |
48 | PRODUCT_COMBINATION_POS_other_with_interest_max | 0.5015 | -0.0036 | 1.0 |
49 | PRODUCT_COMBINATION_POS_other_with_interest_sum | 0.5015 | -0.0029 | 1.0 |
85 | NAME_TYPE_SUITE_Spouse,_partner_sum | 0.5015 | 0.0027 | 1.0 |
67 | FLAG_LAST_APPL_PER_CONTRACT_N_sum | 0.5015 | 0.0103 | 1.0 |
80 | FLAG_LAST_APPL_PER_CONTRACT_Y_mean | 0.5015 | -0.0068 | 1.0 |
68 | FLAG_LAST_APPL_PER_CONTRACT_N_mean | 0.5015 | 0.0068 | 1.0 |
66 | FLAG_LAST_APPL_PER_CONTRACT_N_max | 0.5015 | 0.0062 | 1.0 |
38 | WEEKDAY_APPR_PROCESS_START_THURSDAY_mean | 0.5014 | 0.0010 | 1.0 |
103 | NAME_TYPE_SUITE_Other_A_sum | 0.5010 | 0.0035 | 1.0 |
104 | NAME_TYPE_SUITE_Other_A_mean | 0.5010 | 0.0055 | 1.0 |
102 | NAME_TYPE_SUITE_Other_A_max | 0.5010 | 0.0038 | 1.0 |
87 | WEEKDAY_APPR_PROCESS_START_TUESDAY_max | 0.5010 | 0.0011 | 1.0 |
106 | NAME_TYPE_SUITE_Other_B_sum | 0.5007 | 0.0029 | 1.0 |
107 | NAME_TYPE_SUITE_Other_B_mean | 0.5007 | 0.0025 | 1.0 |
105 | NAME_TYPE_SUITE_Other_B_max | 0.5007 | 0.0018 | 1.0 |
40 | PRODUCT_COMBINATION_POS_others_without_interest_sum | 0.5004 | -0.0033 | 1.0 |
39 | PRODUCT_COMBINATION_POS_others_without_interest_max | 0.5004 | -0.0030 | 1.0 |
41 | PRODUCT_COMBINATION_POS_others_without_interest_mean | 0.5004 | -0.0019 | 1.0 |
23 | NAME_TYPE_SUITE_Group_of_people_mean | 0.5003 | 0.0034 | 1.0 |
22 | NAME_TYPE_SUITE_Group_of_people_sum | 0.5003 | 0.0023 | 1.0 |
21 | NAME_TYPE_SUITE_Group_of_people_max | 0.5003 | 0.0021 | 1.0 |
89 | WEEKDAY_APPR_PROCESS_START_TUESDAY_mean | 0.5000 | 0.0005 | 1.0 |
78 | FLAG_LAST_APPL_PER_CONTRACT_Y_max | 0.5000 | NaN | 1.0 |
CPU times: user 54.8 s, sys: 1.93 s, total: 56.7 s Wall time: 14.5 s
eval_agg01.query("auc <= 0.501").shape
(12, 4)
sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist()
pdf_agg01 = pdf_agg01[sel_feat]
print(pdf_agg01.shape)
(338857, 105)
# get list numerical attributes
# ls_num = pdf_meta.query("sub_type == 'int64'")["name"].tolist()
series_type = pdf_prev_app.dtypes
ls_num = series_type[series_type == "int64"].index.tolist()
ls_num = [col for col in ls_num if col not in ["SK_ID_PREV", "SK_ID_CURR"]]
ls_num
['HOUR_APPR_PROCESS_START', 'NFLAG_LAST_APPL_IN_DAY', 'SELLERPLACE_AREA', 'PREPAID_USER', 'NAME_CASH_LOAN_PURPOSE_ordinal', 'NAME_CONTRACT_STATUS_ordinal', 'NAME_PAYMENT_TYPE_ordinal', 'CODE_REJECT_REASON_ordinal', 'NAME_CLIENT_TYPE_ordinal', 'NAME_GOODS_CATEGORY_ordinal', 'NAME_PORTFOLIO_ordinal', 'CHANNEL_TYPE_ordinal', 'NAME_SELLER_INDUSTRY_ordinal', 'NAME_YIELD_GROUP_ordinal']
pdf_num = pdf_prev_app[["SK_ID_PREV", "SK_ID_CURR"] + ls_num].copy()
pdf_num.head()
SK_ID_PREV | SK_ID_CURR | HOUR_APPR_PROCESS_START | NFLAG_LAST_APPL_IN_DAY | SELLERPLACE_AREA | PREPAID_USER | NAME_CASH_LOAN_PURPOSE_ordinal | NAME_CONTRACT_STATUS_ordinal | NAME_PAYMENT_TYPE_ordinal | CODE_REJECT_REASON_ordinal | NAME_CLIENT_TYPE_ordinal | NAME_GOODS_CATEGORY_ordinal | NAME_PORTFOLIO_ordinal | CHANNEL_TYPE_ordinal | NAME_SELLER_INDUSTRY_ordinal | NAME_YIELD_GROUP_ordinal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2030495 | 271877 | 15 | 1 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 | 1 |
1 | 2802425 | 108129 | 11 | 1 | -1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 4 | 0 | 4 |
2 | 2523466 | 122040 | 11 | 1 | -1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 |
3 | 2819243 | 176158 | 7 | 1 | -1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 1784265 | 202054 | 9 | 1 | -1 | 1 | 2 | 2 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 2 |
%%time
pdf_agg02 = agg_common_data(pdf_num[["SK_ID_CURR"] + ls_num], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)
display(eval_agg02)
{'CHANNEL_TYPE_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'CODE_REJECT_REASON_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'HOUR_APPR_PROCESS_START': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_CASH_LOAN_PURPOSE_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_CLIENT_TYPE_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_CONTRACT_STATUS_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_GOODS_CATEGORY_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_PAYMENT_TYPE_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_PORTFOLIO_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_SELLER_INDUSTRY_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NAME_YIELD_GROUP_ordinal': ['max', 'min', 'sum', 'mean', 'std'], 'NFLAG_LAST_APPL_IN_DAY': ['max', 'min', 'sum', 'mean', 'std'], 'PREPAID_USER': ['max', 'min', 'sum', 'mean', 'std'], 'SELLERPLACE_AREA': ['max', 'min', 'sum', 'mean', 'std']}
After agg: (338857, 70)
name | auc | corr | coverage | |
---|---|---|---|---|
19 | NAME_CONTRACT_STATUS_ordinal_std | 0.5657 | 5.8804e-02 | 0.8192 |
59 | CODE_REJECT_REASON_ordinal_std | 0.5655 | 5.8077e-02 | 0.8192 |
18 | NAME_CONTRACT_STATUS_ordinal_mean | 0.5623 | 7.0461e-02 | 1.0000 |
3 | PREPAID_USER_mean | 0.5574 | 6.0266e-02 | 1.0000 |
58 | CODE_REJECT_REASON_ordinal_mean | 0.5561 | 6.0062e-02 | 1.0000 |
57 | CODE_REJECT_REASON_ordinal_sum | 0.5548 | 6.0961e-02 | 1.0000 |
68 | NAME_YIELD_GROUP_ordinal_mean | 0.5519 | -4.8816e-02 | 1.0000 |
17 | NAME_CONTRACT_STATUS_ordinal_sum | 0.5516 | 5.7904e-02 | 1.0000 |
55 | CODE_REJECT_REASON_ordinal_max | 0.5507 | 4.9633e-02 | 1.0000 |
13 | NAME_GOODS_CATEGORY_ordinal_mean | 0.5494 | -3.6786e-02 | 1.0000 |
65 | NAME_YIELD_GROUP_ordinal_max | 0.5452 | -4.1166e-02 | 1.0000 |
2 | PREPAID_USER_sum | 0.5450 | 4.9427e-02 | 1.0000 |
15 | NAME_CONTRACT_STATUS_ordinal_max | 0.5435 | 4.3386e-02 | 1.0000 |
28 | NAME_PORTFOLIO_ordinal_mean | 0.5429 | 4.2349e-02 | 1.0000 |
10 | NAME_GOODS_CATEGORY_ordinal_max | 0.5426 | -3.2552e-02 | 1.0000 |
12 | NAME_GOODS_CATEGORY_ordinal_sum | 0.5426 | -2.9256e-02 | 1.0000 |
4 | PREPAID_USER_std | 0.5405 | 4.4479e-02 | 0.8192 |
38 | SELLERPLACE_AREA_mean | 0.5382 | -2.6579e-03 | 1.0000 |
33 | HOUR_APPR_PROCESS_START_mean | 0.5377 | -3.6333e-02 | 1.0000 |
54 | NAME_CASH_LOAN_PURPOSE_ordinal_std | 0.5368 | 3.7460e-02 | 0.8192 |
39 | SELLERPLACE_AREA_std | 0.5366 | -4.3819e-03 | 0.8192 |
14 | NAME_GOODS_CATEGORY_ordinal_std | 0.5365 | -2.8191e-02 | 0.8192 |
35 | SELLERPLACE_AREA_max | 0.5351 | -3.1607e-03 | 1.0000 |
23 | NAME_PAYMENT_TYPE_ordinal_mean | 0.5351 | 3.2790e-02 | 1.0000 |
27 | NAME_PORTFOLIO_ordinal_sum | 0.5340 | 4.0230e-02 | 1.0000 |
22 | NAME_PAYMENT_TYPE_ordinal_sum | 0.5339 | 3.8618e-02 | 1.0000 |
31 | HOUR_APPR_PROCESS_START_min | 0.5333 | -3.1684e-02 | 1.0000 |
30 | HOUR_APPR_PROCESS_START_max | 0.5333 | -3.1928e-02 | 1.0000 |
37 | SELLERPLACE_AREA_sum | 0.5325 | -3.9560e-03 | 1.0000 |
25 | NAME_PORTFOLIO_ordinal_max | 0.5311 | 2.9373e-02 | 1.0000 |
29 | NAME_PORTFOLIO_ordinal_std | 0.5305 | 2.9315e-02 | 0.8192 |
0 | PREPAID_USER_max | 0.5289 | 3.2307e-02 | 1.0000 |
43 | NAME_SELLER_INDUSTRY_ordinal_mean | 0.5277 | -2.6485e-02 | 1.0000 |
50 | NAME_CASH_LOAN_PURPOSE_ordinal_max | 0.5260 | 3.5837e-02 | 1.0000 |
67 | NAME_YIELD_GROUP_ordinal_sum | 0.5255 | -1.3066e-02 | 1.0000 |
69 | NAME_YIELD_GROUP_ordinal_std | 0.5242 | -1.9679e-02 | 0.8192 |
40 | NAME_SELLER_INDUSTRY_ordinal_max | 0.5241 | -2.4982e-02 | 1.0000 |
52 | NAME_CASH_LOAN_PURPOSE_ordinal_sum | 0.5237 | 3.4514e-02 | 1.0000 |
53 | NAME_CASH_LOAN_PURPOSE_ordinal_mean | 0.5233 | 2.7435e-02 | 1.0000 |
42 | NAME_SELLER_INDUSTRY_ordinal_sum | 0.5229 | -1.4972e-02 | 1.0000 |
9 | NAME_CLIENT_TYPE_ordinal_std | 0.5216 | -1.7422e-02 | 0.8192 |
66 | NAME_YIELD_GROUP_ordinal_min | 0.5211 | -2.1298e-02 | 1.0000 |
44 | NAME_SELLER_INDUSTRY_ordinal_std | 0.5209 | -2.1940e-02 | 0.8192 |
24 | NAME_PAYMENT_TYPE_ordinal_std | 0.5198 | 2.1540e-02 | 0.8192 |
20 | NAME_PAYMENT_TYPE_ordinal_max | 0.5176 | 1.8720e-02 | 1.0000 |
11 | NAME_GOODS_CATEGORY_ordinal_min | 0.5163 | -1.9570e-02 | 1.0000 |
5 | NAME_CLIENT_TYPE_ordinal_max | 0.5133 | -1.1119e-02 | 1.0000 |
49 | CHANNEL_TYPE_ordinal_std | 0.5110 | 1.3215e-02 | 0.8192 |
62 | NFLAG_LAST_APPL_IN_DAY_sum | 0.5101 | 1.9556e-02 | 1.0000 |
34 | HOUR_APPR_PROCESS_START_std | 0.5100 | -8.9635e-03 | 0.8192 |
41 | NAME_SELLER_INDUSTRY_ordinal_min | 0.5098 | -1.2479e-02 | 1.0000 |
36 | SELLERPLACE_AREA_min | 0.5092 | -1.3533e-03 | 1.0000 |
45 | CHANNEL_TYPE_ordinal_max | 0.5083 | 1.3879e-02 | 1.0000 |
26 | NAME_PORTFOLIO_ordinal_min | 0.5078 | 1.4560e-02 | 1.0000 |
21 | NAME_PAYMENT_TYPE_ordinal_min | 0.5042 | 9.2246e-03 | 1.0000 |
8 | NAME_CLIENT_TYPE_ordinal_mean | 0.5042 | -7.3017e-03 | 1.0000 |
46 | CHANNEL_TYPE_ordinal_min | 0.5038 | -2.6060e-04 | 1.0000 |
6 | NAME_CLIENT_TYPE_ordinal_min | 0.5029 | -1.4025e-04 | 1.0000 |
48 | CHANNEL_TYPE_ordinal_mean | 0.5027 | 5.8389e-03 | 1.0000 |
47 | CHANNEL_TYPE_ordinal_sum | 0.5025 | 2.2143e-02 | 1.0000 |
7 | NAME_CLIENT_TYPE_ordinal_sum | 0.5016 | 3.7297e-03 | 1.0000 |
64 | NFLAG_LAST_APPL_IN_DAY_std | 0.5014 | 4.5610e-03 | 0.8192 |
61 | NFLAG_LAST_APPL_IN_DAY_min | 0.5011 | -5.0978e-03 | 1.0000 |
63 | NFLAG_LAST_APPL_IN_DAY_mean | 0.5011 | -3.5944e-03 | 1.0000 |
51 | NAME_CASH_LOAN_PURPOSE_ordinal_min | 0.5006 | 5.1359e-03 | 1.0000 |
32 | HOUR_APPR_PROCESS_START_sum | 0.5002 | 1.0930e-02 | 1.0000 |
56 | CODE_REJECT_REASON_ordinal_min | 0.5001 | 1.2725e-03 | 1.0000 |
16 | NAME_CONTRACT_STATUS_ordinal_min | 0.5000 | 5.2200e-04 | 1.0000 |
60 | NFLAG_LAST_APPL_IN_DAY_max | 0.5000 | -1.0546e-03 | 1.0000 |
1 | PREPAID_USER_min | 0.5000 | -8.9535e-05 | 1.0000 |
CPU times: user 27.3 s, sys: 1.13 s, total: 28.5 s Wall time: 8.55 s
eval_agg02.query("auc <= 0.501").shape
(6, 4)
sel_feat = eval_agg02.query("auc > 0.501")["name"].tolist()
pdf_agg02 = pdf_agg02[sel_feat]
print(pdf_agg02.shape)
(338857, 64)
# get list continuous attributes
# ls_con = pdf_meta.query("sub_type == 'float64'")["name"].tolist()
series_type = pdf_prev_app.dtypes
ls_con = series_type[series_type == "float64"].index.tolist()
ls_con
['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'CNT_PAYMENT', 'NFLAG_INSURED_ON_APPROVAL', 'DAYS_DECISION_TO_YEARS', 'DAYS_FIRST_DRAWING_TO_YEARS', 'DAYS_FIRST_DUE_TO_YEARS', 'DAYS_LAST_DUE_1ST_VERSION_TO_YEARS', 'DAYS_LAST_DUE_TO_YEARS', 'DAYS_TERMINATION_TO_YEARS']
pdf_con = pdf_prev_app[["SK_ID_PREV", "SK_ID_CURR"] + ls_con].copy()
pdf_con.head()
SK_ID_PREV | SK_ID_CURR | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | CNT_PAYMENT | NFLAG_INSURED_ON_APPROVAL | DAYS_DECISION_TO_YEARS | DAYS_FIRST_DRAWING_TO_YEARS | DAYS_FIRST_DUE_TO_YEARS | DAYS_LAST_DUE_1ST_VERSION_TO_YEARS | DAYS_LAST_DUE_TO_YEARS | DAYS_TERMINATION_TO_YEARS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2030495 | 271877 | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | 0.0000 | 0.1828 | 0.8673 | 12.0 | 0.0 | 0.2000 | NaN | 0.1151 | -0.8219 | 0.1151 | 0.1014 |
1 | 2802425 | 108129 | 25188.615 | 607500.0 | 679671.0 | 0.0 | 607500.0 | 0.0516 | 0.1891 | 0.8351 | 36.0 | 1.0 | 0.4493 | NaN | 0.3671 | -2.5096 | NaN | NaN |
2 | 2523466 | 122040 | 15060.735 | 112500.0 | 136444.5 | 0.0 | 112500.0 | 0.0516 | 0.1891 | 0.8351 | 12.0 | 1.0 | 0.8247 | NaN | 0.7425 | -0.1616 | NaN | NaN |
3 | 2819243 | 176158 | 47041.335 | 450000.0 | 470790.0 | 0.0 | 450000.0 | 0.0516 | 0.1891 | 0.8351 | 12.0 | 1.0 | 1.4027 | NaN | 1.3205 | 0.4164 | 0.4986 | 0.4849 |
4 | 1784265 | 202054 | 31924.395 | 337500.0 | 404055.0 | 0.0 | 337500.0 | 0.0516 | 0.1891 | 0.8351 | 24.0 | NaN | 2.1397 | NaN | NaN | NaN | NaN | NaN |
%%time
pdf_agg03 = agg_common_data(pdf_con[["SK_ID_CURR"] + ls_con], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)
display(eval_agg03)
{'AMT_ANNUITY': ['max', 'min', 'sum', 'mean', 'std'], 'AMT_APPLICATION': ['max', 'min', 'sum', 'mean', 'std'], 'AMT_CREDIT': ['max', 'min', 'sum', 'mean', 'std'], 'AMT_DOWN_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'], 'AMT_GOODS_PRICE': ['max', 'min', 'sum', 'mean', 'std'], 'CNT_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'], 'DAYS_DECISION_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'], 'DAYS_FIRST_DRAWING_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'], 'DAYS_FIRST_DUE_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'], 'DAYS_LAST_DUE_1ST_VERSION_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'], 'DAYS_LAST_DUE_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'], 'DAYS_TERMINATION_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'], 'NFLAG_INSURED_ON_APPROVAL': ['max', 'min', 'sum', 'mean', 'std'], 'RATE_DOWN_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'], 'RATE_INTEREST_PRIMARY': ['max', 'min', 'sum', 'mean', 'std'], 'RATE_INTEREST_PRIVILEGED': ['max', 'min', 'sum', 'mean', 'std']}
After agg: (338857, 80)
name | auc | corr | coverage | |
---|---|---|---|---|
64 | DAYS_FIRST_DRAWING_TO_YEARS_std | 0.6163 | -0.0945 | 0.0005 |
63 | DAYS_FIRST_DRAWING_TO_YEARS_mean | 0.5944 | -0.0963 | 0.1830 |
60 | DAYS_FIRST_DRAWING_TO_YEARS_max | 0.5943 | -0.0961 | 0.1830 |
61 | DAYS_FIRST_DRAWING_TO_YEARS_min | 0.5942 | -0.0962 | 0.1830 |
44 | DAYS_DECISION_TO_YEARS_std | 0.5675 | -0.0605 | 0.8192 |
57 | DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_sum | 0.5635 | -0.0516 | 1.0000 |
4 | AMT_DOWN_PAYMENT_std | 0.5608 | -0.0245 | 0.8192 |
19 | DAYS_FIRST_DUE_TO_YEARS_std | 0.5589 | -0.0497 | 0.7013 |
43 | DAYS_DECISION_TO_YEARS_mean | 0.5582 | -0.0472 | 1.0000 |
15 | DAYS_FIRST_DUE_TO_YEARS_max | 0.5576 | -0.0546 | 0.9937 |
40 | DAYS_DECISION_TO_YEARS_max | 0.5574 | -0.0545 | 1.0000 |
55 | DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_max | 0.5561 | -0.0534 | 0.9863 |
32 | DAYS_LAST_DUE_TO_YEARS_sum | 0.5559 | -0.0441 | 1.0000 |
7 | DAYS_TERMINATION_TO_YEARS_sum | 0.5549 | -0.0434 | 1.0000 |
2 | AMT_DOWN_PAYMENT_sum | 0.5542 | -0.0275 | 1.0000 |
17 | DAYS_FIRST_DUE_TO_YEARS_sum | 0.5540 | -0.0426 | 1.0000 |
0 | AMT_DOWN_PAYMENT_max | 0.5537 | -0.0258 | 1.0000 |
3 | AMT_DOWN_PAYMENT_mean | 0.5534 | -0.0298 | 1.0000 |
30 | DAYS_LAST_DUE_TO_YEARS_max | 0.5527 | -0.0496 | 0.9244 |
79 | RATE_DOWN_PAYMENT_std | 0.5526 | -0.0432 | 0.8192 |
5 | DAYS_TERMINATION_TO_YEARS_max | 0.5521 | -0.0488 | 0.9172 |
58 | DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_mean | 0.5509 | -0.0421 | 0.9863 |
18 | DAYS_FIRST_DUE_TO_YEARS_mean | 0.5494 | -0.0409 | 0.9937 |
73 | AMT_ANNUITY_mean | 0.5471 | -0.0421 | 1.0000 |
34 | DAYS_LAST_DUE_TO_YEARS_std | 0.5448 | -0.0385 | 0.5814 |
39 | CNT_PAYMENT_std | 0.5448 | 0.0422 | 0.8192 |
75 | RATE_DOWN_PAYMENT_max | 0.5442 | -0.0383 | 1.0000 |
9 | DAYS_TERMINATION_TO_YEARS_std | 0.5419 | -0.0359 | 0.5705 |
33 | DAYS_LAST_DUE_TO_YEARS_mean | 0.5419 | -0.0341 | 0.9244 |
8 | DAYS_TERMINATION_TO_YEARS_mean | 0.5414 | -0.0336 | 0.9172 |
78 | RATE_DOWN_PAYMENT_mean | 0.5358 | -0.0326 | 1.0000 |
71 | AMT_ANNUITY_min | 0.5297 | -0.0302 | 1.0000 |
42 | DAYS_DECISION_TO_YEARS_sum | 0.5296 | -0.0191 | 1.0000 |
28 | AMT_APPLICATION_mean | 0.5288 | -0.0231 | 1.0000 |
48 | AMT_GOODS_PRICE_mean | 0.5287 | -0.0230 | 1.0000 |
62 | DAYS_FIRST_DRAWING_TO_YEARS_sum | 0.5282 | 0.0001 | 1.0000 |
70 | AMT_ANNUITY_max | 0.5278 | -0.0291 | 1.0000 |
21 | AMT_CREDIT_min | 0.5275 | -0.0204 | 1.0000 |
41 | DAYS_DECISION_TO_YEARS_min | 0.5246 | -0.0172 | 1.0000 |
35 | CNT_PAYMENT_max | 0.5238 | 0.0290 | 1.0000 |
46 | AMT_GOODS_PRICE_min | 0.5221 | -0.0213 | 1.0000 |
26 | AMT_APPLICATION_min | 0.5221 | -0.0213 | 1.0000 |
23 | AMT_CREDIT_mean | 0.5212 | -0.0173 | 1.0000 |
72 | AMT_ANNUITY_sum | 0.5205 | -0.0075 | 1.0000 |
36 | CNT_PAYMENT_min | 0.5175 | -0.0119 | 1.0000 |
59 | DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std | 0.5172 | -0.0111 | 0.6718 |
25 | AMT_APPLICATION_max | 0.5171 | -0.0138 | 1.0000 |
45 | AMT_GOODS_PRICE_max | 0.5171 | -0.0138 | 1.0000 |
74 | AMT_ANNUITY_std | 0.5162 | -0.0252 | 0.8192 |
77 | RATE_DOWN_PAYMENT_sum | 0.5157 | -0.0107 | 1.0000 |
67 | NFLAG_INSURED_ON_APPROVAL_sum | 0.5147 | -0.0173 | 1.0000 |
56 | DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min | 0.5140 | -0.0117 | 0.9863 |
53 | RATE_INTEREST_PRIVILEGED_mean | 0.5129 | 0.0011 | 1.0000 |
27 | AMT_APPLICATION_sum | 0.5119 | 0.0032 | 1.0000 |
47 | AMT_GOODS_PRICE_sum | 0.5119 | 0.0033 | 1.0000 |
20 | AMT_CREDIT_max | 0.5110 | -0.0094 | 1.0000 |
65 | NFLAG_INSURED_ON_APPROVAL_max | 0.5103 | -0.0114 | 0.9956 |
12 | RATE_INTEREST_PRIMARY_sum | 0.5103 | 0.0197 | 1.0000 |
52 | RATE_INTEREST_PRIVILEGED_sum | 0.5102 | 0.0198 | 1.0000 |
69 | NFLAG_INSURED_ON_APPROVAL_std | 0.5098 | 0.0026 | 0.7213 |
38 | CNT_PAYMENT_mean | 0.5091 | 0.0121 | 1.0000 |
37 | CNT_PAYMENT_sum | 0.5082 | 0.0267 | 1.0000 |
29 | AMT_APPLICATION_std | 0.5077 | -0.0149 | 0.8192 |
49 | AMT_GOODS_PRICE_std | 0.5077 | -0.0149 | 0.8192 |
22 | AMT_CREDIT_sum | 0.5071 | 0.0069 | 1.0000 |
16 | DAYS_FIRST_DUE_TO_YEARS_min | 0.5064 | -0.0050 | 0.9937 |
1 | AMT_DOWN_PAYMENT_min | 0.5058 | -0.0162 | 1.0000 |
68 | NFLAG_INSURED_ON_APPROVAL_mean | 0.5057 | -0.0021 | 0.9956 |
66 | NFLAG_INSURED_ON_APPROVAL_min | 0.5053 | 0.0112 | 0.9956 |
31 | DAYS_LAST_DUE_TO_YEARS_min | 0.5038 | -0.0014 | 0.9244 |
6 | DAYS_TERMINATION_TO_YEARS_min | 0.5032 | -0.0020 | 0.9172 |
76 | RATE_DOWN_PAYMENT_min | 0.5032 | -0.0103 | 1.0000 |
24 | AMT_CREDIT_std | 0.5017 | -0.0101 | 0.8192 |
13 | RATE_INTEREST_PRIMARY_mean | 0.5011 | 0.0008 | 1.0000 |
11 | RATE_INTEREST_PRIMARY_min | 0.5004 | 0.0016 | 1.0000 |
51 | RATE_INTEREST_PRIVILEGED_min | 0.5003 | 0.0022 | 1.0000 |
10 | RATE_INTEREST_PRIMARY_max | 0.5003 | -0.0005 | 1.0000 |
54 | RATE_INTEREST_PRIVILEGED_std | 0.5002 | -0.0017 | 0.8192 |
14 | RATE_INTEREST_PRIMARY_std | 0.5001 | -0.0010 | 0.8192 |
50 | RATE_INTEREST_PRIVILEGED_max | 0.5001 | 0.0004 | 1.0000 |
CPU times: user 33.3 s, sys: 1.39 s, total: 34.7 s Wall time: 10.7 s
eval_agg03.query("auc <= 0.501").shape
(6, 4)
sel_feat = eval_agg03.query("auc > 0.501")["name"].tolist()
pdf_agg03 = pdf_agg03[sel_feat]
print(pdf_agg03.shape)
(338857, 74)
# Có nhiều KH có 1-nhiều khoản vay trước đó
pdf_agg04 = pdf_prev_app.groupby("SK_ID_CURR").agg({"SK_ID_PREV": "count"})
display(pdf_agg04.describe())
pdf_agg04.hist()
plt.show()
SK_ID_PREV | |
---|---|
count | 338857.0000 |
mean | 4.9290 |
std | 4.2207 |
min | 1.0000 |
25% | 2.0000 |
50% | 4.0000 |
75% | 7.0000 |
max | 77.0000 |
pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03).join(pdf_agg04)
print(pdf_feat.shape)
(338857, 244)
%%time
fname = "prev_app"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")
Store features completed! CPU times: user 34.3 s, sys: 749 ms, total: 35 s Wall time: 34.4 s