# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 設定 data_path
dir_data = './data/'
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape
(307511, 122)
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# 檢查每一個 column
for col in app_train:
if app_train[col].dtype == 'object':
# 如果只有兩種值的類別型欄位
if len(list(app_train[col].unique())) <= 2:
# 就做 Label Encoder, 以加入相關係數檢查
app_train[col] = le.fit_transform(app_train[col])
print(app_train.shape)
app_train.head()
(307511, 122)
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | 0 | M | 0 | 1 | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | 0 | F | 0 | 0 | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | 1 | M | 1 | 1 | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | 0 | F | 0 | 1 | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | 0 | M | 0 | 1 | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# 出生日數 (DAYS_BIRTH) 取絕對值
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
參考 Day 12 範例程式,離散化你覺得有興趣的欄位,並嘗試找出有趣的訊息
app_train['YEARS_BIRTH'] = app_train['DAYS_BIRTH']/365
app_train['YEARS_BIRTH'].head()
0 25.920548 1 45.931507 2 52.180822 3 52.068493 4 54.608219 Name: YEARS_BIRTH, dtype: float64
# 針對年齡進行分組
app_train['YEARS_GROUP'] = pd.cut(app_train['YEARS_BIRTH'], bins=[10,20,30,40,50,60,70,80,90,100])
app_train['YEARS_GROUP'].head()
0 (20, 30] 1 (40, 50] 2 (50, 60] 3 (50, 60] 4 (50, 60] Name: YEARS_GROUP, dtype: category Categories (9, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]
# 統計各組(區間)人數
app_train['YEARS_GROUP'].value_counts()
(30, 40] 82308 (40, 50] 76541 (50, 60] 68062 (20, 30] 45021 (60, 70] 35579 (90, 100] 0 (80, 90] 0 (70, 80] 0 (10, 20] 0 Name: YEARS_GROUP, dtype: int64
# 使用年紀進行等距分組,分4組
app_train['equal_width_qgroup'] = pd.qcut(app_train['YEARS_BIRTH'], 4)
app_train['equal_width_qgroup'].head()
0 (20.517, 34.008] 1 (43.151, 53.923] 2 (43.151, 53.923] 3 (43.151, 53.923] 4 (53.923, 69.121] Name: equal_width_qgroup, dtype: category Categories (4, interval[float64]): [(20.517, 34.008] < (34.008, 43.151] < (43.151, 53.923] < (53.923, 69.121]]
# 統計各組(區間)人數
app_train['equal_width_qgroup'].value_counts()
(43.151, 53.923] 76887 (20.517, 34.008] 76884 (34.008, 43.151] 76877 (53.923, 69.121] 76863 Name: equal_width_qgroup, dtype: int64
# 以年齡區間為 x, target 為 y 繪製 barplot
import seaborn as sns
# 忽略警告訊息
import warnings
warnings.filterwarnings('ignore')
plt.figure (figsize=(8, 6))
px = app_train['equal_width_qgroup']
py = app_train['TARGET']
sns.barplot(px, py)
# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('TARGET')
plt.title('Age Interval - TARGET');
app_train['AMT_ANNUITY_DISCRET'] = pd.qcut(app_train['AMT_ANNUITY'], 10)
plt.figure (figsize=(8, 6))
px = app_train['AMT_ANNUITY_DISCRET']
py = app_train['TARGET']
sns.barplot(px, py)
# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('AMT_ANNUITY'); plt.ylabel('TARGET')
plt.title('AMT_ANNUITY - TARGET');