# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 設定 data_path
dir_data = './data/'
# 讀取 csv 資料檔, 並觀察前幾筆資料 ( .head() )
f_app = os.path.join(dir_data, 'application_train.csv')
print('Path of read in data: %s' % (f_app))
app_train = pd.read_csv(f_app)
app_train.head()
Path of read in data: ./data/application_train.csv
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
# 觀察聘雇天數 'DAYS_EMPLOYED' 內的數值
app_train['DAYS_EMPLOYED']
0 -637 1 -1188 2 -225 3 -3039 4 -3038 5 -1588 6 -3130 7 -449 8 365243 9 -2019 10 -679 11 365243 12 -2717 13 -3028 14 -203 15 -1157 16 -1317 17 -191 18 -7804 19 -2038 20 -4286 21 -1652 22 -4306 23 365243 24 -746 25 -3494 26 -2628 27 -1234 28 -1796 29 -1010 ... 307481 -3147 307482 -226 307483 365243 307484 -328 307485 -670 307486 -1185 307487 365243 307488 -1218 307489 -286 307490 -1928 307491 -1953 307492 -1618 307493 -2306 307494 -6573 307495 -7438 307496 -2178 307497 -1222 307498 -3689 307499 -8694 307500 -5326 307501 -1046 307502 -8736 307503 -399 307504 -7258 307505 365243 307506 -236 307507 365243 307508 -7921 307509 -4786 307510 -1262 Name: DAYS_EMPLOYED, Length: 307511, dtype: int64
# 由於其他天數都是負值, 且聘僱日數不太可能是 365243 (大約 1000年), 算是異常數字
# 因此我們推斷這份資料中, DAYS_EMPLOYED 的欄位如果是 365243, 應該是對應到空缺值, 繪圖時應該予以忽略
sub_df = app_train[app_train['DAYS_EMPLOYED'] != 365243]
# 如果直接畫散布圖 - 看不出任何趨勢或形態
plt.plot(sub_df['DAYS_EMPLOYED'] / (-365), sub_df['AMT_INCOME_TOTAL'], '.')
plt.xlabel('Days of employed (year)')
plt.ylabel('AMT_INCOME_TOTAL (raw)')
plt.show()
corr = np.corrcoef(sub_df['DAYS_EMPLOYED'] / (-365), sub_df['AMT_INCOME_TOTAL'])
print("Correlation: %.4f" % (corr[0][1]))
Correlation: 0.0130
# 通常可以對數值範圍較大的取 log: 發現雖然沒有相關,但是受雇越久的人,AMT_INCOME_TOTAL 的 variance 越小
plt.plot(sub_df['DAYS_EMPLOYED'] / (-365), np.log10(sub_df['AMT_INCOME_TOTAL'] ), '.')
plt.xlabel('Days of employed (year)')
plt.ylabel('AMT_INCOME_TOTAL (log-scale)')
plt.show()
corr = np.corrcoef(sub_df['DAYS_EMPLOYED'] / (-365), np.log10(sub_df['AMT_INCOME_TOTAL']))
print("Correlation: %.4f" % (corr[0][1]))
Correlation: 0.0380
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 設定 data_path
dir_data = './data/'
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape
(307511, 122)
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# 檢查每一個 column
for col in app_train:
if app_train[col].dtype == 'object':
# 如果只有兩種值的類別型欄位
if len(list(app_train[col].unique())) <= 2:
# 就做 Label Encoder, 以加入相關係數檢查
app_train[col] = le.fit_transform(app_train[col])
print(app_train.shape)
app_train.head()
(307511, 122)
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | 0 | M | 0 | 1 | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | 0 | F | 0 | 0 | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | 1 | M | 1 | 1 | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | 0 | F | 0 | 1 | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | 0 | M | 0 | 1 | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# 出生日數 (DAYS_BIRTH) 取絕對值
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
一樣,pandas 很貼心地讓我們可以非常容易計算相關係數
# 觀察相關係數
app_train.corr()['TARGET']
SK_ID_CURR -0.002108 TARGET 1.000000 NAME_CONTRACT_TYPE -0.030896 FLAG_OWN_CAR -0.021851 FLAG_OWN_REALTY -0.006148 CNT_CHILDREN 0.019187 AMT_INCOME_TOTAL -0.003982 AMT_CREDIT -0.030369 AMT_ANNUITY -0.012817 AMT_GOODS_PRICE -0.039645 REGION_POPULATION_RELATIVE -0.037227 DAYS_BIRTH -0.078239 DAYS_EMPLOYED 0.074958 DAYS_REGISTRATION 0.041975 DAYS_ID_PUBLISH 0.051457 OWN_CAR_AGE 0.037612 FLAG_MOBIL 0.000534 FLAG_EMP_PHONE 0.045982 FLAG_WORK_PHONE 0.028524 FLAG_CONT_MOBILE 0.000370 FLAG_PHONE -0.023806 FLAG_EMAIL -0.001758 CNT_FAM_MEMBERS 0.009308 REGION_RATING_CLIENT 0.058899 REGION_RATING_CLIENT_W_CITY 0.060893 HOUR_APPR_PROCESS_START -0.024166 REG_REGION_NOT_LIVE_REGION 0.005576 REG_REGION_NOT_WORK_REGION 0.006942 LIVE_REGION_NOT_WORK_REGION 0.002819 REG_CITY_NOT_LIVE_CITY 0.044395 ... OBS_60_CNT_SOCIAL_CIRCLE 0.009022 DEF_60_CNT_SOCIAL_CIRCLE 0.031276 DAYS_LAST_PHONE_CHANGE 0.055218 FLAG_DOCUMENT_2 0.005417 FLAG_DOCUMENT_3 0.044346 FLAG_DOCUMENT_4 -0.002672 FLAG_DOCUMENT_5 -0.000316 FLAG_DOCUMENT_6 -0.028602 FLAG_DOCUMENT_7 -0.001520 FLAG_DOCUMENT_8 -0.008040 FLAG_DOCUMENT_9 -0.004352 FLAG_DOCUMENT_10 -0.001414 FLAG_DOCUMENT_11 -0.004229 FLAG_DOCUMENT_12 -0.000756 FLAG_DOCUMENT_13 -0.011583 FLAG_DOCUMENT_14 -0.009464 FLAG_DOCUMENT_15 -0.006536 FLAG_DOCUMENT_16 -0.011615 FLAG_DOCUMENT_17 -0.003378 FLAG_DOCUMENT_18 -0.007952 FLAG_DOCUMENT_19 -0.001358 FLAG_DOCUMENT_20 0.000215 FLAG_DOCUMENT_21 0.003709 AMT_REQ_CREDIT_BUREAU_HOUR 0.000930 AMT_REQ_CREDIT_BUREAU_DAY 0.002704 AMT_REQ_CREDIT_BUREAU_WEEK 0.000788 AMT_REQ_CREDIT_BUREAU_MON -0.012462 AMT_REQ_CREDIT_BUREAU_QRT -0.002022 AMT_REQ_CREDIT_BUREAU_YEAR 0.019930 DAYS_EMPLOYED_ANOM -0.045987 Name: TARGET, Length: 110, dtype: float64
列出目標 (TARGET) 與所有欄位之間相關係數,數值最大以及最小各 15 個
通過相關係數的結果觀察有興趣的欄位與 TARGET 或其他欄位的相關係數,並嘗試找出有趣的訊息
hw_solution_1 = app_train.corr()['TARGET'].sort_values()
hw_solution_1.head(15)
EXT_SOURCE_3 -0.178919 EXT_SOURCE_2 -0.160472 EXT_SOURCE_1 -0.155317 DAYS_BIRTH -0.078239 DAYS_EMPLOYED_ANOM -0.045987 FLOORSMAX_AVG -0.044003 FLOORSMAX_MEDI -0.043768 FLOORSMAX_MODE -0.043226 AMT_GOODS_PRICE -0.039645 REGION_POPULATION_RELATIVE -0.037227 ELEVATORS_AVG -0.034199 ELEVATORS_MEDI -0.033863 FLOORSMIN_AVG -0.033614 FLOORSMIN_MEDI -0.033394 LIVINGAREA_AVG -0.032997 Name: TARGET, dtype: float64
hw_solution_1.tail(15)
DEF_60_CNT_SOCIAL_CIRCLE 0.031276 DEF_30_CNT_SOCIAL_CIRCLE 0.032248 LIVE_CITY_NOT_WORK_CITY 0.032518 OWN_CAR_AGE 0.037612 DAYS_REGISTRATION 0.041975 FLAG_DOCUMENT_3 0.044346 REG_CITY_NOT_LIVE_CITY 0.044395 FLAG_EMP_PHONE 0.045982 REG_CITY_NOT_WORK_CITY 0.050994 DAYS_ID_PUBLISH 0.051457 DAYS_LAST_PHONE_CHANGE 0.055218 REGION_RATING_CLIENT 0.058899 REGION_RATING_CLIENT_W_CITY 0.060893 DAYS_EMPLOYED 0.074958 TARGET 1.000000 Name: TARGET, dtype: float64
plt.plot(app_train['EXT_SOURCE_3'] , app_train['TARGET'], '.')
plt.xlabel('EXT_SOURCE_3')
plt.ylabel('TARGET')
plt.show()
app_train.boxplot('EXT_SOURCE_3',by='TARGET')
plt.title('EXT_SOURCE_3 corr')
plt.show()
app_train.boxplot('DAYS_EMPLOYED',by='TARGET')
<matplotlib.axes._subplots.AxesSubplot at 0x2bf9cc18eb8>
plot_data = app_train
plot_data['YEAR_EMPLOYED'] = plot_data['DAYS_EMPLOYED']/365
plot_data.boxplot(column='YEAR_EMPLOYED', by = 'TARGET')
<matplotlib.axes._subplots.AxesSubplot at 0x2bf3123a630>
plot_data.boxplot(column=['REGION_RATING_CLIENT_W_CITY'],by = 'TARGET')
<matplotlib.axes._subplots.AxesSubplot at 0x2bf31437cf8>