# Import 需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 設定 data_path
dir_data = './data/'
f_app = os.path.join(dir_data, 'application_train.csv')
print('Path of read in data: %s' % (f_app))
app_train = pd.read_csv(f_app)
app_train.head()
Path of read in data: ./data/application_train.csv
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
app_train['AMT_REQ_CREDIT_BUREAU_HOUR'].describe()
count 265992.000000 mean 0.006402 std 0.083849 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 4.000000 Name: AMT_REQ_CREDIT_BUREAU_HOUR, dtype: float64
# 如果欄位中有 NA, describe 會有問題
app_train['AMT_ANNUITY'].describe()
count 307499.000000 mean 27108.573909 std 14493.737315 min 1615.500000 25% 16524.000000 50% 24903.000000 75% 34596.000000 max 258025.500000 Name: AMT_ANNUITY, dtype: float64
# Ignore NA, 計算五值
five_num = [0, 25, 50, 75, 100]
quantile_5s = [np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = i) for i in five_num]
print(quantile_5s)
[1615.5, 16524.0, 24903.0, 34596.0, 258025.5]
#'~' 波浪符號為取反的意思
app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'].hist(bins = 100)
plt.show()
#等價上一行代碼
app_train[app_train['AMT_ANNUITY'].notnull()]['AMT_ANNUITY'].hist(bins = 100)
plt.show()
# 試著將 max 取代為 q99
app_train[app_train['AMT_ANNUITY'] == app_train['AMT_ANNUITY'].max()] = np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = 99)
np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = 99)
70006.5
five_num = [0, 25, 50, 75, 100]
quantile_5s = [np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = i) for i in five_num]
print(quantile_5s)
[1615.5, 16524.0, 24903.0, 34596.0, 230161.5]
可以看到原本的
max 258025.5
被取代後變為
max 230161.5
# 得到 median 的另外一種方法
np.median(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'])
24903.0
# 計算眾數 (mode)
from scipy.stats import mode
import time
start_time = time.time()
mode_get = mode(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'])
print(mode_get)
print("Elapsed time: %.3f secs" % (time.time() - start_time))
ModeResult(mode=array([9000.]), count=array([6385])) Elapsed time: 5.396 secs
# 計算眾數 (mode)
# 較快速的方式
from collections import defaultdict
start_time = time.time()
mode_dict = defaultdict(lambda:0)
for value in app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY']:
mode_dict[value] += 1
mode_get = sorted(mode_dict.items(), key=lambda kv: kv[1], reverse=True)
print(mode_get[0])
print("Elapsed time: %.3f secs" % (time.time() - start_time))
(9000.0, 6385) Elapsed time: 0.300 secs
# 以 AMT_CREDIT 為例
app_train['AMT_CREDIT'].hist(bins = 50)
plt.title("Original")
plt.show()
value = app_train['AMT_CREDIT'].values
app_train['AMT_CREDIT_Norm1'] = ( value - np.mean(value) ) / ( np.std(value) )
app_train['AMT_CREDIT_Norm1'].hist(bins = 50)
plt.title("Normalized with Z-transform")
plt.show()
app_train['AMT_CREDIT_Norm2'] = ( value - min(value) ) / ( max(value) - min(value) )
app_train['AMT_CREDIT_Norm2'].hist(bins = 50)
plt.title("Normalized to 0 ~ 1")
plt.show()
# Import 需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 設定 data_path
dir_data = './data/'
f_app = os.path.join(dir_data, 'application_train.csv')
print('Path of read in data: %s' % (f_app))
app_train = pd.read_csv(f_app)
app_train.head()
Path of read in data: ./data/application_train.csv
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
"""
YOUR CODE HERE
"""
q_all = list(range(101))
quantile_q_all = [np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = i) for i in q_all]
print(quantile_q_all)
# 1: 計算 AMT_ANNUITY 的 q0 - q100
q_all = pd.DataFrame({'q': list(range(101)),
'value': quantile_q_all})
print(q_all)
[1615.5, 6182.910000000001, 6750.0, 7875.0, 8703.0, 9000.0, 9000.0, 9553.5, 10125.0, 10503.0, 11074.5, 11430.0, 11970.0, 12375.0, 12838.5, 13302.0, 13500.0, 13500.0, 13896.0, 14350.5, 14701.5, 15124.5, 15583.5, 15970.5, 16209.0, 16524.0, 16852.5, 17109.0, 17487.0, 17806.5, 18189.0, 18643.5, 19102.5, 19417.5, 19836.0, 20151.0, 20421.0, 20853.0, 21186.0, 21609.0, 21865.5, 22018.5, 22342.5, 22527.0, 22972.5, 23346.0, 23719.5, 23931.0, 24259.5, 24583.589999999953, 24903.0, 25240.5, 25537.5, 25960.5, 26217.0, 26316.0, 26640.0, 26860.5, 27189.0, 27558.0, 28062.0, 28480.5, 28917.0, 29340.0, 29830.5, 30078.0, 30483.0, 30897.0, 31275.0, 31630.5, 32004.0, 32458.5, 32895.0, 33376.5, 33984.0, 34596.0, 35345.16000000005, 35806.5, 36328.5, 36747.0, 37516.5, 37948.5, 38556.0, 39456.0, 40135.5, 40806.0, 41845.5, 42790.5, 43735.5, 44991.0, 45954.0, 47254.5, 48465.0, 49878.0, 51745.5, 53325.0, 55624.5, 58482.0, 62964.0, 70006.5, 258025.5] q value 0 0 1615.50 1 1 6182.91 2 2 6750.00 3 3 7875.00 4 4 8703.00 5 5 9000.00 6 6 9000.00 7 7 9553.50 8 8 10125.00 9 9 10503.00 10 10 11074.50 11 11 11430.00 12 12 11970.00 13 13 12375.00 14 14 12838.50 15 15 13302.00 16 16 13500.00 17 17 13500.00 18 18 13896.00 19 19 14350.50 20 20 14701.50 21 21 15124.50 22 22 15583.50 23 23 15970.50 24 24 16209.00 25 25 16524.00 26 26 16852.50 27 27 17109.00 28 28 17487.00 29 29 17806.50 .. ... ... 71 71 32458.50 72 72 32895.00 73 73 33376.50 74 74 33984.00 75 75 34596.00 76 76 35345.16 77 77 35806.50 78 78 36328.50 79 79 36747.00 80 80 37516.50 81 81 37948.50 82 82 38556.00 83 83 39456.00 84 84 40135.50 85 85 40806.00 86 86 41845.50 87 87 42790.50 88 88 43735.50 89 89 44991.00 90 90 45954.00 91 91 47254.50 92 92 48465.00 93 93 49878.00 94 94 51745.50 95 95 53325.00 96 96 55624.50 97 97 58482.00 98 98 62964.00 99 99 70006.50 100 100 258025.50 [101 rows x 2 columns]
q_all
q | value | |
---|---|---|
0 | 0 | 1615.50 |
1 | 1 | 6182.91 |
2 | 2 | 6750.00 |
3 | 3 | 7875.00 |
4 | 4 | 8703.00 |
5 | 5 | 9000.00 |
6 | 6 | 9000.00 |
7 | 7 | 9553.50 |
8 | 8 | 10125.00 |
9 | 9 | 10503.00 |
10 | 10 | 11074.50 |
11 | 11 | 11430.00 |
12 | 12 | 11970.00 |
13 | 13 | 12375.00 |
14 | 14 | 12838.50 |
15 | 15 | 13302.00 |
16 | 16 | 13500.00 |
17 | 17 | 13500.00 |
18 | 18 | 13896.00 |
19 | 19 | 14350.50 |
20 | 20 | 14701.50 |
21 | 21 | 15124.50 |
22 | 22 | 15583.50 |
23 | 23 | 15970.50 |
24 | 24 | 16209.00 |
25 | 25 | 16524.00 |
26 | 26 | 16852.50 |
27 | 27 | 17109.00 |
28 | 28 | 17487.00 |
29 | 29 | 17806.50 |
... | ... | ... |
71 | 71 | 32458.50 |
72 | 72 | 32895.00 |
73 | 73 | 33376.50 |
74 | 74 | 33984.00 |
75 | 75 | 34596.00 |
76 | 76 | 35345.16 |
77 | 77 | 35806.50 |
78 | 78 | 36328.50 |
79 | 79 | 36747.00 |
80 | 80 | 37516.50 |
81 | 81 | 37948.50 |
82 | 82 | 38556.00 |
83 | 83 | 39456.00 |
84 | 84 | 40135.50 |
85 | 85 | 40806.00 |
86 | 86 | 41845.50 |
87 | 87 | 42790.50 |
88 | 88 | 43735.50 |
89 | 89 | 44991.00 |
90 | 90 | 45954.00 |
91 | 91 | 47254.50 |
92 | 92 | 48465.00 |
93 | 93 | 49878.00 |
94 | 94 | 51745.50 |
95 | 95 | 53325.00 |
96 | 96 | 55624.50 |
97 | 97 | 58482.00 |
98 | 98 | 62964.00 |
99 | 99 | 70006.50 |
100 | 100 | 258025.50 |
101 rows × 2 columns
也可以使用 DataFrame.quantile 的方法,得到的答案是一樣的。
q_all = app_train['AMT_ANNUITY'].quantile([i/100 for i in range(101)])
q_all = pd.DataFrame({'q': list(range(101)),
'value': quantile_q_all})
q_all
q | value | |
---|---|---|
0 | 0 | 1615.50 |
1 | 1 | 6182.91 |
2 | 2 | 6750.00 |
3 | 3 | 7875.00 |
4 | 4 | 8703.00 |
5 | 5 | 9000.00 |
6 | 6 | 9000.00 |
7 | 7 | 9553.50 |
8 | 8 | 10125.00 |
9 | 9 | 10503.00 |
10 | 10 | 11074.50 |
11 | 11 | 11430.00 |
12 | 12 | 11970.00 |
13 | 13 | 12375.00 |
14 | 14 | 12838.50 |
15 | 15 | 13302.00 |
16 | 16 | 13500.00 |
17 | 17 | 13500.00 |
18 | 18 | 13896.00 |
19 | 19 | 14350.50 |
20 | 20 | 14701.50 |
21 | 21 | 15124.50 |
22 | 22 | 15583.50 |
23 | 23 | 15970.50 |
24 | 24 | 16209.00 |
25 | 25 | 16524.00 |
26 | 26 | 16852.50 |
27 | 27 | 17109.00 |
28 | 28 | 17487.00 |
29 | 29 | 17806.50 |
... | ... | ... |
71 | 71 | 32458.50 |
72 | 72 | 32895.00 |
73 | 73 | 33376.50 |
74 | 74 | 33984.00 |
75 | 75 | 34596.00 |
76 | 76 | 35345.16 |
77 | 77 | 35806.50 |
78 | 78 | 36328.50 |
79 | 79 | 36747.00 |
80 | 80 | 37516.50 |
81 | 81 | 37948.50 |
82 | 82 | 38556.00 |
83 | 83 | 39456.00 |
84 | 84 | 40135.50 |
85 | 85 | 40806.00 |
86 | 86 | 41845.50 |
87 | 87 | 42790.50 |
88 | 88 | 43735.50 |
89 | 89 | 44991.00 |
90 | 90 | 45954.00 |
91 | 91 | 47254.50 |
92 | 92 | 48465.00 |
93 | 93 | 49878.00 |
94 | 94 | 51745.50 |
95 | 95 | 53325.00 |
96 | 96 | 55624.50 |
97 | 97 | 58482.00 |
98 | 98 | 62964.00 |
99 | 99 | 70006.50 |
100 | 100 | 258025.50 |
101 rows × 2 columns
# 2.1 將 NAs 以 q50 填補
print("Before replace NAs, numbers of row that AMT_ANNUITY is NAs: %i" % sum(app_train['AMT_ANNUITY'].isnull()))
"""
Your Code Here
"""
q_50 = app_train['AMT_ANNUITY'].quantile(50/100)
app_train.loc[app_train['AMT_ANNUITY'].isnull(),'AMT_ANNUITY'] = q_50
print("After replace NAs, numbers of row that AMT_ANNUITY is NAs: %i" % sum(app_train['AMT_ANNUITY'].isnull()))
Before replace NAs, numbers of row that AMT_ANNUITY is NAs: 12 After replace NAs, numbers of row that AMT_ANNUITY is NAs: 0
將NAN填補過後, isnull() 回報總共的 NAN 數量變 0
$ y = 2*(\frac{x - min(x)}{max(x) - min(x)} - 0.5) $
# 2.2 Normalize values to -1 to 1
print("== Original data range ==")
print(app_train['AMT_ANNUITY'].describe())
def normalize_value(x):
min_x = x.min()
max_x = x.max()
y = 2*((x-min_x)/(max_x-min_x) - 0.5)
return y
app_train['AMT_ANNUITY_NORMALIZED'] = normalize_value(app_train['AMT_ANNUITY'])
print("== Normalized data range ==")
app_train['AMT_ANNUITY_NORMALIZED'].describe()
== Original data range == count 307511.000000 mean 27108.487841 std 14493.461065 min 1615.500000 25% 16524.000000 50% 24903.000000 75% 34596.000000 max 258025.500000 Name: AMT_ANNUITY, dtype: float64 == Normalized data range ==
count 307511.000000 mean -0.801154 std 0.113049 min -1.000000 25% -0.883714 50% -0.818357 75% -0.742752 max 1.000000 Name: AMT_ANNUITY_NORMALIZED, dtype: float64
也可以用 sklearn 的 MinMaxScaler function 直接套用
from sklearn.preprocessing import MinMaxScaler
def normalize_value(x):
mms = MinMaxScaler((-1,1))
x = mms.fit_transform(x.to_frame())
return x
app_train['AMT_ANNUITY_NORMALIZED'] = normalize_value(app_train['AMT_ANNUITY'])
print("== Normalized data range ==")
app_train['AMT_ANNUITY_NORMALIZED'].describe()
== Normalized data range ==
count 307511.000000 mean -0.801154 std 0.113049 min -1.000000 25% -0.883714 50% -0.818357 75% -0.742752 max 1.000000 Name: AMT_ANNUITY_NORMALIZED, dtype: float64
# 3
print("Before replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: %i" % sum(app_train['AMT_GOODS_PRICE'].isnull()))
# 列出重複最多的數值
"""
Your Code Here
"""
from collections import defaultdict
mode_dict = defaultdict(lambda:0)
for value in app_train[~app_train['AMT_GOODS_PRICE'].isnull()]['AMT_GOODS_PRICE']:
mode_dict[value] += 1
mode_get = sorted(mode_dict.items(), key=lambda kv: kv[1], reverse=True)
value_most = mode_get[0]
print(value_most)
mode_goods_price = list(app_train['AMT_GOODS_PRICE'].value_counts().index)
app_train.loc[app_train['AMT_GOODS_PRICE'].isnull(), 'AMT_GOODS_PRICE'] = mode_goods_price[0]
print("After replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: %i" % sum(app_train['AMT_GOODS_PRICE'].isnull()))
Before replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: 278 (450000.0, 26022) After replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: 0