可參考 seanborn API 中的 heatmap 用法
cmp 顏色挑選,可參考 plt colormap
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 另一個繪圖-樣式套件
plt.style.use('ggplot')
# 忽略警告訊息
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# 設定 data_path
dir_data = './data/'
# 讀取檔案
f_app = os.path.join(dir_data, 'application_train.csv')
print('Path of read in data: %s' % (f_app))
app_train = pd.read_csv(f_app)
app_train.head()
Path of read in data: ./data/application_train.csv
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
# 取出 EXT_SOURCE 的幾項變數並檢驗其相關性
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs
TARGET | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | DAYS_BIRTH | |
---|---|---|---|---|---|
TARGET | 1.000000 | -0.155317 | -0.160472 | -0.178919 | -0.078239 |
EXT_SOURCE_1 | -0.155317 | 1.000000 | 0.213982 | 0.186846 | 0.600610 |
EXT_SOURCE_2 | -0.160472 | 0.213982 | 1.000000 | 0.109167 | 0.091996 |
EXT_SOURCE_3 | -0.178919 | 0.186846 | 0.109167 | 1.000000 | 0.205478 |
DAYS_BIRTH | -0.078239 | 0.600610 | 0.091996 | 0.205478 | 1.000000 |
plt.figure(figsize = (8, 6))
# 繪製相關係數 (correlations) 的 Heatmap
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');
# 進一步我們檢查這三項變數在 Target 上的分布是否不同
plt.figure(figsize = (24, 8))
# 依不同 EXT_SOURCE 逐項繪製 KDE 圖形
for i, source in enumerate(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']):
# 做 subplot
plt.subplot(1, 3, i + 1)
# KDE 圖形
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, source].dropna(), label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, source].dropna(), label = 'target == 1')
# 加上各式圖形標籤
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source); plt.ylabel('Density');
plt.tight_layout(h_pad = 2.5)
# 轉成以年記齡後,將以日記齡丟掉
plot_data = ext_data.copy()
plot_data['YEARS_BIRTH'] = plot_data['DAYS_BIRTH'] / 365
plot_data.drop(['DAYS_BIRTH'],axis = 1, inplace=True)
# 定義函數 : 計算兩個 column 之間的相關係數
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,
size = 20)
N_sample = 100000
# 把 NaN 數值刪去, 並限制資料上限為 100000 : 因為要畫點圖, 如果點太多,會畫很久!
plot_data = plot_data.dropna().sample(n = N_sample)
# 建立 pairgrid 物件
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'TARGET', vars = [x for x in list(plot_data.columns) if x != 'TARGET'])
# 上半部為 scatter
grid.map_upper(plt.scatter, alpha = 0.2)
# 對角線畫 histogram
grid.map_diag(sns.kdeplot)
# 下半部放 density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r)
plt.suptitle('Ext Source and Age Features Pairs Plot', size = 32, y = 1.05)
plt.show()
# 試著以不同 size 畫看看,觀察有那裡不太一樣
N_sample = 1000
plot_data = plot_data.dropna().sample(n = N_sample)
# 建立 pairgrid 物件
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'TARGET', vars = [x for x in list(plot_data.columns) if x != 'TARGET'])
# 上半部為 scatter
grid.map_upper(plt.scatter, alpha = 0.2)
# 對角線畫 histogram
grid.map_diag(sns.kdeplot)
# 下半部放 density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.YlOrBr_r)
plt.suptitle('Ext Source and Age Features Pairs Plot', size = 32, y = 1.05)
plt.show()
(In[2], OUT[2]) (Hint : numpy.random.random - 均勻分布, 隨機小數)
(In[3], OUT[3], In[4], OUT[4]) (Hint : numpy.random.randn - 常態分布)
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 另一個繪圖-樣式套件
plt.style.use('ggplot')
# 忽略警告訊息
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
matrix = 2*np.random.random(size=(10,10))-1
matrix
array([[ 0.96558198, -0.74789228, 0.51347227, -0.52267548, 0.79261341, -0.03386911, -0.92172823, 0.56004055, 0.02321272, 0.85097114], [-0.42054703, 0.41968282, -0.01629041, 0.38473676, 0.3300876 , -0.67057969, 0.23583482, 0.12916991, -0.19375199, 0.33740402], [-0.76423696, -0.78150054, 0.73152889, -0.02785531, 0.01999526, -0.04457691, -0.56986854, -0.03713176, -0.05241264, 0.00116185], [-0.42771526, -0.94835822, -0.99988094, 0.23057932, 0.53748371, -0.11146398, -0.6488914 , -0.21080865, -0.40374161, -0.40224542], [-0.31180053, 0.73145318, 0.58520933, 0.37264574, -0.98587723, 0.19292301, -0.97860174, -0.49768922, 0.6020035 , -0.43381614], [ 0.9158303 , 0.27367692, 0.11060534, 0.40909063, 0.83522406, -0.61584538, 0.93583645, 0.29339454, -0.89801626, -0.36519539], [-0.28486087, 0.72722824, -0.05022807, -0.7040498 , -0.65840177, 0.13302314, -0.23136832, 0.99977706, -0.13638526, 0.75056588], [ 0.48270261, 0.42581859, 0.83653105, 0.92070024, -0.94029696, -0.25380717, 0.9014008 , 0.66125291, 0.02230881, -0.53712624], [ 0.1508261 , 0.162897 , 0.68009475, -0.61919859, 0.46588934, 0.40049566, -0.8648606 , -0.92885367, 0.34314778, -0.31351073], [ 0.08402986, 0.83584258, -0.78792361, 0.61471166, -0.43391429, -0.87342665, -0.43132813, -0.75656129, 0.47549059, 0.76287662]])
"""
Your Code Here
"""
matrix = matrix
plt.figure(figsize=(10,10))
"""
Your Code Here
"""
sns.heatmap(matrix, cmap = plt.cm.coolwarm, vmin = -1, annot = True, vmax = 1)
plt.title('Correlation Heatmap');
plt.show()
nrow = 1000
ncol = 3
"""
Your Code Here
"""
matrix = 2*np.random.random(size=(nrow,ncol))-1
# 隨機給予 0, 1, 2 三種標籤
indice = np.random.choice([0,1,2], size=nrow)
plot_data = pd.DataFrame(matrix, indice).reset_index()
# 繪製 seborn 進階 Heatmap
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'index', vars = [x for x in list(plot_data.columns) if x != 'index'])
"""
Your Code Here
Please replace "..." to correct plot function
"""
grid.map_upper(plt.scatter, alpha = 0.2)
grid.map_diag(sns.kdeplot)
grid.map_lower(sns.kdeplot, cmap = plt.cm.coolwarm)
grid.add_legend()
plt.show()
plot_data.head()
index | 0 | 1 | 2 | |
---|---|---|---|---|
0 | 0 | 0.179298 | -0.449473 | 0.352405 |
1 | 0 | -0.704498 | 0.477248 | 0.208285 |
2 | 0 | -0.393633 | 0.284053 | -0.497673 |
3 | 2 | 0.453905 | -0.406853 | 0.136736 |
4 | 1 | 0.461955 | 0.691811 | 0.037417 |
nrow = 1000
ncol = 3
"""
Your Code Here
"""
matrix = np.random.randn(nrow,ncol)
matrix
indice = np.random.choice([0,1,2], size=nrow)
plot_data = pd.DataFrame(matrix, indice).reset_index()
plot_data.head()
index | 0 | 1 | 2 | |
---|---|---|---|---|
0 | 2 | -1.606178 | -0.405476 | -0.505091 |
1 | 2 | 0.032358 | 2.158690 | 0.345038 |
2 | 2 | 0.251638 | 0.766973 | -1.203160 |
3 | 2 | 0.546371 | 1.183574 | 0.161650 |
4 | 2 | -0.998505 | -0.969187 | -0.750524 |
nrow = 1000
ncol = 3
"""
Your Code Here
"""
matrix = np.random.randn(nrow,ncol)
# 隨機給予 0, 1, 2 三種標籤
indice = np.random.choice([0,1,2], size=nrow)
plot_data = pd.DataFrame(matrix, indice).reset_index()
# 繪製 seborn 進階 Heatmap
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'index', vars = [x for x in list(plot_data.columns) if x != 'index'])
"""
Your Code Here
Please replace "..." to correct plot function
"""
grid.map_upper(plt.scatter, alpha = 0.2)
grid.map_diag(sns.kdeplot)
grid.map_lower(sns.kdeplot, cmap = plt.cm.coolwarm)
grid.add_legend()
plt.show()