相関係数による特徴選択手法
Data : House Prices: Advanced Regression Techniques | Kaggle
Model : Linear Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=0.4)
file = pd.read_csv('../../data/house_detect/train.csv')
file[0:10]
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 | 6 | 50 | RL | 85.0 | 14115 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | MnPrv | Shed | 700 | 10 | 2009 | WD | Normal | 143000 |
6 | 7 | 20 | RL | 75.0 | 10084 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 307000 |
7 | 8 | 60 | RL | NaN | 10382 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | Shed | 350 | 11 | 2009 | WD | Normal | 200000 |
8 | 9 | 50 | RM | 51.0 | 6120 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 4 | 2008 | WD | Abnorml | 129900 |
9 | 10 | 190 | RL | 50.0 | 7420 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 1 | 2008 | WD | Normal | 118000 |
10 rows × 81 columns
data = file.drop('SalePrice', axis=1)
label = file['SalePrice']
data_fill_nan = data.fillna(data.median())
data_with_dummy = pd.get_dummies(data_fill_nan)
def correlation_ranker(X, y, K=10):
cor_list = []
feature_name = X.columns.tolist()
for i in feature_name:
cor = np.corrcoef(X[i], y)[0, 1]
cor_list.append(cor)
cor_list = np.array([0 if np.isnan(i) else i for i in cor_list])
# feature name
cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-K:]].columns.tolist()
# Filter mask
cor_support = [True if i in cor_feature else False for i in feature_name]
# Correlation Values
cor_values = cor_list[np.argsort(np.abs(cor_list))[-K:].tolist()]
return cor_support, cor_feature, cor_values
# K : 上位K個の相関係数の高いものを取得
K=10
cor_support, cor_feature, cor_values = correlation_ranker(data_with_dummy, label, K=K)
plt.figure(figsize=(6, 4), dpi=200)
graph = sns.barplot(cor_feature, cor_values, palette="RdPu")
plt.xticks(rotation=30)
# Add Text Label
for i in range(K):
plt.text(x = i - 0.2 , y = cor_values[i] + 0.01, s = np.round(cor_values[i], 2), size = 6)
train_data = data_with_dummy[cor_feature]
data_with_dummy[cor_feature][0:5]
TotRmsAbvGrd | BsmtQual_Ex | FullBath | ExterQual_TA | 1stFlrSF | TotalBsmtSF | GarageArea | GarageCars | GrLivArea | OverallQual | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 0 | 2 | 0 | 856 | 856 | 548 | 2 | 1710 | 7 |
1 | 6 | 0 | 2 | 1 | 1262 | 1262 | 460 | 2 | 1262 | 6 |
2 | 6 | 0 | 2 | 0 | 920 | 920 | 608 | 2 | 1786 | 7 |
3 | 7 | 0 | 1 | 1 | 961 | 756 | 642 | 3 | 1717 | 7 |
4 | 9 | 0 | 2 | 0 | 1145 | 1145 | 836 | 3 | 2198 | 8 |
fig = plt.figure(figsize=(6, 6), dpi=200)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
ax1 = fig.add_subplot(2, 2, 1)
ax1.set_title("OverallQual")
sns.regplot(x="label", y="values",
data = pd.DataFrame(np.vstack([data_with_dummy['OverallQual'], label]).T, columns=['values', 'label']),
color="#ff0472",
scatter_kws={'s': 2},
line_kws={'color': '#1f6659'}
)
ax2 = fig.add_subplot(2,2,2)
ax2.set_title("GrLivArea")
sns.regplot(x="label", y="values",
data = pd.DataFrame(np.vstack([data_with_dummy['GrLivArea'], label]).T, columns=['values', 'label']),
color = "#29e5c6",
scatter_kws={'s': 2},
line_kws={'color': '#1f6659'}
)
ax3 = fig.add_subplot(2,2,3)
ax3.set_title("GarageCars")
sns.regplot(x="label", y="values",
data = pd.DataFrame(np.vstack([data_with_dummy['GarageCars'], label]).T, columns=['values', 'label']),
color = "#358fcc",
scatter_kws={'s': 2},
line_kws={'color': '#1f6659'}
)
ax4 = fig.add_subplot(2,2,4)
ax4.set_title("GarageArea")
sns.regplot(x="label", y="values",
data = pd.DataFrame(np.vstack([data_with_dummy['GarageArea'], label]).T, columns=['values', 'label']),
color = "#faff00",
scatter_kws={'s': 2},
line_kws={'color': '#1f6659'}
)
<matplotlib.axes._subplots.AxesSubplot at 0x7f4adf5976d8>
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
上位 1-10のモデルで線形回帰した場合の誤差(negative MSE)
results = pd.DataFrame()
for i in range(len(cor_feature)):
input_data = data_with_dummy[cor_feature[0:i+1]]
linear_regression = LinearRegression()
lin_reg_pipe = Pipeline([('lin_model', linear_regression)])
scores = cross_val_score(lin_reg_pipe, input_data, label, cv=5, scoring='neg_mean_squared_error')
results = results.append(pd.DataFrame([scores]))
results = results.reset_index(drop=True)
全特徴量を用いた場合
input_data = data_with_dummy
linear_regression = LinearRegression()
lin_reg_pipe = Pipeline([('lin_model', linear_regression)])
scores = cross_val_score(lin_reg_pipe, input_data, label, cv=5, scoring='neg_mean_squared_error')
results = results.append(pd.DataFrame([scores]))
results = results.reset_index(drop=True)
グラフ表示用にデータを整形
label_comp = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'All']
treatment_results = -results.T
treatment_results.columns = label_comp
fig = plt.figure(figsize=(4, 2), dpi=200)
sns.barplot(data=treatment_results, palette='RdPu')
<matplotlib.axes._subplots.AxesSubplot at 0x7f4ad470fc50>
data_with_dummy
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | SaleType_ConLw | SaleType_New | SaleType_Oth | SaleType_WD | SaleCondition_Abnorml | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 2 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 3 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 4 | 70 | 60.0 | 9550 | 7 | 5 | 1915 | 1970 | 0.0 | 216 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 5 | 60 | 84.0 | 14260 | 8 | 5 | 2000 | 2000 | 350.0 | 655 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 | 6 | 50 | 85.0 | 14115 | 5 | 5 | 1993 | 1995 | 0.0 | 732 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
6 | 7 | 20 | 75.0 | 10084 | 8 | 5 | 2004 | 2005 | 186.0 | 1369 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
7 | 8 | 60 | 69.0 | 10382 | 7 | 6 | 1973 | 1973 | 240.0 | 859 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
8 | 9 | 50 | 51.0 | 6120 | 7 | 5 | 1931 | 1950 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
9 | 10 | 190 | 50.0 | 7420 | 5 | 6 | 1939 | 1950 | 0.0 | 851 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
10 | 11 | 20 | 70.0 | 11200 | 5 | 5 | 1965 | 1965 | 0.0 | 906 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
11 | 12 | 60 | 85.0 | 11924 | 9 | 5 | 2005 | 2006 | 286.0 | 998 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
12 | 13 | 20 | 69.0 | 12968 | 5 | 6 | 1962 | 1962 | 0.0 | 737 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
13 | 14 | 20 | 91.0 | 10652 | 7 | 5 | 2006 | 2007 | 306.0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
14 | 15 | 20 | 69.0 | 10920 | 6 | 5 | 1960 | 1960 | 212.0 | 733 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
15 | 16 | 45 | 51.0 | 6120 | 7 | 8 | 1929 | 2001 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
16 | 17 | 20 | 69.0 | 11241 | 6 | 7 | 1970 | 1970 | 180.0 | 578 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
17 | 18 | 90 | 72.0 | 10791 | 4 | 5 | 1967 | 1967 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
18 | 19 | 20 | 66.0 | 13695 | 5 | 5 | 2004 | 2004 | 0.0 | 646 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
19 | 20 | 20 | 70.0 | 7560 | 5 | 6 | 1958 | 1965 | 0.0 | 504 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
20 | 21 | 60 | 101.0 | 14215 | 8 | 5 | 2005 | 2006 | 380.0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
21 | 22 | 45 | 57.0 | 7449 | 7 | 7 | 1930 | 1950 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
22 | 23 | 20 | 75.0 | 9742 | 8 | 5 | 2002 | 2002 | 281.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
23 | 24 | 120 | 44.0 | 4224 | 5 | 7 | 1976 | 1976 | 0.0 | 840 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
24 | 25 | 20 | 69.0 | 8246 | 5 | 8 | 1968 | 2001 | 0.0 | 188 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
25 | 26 | 20 | 110.0 | 14230 | 8 | 5 | 2007 | 2007 | 640.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
26 | 27 | 20 | 60.0 | 7200 | 5 | 7 | 1951 | 2000 | 0.0 | 234 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
27 | 28 | 20 | 98.0 | 11478 | 8 | 5 | 2007 | 2008 | 200.0 | 1218 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
28 | 29 | 20 | 47.0 | 16321 | 5 | 6 | 1957 | 1997 | 0.0 | 1277 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
29 | 30 | 30 | 60.0 | 6324 | 4 | 6 | 1927 | 1950 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1430 | 1431 | 60 | 60.0 | 21930 | 5 | 5 | 2005 | 2005 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1431 | 1432 | 120 | 69.0 | 4928 | 6 | 6 | 1976 | 1976 | 0.0 | 958 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1432 | 1433 | 30 | 60.0 | 10800 | 4 | 6 | 1927 | 2007 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1433 | 1434 | 60 | 93.0 | 10261 | 6 | 5 | 2000 | 2000 | 318.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1434 | 1435 | 20 | 80.0 | 17400 | 5 | 5 | 1977 | 1977 | 0.0 | 936 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1435 | 1436 | 20 | 80.0 | 8400 | 6 | 9 | 1962 | 2005 | 237.0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1436 | 1437 | 20 | 60.0 | 9000 | 4 | 6 | 1971 | 1971 | 0.0 | 616 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1437 | 1438 | 20 | 96.0 | 12444 | 8 | 5 | 2008 | 2008 | 426.0 | 1336 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1438 | 1439 | 20 | 90.0 | 7407 | 6 | 7 | 1957 | 1996 | 0.0 | 600 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1439 | 1440 | 60 | 80.0 | 11584 | 7 | 6 | 1979 | 1979 | 96.0 | 315 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1440 | 1441 | 70 | 79.0 | 11526 | 6 | 7 | 1922 | 1994 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1441 | 1442 | 120 | 69.0 | 4426 | 6 | 5 | 2004 | 2004 | 147.0 | 697 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1442 | 1443 | 60 | 85.0 | 11003 | 10 | 5 | 2008 | 2008 | 160.0 | 765 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1443 | 1444 | 30 | 69.0 | 8854 | 6 | 6 | 1916 | 1950 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1444 | 1445 | 20 | 63.0 | 8500 | 7 | 5 | 2004 | 2004 | 106.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1445 | 1446 | 85 | 70.0 | 8400 | 6 | 5 | 1966 | 1966 | 0.0 | 187 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1446 | 1447 | 20 | 69.0 | 26142 | 5 | 7 | 1962 | 1962 | 189.0 | 593 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1447 | 1448 | 60 | 80.0 | 10000 | 8 | 5 | 1995 | 1996 | 438.0 | 1079 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1448 | 1449 | 50 | 70.0 | 11767 | 4 | 7 | 1910 | 2000 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1449 | 1450 | 180 | 21.0 | 1533 | 5 | 7 | 1970 | 1970 | 0.0 | 553 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
1450 | 1451 | 90 | 60.0 | 9000 | 5 | 5 | 1974 | 1974 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1451 | 1452 | 20 | 78.0 | 9262 | 8 | 5 | 2008 | 2009 | 194.0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1452 | 1453 | 180 | 35.0 | 3675 | 5 | 5 | 2005 | 2005 | 80.0 | 547 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1453 | 1454 | 20 | 90.0 | 17217 | 5 | 5 | 2006 | 2006 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
1454 | 1455 | 20 | 62.0 | 7500 | 7 | 5 | 2004 | 2005 | 0.0 | 410 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1455 | 1456 | 60 | 62.0 | 7917 | 6 | 5 | 1999 | 2000 | 0.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1456 | 1457 | 20 | 85.0 | 13175 | 6 | 6 | 1978 | 1988 | 119.0 | 790 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1457 | 1458 | 70 | 66.0 | 9042 | 7 | 9 | 1941 | 2006 | 0.0 | 275 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1458 | 1459 | 20 | 68.0 | 9717 | 5 | 6 | 1950 | 1996 | 0.0 | 49 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1459 | 1460 | 20 | 75.0 | 9937 | 5 | 6 | 1965 | 1965 | 0.0 | 830 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1460 rows × 289 columns