Notebook

In [1]:

%load_ext watermark
%watermark -v -p numpy,scipy,sklearn,pandas,matplotlib

CPython 3.7.3
IPython 7.5.0

numpy 1.16.3
scipy 1.2.1
sklearn 0.21.1
pandas 0.24.2
matplotlib 3.0.3

2장 – 머신러닝 프로젝트의 처음부터 끝까지

머신러닝 주택 회사에 오신 것을 환영합니다! 여러분이 해야 할 일은 캘리포니아 인구조사 데이터를 사용해 이 지역의 주택 가격 모델을 만드는 것입니다.

이 노트북은 2장에 있는 모든 샘플 코드와 연습문제 해답을 가지고 있습니다.

노트: 이 주피터 노트북의 결과가 책에 있는 것과 조금 다를 수 있습니다. 대부분은 훈련 알고리즘들이 가지고 있는 무작위성 때문입니다. 가능하면 노트북의 결과를 동일하게 유지하려고 하지만 모든 플랫폼에서 동일한 출력을 낸다고 보장하긴 어렵습니다. 어떤 데이터 구조(가령 딕셔너리)는 아이템의 순서가 일정하지 않습니다. 마지막으로 몇 가지 사소한 버그 수정(해당 부분에 설명을 추가했습니다) 때문에 결과가 조금 달라졌습니다. 하지만 책에서 제시한 설명은 유효합니다.

설정¶

파이썬 2와 3을 모두 지원합니다. 공통 모듈을 임포트하고 맷플롯립 그림이 노트북 안에 포함되도록 설정하고 생성한 그림을 저장하기 위한 함수를 준비합니다:

In [2]:

# 파이썬 2와 파이썬 3 지원
from __future__ import division, print_function, unicode_literals

# 공통
import numpy as np
import os

# 일관된 출력을 위해 유사난수 초기화
np.random.seed(42)

# 맷플롯립 설정
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# 한글출력
matplotlib.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False

# 그림을 저장할 폴드
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

데이터 다운로드¶

In [3]:

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:

fetch_housing_data()

In [5]:

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [6]:

housing = load_housing_data()
housing.head()

Out[6]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

In [7]:

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

In [8]:

housing["ocean_proximity"].value_counts()

Out[8]:

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [9]:

housing.describe()

Out[9]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

In [10]:

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

In [11]:

# 일관된 출력을 위해 유사난수 초기화
np.random.seed(42)

In [12]:

import numpy as np

# 예시를 위해서 만든 것입니다. 사이킷런에는 train_test_split() 함수가 있습니다.
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [13]:

train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

16512 train + 4128 test

In [14]:

from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

위의 test_set_check() 함수는 파이썬 2와 파이썬 3에서 모두 작동되고 다음의 hashlib를 사용한 구현보다 훨씬 빠릅니다.

In [15]:

import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

In [16]:

# 이 버전의 test_set_check() 함수가 파이썬 2도 지원합니다.
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

In [17]:

housing_with_id = housing.reset_index()   # `index` 열이 추가된 데이터프레임이 반환됩니다.
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [18]:

housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [19]:

test_set.head()

Out[19]:

	index	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	id
8	8	-122.26	37.84	42.0	2555.0	665.0	1206.0	595.0	2.0804	226700.0	NEAR BAY	-122222.16
10	10	-122.26	37.85	52.0	2202.0	434.0	910.0	402.0	3.2031	281500.0	NEAR BAY	-122222.15
11	11	-122.26	37.85	52.0	3503.0	752.0	1504.0	734.0	3.2705	241800.0	NEAR BAY	-122222.15
12	12	-122.26	37.85	52.0	2491.0	474.0	1098.0	468.0	3.0750	213500.0	NEAR BAY	-122222.15
13	13	-122.26	37.84	52.0	696.0	191.0	345.0	174.0	2.6736	191300.0	NEAR BAY	-122222.16

In [20]:

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [21]:

test_set.head()

Out[21]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
20046	-119.01	36.06	25.0	1505.0	NaN	1392.0	359.0	1.6812	47700.0	INLAND
3024	-119.46	35.14	30.0	2943.0	NaN	1565.0	584.0	2.5313	45800.0	INLAND
15663	-122.44	37.80	52.0	3830.0	NaN	1310.0	963.0	3.4801	500001.0	NEAR BAY
20484	-118.72	34.28	17.0	3051.0	NaN	1705.0	495.0	5.7376	218600.0	<1H OCEAN
9814	-121.93	36.62	34.0	2351.0	NaN	1063.0	428.0	3.7250	278000.0	NEAR OCEAN

In [22]:

housing["median_income"].hist()

Out[22]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f474cc131d0>

In [23]:

# 소득 카테고리 개수를 제한하기 위해 1.5로 나눕니다.
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# 5 이상은 5로 레이블합니다.
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [24]:

housing["income_cat"].value_counts()

Out[24]:

3.0    7236
2.0    6581
4.0    3639
5.0    2362
1.0     822
Name: income_cat, dtype: int64

In [25]:

housing["income_cat"].hist()
save_fig('income_category_hist')

In [26]:

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [27]:

strat_test_set["income_cat"].value_counts() / len(strat_test_set)

Out[27]:

3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64

In [28]:

housing["income_cat"].value_counts() / len(housing)

Out[28]:

3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

In [29]:

def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [30]:

compare_props

Out[30]:

	Overall	Stratified	Random	Rand. %error	Strat. %error
1.0	0.039826	0.039729	0.040213	0.973236	-0.243309
2.0	0.318847	0.318798	0.324370	1.732260	-0.015195
3.0	0.350581	0.350533	0.358527	2.266446	-0.013820
4.0	0.176308	0.176357	0.167393	-5.056334	0.027480
5.0	0.114438	0.114583	0.109496	-4.318374	0.127011

In [31]:

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

데이터 이해를 위한 탐색과 시각화¶

In [32]:

housing = strat_train_set.copy()

In [33]:

ax = housing.plot(kind="scatter", x="longitude", y="latitude")
ax.set(xlabel='경도', ylabel='위도')
save_fig("bad_visualization_plot")

In [34]:

ax = housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
ax.set(xlabel='경도', ylabel='위도')
save_fig("better_visualization_plot")

sharex=False 매개변수는 x-축의 값과 범례를 표시하지 못하는 버그를 수정합니다. 이는 임시 방편입니다(https://github.com/pandas-dev/pandas/issues/10611 참조). 수정 사항을 알려준 Wilmer Arellano에게 감사합니다.

In [35]:

ax = housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="인구", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
ax.set(xlabel='경도', ylabel='위도')
plt.legend()
save_fig("housing_prices_scatterplot")

In [36]:

import matplotlib.image as mpimg
california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')
ax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
                       s=housing['population']/100, label="인구",
                       c="median_house_value", cmap=plt.get_cmap("jet"),
                       colorbar=False, alpha=0.4,
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)
plt.ylabel("위도", fontsize=14)
plt.xlabel("경도", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar()
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('중간 주택 가격', fontsize=16)

plt.legend(fontsize=16)
save_fig("california_housing_prices_plot")
plt.show()

In [37]:

corr_matrix = housing.corr()

In [38]:

corr_matrix["median_house_value"].sort_values(ascending=False)

Out[38]:

median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

In [39]:

from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

In [40]:

housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")

In [41]:

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [42]:

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

Out[42]:

median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

In [43]:

housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
             alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

In [44]:

housing.describe()

Out[44]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	rooms_per_household	bedrooms_per_room	population_per_household
count	16512.000000	16512.000000	16512.000000	16512.000000	16354.000000	16512.000000	16512.000000	16512.000000	16512.000000	16512.000000	16354.000000	16512.000000
mean	-119.575834	35.639577	28.653101	2622.728319	534.973890	1419.790819	497.060380	3.875589	206990.920724	5.440341	0.212878	3.096437
std	2.001860	2.138058	12.574726	2138.458419	412.699041	1115.686241	375.720845	1.904950	115703.014830	2.611712	0.057379	11.584826
min	-124.350000	32.540000	1.000000	6.000000	2.000000	3.000000	2.000000	0.499900	14999.000000	1.130435	0.100000	0.692308
25%	-121.800000	33.940000	18.000000	1443.000000	295.000000	784.000000	279.000000	2.566775	119800.000000	4.442040	0.175304	2.431287
50%	-118.510000	34.260000	29.000000	2119.500000	433.000000	1164.000000	408.000000	3.540900	179500.000000	5.232284	0.203031	2.817653
75%	-118.010000	37.720000	37.000000	3141.000000	644.000000	1719.250000	602.000000	4.744475	263900.000000	6.056361	0.239831	3.281420
max	-114.310000	41.950000	52.000000	39320.000000	6210.000000	35682.000000	5358.000000	15.000100	500001.000000	141.909091	1.000000	1243.333333

머신러닝 알고리즘을 위한 데이터 준비¶

In [45]:

housing = strat_train_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제
housing_labels = strat_train_set["median_house_value"].copy()

In [46]:

sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

Out[46]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity
4629	-118.30	34.07	18.0	3759.0	NaN	3296.0	1462.0	2.2708	<1H OCEAN
6068	-117.86	34.01	16.0	4632.0	NaN	3038.0	727.0	5.1762	<1H OCEAN
17923	-121.97	37.35	30.0	1955.0	NaN	999.0	386.0	4.6328	<1H OCEAN
13656	-117.30	34.05	6.0	2155.0	NaN	1039.0	391.0	1.6675	INLAND
19252	-122.79	38.48	7.0	6837.0	NaN	3468.0	1405.0	3.1662	<1H OCEAN

In [47]:

sample_incomplete_rows.dropna(subset=["total_bedrooms"])    # 옵션 1

Out[47]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity

In [48]:

sample_incomplete_rows.drop("total_bedrooms", axis=1)       # 옵션 2

Out[48]:

	longitude	latitude	housing_median_age	total_rooms	population	households	median_income	ocean_proximity
4629	-118.30	34.07	18.0	3759.0	3296.0	1462.0	2.2708	<1H OCEAN
6068	-117.86	34.01	16.0	4632.0	3038.0	727.0	5.1762	<1H OCEAN
17923	-121.97	37.35	30.0	1955.0	999.0	386.0	4.6328	<1H OCEAN
13656	-117.30	34.05	6.0	2155.0	1039.0	391.0	1.6675	INLAND
19252	-122.79	38.48	7.0	6837.0	3468.0	1405.0	3.1662	<1H OCEAN

In [49]:

median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # 옵션 3
sample_incomplete_rows

Out[49]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity
4629	-118.30	34.07	18.0	3759.0	433.0	3296.0	1462.0	2.2708	<1H OCEAN
6068	-117.86	34.01	16.0	4632.0	433.0	3038.0	727.0	5.1762	<1H OCEAN
17923	-121.97	37.35	30.0	1955.0	433.0	999.0	386.0	4.6328	<1H OCEAN
13656	-117.30	34.05	6.0	2155.0	433.0	1039.0	391.0	1.6675	INLAND
19252	-122.79	38.48	7.0	6837.0	433.0	3468.0	1405.0	3.1662	<1H OCEAN

sklearn.preprocessing.Imputer 클래스는 사이킷런 0.20 버전에서 사용 중지 경고가 발생하고 0.22 버전에서 삭제될 예정입니다. 대신 추가된 sklearn.impute.SimpleImputer 클래스를 사용합니다.

In [50]:

#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

중간값이 수치형 특성에서만 계산될 수 있기 때문에 텍스트 특성을 삭제합니다:

In [51]:

housing_num = housing.drop('ocean_proximity', axis=1)
# 다른 방법: housing_num = housing.select_dtypes(include=[np.number])

In [52]:

imputer.fit(housing_num)

Out[52]:

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [53]:

imputer.statistics_

Out[53]:

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

각 특성의 중간 값이 수동으로 계산한 것과 같은지 확인해 보세요:

In [54]:

housing_num.median().values

Out[54]:

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

훈련 세트 변환:

In [55]:

X = imputer.transform(housing_num)

In [56]:

housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index = list(housing.index.values))

In [57]:

housing_tr.loc[sample_incomplete_rows.index.values]

Out[57]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income
4629	-118.30	34.07	18.0	3759.0	433.0	3296.0	1462.0	2.2708
6068	-117.86	34.01	16.0	4632.0	433.0	3038.0	727.0	5.1762
17923	-121.97	37.35	30.0	1955.0	433.0	999.0	386.0	4.6328
13656	-117.30	34.05	6.0	2155.0	433.0	1039.0	391.0	1.6675
19252	-122.79	38.48	7.0	6837.0	433.0	3468.0	1405.0	3.1662

In [58]:

imputer.strategy

Out[58]:

'median'

In [59]:

housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.head()

Out[59]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income
0	-121.89	37.29	38.0	1568.0	351.0	710.0	339.0	2.7042
1	-121.93	37.05	14.0	679.0	108.0	306.0	113.0	6.4214
2	-117.20	32.77	31.0	1952.0	471.0	936.0	462.0	2.8621
3	-119.61	36.31	25.0	1847.0	371.0	1460.0	353.0	1.8839
4	-118.59	34.23	17.0	6592.0	1525.0	4459.0	1463.0	3.0347

이제 범주형 입력 특성인 ocean_proximity을 전처리합니다:

책에 실린 방법¶

In [60]:

housing_cat = housing['ocean_proximity']
housing_cat.head(10)

Out[60]:

17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
19480        INLAND
8879      <1H OCEAN
13685        INLAND
4937      <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object

판다스의 factorize() 메소드는 문자열 범주형 특성을 머신러닝 알고리즘이 다루기 쉬운 숫자 범주형 특성으로 변환시켜 줍니다:

In [61]:

housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]

Out[61]:

array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0])

In [62]:

housing_categories

Out[62]:

Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')

OneHotEncoder를 사용하여 범주형 값을 원-핫 벡터로 변경합니다:

사이킷런 0.20 버전에서 OneHotEncoder의 동작 방식이 변경되었습니다. 종전에는 0~최댓값 사이의 정수를 카테고리로 인식했지만 앞으로는 정수나 문자열에 상관없이 고유한 값만을 카테고리로 인식합니다. 경고 메세지를 피하기 위해 categories 매개변수를 auto로 설정합니다.

In [63]:

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

Out[63]:

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

OneHotEncoder는 기본적으로 희소 행렬을 반환합니다. 필요하면 밀집 배열로 변환할 수 있습니다:

In [64]:

housing_cat_1hot.toarray()

Out[64]:

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [65]:

# [PR #9151](https://github.com/scikit-learn/scikit-learn/pull/9151)에서 가져온 CategoricalEncoder 클래스의 정의.
# 이 클래스는 사이킷런 0.20에 포함될 예정입니다.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

CategoricalEncoder는 하나 이상의 특성을 가진 2D 배열을 기대합니다. 따라서 housing_cat을 2D 배열로 바꾸어 주어야 합니다:

In [66]:

#from sklearn.preprocessing import CategoricalEncoder # Scikit-Learn 0.20에서 추가 예정

cat_encoder = CategoricalEncoder()
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

Out[66]:

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

사이킷런 0.20 개발 브랜치에 있던 CategoricalEncoder는 새로운 OneHotEncoder와 OrdinalEncoder로 나뉘었습니다. OneHotEncoder로 문자열로 된 범주형 변수도 변환할 수 있습니다:

In [67]:

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(categories='auto')
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

Out[67]:

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

기본 인코딩은 원-핫 벡터이고 희소 행렬로 반환됩니다. toarray() 메소드를 사용하여 밀집 배열로 바꿀 수 있습니다:

In [68]:

housing_cat_1hot.toarray()

Out[68]:

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

또는 encoding 매개변수를 "onehot-dense"로 지정하여 희소 행렬대신 밀집 행렬을 얻을 수 있습니다. 0.20 버전의 OneHotEncoder는 sparse=Fasle 옵션을 주어 밀집 행렬을 얻을 수 있습니다:

In [69]:

# cat_encoder = CategoricalEncoder(encoding="onehot-dense")
cat_encoder = OneHotEncoder(categories='auto', sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

Out[69]:

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [70]:

cat_encoder.categories_

Out[70]:

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

future_encoders.py를 사용한 새로운 방법¶

In [71]:

housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

Out[71]:

	ocean_proximity
17606	<1H OCEAN
18632	<1H OCEAN
14650	NEAR OCEAN
3230	INLAND
3555	<1H OCEAN
19480	INLAND
8879	<1H OCEAN
13685	INLAND
4937	<1H OCEAN
4861	<1H OCEAN

주의: 번역서는 판다스의 Series.factorize() 메서드를 사용하여 문자열 범주형 특성을 정수로 인코딩합니다. 사이킷런 0.20에 추가될 OrdinalEncoder 클래스(PR #10521)는 입력 특성(레이블 y가 아니라 X)을 위해 설계되었고 파이프라인(나중에 이 노트북에서 나옵니다)과 잘 작동되기 때문에 더 좋은 방법입니다. 지금은 future_encoders.py 파일에서 임포트하지만 사이킷런 0.20 버전이 릴리스되면 sklearn.preprocessing에서 바로 임포팅할 수 있습니다.

0.20 버전 릴리스에 맞추어 sklearn.preprocessing에서 임포트합니다.

In [72]:

# from future_encoders import OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

In [73]:

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

Out[73]:

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [74]:

ordinal_encoder.categories_

Out[74]:

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

주의: 번역서는 CategoricalEncoder를 사용하여 각 범주형 값을 원-핫 벡터로 변경합니다. 이 클래스는 OrdinalEncoder와 새로운 OneHotEncoder로 리팩토링되었습니다. 지금은 OneHotEncoder가 정수형 범주 입력만 다룰 수 있지만 사이킷런 0.20에서는 문자열 범주 입력도 다룰 수 있을 것입니다(PR #10521). 지금은 future_encoders.py 파일에서 임포트하지만 사이킷런 0.20 버전이 릴리스되면 sklearn.preprocessing에서 바로 임포팅할 수 있습니다.

0.20 버전 릴리스에 맞추어 sklearn.preprocessing에서 임포트합니다(사실 우리는 이미 위에서 0.20 버전의 OneHotEncoder를 사용했습니다).

In [75]:

# from future_encoders import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

Out[75]:

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

기본적으로 OneHotEncoder 클래스는 희소 행렬을 반환하지만 필요하면 toarray() 메서드를 호출하여 밀집 배열로 바꿀 수 있습니다:

In [76]:

housing_cat_1hot.toarray()

Out[76]:

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

또는 OneHotEncoder 객체를 만들 때 sparse=False로 지정하면 됩니다:

In [77]:

cat_encoder = OneHotEncoder(categories='auto', sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

Out[77]:

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [78]:

cat_encoder.categories_

Out[78]:

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

다시 책의 내용이 이어집니다¶

추가 특성을 위해 나만의 변환기를 만들겠습니다:

In [79]:

from sklearn.base import BaseEstimator, TransformerMixin

# 컬럼 인덱스
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [80]:

housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs, 
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_extra_attribs.head()

Out[80]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity	rooms_per_household	population_per_household
0	-121.89	37.29	38	1568	351	710	339	2.7042	<1H OCEAN	4.62537	2.0944
1	-121.93	37.05	14	679	108	306	113	6.4214	<1H OCEAN	6.00885	2.70796
2	-117.2	32.77	31	1952	471	936	462	2.8621	NEAR OCEAN	4.22511	2.02597
3	-119.61	36.31	25	1847	371	1460	353	1.8839	INLAND	5.23229	4.13598
4	-118.59	34.23	17	6592	1525	4459	1463	3.0347	<1H OCEAN	4.50581	3.04785

수치 특성을 전처리하기 위한 파이프라인을 만듭니다(0.20 버전에 새로 추가된 SimpleImputer 클래스로 변경합니다):

In [81]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [82]:

housing_num_tr

Out[82]:

array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.31205452,
        -0.08649871,  0.15531753],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.21768338,
        -0.03353391, -0.83628902],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.46531516,
        -0.09240499,  0.4222004 ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.3469342 ,
        -0.03055414, -0.52177644],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.02499488,
         0.06150916, -0.30340741],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.22852947,
        -0.09586294,  0.10180567]])

future_encoders.py를 사용한 방법 ==========================¶

사이킷런의 0.20 버전에 포함될 ColumnTransformer를 사용하면 책의 예제에서처럼 DataFrameSelector와 FeatureUnion을 사용하지 않고 간단히 전체 파이프라인을 만들 수 있습니다. 아직 사이킷런 0.20 버전이 릴리스되기 전이므로 여기서는 future_encoders.py에 ColumnTransformer를 넣어 놓고 사용합니다.

사이킷런 0.20 버전에 추가된 sklearn.compose.ColumnTransformer로 코드를 변경합니다.

In [83]:

# from future_encoders import ColumnTransformer
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(categories='auto'), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

Out[83]:

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

====================================================¶

판단스 DataFrame 컬럼의 일부를 선택하는 변환기를 만듭니다:

In [84]:

from sklearn.base import BaseEstimator, TransformerMixin

# 사이킷런이 DataFrame을 바로 사용하지 못하므로
# 수치형이나 범주형 컬럼을 선택하는 클래스를 만듭니다.
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

하나의 큰 파이프라인에 이들을 모두 결합하여 수치형과 범주형 특성을 전처리합니다:

0.20 버전에 추가된 SimpleImputer를 사용합니다.

In [85]:

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

future_encoders.py를 사용한 방법 ==========================¶

In [86]:

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(categories='auto', sparse=False)),
    ])

====================================================¶

사이킷런 0.20 버전에 추가된 ColumnTransformer로 만든 full_pipline을 사용합니다:

In [87]:

# from sklearn.pipeline import FeatureUnion

# full_pipeline = FeatureUnion(transformer_list=[
#         ("num_pipeline", num_pipeline),
#         ("cat_pipeline", cat_pipeline),
#     ])
full_pipeline = ColumnTransformer([
        ("num_pipeline", num_pipeline, num_attribs),
        ("cat_encoder", OneHotEncoder(categories='auto'), cat_attribs),
    ])

In [88]:

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

Out[88]:

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [89]:

housing_prepared.shape

Out[89]:

(16512, 16)

모델 선택과 훈련¶

In [90]:

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

Out[90]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [91]:

# 훈련 샘플 몇 개를 사용해 전체 파이프라인을 적용해 보겠습니다.
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("예측:", lin_reg.predict(some_data_prepared))

예측: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]

실제 값과 비교합니다:

In [92]:

print("레이블:", list(some_labels))

레이블: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

In [93]:

some_data_prepared

Out[93]:

array([[-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821,
        -0.63621141, -0.42069842, -0.61493744, -0.31205452, -0.08649871,
         0.15531753,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , -0.90896655, -1.0369278 ,
        -0.99833135, -1.02222705,  1.33645936,  0.21768338, -0.03353391,
        -0.83628902,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, -0.31365989, -0.15334458,
        -0.43363936, -0.0933178 , -0.5320456 , -0.46531516, -0.09240499,
         0.4222004 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.01706767,  0.31357576, -0.29052016, -0.36276217, -0.39675594,
         0.03604096, -0.38343559, -1.04556555, -0.07966124,  0.08973561,
        -0.19645314,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.49247384, -0.65929936, -0.92673619,  1.85619316,  2.41221109,
         2.72415407,  2.57097492, -0.44143679, -0.35783383, -0.00419445,
         0.2699277 ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [94]:

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

Out[94]:

68628.19819848923

In [95]:

from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

Out[95]:

49439.89599001897

In [96]:

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

Out[96]:

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [97]:

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

Out[97]:

0.0

모델 세부 튜닝¶

In [98]:

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [99]:

def display_scores(scores):
    print("점수:", scores)
    print("평균:", scores.mean())
    print("표준편차:", scores.std())

display_scores(tree_rmse_scores)

점수: [70194.33680785 66855.16363941 72432.58244769 70758.73896782
 71115.88230639 75585.14172901 70262.86139133 70273.6325285
 75366.87952553 71231.65726027]
평균: 71407.68766037929
표준편차: 2439.4345041191004

In [100]:

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

점수: [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
평균: 69052.46136345083
표준편차: 2731.674001798349

사이킷런 0.22 버전에서 랜덤 포레스트의 n_estimator 기본값이 10에서 100으로 변경됩니다. 0.20 버전에서 n_estimator 값을 지정하지 않을 경우 이에 대한 경고 메세지가 나옵니다. 경고 메세지를 피하기 위해 명시적으로 n_estimator를 10으로 설정합니다.

In [101]:

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

Out[101]:

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [102]:

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

Out[102]:

21933.31414779769

In [103]:

from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

점수: [51646.44545909 48940.60114882 53050.86323649 54408.98730149
 50922.14870785 56482.50703987 51864.52025526 49760.85037653
 55434.21627933 53326.10093303]
평균: 52583.72407377466
표준편차: 2298.353351147122

In [104]:

scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

Out[104]:

count       10.000000
mean     69052.461363
std       2879.437224
min      64969.630564
25%      67136.363758
50%      68156.372635
75%      70982.369487
max      74739.570526
dtype: float64

In [105]:

from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

Out[105]:

111094.6308539982

In [106]:

from sklearn.model_selection import GridSearchCV

param_grid = [
    # 하이퍼파라미터 12(=3×4)개의 조합을 시도합니다.
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # bootstrap은 False로 하고 6(=2×3)개의 조합을 시도합니다.
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# 다섯 폴드에서 훈련하면 총 (12+6)*5=90번의 훈련이 일어납니다.
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', 
                           return_train_score=True, n_jobs=-1)
grid_search.fit(housing_prepared, housing_labels)

Out[106]:

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

최상의 파라미터 조합:

In [107]:

grid_search.best_params_

Out[107]:

{'max_features': 8, 'n_estimators': 30}

In [108]:

grid_search.best_estimator_

Out[108]:

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=8, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

그리드서치에서 테스트한 하이퍼파라미터 조합의 점수를 확인합니다:

In [109]:

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63669.05791727153 {'max_features': 2, 'n_estimators': 3}
55627.16171305252 {'max_features': 2, 'n_estimators': 10}
53384.57867637289 {'max_features': 2, 'n_estimators': 30}
60965.99185930139 {'max_features': 4, 'n_estimators': 3}
52740.98248528835 {'max_features': 4, 'n_estimators': 10}
50377.344409590376 {'max_features': 4, 'n_estimators': 30}
58663.84733372485 {'max_features': 6, 'n_estimators': 3}
52006.15355973719 {'max_features': 6, 'n_estimators': 10}
50146.465964159885 {'max_features': 6, 'n_estimators': 30}
57869.25504027614 {'max_features': 8, 'n_estimators': 3}
51711.09443660957 {'max_features': 8, 'n_estimators': 10}
49682.25345942335 {'max_features': 8, 'n_estimators': 30}
62895.088889905004 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54658.14484390074 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59470.399594730654 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52725.01091081235 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
57490.612956065226 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51009.51445842374 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

In [110]:

pd.DataFrame(grid_search.cv_results_)

Out[110]:

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_max_features	param_n_estimators	param_bootstrap	params	split0_test_score	split1_test_score	...	mean_test_score	std_test_score	rank_test_score	split0_train_score	split1_train_score	split2_train_score	split3_train_score	split4_train_score	mean_train_score	std_train_score
0	0.050272	0.000983	0.002671	0.000031	2	3	NaN	{'max_features': 2, 'n_estimators': 3}	-3.837622e+09	-4.147108e+09	...	-4.053749e+09	1.519609e+08	18	-1.064113e+09	-1.105142e+09	-1.116550e+09	-1.112342e+09	-1.129650e+09	-1.105559e+09	2.220402e+07
1	0.155895	0.002054	0.007374	0.000096	2	10	NaN	{'max_features': 2, 'n_estimators': 10}	-3.047771e+09	-3.254861e+09	...	-3.094381e+09	1.327046e+08	11	-5.927175e+08	-5.870952e+08	-5.776964e+08	-5.716332e+08	-5.802501e+08	-5.818785e+08	7.345821e+06
2	0.466790	0.003222	0.020763	0.000376	2	30	NaN	{'max_features': 2, 'n_estimators': 30}	-2.689185e+09	-3.021086e+09	...	-2.849913e+09	1.626879e+08	9	-4.381089e+08	-4.391272e+08	-4.371702e+08	-4.376955e+08	-4.452654e+08	-4.394734e+08	2.966320e+06
3	0.080077	0.000765	0.002662	0.000017	4	3	NaN	{'max_features': 4, 'n_estimators': 3}	-3.730181e+09	-3.786886e+09	...	-3.716852e+09	1.631421e+08	16	-9.865163e+08	-1.012565e+09	-9.169425e+08	-1.037400e+09	-9.707739e+08	-9.848396e+08	4.084607e+07
4	0.260610	0.002850	0.007349	0.000147	4	10	NaN	{'max_features': 4, 'n_estimators': 10}	-2.666283e+09	-2.784511e+09	...	-2.781611e+09	1.268562e+08	8	-5.097115e+08	-5.162820e+08	-4.962893e+08	-5.436192e+08	-5.160297e+08	-5.163863e+08	1.542862e+07
5	0.771269	0.001967	0.020760	0.000226	4	30	NaN	{'max_features': 4, 'n_estimators': 30}	-2.387153e+09	-2.588448e+09	...	-2.537877e+09	1.214603e+08	3	-3.838835e+08	-3.880268e+08	-3.790867e+08	-4.040957e+08	-3.845520e+08	-3.879289e+08	8.571233e+06
6	0.109181	0.003021	0.002627	0.000022	6	3	NaN	{'max_features': 6, 'n_estimators': 3}	-3.119657e+09	-3.586319e+09	...	-3.441447e+09	1.893141e+08	14	-9.245343e+08	-8.886939e+08	-9.353135e+08	-9.009801e+08	-8.624664e+08	-9.023976e+08	2.591445e+07
7	0.359064	0.002993	0.007470	0.000265	6	10	NaN	{'max_features': 6, 'n_estimators': 10}	-2.549663e+09	-2.782039e+09	...	-2.704640e+09	1.471542e+08	6	-4.980344e+08	-5.045869e+08	-4.994664e+08	-4.990325e+08	-5.055542e+08	-5.013349e+08	3.100456e+06
8	1.085904	0.007037	0.020348	0.000221	6	30	NaN	{'max_features': 6, 'n_estimators': 30}	-2.370010e+09	-2.583638e+09	...	-2.514668e+09	1.285063e+08	2	-3.838538e+08	-3.804711e+08	-3.805218e+08	-3.856095e+08	-3.901917e+08	-3.841296e+08	3.617057e+06
9	0.140850	0.003441	0.002625	0.000012	8	3	NaN	{'max_features': 8, 'n_estimators': 3}	-3.353504e+09	-3.348552e+09	...	-3.348851e+09	1.241864e+08	13	-9.228123e+08	-8.553031e+08	-8.603321e+08	-8.881964e+08	-9.151287e+08	-8.883545e+08	2.750227e+07
10	0.464359	0.003905	0.007256	0.000095	8	10	NaN	{'max_features': 8, 'n_estimators': 10}	-2.571970e+09	-2.718994e+09	...	-2.674037e+09	1.392720e+08	5	-4.932416e+08	-4.815238e+08	-4.730979e+08	-5.155367e+08	-4.985555e+08	-4.923911e+08	1.459294e+07
11	1.401487	0.007872	0.020617	0.000496	8	30	NaN	{'max_features': 8, 'n_estimators': 30}	-2.357390e+09	-2.546640e+09	...	-2.468326e+09	1.091647e+08	1	-3.841658e+08	-3.744500e+08	-3.773239e+08	-3.882250e+08	-3.810005e+08	-3.810330e+08	4.871017e+06
12	0.074540	0.001222	0.003115	0.000243	2	3	False	{'bootstrap': False, 'max_features': 2, 'n_est...	-3.785816e+09	-4.166012e+09	...	-3.955792e+09	1.900966e+08	17	-0.000000e+00	-0.000000e+00	-0.000000e+00	-0.000000e+00	-0.000000e+00	0.000000e+00	0.000000e+00
13	0.246445	0.001551	0.008540	0.000139	2	10	False	{'bootstrap': False, 'max_features': 2, 'n_est...	-2.810721e+09	-3.107789e+09	...	-2.987513e+09	1.539231e+08	10	-6.056477e-02	-0.000000e+00	-0.000000e+00	-0.000000e+00	-2.967449e+00	-6.056027e-01	1.181156e+00
14	0.100478	0.002083	0.003043	0.000110	3	3	False	{'bootstrap': False, 'max_features': 3, 'n_est...	-3.618324e+09	-3.441527e+09	...	-3.536728e+09	7.795196e+07	15	-0.000000e+00	-0.000000e+00	-0.000000e+00	-0.000000e+00	-6.072840e+01	-1.214568e+01	2.429136e+01
15	0.329762	0.002839	0.008933	0.000553	3	10	False	{'bootstrap': False, 'max_features': 3, 'n_est...	-2.757999e+09	-2.851737e+09	...	-2.779927e+09	6.286611e+07	7	-2.089484e+01	-0.000000e+00	-0.000000e+00	-0.000000e+00	-5.465556e+00	-5.272080e+00	8.093117e+00
16	0.125876	0.002710	0.003012	0.000109	4	3	False	{'bootstrap': False, 'max_features': 4, 'n_est...	-3.134040e+09	-3.559375e+09	...	-3.305171e+09	1.879203e+08	12	-0.000000e+00	-0.000000e+00	-0.000000e+00	-0.000000e+00	-0.000000e+00	0.000000e+00	0.000000e+00
17	0.410446	0.007215	0.008174	0.000232	4	10	False	{'bootstrap': False, 'max_features': 4, 'n_est...	-2.525578e+09	-2.710011e+09	...	-2.601971e+09	1.088031e+08	4	-0.000000e+00	-1.514119e-02	-0.000000e+00	-0.000000e+00	-0.000000e+00	-3.028238e-03	6.056477e-03

18 rows × 23 columns

In [111]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', 
                                random_state=42, n_jobs=-1)
rnd_search.fit(housing_prepared, housing_labels)

Out[111]:

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...
                                                   warm_start=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474dc82320>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474dc82dd8>},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=0)

In [112]:

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49150.657232934034 {'max_features': 7, 'n_estimators': 180}
51389.85295710133 {'max_features': 5, 'n_estimators': 15}
50796.12045980556 {'max_features': 3, 'n_estimators': 72}
50835.09932039744 {'max_features': 5, 'n_estimators': 21}
49280.90117886215 {'max_features': 7, 'n_estimators': 122}
50774.86679035961 {'max_features': 3, 'n_estimators': 75}
50682.75001237282 {'max_features': 3, 'n_estimators': 88}
49608.94061293652 {'max_features': 5, 'n_estimators': 100}
50473.57642831875 {'max_features': 3, 'n_estimators': 150}
64429.763804893395 {'max_features': 5, 'n_estimators': 2}

In [113]:

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

Out[113]:

array([7.33442355e-02, 6.29090705e-02, 4.11437985e-02, 1.46726854e-02,
       1.41064835e-02, 1.48742809e-02, 1.42575993e-02, 3.66158981e-01,
       5.64191792e-02, 1.08792957e-01, 5.33510773e-02, 1.03114883e-02,
       1.64780994e-01, 6.02803867e-05, 1.96041560e-03, 2.85647464e-03])

사이킷런 0.20 버전의 ColumnTransformer를 사용했기 때문에 full_pipeline에서 cat_encoder를 가져옵니다. 즉 cat_pipeline을 사용하지 않았습니다:

In [114]:

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
# cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_encoder = full_pipeline.named_transformers_["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

Out[114]:

[(0.36615898061813423, 'median_income'),
 (0.16478099356159054, 'INLAND'),
 (0.10879295677551575, 'pop_per_hhold'),
 (0.07334423551601243, 'longitude'),
 (0.06290907048262032, 'latitude'),
 (0.056419179181954014, 'rooms_per_hhold'),
 (0.053351077347675815, 'bedrooms_per_room'),
 (0.04114379847872964, 'housing_median_age'),
 (0.014874280890402769, 'population'),
 (0.014672685420543239, 'total_rooms'),
 (0.014257599323407808, 'households'),
 (0.014106483453584104, 'total_bedrooms'),
 (0.010311488326303788, '<1H OCEAN'),
 (0.0028564746373201584, 'NEAR OCEAN'),
 (0.0019604155994780706, 'NEAR BAY'),
 (6.0280386727366e-05, 'ISLAND')]

In [115]:

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [116]:

final_rmse

Out[116]:

47730.22690385927

테스트 RMSE에 대한 95% 신뢰 구간을 계산할 수 있습니다:

In [117]:

from scipy import stats

In [118]:

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)

np.sqrt(stats.t.interval(confidence, m - 1,
                         loc=np.mean(squared_errors),
                         scale=stats.sem(squared_errors)))

Out[118]:

array([45685.10470776, 49691.25001878])

다음과 같이 수동으로 계산할 수도 있습니다:

In [119]:

tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)

Out[119]:

(45685.10470776014, 49691.25001877871)

또는 t 점수 대신 z 점수를 사용할 수도 있습니다:

In [120]:

zscore = stats.norm.ppf((1 + confidence) / 2)
zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)

Out[120]:

(45685.717918136594, 49690.68623889426)

추가 내용¶

전처리와 예측을 포함한 파이프라인¶

In [121]:

full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("linear", LinearRegression())
    ])

full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)

Out[121]:

array([210644.60459286, 317768.80697211, 210956.43331178,  59218.98886849,
       189747.55849879])

joblib을 사용한 모델 저장¶

In [122]:

my_model = full_pipeline_with_predictor

In [123]:

from sklearn.externals import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#...
my_model_loaded = joblib.load("my_model.pkl") # DIFF

/home/haesun/anaconda3/envs/handson-ml/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)

`RandomizedSearchCV`을 위한 Scipy 분포 함수¶

In [124]:

from scipy.stats import geom, expon
geom_distrib=geom(0.5).rvs(10000, random_state=42)
expon_distrib=expon(scale=1).rvs(10000, random_state=42)
plt.hist(geom_distrib, bins=50)
plt.show()
plt.hist(expon_distrib, bins=50)
plt.show()

연습문제 해답¶

1.¶

질문: 서포트 벡터 머신 회귀(sklearn.svm.SVR)를 kernel=“linear”(하이퍼파라미터 C를 바꿔가며)나 kernel=“rbf”(하이퍼파라미터 C와 gamma를 바꿔가며) 등의 다양한 하이퍼파라미터 설정으로 시도해보세요. 지금은 이 하이퍼파라미터가 무엇을 의미하는지 너무 신경 쓰지 마세요. 최상의 SVR 모델은 무엇인가요?

In [127]:

from sklearn.model_selection import GridSearchCV

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', 
                           verbose=2, n_jobs=1)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=10.0, kernel=linear ...........................................

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

[CV] ............................ C=10.0, kernel=linear, total=   4.3s
[CV] C=10.0, kernel=linear ...........................................

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.3s remaining:    0.0s

[CV] ............................ C=10.0, kernel=linear, total=   4.4s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............................ C=10.0, kernel=linear, total=   4.3s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............................ C=10.0, kernel=linear, total=   4.3s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............................ C=10.0, kernel=linear, total=   4.3s
[CV] C=30.0, kernel=linear ...........................................
[CV] ............................ C=30.0, kernel=linear, total=   4.3s
[CV] C=30.0, kernel=linear ...........................................
[CV] ............................ C=30.0, kernel=linear, total=   4.3s
[CV] C=30.0, kernel=linear ...........................................
[CV] ............................ C=30.0, kernel=linear, total=   4.4s
[CV] C=30.0, kernel=linear ...........................................
[CV] ............................ C=30.0, kernel=linear, total=   4.4s
[CV] C=30.0, kernel=linear ...........................................
[CV] ............................ C=30.0, kernel=linear, total=   4.3s
[CV] C=100.0, kernel=linear ..........................................
[CV] ........................... C=100.0, kernel=linear, total=   4.3s
[CV] C=100.0, kernel=linear ..........................................
[CV] ........................... C=100.0, kernel=linear, total=   4.3s
[CV] C=100.0, kernel=linear ..........................................
[CV] ........................... C=100.0, kernel=linear, total=   4.4s
[CV] C=100.0, kernel=linear ..........................................
[CV] ........................... C=100.0, kernel=linear, total=   4.3s
[CV] C=100.0, kernel=linear ..........................................
[CV] ........................... C=100.0, kernel=linear, total=   4.2s
[CV] C=300.0, kernel=linear ..........................................
[CV] ........................... C=300.0, kernel=linear, total=   4.3s
[CV] C=300.0, kernel=linear ..........................................
[CV] ........................... C=300.0, kernel=linear, total=   4.3s
[CV] C=300.0, kernel=linear ..........................................
[CV] ........................... C=300.0, kernel=linear, total=   4.4s
[CV] C=300.0, kernel=linear ..........................................
[CV] ........................... C=300.0, kernel=linear, total=   4.4s
[CV] C=300.0, kernel=linear ..........................................
[CV] ........................... C=300.0, kernel=linear, total=   4.3s
[CV] C=1000.0, kernel=linear .........................................
[CV] .......................... C=1000.0, kernel=linear, total=   4.5s
[CV] C=1000.0, kernel=linear .........................................
[CV] .......................... C=1000.0, kernel=linear, total=   4.5s
[CV] C=1000.0, kernel=linear .........................................
[CV] .......................... C=1000.0, kernel=linear, total=   4.5s
[CV] C=1000.0, kernel=linear .........................................
[CV] .......................... C=1000.0, kernel=linear, total=   4.5s
[CV] C=1000.0, kernel=linear .........................................
[CV] .......................... C=1000.0, kernel=linear, total=   4.4s
[CV] C=3000.0, kernel=linear .........................................
[CV] .......................... C=3000.0, kernel=linear, total=   4.8s
[CV] C=3000.0, kernel=linear .........................................
[CV] .......................... C=3000.0, kernel=linear, total=   4.8s
[CV] C=3000.0, kernel=linear .........................................
[CV] .......................... C=3000.0, kernel=linear, total=   4.9s
[CV] C=3000.0, kernel=linear .........................................
[CV] .......................... C=3000.0, kernel=linear, total=   4.9s
[CV] C=3000.0, kernel=linear .........................................
[CV] .......................... C=3000.0, kernel=linear, total=   4.7s
[CV] C=10000.0, kernel=linear ........................................
[CV] ......................... C=10000.0, kernel=linear, total=   6.3s
[CV] C=10000.0, kernel=linear ........................................
[CV] ......................... C=10000.0, kernel=linear, total=   6.4s
[CV] C=10000.0, kernel=linear ........................................
[CV] ......................... C=10000.0, kernel=linear, total=   6.5s
[CV] C=10000.0, kernel=linear ........................................
[CV] ......................... C=10000.0, kernel=linear, total=   6.0s
[CV] C=10000.0, kernel=linear ........................................
[CV] ......................... C=10000.0, kernel=linear, total=   5.8s
[CV] C=30000.0, kernel=linear ........................................
[CV] ......................... C=30000.0, kernel=linear, total=   9.9s
[CV] C=30000.0, kernel=linear ........................................
[CV] ......................... C=30000.0, kernel=linear, total=  10.1s
[CV] C=30000.0, kernel=linear ........................................
[CV] ......................... C=30000.0, kernel=linear, total=  10.5s
[CV] C=30000.0, kernel=linear ........................................
[CV] ......................... C=30000.0, kernel=linear, total=  10.1s
[CV] C=30000.0, kernel=linear ........................................
[CV] ......................... C=30000.0, kernel=linear, total=   9.1s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=1.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.03, kernel=rbf, total=   8.9s
[CV] C=1.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=1.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=1.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=1.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=1.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=1.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=1.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=1.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=1.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=1.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=1.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=1.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=1.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=1.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=1.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=1.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=1.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=1.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=3.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=3.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=3.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=3.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=3.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=3.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.03, kernel=rbf, total=   8.9s
[CV] C=3.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.03, kernel=rbf ...................................
[CV] .................... C=3.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=3.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=3.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.1, kernel=rbf, total=   8.8s
[CV] C=3.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=3.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=3.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=3.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=3.0, gamma=0.3, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=3.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=3.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=3.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=3.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=3.0, gamma=1.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=3.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total=   9.0s
[CV] C=3.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=3.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=3.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=3.0, gamma=3.0, kernel=rbf ....................................
[CV] ..................... C=3.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=10.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=10.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=10.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=10.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=10.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=10.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=10.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=10.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=10.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=10.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=10.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=10.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=10.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=10.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=10.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=10.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=0.3, kernel=rbf, total=   8.6s
[CV] C=10.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=10.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=10.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=10.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=10.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=10.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=3.0, kernel=rbf, total=   9.0s
[CV] C=10.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=10.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=10.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=10.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=10.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=30.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=30.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.01, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.01, kernel=rbf, total=   8.9s
[CV] C=30.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.03, kernel=rbf ..................................
[CV] ................... C=30.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=30.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=30.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=30.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=30.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=30.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=30.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=30.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=30.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=30.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=30.0, gamma=0.3, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=30.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=30.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=30.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=1.0, kernel=rbf, total=   8.4s
[CV] C=30.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=30.0, gamma=1.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=30.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=3.0, kernel=rbf, total=   9.0s
[CV] C=30.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=30.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=30.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=30.0, gamma=3.0, kernel=rbf ...................................
[CV] .................... C=30.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=100.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=100.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=100.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=100.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=100.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.01, kernel=rbf, total=   8.8s
[CV] C=100.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.03, kernel=rbf, total=   8.7s
[CV] C=100.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.03, kernel=rbf, total=   8.7s
[CV] C=100.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.03, kernel=rbf, total=   8.7s
[CV] C=100.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.03, kernel=rbf, total=   8.7s
[CV] C=100.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=100.0, gamma=0.03, kernel=rbf, total=   8.7s
[CV] C=100.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.1, kernel=rbf, total=   8.5s
[CV] C=100.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.1, kernel=rbf, total=   8.6s
[CV] C=100.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.1, kernel=rbf, total=   8.6s
[CV] C=100.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.1, kernel=rbf, total=   8.6s
[CV] C=100.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.1, kernel=rbf, total=   8.6s
[CV] C=100.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=100.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=100.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=100.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=100.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=0.3, kernel=rbf, total=   8.5s
[CV] C=100.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=100.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=100.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=100.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=100.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=100.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=3.0, kernel=rbf, total=   9.0s
[CV] C=100.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=100.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=3.0, kernel=rbf, total=   9.0s
[CV] C=100.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=100.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=100.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=300.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.01, kernel=rbf, total=   8.7s
[CV] C=300.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.01, kernel=rbf, total=   8.7s
[CV] C=300.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.01, kernel=rbf, total=   8.7s
[CV] C=300.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.01, kernel=rbf, total=   8.7s
[CV] C=300.0, gamma=0.01, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.01, kernel=rbf, total=   8.7s
[CV] C=300.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.03, kernel=rbf, total=   8.5s
[CV] C=300.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.03, kernel=rbf, total=   8.5s
[CV] C=300.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.03, kernel=rbf, total=   8.5s
[CV] C=300.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.03, kernel=rbf, total=   8.5s
[CV] C=300.0, gamma=0.03, kernel=rbf .................................
[CV] .................. C=300.0, gamma=0.03, kernel=rbf, total=   8.5s
[CV] C=300.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.1, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.1, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.1, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.1, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.1, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.3, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.3, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.3, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.3, kernel=rbf, total=   8.3s
[CV] C=300.0, gamma=0.3, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=0.3, kernel=rbf, total=   8.4s
[CV] C=300.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=300.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=300.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=300.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=300.0, gamma=1.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=1.0, kernel=rbf, total=   8.3s
[CV] C=300.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=300.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=300.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=3.0, kernel=rbf, total=   9.0s
[CV] C=300.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=300.0, gamma=3.0, kernel=rbf ..................................
[CV] ................... C=300.0, gamma=3.0, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total=   8.4s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total=   8.5s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total=   8.4s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total=   8.4s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.01, kernel=rbf, total=   8.6s
[CV] C=1000.0, gamma=0.03, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total=   8.6s
[CV] C=1000.0, gamma=0.03, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1000.0, gamma=0.03, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total=   8.9s
[CV] C=1000.0, gamma=0.03, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1000.0, gamma=0.03, kernel=rbf ................................
[CV] ................. C=1000.0, gamma=0.03, kernel=rbf, total=   8.8s
[CV] C=1000.0, gamma=0.1, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total=   8.7s
[CV] C=1000.0, gamma=0.1, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total=   8.6s
[CV] C=1000.0, gamma=0.1, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total=   8.9s
[CV] C=1000.0, gamma=0.1, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total=   8.9s
[CV] C=1000.0, gamma=0.1, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.1, kernel=rbf, total=   9.6s
[CV] C=1000.0, gamma=0.3, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total=   9.5s
[CV] C=1000.0, gamma=0.3, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=0.3, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=0.3, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=0.3, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.3, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=   9.1s
[CV] C=1000.0, gamma=3.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total=   9.9s
[CV] C=1000.0, gamma=3.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total=   9.9s
[CV] C=1000.0, gamma=3.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total=   9.9s
[CV] C=1000.0, gamma=3.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total=   9.9s
[CV] C=1000.0, gamma=3.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=3.0, kernel=rbf, total=   9.9s

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 34.2min finished

Out[127]:

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=1,
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=2)

최상 모델의 (5-폴드 교차 검증으로 평가한) 점수는 다음과 같습니다:

In [128]:

negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

Out[128]:

70363.90313964167

이는 RandomForestRegressor보다 훨씬 좋지 않네요. 최상의 하이퍼파라미터를 확인해 보겠습니다:

In [129]:

grid_search.best_params_

Out[129]:

{'C': 30000.0, 'kernel': 'linear'}

선형 커널이 RBF 커널보다 성능이 나은 것 같습니다. C는 테스트한 것 중에 최대값이 선택되었습니다. 따라서 (작은 값들은 지우고) 더 큰 값의 C로 그리드서치를 다시 실행해 보아야 합니다. 아마도 더 큰 값의 C에서 성능이 높아질 것입니다.

2.¶

질문: GridSearchCV를 RandomizedSearchCV로 바꿔보세요.

In [132]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

# expon(), reciprocal()와 다른 확률 분포 함수에 대해서는
# https://docs.scipy.org/doc/scipy/reference/stats.html를 참고하세요.

# 노트: kernel 매개변수가 "linear"일 때는 gamma가 무시됩니다.
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, n_jobs=1, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   5.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.0s remaining:    0.0s

[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   5.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   5.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   5.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   5.0s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, total=  10.8s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total=  10.3s
[CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf ........
[CV]  C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total=  10.3s
[CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf ........
[CV]  C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total=  10.3s
[CV] C=24.17508294611391, gamma=3.503557475158312, kernel=rbf ........
[CV]  C=24.17508294611391, gamma=3.503557475158312, kernel=rbf, total=  10.3s
[CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ...
[CV]  C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total=   9.2s
[CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ...
[CV]  C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total=   9.2s
[CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ...
[CV]  C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total=   9.2s
[CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ...
[CV]  C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total=   9.2s
[CV] C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf ...
[CV]  C=113564.03940586245, gamma=0.0007790692366582295, kernel=rbf, total=   9.2s
[CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ......
[CV]  C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total=   9.3s
[CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ......
[CV]  C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total=   9.3s
[CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ......
[CV]  C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total=   9.2s
[CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ......
[CV]  C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total=   9.3s
[CV] C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf ......
[CV]  C=108.30488238805073, gamma=0.3627537294604771, kernel=rbf, total=   9.3s
[CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear .
[CV]  C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total=   4.9s
[CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear .
[CV]  C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total=   4.9s
[CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear .
[CV]  C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total=   4.9s
[CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear .
[CV]  C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total=   4.9s
[CV] C=21.344953672647435, gamma=0.023332523598323388, kernel=linear .
[CV]  C=21.344953672647435, gamma=0.023332523598323388, kernel=linear, total=   4.9s
[CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ......
[CV]  C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total=   9.1s
[CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ......
[CV]  C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total=   9.1s
[CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ......
[CV]  C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total=   9.1s
[CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ......
[CV]  C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total=   9.0s
[CV] C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf ......
[CV]  C=5603.270317432516, gamma=0.15023452872733867, kernel=rbf, total=   9.0s
[CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf .....
[CV]  C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total=  20.7s
[CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf .....
[CV]  C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total=  21.6s
[CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf .....
[CV]  C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total=  24.8s
[CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf .....
[CV]  C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total=  20.3s
[CV] C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf .....
[CV]  C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, total=  22.4s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total=  10.3s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total=  10.8s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total=  11.4s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total=  10.4s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, total=   9.3s
[CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear ....
[CV]  C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total=  42.8s
[CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear ....
[CV]  C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total=  35.3s
[CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear ....
[CV]  C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total=  41.3s
[CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear ....
[CV]  C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total=  35.9s
[CV] C=171377.39570378003, gamma=0.628789100540856, kernel=linear ....
[CV]  C=171377.39570378003, gamma=0.628789100540856, kernel=linear, total=  28.6s
[CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ...
[CV]  C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total=   5.9s
[CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ...
[CV]  C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total=   6.1s
[CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ...
[CV]  C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total=   6.2s
[CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ...
[CV]  C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total=   5.9s
[CV] C=5385.293820172355, gamma=0.18696125197741642, kernel=linear ...
[CV]  C=5385.293820172355, gamma=0.18696125197741642, kernel=linear, total=   6.0s
[CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........
[CV]  C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total=   9.8s
[CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........
[CV]  C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total=   9.8s
[CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........
[CV]  C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total=   9.8s
[CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........
[CV]  C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total=   9.8s
[CV] C=22.59903216621323, gamma=2.850796878935603, kernel=rbf ........
[CV]  C=22.59903216621323, gamma=2.850796878935603, kernel=rbf, total=   9.8s
[CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear ....
[CV]  C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total=  12.1s
[CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear ....
[CV]  C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total=  11.9s
[CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear ....
[CV]  C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total=  12.5s
[CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear ....
[CV]  C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total=  12.1s
[CV] C=34246.75194632794, gamma=0.3632878599687583, kernel=linear ....
[CV]  C=34246.75194632794, gamma=0.3632878599687583, kernel=linear, total=  11.0s
[CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf .......
[CV]  C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total=   9.3s
[CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf .......
[CV]  C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total=   9.3s
[CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf .......
[CV]  C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total=   9.3s
[CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf .......
[CV]  C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total=   9.2s
[CV] C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf .......
[CV]  C=167.7278956080511, gamma=0.2757870542258224, kernel=rbf, total=   9.3s
[CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear ....
[CV]  C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total=   4.9s
[CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear ....
[CV]  C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total=   4.9s
[CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear ....
[CV]  C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total=   4.9s
[CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear ....
[CV]  C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total=   4.9s
[CV] C=61.54360542501371, gamma=0.6835472281341501, kernel=linear ....
[CV]  C=61.54360542501371, gamma=0.6835472281341501, kernel=linear, total=   4.8s
[CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf .......
[CV]  C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total=   9.2s
[CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf .......
[CV]  C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total=   9.2s
[CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf .......
[CV]  C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total=   9.2s
[CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf .......
[CV]  C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total=   9.2s
[CV] C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf .......
[CV]  C=98.73897389920914, gamma=0.4960365360493639, kernel=rbf, total=   9.2s
[CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ......
[CV]  C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total=   9.1s
[CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ......
[CV]  C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total=   9.1s
[CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ......
[CV]  C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total=   9.2s
[CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ......
[CV]  C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total=   9.1s
[CV] C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf ......
[CV]  C=8935.505635947808, gamma=0.37354658165762367, kernel=rbf, total=   9.1s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total=   4.9s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total=   4.9s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total=   4.9s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total=   4.9s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, total=   4.8s
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 2.2min
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.6min
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.5min
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.9min
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, total= 1.9min
[CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear ....
[CV]  C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total=   5.0s
[CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear ....
[CV]  C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total=   5.0s
[CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear ....
[CV]  C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total=   5.0s
[CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear ....
[CV]  C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total=   5.0s
[CV] C=761.4316758498783, gamma=2.6126336514161914, kernel=linear ....
[CV]  C=761.4316758498783, gamma=2.6126336514161914, kernel=linear, total=   5.0s
[CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ...
[CV]  C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total=  23.5s
[CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ...
[CV]  C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total=  23.0s
[CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ...
[CV]  C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total=  36.9s
[CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ...
[CV]  C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total=  24.3s
[CV] C=97392.81883041795, gamma=0.09265545895311562, kernel=linear ...
[CV]  C=97392.81883041795, gamma=0.09265545895311562, kernel=linear, total=  20.1s
[CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear ....
[CV]  C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total=   5.5s
[CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear ....
[CV]  C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total=   5.6s
[CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear ....
[CV]  C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total=   5.4s
[CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear ....
[CV]  C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total=   5.6s
[CV] C=2423.0759984939164, gamma=3.248614270240346, kernel=linear ....
[CV]  C=2423.0759984939164, gamma=3.248614270240346, kernel=linear, total=   5.3s
[CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear ....
[CV]  C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total=   5.1s
[CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear ....
[CV]  C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total=   5.0s
[CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear ....
[CV]  C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total=   5.1s
[CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear ....
[CV]  C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total=   5.1s
[CV] C=717.3632997255095, gamma=0.3165604432088257, kernel=linear ....
[CV]  C=717.3632997255095, gamma=0.3165604432088257, kernel=linear, total=   5.0s
[CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf .......
[CV]  C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total=  10.3s
[CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf .......
[CV]  C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total=  10.3s
[CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf .......
[CV]  C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total=  10.3s
[CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf .......
[CV]  C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total=  10.3s
[CV] C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf .......
[CV]  C=4446.667521184072, gamma=3.3597284456608496, kernel=rbf, total=  10.3s
[CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ...
[CV]  C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total=   5.5s
[CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ...
[CV]  C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total=   5.8s
[CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ...
[CV]  C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total=   5.8s
[CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ...
[CV]  C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total=   5.5s
[CV] C=2963.564121207815, gamma=0.15189814782062885, kernel=linear ...
[CV]  C=2963.564121207815, gamma=0.15189814782062885, kernel=linear, total=   5.5s
[CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ...
[CV]  C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total=   4.8s
[CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ...
[CV]  C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total=   4.9s
[CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ...
[CV]  C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total=   4.9s
[CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ...
[CV]  C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total=   4.9s
[CV] C=91.64267381686706, gamma=0.01575994483585621, kernel=linear ...
[CV]  C=91.64267381686706, gamma=0.01575994483585621, kernel=linear, total=   4.8s
[CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf .....
[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total=   9.6s
[CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf .....
[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total=   9.6s
[CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf .....
[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total=   9.6s
[CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf .....
[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total=   9.6s
[CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf .....
[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, total=   9.5s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total=   9.5s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total=   9.4s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total=   9.5s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total=   9.4s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, total=   9.5s
[CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ...
[CV]  C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total=   7.9s
[CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ...
[CV]  C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total=   8.4s
[CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ...
[CV]  C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total=   8.5s
[CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ...
[CV]  C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total=   8.7s
[CV] C=16483.850529752886, gamma=1.4752145260435134, kernel=linear ...
[CV]  C=16483.850529752886, gamma=1.4752145260435134, kernel=linear, total=   7.5s
[CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf .......
[CV]  C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total=  41.1s
[CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf .......
[CV]  C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total=  39.4s
[CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf .......
[CV]  C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total=  50.7s
[CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf .......
[CV]  C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total=  52.8s
[CV] C=101445.66881340064, gamma=1.052904084582266, kernel=rbf .......
[CV]  C=101445.66881340064, gamma=1.052904084582266, kernel=rbf, total=  43.7s
[CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf .......
[CV]  C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total=  17.0s
[CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf .......
[CV]  C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total=  17.2s
[CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf .......
[CV]  C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total=  16.9s
[CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf .......
[CV]  C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total=  18.8s
[CV] C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf .......
[CV]  C=56681.80859029545, gamma=0.9763011917123741, kernel=rbf, total=  17.9s
[CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total=   9.3s
[CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total=   9.3s
[CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total=   9.3s
[CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total=   9.3s
[CV] C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=48.15822390928914, gamma=0.4633351167983427, kernel=rbf, total=   9.3s
[CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf .......
[CV]  C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total=   9.1s
[CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf .......
[CV]  C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total=   9.1s
[CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf .......
[CV]  C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total=   9.1s
[CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf .......
[CV]  C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total=   9.1s
[CV] C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf .......
[CV]  C=399.7268155705774, gamma=1.3078757839577408, kernel=rbf, total=   9.1s
[CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ...
[CV]  C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total=   4.9s
[CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ...
[CV]  C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total=   4.9s
[CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ...
[CV]  C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total=   5.0s
[CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ...
[CV]  C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total=   5.0s
[CV] C=251.14073886281363, gamma=0.8238105204914145, kernel=linear ...
[CV]  C=251.14073886281363, gamma=0.8238105204914145, kernel=linear, total=   4.9s
[CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear ....
[CV]  C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total=   4.9s
[CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear ....
[CV]  C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total=   4.9s
[CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear ....
[CV]  C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total=   5.2s
[CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear ....
[CV]  C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total=   5.9s
[CV] C=60.17373642891687, gamma=1.2491263443165994, kernel=linear ....
[CV]  C=60.17373642891687, gamma=1.2491263443165994, kernel=linear, total=   5.7s
[CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ......
[CV]  C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total=   9.8s
[CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ......
[CV]  C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total=   9.7s
[CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ......
[CV]  C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total=   9.6s
[CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ......
[CV]  C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total=   9.3s
[CV] C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf ......
[CV]  C=15415.161544891856, gamma=0.2691677514619319, kernel=rbf, total=   9.3s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total=   5.3s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total=   5.3s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total=   5.3s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total=   5.3s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, total=   5.2s
[CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear .....
[CV]  C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total=   4.9s
[CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear .....
[CV]  C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total=   4.9s
[CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear .....
[CV]  C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total=   4.9s
[CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear .....
[CV]  C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total=   4.9s
[CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear .....
[CV]  C=55.53838911232773, gamma=0.578634378499143, kernel=linear, total=   4.8s
[CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ......
[CV]  C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total=   9.2s
[CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ......
[CV]  C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total=   9.2s
[CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ......
[CV]  C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total=   9.2s
[CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ......
[CV]  C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total=   9.2s
[CV] C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf ......
[CV]  C=26.714480823948186, gamma=1.0117295509275495, kernel=rbf, total=   9.2s
[CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ...
[CV]  C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total=   5.9s
[CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ...
[CV]  C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total=   5.7s
[CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ...
[CV]  C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total=   5.8s
[CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ...
[CV]  C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total=   5.7s
[CV] C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear ...
[CV]  C=3582.0552780489566, gamma=1.1891370222133257, kernel=linear, total=   5.5s
[CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear ....
[CV]  C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total=   4.8s
[CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear ....
[CV]  C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total=   4.8s
[CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear ....
[CV]  C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total=   4.9s
[CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear ....
[CV]  C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total=   4.9s
[CV] C=198.7004781812736, gamma=0.5282819748826726, kernel=linear ....
[CV]  C=198.7004781812736, gamma=0.5282819748826726, kernel=linear, total=   4.8s
[CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total=   4.9s
[CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total=   4.8s
[CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total=   4.9s
[CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total=   4.9s
[CV] C=129.8000604143307, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=129.8000604143307, gamma=2.8621383676481322, kernel=linear, total=   4.8s
[CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ......
[CV]  C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total=   9.2s
[CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ......
[CV]  C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total=   9.2s
[CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ......
[CV]  C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total=   9.2s
[CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ......
[CV]  C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total=   9.2s
[CV] C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf ......
[CV]  C=288.4269299593897, gamma=0.17580835850006285, kernel=rbf, total=   9.2s
[CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear ....
[CV]  C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total=   6.2s
[CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear ....
[CV]  C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total=   6.2s
[CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear ....
[CV]  C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total=   6.4s
[CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear ....
[CV]  C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total=   6.3s
[CV] C=6287.039489427172, gamma=0.3504567255332862, kernel=linear ....
[CV]  C=6287.039489427172, gamma=0.3504567255332862, kernel=linear, total=   6.0s
[CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf .......
[CV]  C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total=  31.8s
[CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf .......
[CV]  C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total=  36.3s
[CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf .......
[CV]  C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total=  34.2s
[CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf .......
[CV]  C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total=  34.8s
[CV] C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf .......
[CV]  C=61217.04421344494, gamma=1.6279689407405564, kernel=rbf, total=  32.6s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total=   9.4s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total=   9.4s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total=   9.4s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total=   9.4s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, total=   9.4s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total=  12.1s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total=  11.8s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total=  10.9s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total=  12.4s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, total=  11.2s
[CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear ....
[CV]  C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total=  29.3s
[CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear ....
[CV]  C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total=  21.7s
[CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear ....
[CV]  C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total=  32.3s
[CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear ....
[CV]  C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total=  24.0s
[CV] C=84789.82947739525, gamma=0.3176359085304841, kernel=linear ....
[CV]  C=84789.82947739525, gamma=0.3176359085304841, kernel=linear, total=  18.3s

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 53.2min finished

Out[132]:

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, shrinking=True,
                                 tol=0.001, verbose=False),
                   iid='warn', n_iter=50, n_jobs=1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474838d278>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f474838d128>,
                                        'kernel': ['linear', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=2)

최상 모델의 (5-폴드 교차 검증으로 평가한) 점수는 다음과 같습니다:

In [133]:

negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

Out[133]:

54767.99053704409

이제 RandomForestRegressor의 성능에 훨씬 가까워졌습니다(하지만 아직 차이가 납니다). 최상의 하이퍼파라미터를 확인해 보겠습니다:

In [134]:

rnd_search.best_params_

Out[134]:

{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

이번에는 RBF 커널에 대해 최적의 하이퍼파라미터 조합을 찾았습니다. 보통 랜덤서치가 같은 시간안에 그리드서치보다 더 좋은 하이퍼파라미터를 찾습니다.

여기서 사용된 scale=1.0인 지수 분포를 살펴보겠습니다. 일부 샘플은 1.0보다 아주 크거나 작습니다. 하지만 로그 분포를 보면 대부분의 값이 exp(-2)와 exp(+2), 즉 0.1과 7.4 사이에 집중되어 있음을 알 수 있습니다.

In [135]:

expon_distrib = expon(scale=1.)
samples = expon_distrib.rvs(10000, random_state=42)
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.title("Exponential distribution (scale=1.0)")
plt.hist(samples, bins=50)
plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)
plt.show()

C에 사용된 분포는 매우 다릅니다. 주어진 범위안에서 균등 분포로 샘플링됩니다. 그래서 오른쪽 로그 분포가 거의 일정하게 나타납니다. 이런 분포는 원하는 스케일이 정확이 무엇인지 모를 때 사용하면 좋습니다:

In [136]:

reciprocal_distrib = reciprocal(20, 200000)
samples = reciprocal_distrib.rvs(10000, random_state=42)
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.title("Reciprocal distribution (scale=1.0)")
plt.hist(samples, bins=50)
plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)
plt.show()

reciprocal() 함수는 하이퍼파라미터의 스케일에 대해 전혀 감을 잡을 수 없을 때 사용합니다(오른쪽 그래프에서 볼 수 있듯이 주어진 범위안에서 모든 값이 균등합니다). 반면 지수 분포는 하이퍼파라미터의 스케일을 (어느정도) 알고 있을 때 사용하는 것이 좋습니다.

3.¶

질문: 가장 중요한 특성을 선택하는 변환기를 준비 파이프라인에 추가해보세요.

In [137]:

from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

노트: 이 특성 선택 클래스는 이미 어떤 식으로든 특성 중요도를 계산했다고 가정합니다(가령 RandomForestRegressor을 사용하여). TopFeatureSelector의 fit() 메서드에서 직접 계산할 수도 있지만 (캐싱을 사용하지 않을 경우) 이렇게 하면 그리드서치나 랜덤서치의 모든 하이퍼파라미터 조합에 대해 계산이 일어나기 때문에 매우 느려집니다.

선택할 특성의 개수를 지정합니다:

In [138]:

k = 5

최상의 k개 특성의 인덱스를 확인해 보겠습니다:

In [139]:

top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

Out[139]:

array([ 0,  1,  7,  9, 12])

In [140]:

np.array(attributes)[top_k_feature_indices]

Out[140]:

array(['longitude', 'latitude', 'median_income', 'pop_per_hhold',
       'INLAND'], dtype='<U18')

최상의 k개 특성이 맞는지 다시 확인합니다:

In [141]:

sorted(zip(feature_importances, attributes), reverse=True)[:k]

Out[141]:

[(0.36615898061813423, 'median_income'),
 (0.16478099356159054, 'INLAND'),
 (0.10879295677551575, 'pop_per_hhold'),
 (0.07334423551601243, 'longitude'),
 (0.06290907048262032, 'latitude')]

좋습니다. 이제 이전에 정의한 준비 파이프라인과 특성 선택기를 추가한 새로운 파이프라인을 만듭니다:

In [142]:

preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [143]:

housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

처음 3개 샘플의 특성을 확인해 보겠습니다:

In [144]:

housing_prepared_top_k_features[0:3]

Out[144]:

array([[-1.15604281,  0.77194962, -0.61493744, -0.08649871,  0.        ],
       [-1.17602483,  0.6596948 ,  1.33645936, -0.03353391,  0.        ],
       [ 1.18684903, -1.34218285, -0.5320456 , -0.09240499,  0.        ]])

최상의 k개 특성이 맞는지 다시 확인합니다:

In [145]:

housing_prepared[0:3, top_k_feature_indices]

Out[145]:

array([[-1.15604281,  0.77194962, -0.61493744, -0.08649871,  0.        ],
       [-1.17602483,  0.6596948 ,  1.33645936, -0.03353391,  0.        ],
       [ 1.18684903, -1.34218285, -0.5320456 , -0.09240499,  0.        ]])

성공입니다! :)

4.¶

질문: 전체 데이터 준비 과정과 최종 예측을 하나의 파이프라인으로 만들어보세요.

In [146]:

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_))
])

In [147]:

prepare_select_and_predict_pipeline.fit(housing, housing_labels)

Out[147]:

Pipeline(memory=None,
         steps=[('preparation',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('selector',
                                                                   DataFrameSelector(attribute_names=['longitude',
                                                                                                      'latitude',
                                                                                                      'housing_median_age',
                                                                                                      'total_rooms',
                                                                                                      'total_bedrooms',
                                                                                                      'population',
                                                                                                      'households',
                                                                                                      'median_...
       1.41064835e-02, 1.48742809e-02, 1.42575993e-02, 3.66158981e-01,
       5.64191792e-02, 1.08792957e-01, 5.33510773e-02, 1.03114883e-02,
       1.64780994e-01, 6.02803867e-05, 1.96041560e-03, 2.85647464e-03]),
                                    k=5)),
                ('svm_reg',
                 SVR(C=157055.10989448498, cache_size=200, coef0=0.0, degree=3,
                     epsilon=0.1, gamma=0.26497040005002437, kernel='rbf',
                     max_iter=-1, shrinking=True, tol=0.001, verbose=False))],
         verbose=False)

몇 개의 샘플에 전체 파이프라인을 적용해 보겠습니다:

In [148]:

some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("예측:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("레이블:\t\t", list(some_labels))

예측:	 [203214.28978849 371846.88152572 173295.65441612  47328.3970888 ]
레이블:		 [286600.0, 340600.0, 196900.0, 46300.0]

전체 파이프라인이 잘 작동하는 것 같습니다. 물론 예측 성능이 아주 좋지는 않습니다. SVR보다 RandomForestRegressor가 더 나은 것 같습니다.

5.¶

질문: GridSearchCV를 사용해 준비 단계의 옵션을 자동으로 탐색해보세요.

사이킷런 0.20 버전에서 GridSearchCV의 n_jobs 매개변수를 -1로 했을 때 에러가 발생하는 경우가 있습니다(https://github.com/scikit-learn/scikit-learn/issues/12250). 에러가 해결되기 전까지 매개변수를 1로 설정합니다.

In [149]:

param_grid = [
        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
         'feature_selection__k': list(range(1, len(feature_importances) + 1))}
]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=1)
grid_search_prep.fit(housing, housing_labels)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.3s remaining:    0.0s

[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total=   6.2s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total=   6.2s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=mean, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total=   6.2s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=median, total=   6.2s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.2s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.3s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.2s
[CV] feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=1, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.2s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total=   6.4s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total=   6.4s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=mean, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total=   6.4s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=median, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.4s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.5s
[CV] feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=2, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.5s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total=   6.9s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total=   7.2s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total=   7.1s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total=   6.7s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean, total=   6.8s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total=   6.9s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total=   6.7s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total=   6.6s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total=   6.6s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=median, total=   6.6s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.7s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.7s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.6s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.7s
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=3, preparation__num_pipeline__imputer__strategy=most_frequent, total=   6.7s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total=   7.3s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total=   7.1s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total=   7.2s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total=   7.2s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=mean, total=   7.1s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total=   7.3s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total=   7.1s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total=   7.3s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total=   7.2s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=median, total=   7.1s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.3s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.1s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.3s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.2s
[CV] feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.1s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total=   7.6s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total=   7.6s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=mean, total=   7.4s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total=   7.6s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total=   7.6s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=median, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.6s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.5s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.6s
[CV] feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=5, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.4s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total=   7.7s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total=   7.9s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total=   7.7s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total=   7.6s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=mean, total=   7.9s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total=   7.7s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total=   7.9s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total=   7.7s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total=   7.6s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=median, total=   7.9s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.7s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.9s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.7s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.6s
[CV] feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=6, preparation__num_pipeline__imputer__strategy=most_frequent, total=   7.9s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total=   8.3s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total=   8.2s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total=   8.5s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total=   8.5s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=mean, total=   8.3s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total=   8.9s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total=   8.4s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total=   8.4s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total=   8.1s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=median, total=   8.2s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total=   8.8s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total=   8.3s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total=   8.6s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total=   8.2s
[CV] feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num_pipeline__imputer__strategy=most_frequent, total=   8.7s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total=   9.9s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total=   9.6s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total=  10.8s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total=  10.4s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=mean, total=  11.3s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total=  10.3s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total=   9.7s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total=  10.7s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total=  10.7s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=median, total=  10.5s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total=  10.1s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total=   9.9s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total=  10.6s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total=   9.7s
[CV] feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=8, preparation__num_pipeline__imputer__strategy=most_frequent, total=  10.4s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total=  14.3s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total=  14.5s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total=  13.6s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total=  13.7s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=mean, total=  12.6s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total=  14.2s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total=  14.4s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total=  12.2s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total=  13.8s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=median, total=  12.5s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total=  14.1s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total=  14.4s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total=  14.0s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total=  13.9s
[CV] feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=9, preparation__num_pipeline__imputer__strategy=most_frequent, total=  13.3s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total=  14.4s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total=  15.3s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total=  19.5s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total=  17.0s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=mean, total=  15.8s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total=  14.6s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total=  17.1s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total=  15.8s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total=  16.0s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=median, total=  14.9s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total=  14.7s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total=  15.4s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total=  14.7s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total=  16.0s
[CV] feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=10, preparation__num_pipeline__imputer__strategy=most_frequent, total=  16.4s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total=  19.0s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total=  17.2s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total=  17.5s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total=  18.6s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=mean, total=  17.9s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total=  16.1s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total=  15.8s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total=  16.2s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total=  19.0s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=median, total=  18.4s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total=  19.3s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total=  15.7s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total=  16.2s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total=  17.5s
[CV] feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=11, preparation__num_pipeline__imputer__strategy=most_frequent, total=  20.9s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total=  19.5s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total=  19.1s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total=  18.8s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total=  19.2s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=mean, total=  19.9s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total=  17.8s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total=  18.5s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total=  20.1s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total=  18.6s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=median, total=  18.6s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total=  17.3s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total=  18.4s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.8s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total=  17.7s
[CV] feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=12, preparation__num_pipeline__imputer__strategy=most_frequent, total=  18.0s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total=  23.0s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total=  21.0s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total=  22.5s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total=  20.6s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=mean, total=  17.4s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total=  18.8s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total=  22.2s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total=  23.2s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total=  22.6s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=median, total=  20.7s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total=  19.6s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.3s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.8s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.5s
[CV] feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=13, preparation__num_pipeline__imputer__strategy=most_frequent, total=  19.2s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total=  18.0s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total=  20.8s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total=  21.1s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total=  21.2s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=mean, total=  19.4s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total=  20.6s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total=  21.5s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total=  21.0s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total=  21.0s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=median, total=  19.1s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total=  20.8s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total=  18.5s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total=  19.4s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.6s
[CV] feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num_pipeline__imputer__strategy=most_frequent, total=  25.0s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total=  21.6s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total=  20.6s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total=  22.1s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total=  17.5s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=mean, total=  20.3s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total=  18.6s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total=  20.8s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total=  22.7s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total=  22.7s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=median, total=  21.0s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total=  22.3s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total=  22.3s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total=  22.9s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total=  18.8s
[CV] feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=15, preparation__num_pipeline__imputer__strategy=most_frequent, total=  22.3s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total=  20.9s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total=  21.3s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total=  20.1s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total=  21.4s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=mean, total=  18.4s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total=  19.2s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total=  21.2s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total=  20.2s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total=  17.5s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=median, total=  20.6s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total=  18.7s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.0s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total=  19.9s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total=  20.0s
[CV] feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent 
[CV]  feature_selection__k=16, preparation__num_pipeline__imputer__strategy=most_frequent, total=  21.3s

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 52.4min finished

Out[149]:

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preparation',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('selector',
                                                                                          DataFrameSelector(attribute_names=['longitude',
                                                                                                                             'latitude',
                                                                                                                             'housing_median_age',
                                                                                                                             'tota...
                                            kernel='rbf', max_iter=-1,
                                            shrinking=True, tol=0.001,
                                            verbose=False))],
                                verbose=False),
             iid='warn', n_jobs=1,
             param_grid=[{'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16],
                          'preparation__num_pipeline__imputer__strategy': ['mean',
                                                                           'median',
                                                                           'most_frequent']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=2)

In [150]:

grid_search_prep.best_params_

Out[150]:

{'feature_selection__k': 15,
 'preparation__num_pipeline__imputer__strategy': 'most_frequent'}

최상의 Imputer 정책은 most_frequent이고 거의 모든 특성이 유용합니다(16개 중 15개). 마지막 특성(ISLAND)은 잡음이 추가될 뿐입니다.

축하합니다! 이제 머신러닝에 대해 꽤 많은 것을 알게 되었습니다. :)

설정¶

데이터 다운로드¶

데이터 이해를 위한 탐색과 시각화¶

머신러닝 알고리즘을 위한 데이터 준비¶

책에 실린 방법¶

future_encoders.py를 사용한 새로운 방법¶

다시 책의 내용이 이어집니다¶

future_encoders.py를 사용한 방법 ==========================¶

====================================================¶

future_encoders.py를 사용한 방법 ==========================¶

====================================================¶

모델 선택과 훈련¶

모델 세부 튜닝¶

추가 내용¶

전처리와 예측을 포함한 파이프라인¶

joblib을 사용한 모델 저장¶

RandomizedSearchCV을 위한 Scipy 분포 함수¶

연습문제 해답¶

1.¶

2.¶

3.¶

4.¶

5.¶

`RandomizedSearchCV`을 위한 Scipy 분포 함수¶