▣비즈니스의 정확한 목적?(이익을 어떻게 얻을지)
■ 문제 정의
▶평균 제곱근 오차Root Mean Square Error(RMSE)
▶평균 절대 오차Mean Absolute Error(MAE)
▶아나콘다(Anaconda) 설치
$ conda create -n mlbook python=3.5 anaconda
$ activate mlbook
$ conda install -n mlbook -c conda-forge tensorflow
$ jupyter notebook
▶깃허브
https://github.com/rickiepark/handson-ml
/hands_on_ml_link/datasets/housing/housing.csv
import os
import pandas as pd
HOUSING_PATH = os.path.join("datasets", "housing")
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): longitude 20640 non-null float64 latitude 20640 non-null float64 housing_median_age 20640 non-null float64 total_rooms 20640 non-null float64 total_bedrooms 20433 non-null float64 population 20640 non-null float64 households 20640 non-null float64 median_income 20640 non-null float64 median_house_value 20640 non-null float64 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
housing["ocean_proximity"].value_counts()
<1H OCEAN 9136 INLAND 6551 NEAR OCEAN 2658 NEAR BAY 2290 ISLAND 5 Name: ocean_proximity, dtype: int64
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
16512 train + 4128 test
np.random.permutation(10)
array([9, 6, 4, 5, 8, 7, 1, 0, 2, 3])
housing.iloc[[32, 627, 2373]]
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
32 | -122.27 | 37.84 | 48.0 | 1922.0 | 409.0 | 1026.0 | 335.0 | 1.7969 | 110400.0 | NEAR BAY |
627 | -122.18 | 37.70 | 35.0 | 2562.0 | 554.0 | 1398.0 | 525.0 | 3.3906 | 178900.0 | NEAR BAY |
2373 | -119.57 | 36.70 | 30.0 | 2370.0 | 412.0 | 1248.0 | 410.0 | 3.1442 | 72300.0 | INLAND |
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
▷무작위 샘플링 방식은 특성 수에 비해 데이터셋이 충분히 크다면 일반적으로 괜찮지만, 그렇지 않다면 샘플링 편향이 생길 가능성이 큼
▶계층적 샘플링stratified sampling : 전체 모수는 계층strata이라는 동질의 그룹으로 나뉘고, 테스트 세트가 전체 모수를 대표하도록 각 계층에서 올바른 수의 샘플을 추출하는 것
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # 소득의 카테고리 수를 제한(나누기), 이산적인 카테고리 만들기(올림)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
s = pd.Series(range(5))
s.where(s > 1, 10)
0 10 1 10 2 2 3 3 4 4 dtype: int64
housing["income_cat"].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x227af071e80>
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # n_splits = Number of re-shuffling & splitting iterations.
for train_index, test_index in split.split(housing, housing["income_cat"]):
# print(train_index, test_index)
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
# 전체 주택 데이터셋에서 소득 카테고리의 비율
housing["income_cat"].value_counts() / len(housing)
3.0 0.350581 2.0 0.318847 4.0 0.176308 5.0 0.114438 1.0 0.039826 Name: income_cat, dtype: float64
# 테스트 주택 데이터셋에서 소득 카테고리의 비율(무작위 샘플링)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
test_set["income_cat"].value_counts() / len(test_set)
3.0 0.358527 2.0 0.324370 4.0 0.167393 5.0 0.109496 1.0 0.040213 Name: income_cat, dtype: float64
# 테스트 주택 데이터셋에서 소득 카테고리의 비율(계층 샘플링)
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
3.0 0.350533 2.0 0.318798 4.0 0.176357 5.0 0.114583 1.0 0.039729 Name: income_cat, dtype: float64
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
# 훈련 세트를 손상시키지 않기 위해 복사본 생성
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude")
<matplotlib.axes._subplots.AxesSubplot at 0x227af6c67b8>
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x227af136240>
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False)
plt.legend()
<matplotlib.legend.Legend at 0x227af0ee8d0>
corr_matrix = housing.corr()
# 중간 주택 가격과 다른 특성 사이의 상관관계 크기
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.687160 total_rooms 0.135097 housing_median_age 0.114110 households 0.064506 total_bedrooms 0.047689 population -0.026920 longitude -0.047432 latitude -0.142724 Name: median_house_value, dtype: float64
# 숫자형 특성 사이에 산점도를 그려주는 판다스의 scatter_matrix 함수
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000227B1335E10>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B11EF278>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B1205908>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B1220F98>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000227B123C668>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B1255CF8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B12733C8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B128AA58>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000227B12A9128>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B12C07B8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B12DAE48>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B12F7518>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000227B130FBA8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B136E278>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B1386908>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000227B139FF98>]], dtype=object)
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x227b1694f98>
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.687160 rooms_per_household 0.146285 total_rooms 0.135097 housing_median_age 0.114110 households 0.064506 total_bedrooms 0.047689 population_per_household -0.021985 population -0.026920 longitude -0.047432 latitude -0.142724 bedrooms_per_room -0.259984 Name: median_house_value, dtype: float64
▣ 데이터 준비를 함수로 자동화해야 하는 이유
housing = strat_train_set.drop("median_house_value", axis=1, inplace=False)
housing_labels = strat_train_set["median_house_value"].copy()
housing.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 16512 entries, 17606 to 15775 Data columns (total 9 columns): longitude 16512 non-null float64 latitude 16512 non-null float64 housing_median_age 16512 non-null float64 total_rooms 16512 non-null float64 total_bedrooms 16354 non-null float64 population 16512 non-null float64 households 16512 non-null float64 median_income 16512 non-null float64 ocean_proximity 16512 non-null object dtypes: float64(8), object(1) memory usage: 1.3+ MB
print(type(housing_labels))
print(len(housing_labels))
housing_labels.head()
<class 'pandas.core.series.Series'> 16512
17606 286600.0 18632 340600.0 14650 196900.0 3230 46300.0 3555 254500.0 Name: median_house_value, dtype: float64
▣ 누락된 특성값을 처리하는 방법 세 가지
# 방법 1
housing.dropna(subset=["total_bedrooms"])
# 방법 2
housing.drop("total_bedrooms", axis=1)
# 방법 3
median = housing["total_bedrooms"].median() # 중간값 계산
print(median)
housing["total_bedrooms"].fillna(median, inplace=True)
housing.info()
433.0 <class 'pandas.core.frame.DataFrame'> Int64Index: 16512 entries, 17606 to 15775 Data columns (total 9 columns): longitude 16512 non-null float64 latitude 16512 non-null float64 housing_median_age 16512 non-null float64 total_rooms 16512 non-null float64 total_bedrooms 16512 non-null float64 population 16512 non-null float64 households 16512 non-null float64 median_income 16512 non-null float64 ocean_proximity 16512 non-null object dtypes: float64(8), object(1) memory usage: 1.3+ MB
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# 텍스트 특성인 ocean_proximity를 제외
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='median', verbose=0)
imputer.statistics_
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. , 408. , 3.5409])
housing_num.median()
longitude -118.5100 latitude 34.2600 housing_median_age 29.0000 total_rooms 2119.5000 total_bedrooms 433.0000 population 1164.0000 households 408.0000 median_income 3.5409 dtype: float64
X = imputer.transform(housing_num)
print(type(X))
X
<class 'numpy.ndarray'>
array([[-121.89 , 37.29 , 38. , ..., 710. , 339. , 2.7042], [-121.93 , 37.05 , 14. , ..., 306. , 113. , 6.4214], [-117.2 , 32.77 , 31. , ..., 936. , 462. , 2.8621], ..., [-116.4 , 34.09 , 9. , ..., 2098. , 765. , 3.2723], [-118.01 , 33.82 , 31. , ..., 1356. , 356. , 4.0625], [-122.45 , 37.77 , 52. , ..., 1269. , 639. , 3.575 ]])
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = list(housing.index.values))
type(housing_tr)
pandas.core.frame.DataFrame
housing_num.columns
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], dtype='object')
list(housing.index.values)[:10]
[17606, 18632, 14650, 3230, 3555, 19480, 8879, 13685, 4937, 4861]
housing_cat = housing["ocean_proximity"]
housing_cat.head(10)
17606 <1H OCEAN 18632 <1H OCEAN 14650 NEAR OCEAN 3230 INLAND 3555 <1H OCEAN 19480 INLAND 8879 <1H OCEAN 13685 INLAND 4937 <1H OCEAN 4861 <1H OCEAN Name: ocean_proximity, dtype: object
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[0:10]
array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
housing_categories
Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')
→ 원-핫 인코딩one-hot encoding으로 해결
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
print(housing_cat_1hot)
housing_cat_1hot
(0, 0) 1.0 (1, 0) 1.0 (2, 1) 1.0 (3, 2) 1.0 (4, 0) 1.0 (5, 2) 1.0 (6, 0) 1.0 (7, 2) 1.0 (8, 0) 1.0 (9, 0) 1.0 (10, 2) 1.0 (11, 2) 1.0 (12, 0) 1.0 (13, 2) 1.0 (14, 2) 1.0 (15, 0) 1.0 (16, 3) 1.0 (17, 2) 1.0 (18, 2) 1.0 (19, 2) 1.0 (20, 0) 1.0 (21, 0) 1.0 (22, 0) 1.0 (23, 2) 1.0 (24, 2) 1.0 : : (16487, 2) 1.0 (16488, 2) 1.0 (16489, 1) 1.0 (16490, 3) 1.0 (16491, 0) 1.0 (16492, 3) 1.0 (16493, 2) 1.0 (16494, 2) 1.0 (16495, 0) 1.0 (16496, 2) 1.0 (16497, 3) 1.0 (16498, 2) 1.0 (16499, 0) 1.0 (16500, 0) 1.0 (16501, 0) 1.0 (16502, 1) 1.0 (16503, 0) 1.0 (16504, 2) 1.0 (16505, 2) 1.0 (16506, 0) 1.0 (16507, 2) 1.0 (16508, 2) 1.0 (16509, 2) 1.0 (16510, 0) 1.0 (16511, 3) 1.0
<16512x5 sparse matrix of type '<class 'numpy.float64'>' with 16512 stored elements in Compressed Sparse Row format>
housing_cat_encoded.reshape(-1,1)
array([[0], [0], [1], ..., [2], [0], [3]], dtype=int64)
housing_cat_encoded.reshape(-1, 3)
array([[0, 0, 1], [2, 0, 2], [0, 2, 0], ..., [0, 2, 2], [0, 2, 2], [2, 0, 3]], dtype=int64)
encoder.categories_
[array([0, 1, 2, 3, 4], dtype=int64)]
housing_cat_1hot.toarray()
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], ..., [0., 0., 1., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
cat_encoder = OneHotEncoder(categories='auto', sparse=False)
housing_cat_1hot_ndarray = cat_encoder.fit_transform(housing_cat.values.reshape(-1, 1))
housing_cat_1hot_ndarray
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], ..., [0., 1., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
housing_cat.values.reshape(-1, 1)
array([['<1H OCEAN'], ['<1H OCEAN'], ['NEAR OCEAN'], ..., ['INLAND'], ['<1H OCEAN'], ['NEAR BAY']], dtype=object)
cat_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
※덕 타이핑: 동적 타이핑의 한 종류로, 객체의 변수 및 메소드의 집합이 객체의 타입을 결정하는 것을 말함. 클래스 상속이나 인터페이스 구현으로 타입을 구분하는 대신, 덕 타이핑은 객체가 어떤 타입에 걸맞은 변수와 메소드를 지니면 객체를 해당 타입에 속하는 것으로 간주한다. “덕 타이핑”이라는 용어는 다음과 같이 표현될 수 있는 덕 테스트에서 유래했다.
class Duck:
def quack(self):
print ("꽥꽥!")
def feathers(self):
print ("오리에게 흰색, 회색 깃털이 있습니다.")
class Person:
def quack(self):
print ("이 사람이 오리를 흉내내네요.")
def feathers(self):
print ("사람은 바닥에서 깃털을 주어서 보여 줍니다.")
def in_the_forest(duck):
duck.quack()
duck.feathers()
def game():
donald = Duck()
john = Person()
in_the_forest(donald)
in_the_forest(john)
game()
꽥꽥! 오리에게 흰색, 회색 깃털이 있습니다. 이 사람이 오리를 흉내내네요. 사람은 바닥에서 깃털을 주어서 보여 줍니다.
from sklearn.base import BaseEstimator, TransformerMixin
# 컬럼 인덱스
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.c_[a, b]
array([[1, 4], [2, 5], [3, 6]])
housing.values
array([[-121.89, 37.29, 38.0, ..., 339.0, 2.7042, '<1H OCEAN'], [-121.93, 37.05, 14.0, ..., 113.0, 6.4214, '<1H OCEAN'], [-117.2, 32.77, 31.0, ..., 462.0, 2.8621, 'NEAR OCEAN'], ..., [-116.4, 34.09, 9.0, ..., 765.0, 3.2723, 'INLAND'], [-118.01, 33.82, 31.0, ..., 356.0, 4.0625, '<1H OCEAN'], [-122.45, 37.77, 52.0, ..., 639.0, 3.575, 'NEAR BAY']], dtype=object)
housing_extra_attribs
array([[-121.89, 37.29, 38.0, ..., '<1H OCEAN', 4.625368731563422, 2.094395280235988], [-121.93, 37.05, 14.0, ..., '<1H OCEAN', 6.008849557522124, 2.7079646017699117], [-117.2, 32.77, 31.0, ..., 'NEAR OCEAN', 4.225108225108225, 2.0259740259740258], ..., [-116.4, 34.09, 9.0, ..., 'INLAND', 6.34640522875817, 2.742483660130719], [-118.01, 33.82, 31.0, ..., '<1H OCEAN', 5.50561797752809, 3.808988764044944], [-122.45, 37.77, 52.0, ..., 'NEAR BAY', 4.843505477308295, 1.9859154929577465]], dtype=object)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr
array([[-1.15604281, 0.77194962, 0.74333089, ..., -0.31205452, -0.08649871, 0.15531753], [-1.17602483, 0.6596948 , -1.1653172 , ..., 0.21768338, -0.03353391, -0.83628902], [ 1.18684903, -1.34218285, 0.18664186, ..., -0.46531516, -0.09240499, 0.4222004 ], ..., [ 1.58648943, -0.72478134, -1.56295222, ..., 0.3469342 , -0.03055414, -0.52177644], [ 0.78221312, -0.85106801, 0.18664186, ..., 0.02499488, 0.06150916, -0.30340741], [-1.43579109, 0.99645926, 1.85670895, ..., -0.22852947, -0.09586294, 0.10180567]])
# 사이킷런이 DataFrame을 바로 사용하지 못하므로
# 수치형이나 범주형 컬럼을 선택하는 클래스를 생성
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', OneHotEncoder(categories='auto')),
])
from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num_pipeline", num_pipeline, num_attribs),
("cat_encoder", OneHotEncoder(categories='auto'), cat_attribs),
])
housing.values
array([[-121.89, 37.29, 38.0, ..., 339.0, 2.7042, '<1H OCEAN'], [-121.93, 37.05, 14.0, ..., 113.0, 6.4214, '<1H OCEAN'], [-117.2, 32.77, 31.0, ..., 462.0, 2.8621, 'NEAR OCEAN'], ..., [-116.4, 34.09, 9.0, ..., 765.0, 3.2723, 'INLAND'], [-118.01, 33.82, 31.0, ..., 356.0, 4.0625, '<1H OCEAN'], [-122.45, 37.77, 52.0, ..., 639.0, 3.575, 'NEAR BAY']], dtype=object)
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281, 0.77194962, 0.74333089, ..., 0. , 0. , 0. ], [-1.17602483, 0.6596948 , -1.1653172 , ..., 0. , 0. , 0. ], [ 1.18684903, -1.34218285, 0.18664186, ..., 0. , 0. , 1. ], ..., [ 1.58648943, -0.72478134, -1.56295222, ..., 0. , 0. , 0. ], [ 0.78221312, -0.85106801, 0.18664186, ..., 0. , 0. , 0. ], [-1.43579109, 0.99645926, 1.85670895, ..., 0. , 1. , 0. ]])
housing_prepared.shape
(16512, 16)