! pip install numpy
! pip install pandas
! pip install sklearn
import warnings
warnings.filterwarnings('ignore')
import hourse_price_preprocessor as hpp
import os
import numpy as np
import pandas as pd
DATA_DIR = "data/house_price/"
TEST_FILENAME = "test.csv"
TRAIN_FILENAME = "train.csv"
test_file = os.path.join(DATA_DIR, TEST_FILENAME)
train_file = os.path.join(DATA_DIR, TRAIN_FILENAME)
X_train, X_test, y_train, test_id_idx = hpp.get_train_test_split_dataset(train_file, test_file)
X_train.shape, y_train.shape, X_test.shape, test_id_idx.shape
((1460, 67), (1460,), (1459, 67), (1459,))
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
select = SelectFromModel(estimator=RandomForestRegressor(n_estimators=200),
threshold="median")
select.fit(X_train, y_train)
# transform training set
X_train_selected = select.transform(X_train)
X_train_selected.shape
(1460, 34)
# CV score of selected data set
np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),
X_train_selected,
y_train,
scoring="r2"))
0.8451998805867299
# CV score of full data set
np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),
X_train,
y_train,
scoring="r2"))
0.8453123482733651
X_test_selected = select.transform(X_test)
lr = RandomForestRegressor(n_estimators=1000)
lr.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
select.get_support()
array([False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, False, False, False, False, False, False, True, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, False, True, False, False, True, False])
lr.feature_importances_
array([5.86635253e-04, 6.35226018e-04, 3.13873874e-05, 1.76446103e-05, 2.63455817e-04, 1.66475815e-03, 2.48867436e-04, 7.71798903e-05, 2.38984037e-04, 1.69252390e-05, 3.07788155e-04, 3.05799961e-04, 1.17085864e-04, 1.16593143e-05, 5.62596401e-05, 7.95842376e-06, 3.22227354e-05, 1.97501373e-04, 7.64795432e-04, 3.15306021e-03, 1.57051068e-03, 6.69520234e-03, 1.84039792e-02, 5.29243793e-03, 1.01574917e-03, 2.09889878e-06, 8.71862017e-05, 1.00176116e-03, 6.39519127e-04, 1.01269643e-03, 1.03724146e-03, 1.04118417e-03, 3.44272071e-04, 4.41193308e-04, 8.44429188e-03, 1.73231906e-02, 6.26555657e-02, 3.46818684e-02, 8.78382531e-03, 3.61008523e-02, 1.55159707e-03, 8.04525688e-03, 6.97136401e-02, 4.21627003e-02, 2.50655836e-02, 1.89168035e-04, 2.34987429e-01, 1.85382910e-03, 4.74007615e-04, 7.56489778e-03, 2.63247269e-03, 4.53145595e-03, 1.99927331e-03, 7.69932058e-03, 8.73998630e-03, 8.06918925e-03, 3.19444427e-01, 1.93271512e-02, 5.68989321e-03, 7.71485291e-03, 1.10370882e-03, 5.76839664e-04, 1.79836600e-03, 4.91730645e-04, 1.62542868e-04, 2.34836091e-03, 7.52498271e-04])
np.flip(np.argsort(lr.feature_importances_), axis = 0)
array([56, 46, 42, 36, 43, 39, 37, 44, 57, 22, 35, 38, 54, 34, 55, 41, 59, 53, 49, 21, 58, 23, 51, 19, 50, 65, 52, 47, 62, 5, 20, 40, 60, 31, 30, 24, 29, 27, 18, 66, 28, 1, 0, 61, 63, 48, 33, 32, 10, 11, 4, 6, 8, 17, 45, 64, 12, 26, 7, 14, 16, 2, 3, 9, 13, 15, 25], dtype=int64)