! pip install numpy
! pip install pandas
! pip install sklearn
import warnings
warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import hourse_price_preprocessor as hpp
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest # specify number
from sklearn.feature_selection import SelectPercentile # specify remaining ratio
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
Load data
DATA_DIR = "data/house_price/"
TEST_FILENAME = "test.csv"
TRAIN_FILENAME = "train.csv"
test_file = os.path.join(DATA_DIR, TEST_FILENAME)
train_file = os.path.join(DATA_DIR, TRAIN_FILENAME)
X_train, X_test, y_train, test_id_idx = hpp.get_train_test_split_dataset(train_file, test_file)
print(X_train.shape, y_train.shape, X_test.shape, test_id_idx.shape)
(1460, 67) (1460,) (1459, 67) (1459,)
select = SelectPercentile(percentile=55)
select.fit(X_train, y_train)
# transform training set
X_train_selected = select.transform(X_train)
print(X_train_selected.shape)
(1460, 37)
np.mean(cross_val_score(LinearRegression(), X_train_selected, y_train, scoring="r2"))
0.8000781632789679
np.mean(cross_val_score(LinearRegression(), X_train, y_train, scoring="r2"))
-2.009795549320303e+20
select.get_support()
array([False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, True, True, True, True, True, False, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, False, True, True, True, True, False, True, False, False, True, True, True, False, True, True, True, True, True, True, True, False, False, False, False, True, False, False])