import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm
from collections import OrderedDict, defaultdict
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
Loan the train and test datasets, saved from Kaggle to Github.
#data
train = pd.read_csv("https://raw.githubusercontent.com/mkivenson/Computational-Mathematics/master/Final%20Project/train.csv")
train_copy = pd.read_csv("https://raw.githubusercontent.com/mkivenson/Computational-Mathematics/master/Final%20Project/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/mkivenson/Computational-Mathematics/master/Final%20Project/test.csv")
The following is a summary of each of the features of the training dataset for house prices.
train.describe()
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | ... | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | ... | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | ... | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | ... | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | ... | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | ... | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
8 rows × 38 columns
train.loc[:, train.isna().any()]
LotFrontage | Alley | MasVnrType | MasVnrArea | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Electrical | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageQual | GarageCond | PoolQC | Fence | MiscFeature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 65.0 | NaN | BrkFace | 196.0 | Gd | TA | No | GLQ | Unf | SBrkr | NaN | Attchd | 2003.0 | RFn | TA | TA | NaN | NaN | NaN |
1 | 80.0 | NaN | None | 0.0 | Gd | TA | Gd | ALQ | Unf | SBrkr | TA | Attchd | 1976.0 | RFn | TA | TA | NaN | NaN | NaN |
2 | 68.0 | NaN | BrkFace | 162.0 | Gd | TA | Mn | GLQ | Unf | SBrkr | TA | Attchd | 2001.0 | RFn | TA | TA | NaN | NaN | NaN |
3 | 60.0 | NaN | None | 0.0 | TA | Gd | No | ALQ | Unf | SBrkr | Gd | Detchd | 1998.0 | Unf | TA | TA | NaN | NaN | NaN |
4 | 84.0 | NaN | BrkFace | 350.0 | Gd | TA | Av | GLQ | Unf | SBrkr | TA | Attchd | 2000.0 | RFn | TA | TA | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1455 | 62.0 | NaN | None | 0.0 | Gd | TA | No | Unf | Unf | SBrkr | TA | Attchd | 1999.0 | RFn | TA | TA | NaN | NaN | NaN |
1456 | 85.0 | NaN | Stone | 119.0 | Gd | TA | No | ALQ | Rec | SBrkr | TA | Attchd | 1978.0 | Unf | TA | TA | NaN | MnPrv | NaN |
1457 | 66.0 | NaN | None | 0.0 | TA | Gd | No | GLQ | Unf | SBrkr | Gd | Attchd | 1941.0 | RFn | TA | TA | NaN | GdPrv | Shed |
1458 | 68.0 | NaN | None | 0.0 | TA | TA | Mn | GLQ | Rec | FuseA | NaN | Attchd | 1950.0 | Unf | TA | TA | NaN | NaN | NaN |
1459 | 75.0 | NaN | None | 0.0 | TA | TA | No | BLQ | LwQ | SBrkr | NaN | Attchd | 1965.0 | Fin | TA | TA | NaN | NaN | NaN |
1460 rows × 19 columns
To handle the NaN values, I filled them with either the mean value of a column, 'None', or 0. I determined how to fill NaN values using the data dictionary provided on Kaggle.
def removena(data):
#LotFrontage: Linear feet of street connected to property. Fill with mean lot frontage in dataset
data['LotFrontage'] = data['LotFrontage'].fillna(np.mean(data['LotFrontage']))
#Alley: Type of alley access. Fill NaN with None (assume no alley access)
data['Alley'] = data['Alley'].fillna('None')
#MasVnrType: Masonry veneer type. Fill NaN with None (assume no veneer)
data['MasVnrType'] = data['Alley'].fillna('None')
#MasVnrArea: Masonry veneer area in square feet. Fill NaN with 0 (assume no veneer)
data['MasVnrArea'] = data['Alley'].fillna(0)
#BsmtQual: Height of the basement - assume no basement
data['BsmtQual'] = data['BsmtQual'].fillna('None')
#BsmtCond: General condition of the basement - assume no basement
data['BsmtCond'] = data['BsmtCond'].fillna('None')
#BsmtExposure: Walkout or garden level basement walls - assume no basement
data['BsmtExposure'] = data['BsmtExposure'].fillna('None')
#BsmtFinType1: Quality of basement finished area - assume no basement
data['BsmtFinType1'] = data['BsmtFinType1'].fillna('None')
#BsmtFinType2: Quality of second finished area (if present) - assume no basement
data['BsmtFinType2'] = data['BsmtFinType2'].fillna('None')
#BsmtFinSF1
data['BsmtFinSF1'] = data['BsmtFinSF1'].fillna(0)
#BsmtFinSF2
data['BsmtFinSF2'] = data['BsmtFinSF2'].fillna(0)
#BsmtUnfSF
data['BsmtUnfSF'] = data['BsmtUnfSF'].fillna(0)
#TotalBsmtSF
data['TotalBsmtSF'] = data['TotalBsmtSF'].fillna(0)
#BsmtFullBath
data['BsmtFullBath'] = data['BsmtFullBath'].fillna(0)
#BsmtHalfBath
data['BsmtHalfBath'] = data['BsmtHalfBath'].fillna(0)
#Electrical: Electrical system - only one house with NaN for this field, set to SBrkr (most common)
data['Electrical'] = data['Electrical'].fillna('SBrkr')
#FireplaceQu: Fireplace quality - assume no fireplace, set none
data['FireplaceQu'] = data['FireplaceQu'].fillna('None')
#GarageType: Garage location - set to None, assume no garage
data['GarageType'] = data['GarageType'].fillna('None')
#GarageYrBlt: Year garage was built - set to None, assume no garage
data['GarageYrBlt'] = data['GarageYrBlt'].fillna('None')
#GarageFinish: Interior finish of the garage - set to None, assume no garage
data['GarageFinish'] = data['GarageFinish'].fillna('None')
#GarageQual: Garage quality - set to None, assume no garage
data['GarageQual'] = data['GarageQual'].fillna('None')
#GarageCond: Garage condition - set to None, assume no garage
data['GarageCond'] = data['GarageCond'].fillna('None')
#GarageCars
data['GarageCars'] = data['GarageCars'].fillna(0)
#GarageArea
data['GarageArea'] = data['GarageArea'].fillna(0)
#PoolQC: Pool quality - set to None, assume no pool
data['PoolQC'] = data['PoolQC'].fillna('None')
#Fence: Fence quality - set to None, assume no fence
data['Fence'] = data['Fence'].fillna('None')
#MiscFeature: Miscellaneous feature not covered in other categories, assume no feature
data['MiscFeature'] = data['MiscFeature'].fillna('None')
return data
train = removena(train)
train_copy = removena(train_copy)
Some of the categorical columns must be encoded to be used for regression. The following columns are categorical.
train.select_dtypes(include = ['object'])
MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageYrBlt | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RL | Pave | None | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | 2003 | RFn | TA | TA | Y | None | None | None | WD | Normal |
1 | RL | Pave | None | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | ... | 1976 | RFn | TA | TA | Y | None | None | None | WD | Normal |
2 | RL | Pave | None | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | 2001 | RFn | TA | TA | Y | None | None | None | WD | Normal |
3 | RL | Pave | None | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | ... | 1998 | Unf | TA | TA | Y | None | None | None | WD | Abnorml |
4 | RL | Pave | None | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | ... | 2000 | RFn | TA | TA | Y | None | None | None | WD | Normal |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1455 | RL | Pave | None | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | ... | 1999 | RFn | TA | TA | Y | None | None | None | WD | Normal |
1456 | RL | Pave | None | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | ... | 1978 | Unf | TA | TA | Y | None | MnPrv | None | WD | Normal |
1457 | RL | Pave | None | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | ... | 1941 | RFn | TA | TA | Y | None | GdPrv | Shed | WD | Normal |
1458 | RL | Pave | None | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | ... | 1950 | Unf | TA | TA | Y | None | None | None | WD | Normal |
1459 | RL | Pave | None | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | ... | 1965 | Fin | TA | TA | Y | None | None | None | WD | Normal |
1460 rows × 45 columns
Instead of using one hot encoder or label encoder, I created a custom dictionary for encoding. The dictionary for mapping categorical values to integers was created by sorting values in descending mean values.
def encode_train():
# integer encode
label_encoder = LabelEncoder()
for col in train.select_dtypes(include = ['object']):
#label_encoder.fit(list(code))
#data[col] = label_encoder.transform(data[col].astype(str))
#instead of using label encoder, which codes strings alphabetically or one hot encoder, I made a custom dict
di = (train[[col, 'SalePrice']]
.groupby(col)
.mean()
.sort_values('SalePrice')
.reset_index()
.reset_index()[col]
.to_dict(into=OrderedDict))
di = {v: k for k, v in di.items()}
train[col] = train[col].map(di)
return train
def encode_test():
# integer encode
label_encoder = LabelEncoder()
for col in test.select_dtypes(include = ['object']):
di = (train_copy[[col, 'SalePrice']]
.groupby(col)
.mean()
.sort_values('SalePrice')
.reset_index()
.reset_index()[col]
.to_dict(into=OrderedDict))
di = {v: k for k, v in di.items()}
test[col] = test[col].map(di)
return test
train = encode_train()
The following charts show a visualization of a scatter plot of all the features and Sales Price, as well as a histogram of feature value distributions. I attemped to implement log and exp transformations on some variables, however this increased the RMSE.
fig, ax = plt.subplots(10, 8, sharey=True, figsize=(16, 22))
plt.tight_layout()
for y, row in enumerate(ax):
for x, col in enumerate(row):
#for column in train.columns:
column = (train.columns[(train.columns != 'Id')])[x * 10 + y]
col.scatter(train[column], train['SalePrice'])
col.set_title(column)
fig, ax = plt.subplots(10, 8, sharey=True, figsize=(16, 22))
plt.tight_layout()
for y, row in enumerate(ax):
for x, col in enumerate(row):
#for column in train.columns:
column = (train.columns[(train.columns != 'Id') & (train.columns != 'SalePrice')])[x * 8 + y]
col.hist(train[column], bins = 20)
col.set_title(column)
The following section creates the training set of significant features only, using a significance level of 0.20. This was determined using the p-value for each feature from statsmodels.
X_train = train.drop(['SalePrice','ExterCond','Heating', 'YearBuilt', 'Exterior1st', 'Exterior2nd', 'Foundation',
'BsmtFinType2', 'BsmtFinSF2', 'CentralAir', 'BsmtHalfBath', 'GarageFinish', 'PavedDrive',
'OpenPorchSF', 'EnclosedPorch', 'MiscVal', 'YrSold','LotShape', 'BsmtUnfSF',
'FireplaceQu', 'MiscFeature', 'TotalBsmtSF', 'Fence', 'GarageYrBlt','Id',
'BsmtFinType1', 'MSZoning', 'LandSlope', 'HalfBath', 'FullBath',
'GarageType', 'GarageArea', '3SsnPorch'], axis=1)
y_train = train['SalePrice']
# with statsmodels
X = sm.add_constant(X_train) # adding a constant
model = sm.OLS(y_train, X_train).fit()
predictions = model.predict(X_train)
model.summary()
C:\Users\mkive\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead. return ptp(axis=axis, out=out, **kwargs)
Dep. Variable: | SalePrice | R-squared (uncentered): | 0.977 |
---|---|---|---|
Model: | OLS | Adj. R-squared (uncentered): | 0.976 |
Method: | Least Squares | F-statistic: | 1310. |
Date: | Sat, 14 Dec 2019 | Prob (F-statistic): | 0.00 |
Time: | 20:49:42 | Log-Likelihood: | -17135. |
No. Observations: | 1460 | AIC: | 3.436e+04 |
Df Residuals: | 1415 | BIC: | 3.460e+04 |
Df Model: | 45 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
MSSubClass | -247.2859 | 38.939 | -6.351 | 0.000 | -323.670 | -170.902 |
LotFrontage | -139.3097 | 45.949 | -3.032 | 0.002 | -229.445 | -49.174 |
LotArea | 0.1712 | 0.096 | 1.792 | 0.073 | -0.016 | 0.359 |
Street | 1.846e+04 | 1.33e+04 | 1.391 | 0.165 | -7580.362 | 4.45e+04 |
Alley | 1317.2931 | 721.749 | 1.825 | 0.068 | -98.520 | 2733.106 |
LandContour | 4159.7639 | 1909.687 | 2.178 | 0.030 | 413.642 | 7905.886 |
Utilities | 6.648e+04 | 3.07e+04 | 2.167 | 0.030 | 6298.196 | 1.27e+05 |
LotConfig | 1211.4275 | 711.713 | 1.702 | 0.089 | -184.699 | 2607.554 |
Neighborhood | 2011.6360 | 200.262 | 10.045 | 0.000 | 1618.793 | 2404.479 |
Condition1 | 2078.0749 | 853.216 | 2.436 | 0.015 | 404.370 | 3751.780 |
Condition2 | -8660.7583 | 3259.623 | -2.657 | 0.008 | -1.51e+04 | -2266.545 |
BldgType | -6451.6717 | 2082.141 | -3.099 | 0.002 | -1.05e+04 | -2367.256 |
HouseStyle | -898.1754 | 662.816 | -1.355 | 0.176 | -2198.383 | 402.032 |
OverallQual | 9791.2259 | 1157.685 | 8.458 | 0.000 | 7520.263 | 1.21e+04 |
OverallCond | 4628.5020 | 841.871 | 5.498 | 0.000 | 2977.052 | 6279.952 |
YearRemodAdd | -68.3127 | 20.192 | -3.383 | 0.001 | -107.922 | -28.703 |
RoofStyle | 2352.2783 | 726.124 | 3.239 | 0.001 | 927.883 | 3776.674 |
RoofMatl | 8458.8767 | 2039.926 | 4.147 | 0.000 | 4457.273 | 1.25e+04 |
MasVnrType | 1317.2931 | 721.749 | 1.825 | 0.068 | -98.520 | 2733.106 |
MasVnrArea | 1317.2931 | 721.749 | 1.825 | 0.068 | -98.520 | 2733.106 |
ExterQual | 1.029e+04 | 2466.343 | 4.172 | 0.000 | 5451.812 | 1.51e+04 |
BsmtQual | 6845.5537 | 1841.102 | 3.718 | 0.000 | 3233.971 | 1.05e+04 |
BsmtCond | -5069.1691 | 2365.953 | -2.143 | 0.032 | -9710.321 | -428.017 |
BsmtExposure | 5065.2581 | 946.282 | 5.353 | 0.000 | 3208.991 | 6921.525 |
BsmtFinSF1 | 12.3947 | 2.671 | 4.641 | 0.000 | 7.156 | 17.633 |
HeatingQC | 1344.9593 | 1049.922 | 1.281 | 0.200 | -714.611 | 3404.530 |
Electrical | -3772.2628 | 2306.655 | -1.635 | 0.102 | -8297.093 | 752.567 |
1stFlrSF | 24.8400 | 5.138 | 4.835 | 0.000 | 14.762 | 34.918 |
2ndFlrSF | 23.5145 | 4.777 | 4.922 | 0.000 | 14.144 | 32.885 |
LowQualFinSF | -22.6692 | 13.165 | -1.722 | 0.085 | -48.495 | 3.156 |
GrLivArea | 25.6853 | 4.938 | 5.202 | 0.000 | 15.999 | 35.371 |
BsmtFullBath | 4512.6457 | 2127.529 | 2.121 | 0.034 | 339.197 | 8686.095 |
BedroomAbvGr | -4169.2805 | 1526.698 | -2.731 | 0.006 | -7164.114 | -1174.447 |
KitchenAbvGr | -2.394e+04 | 5871.493 | -4.077 | 0.000 | -3.55e+04 | -1.24e+04 |
KitchenQual | 9010.1526 | 1953.454 | 4.612 | 0.000 | 5178.175 | 1.28e+04 |
TotRmsAbvGrd | 4141.6833 | 1105.322 | 3.747 | 0.000 | 1973.437 | 6309.930 |
Functional | 3904.7472 | 1017.355 | 3.838 | 0.000 | 1909.060 | 5900.434 |
Fireplaces | 3955.9804 | 1558.871 | 2.538 | 0.011 | 898.034 | 7013.927 |
GarageCars | 1.139e+04 | 1800.307 | 6.324 | 0.000 | 7853.957 | 1.49e+04 |
GarageQual | 4404.4419 | 3364.396 | 1.309 | 0.191 | -2195.298 | 1.1e+04 |
GarageCond | -4603.8004 | 1437.989 | -3.202 | 0.001 | -7424.619 | -1782.981 |
WoodDeckSF | 21.5810 | 7.140 | 3.022 | 0.003 | 7.574 | 35.588 |
ScreenPorch | 49.0476 | 15.107 | 3.247 | 0.001 | 19.413 | 78.683 |
PoolArea | -251.1118 | 44.214 | -5.679 | 0.000 | -337.844 | -164.379 |
PoolQC | 7.861e+04 | 1.27e+04 | 6.180 | 0.000 | 5.37e+04 | 1.04e+05 |
MoSold | -404.5408 | 303.742 | -1.332 | 0.183 | -1000.374 | 191.292 |
SaleType | 4005.7246 | 833.371 | 4.807 | 0.000 | 2370.949 | 5640.500 |
SaleCondition | 2571.5147 | 1044.663 | 2.462 | 0.014 | 522.260 | 4620.769 |
Omnibus: | 423.579 | Durbin-Watson: | 1.922 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 39183.404 |
Skew: | -0.250 | Prob(JB): | 0.00 |
Kurtosis: | 28.374 | Cond. No. | 1.21e+16 |
This step fits and predicts a linear regression model using significant features, then generates a submission csv. The linear regressor produced an RMSE of 0.22857.
regressor = LinearRegression()
regressor.fit(X_train, y_train) #training the algorithm
#Coefficient:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)
test = removena(test)
test = encode_test()
for nacolumn in test.loc[:, test.isna().any()].columns:
test[nacolumn].fillna(test[nacolumn].mode()[0], inplace=True)
y_pred = regressor.predict(test.drop(['ExterCond','Heating', 'YearBuilt', 'Exterior1st', 'Exterior2nd', 'Foundation',
'BsmtFinType2', 'BsmtFinSF2', 'CentralAir', 'BsmtHalfBath', 'GarageFinish', 'PavedDrive',
'OpenPorchSF', 'EnclosedPorch', 'MiscVal', 'YrSold','LotShape', 'BsmtUnfSF',
'FireplaceQu', 'MiscFeature', 'TotalBsmtSF', 'Fence', 'GarageYrBlt','Id',
'BsmtFinType1', 'MSZoning', 'LandSlope', 'HalfBath', 'FullBath',
'GarageType', 'GarageArea', '3SsnPorch'], axis=1))
submission = pd.concat([pd.Series(test.Id), pd.Series(y_pred)], axis=1)
submission.columns = ['Id', 'SalePrice']
pd.DataFrame(submission).to_csv("submission.csv", index=False)
55661.29093720041 [-2.48434887e+02 -1.40262431e+02 1.69755716e-01 1.80223168e+04 1.31026507e+03 4.15367807e+03 6.34090334e+04 1.21708288e+03 2.01731937e+03 2.08662493e+03 -8.83185494e+03 -6.54120891e+03 -8.56142656e+02 9.81161161e+03 4.72444317e+03 -9.47123501e+01 2.33056438e+03 8.39515391e+03 1.31026507e+03 1.31026507e+03 1.03738745e+04 7.02554891e+03 -5.23947050e+03 5.07594253e+03 1.23164127e+01 1.45373962e+03 -3.62836975e+03 2.48925385e+01 2.35932595e+01 -2.28480719e+01 2.56377263e+01 4.55322285e+03 -4.20376227e+03 -2.41280980e+04 9.20224710e+03 4.14332833e+03 3.84810804e+03 3.89143724e+03 1.14380297e+04 4.32674961e+03 -4.60533306e+03 2.16847910e+01 4.86214636e+01 -2.51148937e+02 7.87154951e+04 -4.04714548e+02 4.02205766e+03 2.59602296e+03]
Random Forest Regressor producted an RMSE of 0.14782.
# Import random forest regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train.drop(['SalePrice', 'Id'], axis=1), y_train);
# Use the forest's predict method on the test data
predictions = rf.predict(test.drop(['Id'], axis=1))
submission_rw = pd.concat([pd.Series(test.Id), pd.Series(predictions)], axis=1)
submission_rw.columns = ['Id', 'SalePrice']
submission_rw.head()
pd.DataFrame(submission_rw).to_csv("submission_rw.csv", index=False)
Gradient Boosting Regressor producted an RMSE of 0.13507
# Import GradientBoostingRegressor
gradient_boosting_regressor = GradientBoostingRegressor(random_state = 42)
# Train the model on training data
gradient_boosting_regressor.fit(train.drop(['SalePrice', 'Id'], axis=1), y_train);
# Use the forest's predict method on the test data
predictions = gradient_boosting_regressor.predict(test.drop(['Id'], axis=1))
submission_gbf = pd.concat([pd.Series(test.Id), pd.Series(predictions)], axis=1)
submission_gbf.columns = ['Id', 'SalePrice']
submission_gbf.head()
pd.DataFrame(submission_gbf).to_csv("submission_gbf.csv", index=False)