The present project aims to use Machine Learning (more specifically, Linear Regression) models to predict house prices from other features of the houses.
The information about the columns from the dataset that we'll can be found here, while the dataset can be downloaded here.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from datetime import date
pd.set_option('display.max_columns', None)
# Read the dataset
data = pd.read_csv("AmesHousing.tsv", sep="\t")
# Display basic info
print("Shape of the dataset:", data.shape)
display(data.info())
display(data.head())
Shape of the dataset: (2930, 82) <class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 82 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 2930 non-null int64 1 PID 2930 non-null int64 2 MS SubClass 2930 non-null int64 3 MS Zoning 2930 non-null object 4 Lot Frontage 2440 non-null float64 5 Lot Area 2930 non-null int64 6 Street 2930 non-null object 7 Alley 198 non-null object 8 Lot Shape 2930 non-null object 9 Land Contour 2930 non-null object 10 Utilities 2930 non-null object 11 Lot Config 2930 non-null object 12 Land Slope 2930 non-null object 13 Neighborhood 2930 non-null object 14 Condition 1 2930 non-null object 15 Condition 2 2930 non-null object 16 Bldg Type 2930 non-null object 17 House Style 2930 non-null object 18 Overall Qual 2930 non-null int64 19 Overall Cond 2930 non-null int64 20 Year Built 2930 non-null int64 21 Year Remod/Add 2930 non-null int64 22 Roof Style 2930 non-null object 23 Roof Matl 2930 non-null object 24 Exterior 1st 2930 non-null object 25 Exterior 2nd 2930 non-null object 26 Mas Vnr Type 2907 non-null object 27 Mas Vnr Area 2907 non-null float64 28 Exter Qual 2930 non-null object 29 Exter Cond 2930 non-null object 30 Foundation 2930 non-null object 31 Bsmt Qual 2850 non-null object 32 Bsmt Cond 2850 non-null object 33 Bsmt Exposure 2847 non-null object 34 BsmtFin Type 1 2850 non-null object 35 BsmtFin SF 1 2929 non-null float64 36 BsmtFin Type 2 2849 non-null object 37 BsmtFin SF 2 2929 non-null float64 38 Bsmt Unf SF 2929 non-null float64 39 Total Bsmt SF 2929 non-null float64 40 Heating 2930 non-null object 41 Heating QC 2930 non-null object 42 Central Air 2930 non-null object 43 Electrical 2929 non-null object 44 1st Flr SF 2930 non-null int64 45 2nd Flr SF 2930 non-null int64 46 Low Qual Fin SF 2930 non-null int64 47 Gr Liv Area 2930 non-null int64 48 Bsmt Full Bath 2928 non-null float64 49 Bsmt Half Bath 2928 non-null float64 50 Full Bath 2930 non-null int64 51 Half Bath 2930 non-null int64 52 Bedroom AbvGr 2930 non-null int64 53 Kitchen AbvGr 2930 non-null int64 54 Kitchen Qual 2930 non-null object 55 TotRms AbvGrd 2930 non-null int64 56 Functional 2930 non-null object 57 Fireplaces 2930 non-null int64 58 Fireplace Qu 1508 non-null object 59 Garage Type 2773 non-null object 60 Garage Yr Blt 2771 non-null float64 61 Garage Finish 2771 non-null object 62 Garage Cars 2929 non-null float64 63 Garage Area 2929 non-null float64 64 Garage Qual 2771 non-null object 65 Garage Cond 2771 non-null object 66 Paved Drive 2930 non-null object 67 Wood Deck SF 2930 non-null int64 68 Open Porch SF 2930 non-null int64 69 Enclosed Porch 2930 non-null int64 70 3Ssn Porch 2930 non-null int64 71 Screen Porch 2930 non-null int64 72 Pool Area 2930 non-null int64 73 Pool QC 13 non-null object 74 Fence 572 non-null object 75 Misc Feature 106 non-null object 76 Misc Val 2930 non-null int64 77 Mo Sold 2930 non-null int64 78 Yr Sold 2930 non-null int64 79 Sale Type 2930 non-null object 80 Sale Condition 2930 non-null object 81 SalePrice 2930 non-null int64 dtypes: float64(11), int64(28), object(43) memory usage: 1.8+ MB
None
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Gd | Attchd | 1960.0 | Fin | 2.0 | 528.0 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 93.0 | 11160 | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | TA | Attchd | 1968.0 | Fin | 2.0 | 522.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
For the functions that we'll create, we'll start by creating them in the first few cells in the notebook. This way, we can add cells to the end of the notebook to do experiments and update the functions in these cells. We aim to:
def transform_features():
return data
def select_features():
return data[["MS SubClass", "Gr Liv Area", "SalePrice"]]
def train_and_test():
train = data[:1460].copy()
test = data[1460:].copy()
features = list(select_features().columns[select_features().columns != "SalePrice"])
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
return rmse
In general, the goal of this function is to:
def transform_features():
"""
Takes the initial dataframe, and returns a transformed version of the dataframe with the following changes:
1. Drop columns with more than 25% of null values
2. Update null values in remaining numerical columns with column mean
3. Scale numerical columns to a 0-1 scale
4. Create a column that tells how long did it pass since year of construction until year of remodelation
5. Categorical columns are converted to proper dtype
Args:
none
Returns:
df: Modified dataframe
"""
df = data.copy()
# 1. Drop columns with more than 25% of null values
# Create a list with variables having null values and the percentage of null values for each of them
na_props = df.isna().sum()[df.isna().sum() > 0] / len(df)
# Drop variables with null values above 25%
drop_list = list(na_props[na_props > .25].index)
df.drop(drop_list, axis=1, inplace=True)
# 2. Update null values in remaining numerical columns with column mean
# Update the list with variables having null values and the percentage of null values for each of them
na_props = df.isna().sum()[df.isna().sum() > 0] / len(df)
# Create a list with columns having null values, then select a list with columns that, having null values, are numerical
na_cols = list(na_props[na_props > 0].index)
avg_list = list(df[na_cols].select_dtypes(include=["int", "float"]).columns)
# Replace null values in numerical columns with their means
df[avg_list] = df[avg_list].fillna(df[avg_list].mean())
# 3. Scale numerical columns to a 0-1 scale
# Manually create a list with variables to be scaled to 0-1, then apply the scaling
num_cols = ['Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd',
'Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
'3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val']
df[num_cols] = (df[num_cols] - df[num_cols].min()) / (df[num_cols].max() - df[num_cols].min())
# 4. Create a column that tells how long did it pass since year of construction until year of remodelation
# Create a new variable telling how old was the house when it was remodeled. In case it wasn't, it tells current house age
df["years_until_remod"] = df["Year Remod/Add"] - df["Year Built"]
df["years_until_remod"] = np.where(df["years_until_remod"] == 0, date.today().year - df["Year Built"], df["years_until_remod"])
# 5. Categorical columns are converted to proper dtype
# Manually create a list with variables to be converted to category dtype, then apply the conversion
categorical_cols = ["Order", "PID", "Overall Qual", "Overall Cond", "Year Remod/Add", "Year Built", "Garage Yr Blt",
"MS SubClass", "MS Zoning", "Street", "Lot Shape", "Land Contour", "Utilities", "Lot Config", "Mo Sold",
"Yr Sold", "Land Slope", "Neighborhood", "Condition 1", "Condition 2", "Bldg Type", "House Style",
"Roof Style", "Roof Matl", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond",
"Foundation", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2",
"Heating", "Heating QC", "Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type",
"Garage Finish", "Garage Qual", "Garage Cond", "Paved Drive", "Sale Type", "Sale Condition"]
df[categorical_cols] = df[categorical_cols].astype("category")
# Move the SalePrice column to the end of the dataframe, for visualization purposes
cols = list(df.columns.values)
cols.pop(cols.index("SalePrice"))
df = df[cols+["SalePrice"]]
return df
To improve the function that we'll use to select interesting feature columns, first we'll take a look at the correlations between numerical columns
df = transform_features()
sns.set(rc={'figure.figsize':(12,9)})
sns.heatmap(df.select_dtypes(include=["int", "float"]).corr())
<AxesSubplot:>
It seems that the variables with the higher correlation values with SalePrice column are:
Therefore, let's see their individual r values:
# Correlations of interesting features from the heatmap with SalePrice
df[["Total Bsmt SF", "1st Flr SF", "Gr Liv Area", "Garage Cars", "SalePrice"]].corr()["SalePrice"]
Total Bsmt SF 0.632105 1st Flr SF 0.621676 Gr Liv Area 0.706780 Garage Cars 0.647861 SalePrice 1.000000 Name: SalePrice, dtype: float64
We'll use these as numerical predictors in our model.
Now, let's see how the categorical variables distribute their values, and we'll keep only those having interesting information, and not concentrating more than 95% of their values in one specific condition:
# Loop to display normalized value counts for each of the categorical columns
for var in ["Order", "PID", "Overall Qual", "Overall Cond", "Year Remod/Add", "Year Built", "Garage Yr Blt",
"MS SubClass", "MS Zoning", "Street", "Lot Shape", "Land Contour", "Utilities", "Lot Config", "Mo Sold",
"Yr Sold", "Land Slope", "Neighborhood", "Condition 1", "Condition 2", "Bldg Type", "House Style",
"Roof Style", "Roof Matl", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond",
"Foundation", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2",
"Heating", "Heating QC", "Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type",
"Garage Finish", "Garage Qual", "Garage Cond", "Paved Drive", "Sale Type", "Sale Condition"]:
display(df[var].value_counts(normalize=True))
1 0.000341 1947 0.000341 1949 0.000341 1950 0.000341 1951 0.000341 ... 979 0.000341 980 0.000341 981 0.000341 982 0.000341 2930 0.000341 Name: Order, Length: 2930, dtype: float64
526301100 0.000341 905202090 0.000341 905202210 0.000341 905202230 0.000341 905225020 0.000341 ... 533242090 0.000341 533244030 0.000341 533244040 0.000341 533250050 0.000341 1007100110 0.000341 Name: PID, Length: 2930, dtype: float64
5 0.281570 6 0.249829 7 0.205461 8 0.119454 4 0.077133 9 0.036519 3 0.013652 10 0.010580 2 0.004437 1 0.001365 Name: Overall Qual, dtype: float64
5 0.564505 6 0.181911 7 0.133106 8 0.049147 4 0.034471 3 0.017065 9 0.013993 2 0.003413 1 0.002389 Name: Overall Cond, dtype: float64
1950 0.123208 2006 0.068942 2007 0.055973 2005 0.048123 2004 0.037884 ... 1986 0.004437 1981 0.004437 2010 0.004437 1983 0.003754 1982 0.003072 Name: Year Remod/Add, Length: 61, dtype: float64
2005 0.048464 2006 0.047099 2007 0.037201 2004 0.033788 2003 0.030034 ... 1906 0.000341 1907 0.000341 1911 0.000341 1913 0.000341 1872 0.000341 Name: Year Built, Length: 118, dtype: float64
1978.1324431613136 0.054266 2005.0 0.048464 2006.0 0.039249 2007.0 0.039249 2004.0 0.033788 ... 1933.0 0.000341 1919.0 0.000341 1908.0 0.000341 1906.0 0.000341 2207.0 0.000341 Name: Garage Yr Blt, Length: 104, dtype: float64
20 0.368259 60 0.196246 50 0.097952 120 0.065529 30 0.047440 160 0.044027 70 0.043686 80 0.040273 90 0.037201 190 0.020819 85 0.016382 75 0.007850 45 0.006143 180 0.005802 40 0.002048 150 0.000341 Name: MS SubClass, dtype: float64
RL 0.775768 RM 0.157679 FV 0.047440 RH 0.009215 C (all) 0.008532 A (agr) 0.000683 I (all) 0.000683 Name: MS Zoning, dtype: float64
Pave 0.995904 Grvl 0.004096 Name: Street, dtype: float64
Reg 0.634471 IR1 0.334130 IR2 0.025939 IR3 0.005461 Name: Lot Shape, dtype: float64
Lvl 0.898635 HLS 0.040956 Bnk 0.039932 Low 0.020478 Name: Land Contour, dtype: float64
AllPub 0.998976 NoSewr 0.000683 NoSeWa 0.000341 Name: Utilities, dtype: float64
Inside 0.730375 Corner 0.174403 CulDSac 0.061433 FR2 0.029010 FR3 0.004778 Name: Lot Config, dtype: float64
6 0.172355 7 0.153242 5 0.134812 4 0.095222 8 0.079522 3 0.079181 10 0.059044 9 0.054949 11 0.048805 2 0.045392 1 0.041980 12 0.035495 Name: Mo Sold, dtype: float64
2007 0.236860 2009 0.221160 2006 0.213311 2008 0.212287 2010 0.116382 Name: Yr Sold, dtype: float64
Gtl 0.951877 Mod 0.042662 Sev 0.005461 Name: Land Slope, dtype: float64
NAmes 0.151195 CollgCr 0.091126 OldTown 0.081570 Edwards 0.066212 Somerst 0.062116 NridgHt 0.056655 Gilbert 0.056314 Sawyer 0.051536 NWAmes 0.044710 SawyerW 0.042662 Mitchel 0.038908 BrkSide 0.036860 Crawfor 0.035154 IDOTRR 0.031741 Timber 0.024573 NoRidge 0.024232 StoneBr 0.017406 SWISU 0.016382 ClearCr 0.015017 MeadowV 0.012628 BrDale 0.010239 Blmngtn 0.009556 Veenker 0.008191 NPkVill 0.007850 Blueste 0.003413 Greens 0.002730 GrnHill 0.000683 Landmrk 0.000341 Name: Neighborhood, dtype: float64
Norm 0.860751 Feedr 0.055973 Artery 0.031399 RRAn 0.017065 PosN 0.013311 RRAe 0.009556 PosA 0.006826 RRNn 0.003072 RRNe 0.002048 Name: Condition 1, dtype: float64
Norm 0.989761 Feedr 0.004437 Artery 0.001706 PosA 0.001365 PosN 0.001365 RRNn 0.000683 RRAe 0.000341 RRAn 0.000341 Name: Condition 2, dtype: float64
1Fam 0.827645 TwnhsE 0.079522 Duplex 0.037201 Twnhs 0.034471 2fmCon 0.021160 Name: Bldg Type, dtype: float64
1Story 0.505461 2Story 0.297952 1.5Fin 0.107167 SLvl 0.043686 SFoyer 0.028328 2.5Unf 0.008191 1.5Unf 0.006485 2.5Fin 0.002730 Name: House Style, dtype: float64
Gable 0.792150 Hip 0.188055 Gambrel 0.007509 Flat 0.006826 Mansard 0.003754 Shed 0.001706 Name: Roof Style, dtype: float64
CompShg 0.985324 Tar&Grv 0.007850 WdShake 0.003072 WdShngl 0.002389 ClyTile 0.000341 Membran 0.000341 Metal 0.000341 Roll 0.000341 Name: Roof Matl, dtype: float64
VinylSd 0.350171 MetalSd 0.153584 HdBoard 0.150853 Wd Sdng 0.143345 Plywood 0.075427 CemntBd 0.043003 BrkFace 0.030034 WdShing 0.019113 AsbShng 0.015017 Stucco 0.014676 BrkComm 0.002048 AsphShn 0.000683 CBlock 0.000683 Stone 0.000683 ImStucc 0.000341 PreCast 0.000341 Name: Exterior 1st, dtype: float64
VinylSd 0.346416 MetalSd 0.152560 HdBoard 0.138567 Wd Sdng 0.135495 Plywood 0.093515 CmentBd 0.043003 Wd Shng 0.027645 Stucco 0.016041 BrkFace 0.016041 AsbShng 0.012969 Brk Cmn 0.007509 ImStucc 0.005119 Stone 0.002048 AsphShn 0.001365 CBlock 0.001024 Other 0.000341 PreCast 0.000341 Name: Exterior 2nd, dtype: float64
None 0.602683 BrkFace 0.302718 Stone 0.085655 BrkCmn 0.008600 CBlock 0.000344 Name: Mas Vnr Type, dtype: float64
TA 0.613993 Gd 0.337543 Ex 0.036519 Fa 0.011945 Name: Exter Qual, dtype: float64
TA 0.869966 Gd 0.102048 Fa 0.022867 Ex 0.004096 Po 0.001024 Name: Exter Cond, dtype: float64
PConc 0.447099 CBlock 0.424573 BrkTil 0.106143 Slab 0.016724 Stone 0.003754 Wood 0.001706 Name: Foundation, dtype: float64
TA 0.450175 Gd 0.427719 Ex 0.090526 Fa 0.030877 Po 0.000702 Name: Bsmt Qual, dtype: float64
TA 0.917895 Gd 0.042807 Fa 0.036491 Po 0.001754 Ex 0.001053 Name: Bsmt Cond, dtype: float64
No 0.669477 Av 0.146821 Gd 0.099754 Mn 0.083948 Name: Bsmt Exposure, dtype: float64
GLQ 0.301404 Unf 0.298596 ALQ 0.150526 Rec 0.101053 BLQ 0.094386 LwQ 0.054035 Name: BsmtFin Type 1, dtype: float64
Unf 0.877150 Rec 0.037206 LwQ 0.031239 BLQ 0.023868 ALQ 0.018603 GLQ 0.011934 Name: BsmtFin Type 2, dtype: float64
GasA 0.984642 GasW 0.009215 Grav 0.003072 Wall 0.002048 OthW 0.000683 Floor 0.000341 Name: Heating, dtype: float64
Ex 0.510239 TA 0.294881 Gd 0.162457 Fa 0.031399 Po 0.001024 Name: Heating QC, dtype: float64
Y 0.933106 N 0.066894 Name: Central Air, dtype: float64
SBrkr 0.915671 FuseA 0.064186 FuseF 0.017071 FuseP 0.002731 Mix 0.000341 Name: Electrical, dtype: float64
TA 0.509898 Gd 0.395904 Ex 0.069966 Fa 0.023891 Po 0.000341 Name: Kitchen Qual, dtype: float64
Typ 0.931058 Min2 0.023891 Min1 0.022184 Mod 0.011945 Maj1 0.006485 Maj2 0.003072 Sal 0.000683 Sev 0.000683 Name: Functional, dtype: float64
Attchd 0.624234 Detchd 0.282005 BuiltIn 0.067075 Basment 0.012982 2Types 0.008294 CarPort 0.005409 Name: Garage Type, dtype: float64
Unf 0.444244 RFn 0.293035 Fin 0.262721 Name: Garage Finish, dtype: float64
TA 0.943703 Fa 0.044749 Gd 0.008661 Po 0.001804 Ex 0.001083 Name: Garage Qual, dtype: float64
TA 0.961747 Fa 0.026705 Gd 0.005413 Po 0.005052 Ex 0.001083 Name: Garage Cond, dtype: float64
Y 0.905119 N 0.073720 P 0.021160 Name: Paved Drive, dtype: float64
WD 0.865529 New 0.081570 COD 0.029693 ConLD 0.008874 CWD 0.004096 ConLI 0.003072 ConLw 0.002730 Oth 0.002389 Con 0.001706 VWD 0.000341 Name: Sale Type, dtype: float64
Normal 0.823549 Partial 0.083618 Abnorml 0.064846 Family 0.015700 Alloca 0.008191 AdjLand 0.004096 Name: Sale Condition, dtype: float64
From all the categorical variables, we'll only keep the following:
"Overall Qual", "Overall Cond", "MS SubClass", "MS Zoning", "Lot Shape", "Land Contour", "Lot Config", "Neighborhood", "Condition 1", "Bldg Type", "House Style", "Roof Style", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond", "Foundation", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC", "Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type", "Garage Finish", "Garage Qual", "Paved Drive", "Sale Type", "Sale Condition".
We'll now proceed with updating the select_features() function, to keep only the numerical variables of our interest, and transforming the interesting categorical variables into dummy variables:
def select_features():
"""
Takes the transformed dataframe from transform_features(), and returns a version with the following changes:
1. Keep only numerical variables of interest
2. Convert interesting categorical variables to dummy (0-1) variables
3. Append SalePrice variable at the end of the dataframe
Args:
dataframe: Transformed dataframe as input
Returns:
df: Modified dataframe with the selected attributes
"""
dataframe = transform_features().copy()
df = pd.DataFrame()
numeric_cols = ["Total Bsmt SF", "1st Flr SF", "Gr Liv Area", "Garage Cars"]
categorical_cols = ["Overall Qual", "Overall Cond", "MS SubClass", "MS Zoning", "Lot Shape", "Land Contour",
"Lot Config", "Neighborhood", "Condition 1", "Bldg Type", "House Style", "Roof Style",
"Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond", "Foundation",
"Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC",
"Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type", "Garage Finish",
"Garage Qual", "Paved Drive", "Sale Type", "Sale Condition"]
# 1. Keep only numerical variables of interest
df[numeric_cols] = dataframe[numeric_cols]
# 2. Convert interesting categorical variables to dummy (0-1) variables
dummy_cols = pd.get_dummies(dataframe[categorical_cols])
df = pd.concat([df, dummy_cols], axis=1)
# 3. Append SalePrice variable at the end of the dataframe
df["SalePrice"] = dataframe["SalePrice"]
return df
Finally, we'll update the train_and_test() function to implement holdoud, simple cross or multiple cross validation depending on a parameter k.
def train_and_test(k):
"""
Takes the modified dataframe from select_features(), and applies a Machine Learning -Linear Regression- model to return
the Root Mean Squared Error (RMSE) of the model.
Args:
k: Number of k-folds for the cross-validation. If 0, applies holdout validation.
Returns:
rmse: RMSE value for k=0 or k=1
avg_rmse: Average RMSE value for k>1
"""
df = select_features()
features = list(df.columns[select_features().columns != "SalePrice"])
# Apply holdout validation if k=0
if k == 0:
train = df[:1460].copy()
test = df[1460:].copy()
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
return rmse
# Apply simple cross validation if k=1
elif k == 1:
shuf_df = df.loc[np.random.permutation(len(df))]
train = shuf_df[:1460].copy()
test = shuf_df[1460:].copy()
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
return rmse
# Apply multiple cross validation if k>1
elif k > 1:
shuf_df = df.loc[np.random.permutation(len(df))]
rmse_vals = []
kf = KFold(k, shuffle=True, random_state=1)
for train_index, test_index in kf.split(shuf_df):
train = shuf_df.iloc[train_index].copy()
test = shuf_df.iloc[test_index].copy()
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
rmse_vals.append(rmse)
avg_rmse = np.mean(rmse_vals)
print("RMSE values:", rmse_vals)
return avg_rmse
train_and_test(4)
RMSE values: [3.626814340838356e+28, 4.762010281276928e+30, 2.7789657609906986e+27, 5.390723126202538e+29]
1.335032425766639e+30