In this project we will fine-tune a linear regression algorithm to predict the data from a large dataset on houses in Ames, Iowa, United States from 2006 to 2010. You can read more about why the data was collected here. You can also read about the different columns in the data here.
#Importing the necessary libraries and algorithms
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
#reading in the data
data = pd.read_csv('AmesHousing.tsv',sep='\t')
#Creating raw data for later
raw_data = data
In order to be able to tweak the model all the time we will define three functions. transform_features()
,select_features()
andtrain_test()
. Those will be the functions we will work with throughout this project. But every time we will update them. Hence they will be located in one block, so we can copy them and show the differences to their previous versions with in-line comments.
#This will be the default function block or pipeline.
#From now on I will remove the inline comments made so I can highlight the new functionalities every time.
#Here the basic functionalities of the model will be layed out.
def transform_features(df):
#We will leave this for now.
return df
def select_features(df):
#The sole purpose of this function is to select features
return df[['Gr Liv Area','SalePrice']]
def train_test(df):
df = select_features(df)
#Splitting the data in order to train and test our model.
train = df[:1460]
test = df[1460:]
#Here we select the numeric columns and drop the price column from train.
#Sometimes we will use it and sometime we will not.
numeric_train = train.select_dtypes(include = ['integer','float'])
numeric_test = test.select_dtypes(include = ['integer','float'])
features = numeric_train.columns.drop('SalePrice')
#Training the model.
model = LinearRegression()
model.fit(train[features],train['SalePrice'])
#Testing the model.
prediction = model.predict(test[features])
#Finding the error metric root mean squared error
mse = mean_squared_error(test['SalePrice'],prediction)
rmse = mse**0.5
return rmse
transform_df = transform_features(data)
select_df = select_features(transform_df)
rmse = train_test(select_df)
print('rmse:',rmse)
rmse: 57088.25161263909
The transform_features()
function serves to as you may guess transform the values before we will select them. These are the general considerations for transformation.
Right now we will explore the data in the following order:
#Inspecting missing value percentages
per_missing = data.isnull().sum()/len(data)
per_missing
Order 0.000000 PID 0.000000 MS SubClass 0.000000 MS Zoning 0.000000 Lot Frontage 0.167235 Lot Area 0.000000 Street 0.000000 Alley 0.932423 Lot Shape 0.000000 Land Contour 0.000000 Utilities 0.000000 Lot Config 0.000000 Land Slope 0.000000 Neighborhood 0.000000 Condition 1 0.000000 Condition 2 0.000000 Bldg Type 0.000000 House Style 0.000000 Overall Qual 0.000000 Overall Cond 0.000000 Year Built 0.000000 Year Remod/Add 0.000000 Roof Style 0.000000 Roof Matl 0.000000 Exterior 1st 0.000000 Exterior 2nd 0.000000 Mas Vnr Type 0.007850 Mas Vnr Area 0.007850 Exter Qual 0.000000 Exter Cond 0.000000 Foundation 0.000000 Bsmt Qual 0.027304 Bsmt Cond 0.027304 Bsmt Exposure 0.028328 BsmtFin Type 1 0.027304 BsmtFin SF 1 0.000341 BsmtFin Type 2 0.027645 BsmtFin SF 2 0.000341 Bsmt Unf SF 0.000341 Total Bsmt SF 0.000341 Heating 0.000000 Heating QC 0.000000 Central Air 0.000000 Electrical 0.000341 1st Flr SF 0.000000 2nd Flr SF 0.000000 Low Qual Fin SF 0.000000 Gr Liv Area 0.000000 Bsmt Full Bath 0.000683 Bsmt Half Bath 0.000683 Full Bath 0.000000 Half Bath 0.000000 Bedroom AbvGr 0.000000 Kitchen AbvGr 0.000000 Kitchen Qual 0.000000 TotRms AbvGrd 0.000000 Functional 0.000000 Fireplaces 0.000000 Fireplace Qu 0.485324 Garage Type 0.053584 Garage Yr Blt 0.054266 Garage Finish 0.054266 Garage Cars 0.000341 Garage Area 0.000341 Garage Qual 0.054266 Garage Cond 0.054266 Paved Drive 0.000000 Wood Deck SF 0.000000 Open Porch SF 0.000000 Enclosed Porch 0.000000 3Ssn Porch 0.000000 Screen Porch 0.000000 Pool Area 0.000000 Pool QC 0.995563 Fence 0.804778 Misc Feature 0.963823 Misc Val 0.000000 Mo Sold 0.000000 Yr Sold 0.000000 Sale Type 0.000000 Sale Condition 0.000000 SalePrice 0.000000 dtype: float64
#Eliminating the columns with more than 5% of missing values.
per_missing = per_missing[per_missing<0.05].index
per_missing
Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Area', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Garage Cars', 'Garage Area', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold', 'Sale Type', 'Sale Condition', 'SalePrice'], dtype='object')
#Dropping the first batch of columns
data = data[per_missing]
data
Order | PID | MS SubClass | MS Zoning | Lot Area | Street | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Garage Cars | Garage Area | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 31770 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | 2.0 | 528.0 | P | 210 | 62 | 0 | 0 | 0 | 0 | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 11622 | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | 1.0 | 730.0 | Y | 140 | 0 | 0 | 0 | 120 | 0 | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 14267 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | 1.0 | 312.0 | Y | 393 | 36 | 0 | 0 | 0 | 0 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 11160 | Pave | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | 2.0 | 522.0 | Y | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 13830 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | 2.0 | 482.0 | Y | 212 | 34 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | WD | Normal | 189900 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 2926 | 923275080 | 80 | RL | 7937 | Pave | IR1 | Lvl | AllPub | CulDSac | Gtl | Mitchel | Norm | Norm | 1Fam | SLvl | 6 | 6 | 1984 | 1984 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | TA | TA | Av | GLQ | 819.0 | Unf | 0.0 | 184.0 | 1003.0 | GasA | TA | Y | SBrkr | 1003 | 0 | 0 | 1003 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | 2.0 | 588.0 | Y | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2006 | WD | Normal | 142500 |
2926 | 2927 | 923276100 | 20 | RL | 8885 | Pave | IR1 | Low | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1983 | 1983 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | Gd | TA | Av | BLQ | 301.0 | ALQ | 324.0 | 239.0 | 864.0 | GasA | TA | Y | SBrkr | 902 | 0 | 0 | 902 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | 2.0 | 484.0 | Y | 164 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 2006 | WD | Normal | 131000 |
2927 | 2928 | 923400125 | 85 | RL | 10441 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | SFoyer | 5 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | Wd Shng | None | 0.0 | TA | TA | PConc | Gd | TA | Av | GLQ | 337.0 | Unf | 0.0 | 575.0 | 912.0 | GasA | TA | Y | SBrkr | 970 | 0 | 0 | 970 | 0.0 | 1.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | 0.0 | 0.0 | Y | 80 | 32 | 0 | 0 | 0 | 0 | 700 | 7 | 2006 | WD | Normal | 132000 |
2928 | 2929 | 924100070 | 20 | RL | 10010 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1974 | 1975 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | Gd | TA | Av | ALQ | 1071.0 | LwQ | 123.0 | 195.0 | 1389.0 | GasA | Gd | Y | SBrkr | 1389 | 0 | 0 | 1389 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 6 | Typ | 1 | 2.0 | 418.0 | Y | 240 | 38 | 0 | 0 | 0 | 0 | 0 | 4 | 2006 | WD | Normal | 170000 |
2929 | 2930 | 924151050 | 60 | RL | 9627 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1993 | 1994 | Gable | CompShg | HdBoard | HdBoard | BrkFace | 94.0 | TA | TA | PConc | Gd | TA | Av | LwQ | 758.0 | Unf | 0.0 | 238.0 | 996.0 | GasA | Ex | Y | SBrkr | 996 | 1004 | 0 | 2000 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 9 | Typ | 1 | 3.0 | 650.0 | Y | 190 | 48 | 0 | 0 | 0 | 0 | 0 | 11 | 2006 | WD | Normal | 188000 |
2930 rows × 71 columns
For the text columns we will remove all with missing values.
For the numeric columns we can fill the missing values with the mode.
text_data_cols = data.select_dtypes(include= ['object']).isnull().sum()
text_drop_cols = text_data_cols[text_data_cols>0]
data = data.drop(text_drop_cols.index, axis=1)
numeric_data = data.select_dtypes(include= ['integer','float'])
cols = numeric_data.columns
mode = data.filter(cols).mode()
data[cols] = data[cols].fillna(mode.iloc[0])
data.isnull().sum().value_counts() #They are all filled now
0 64 dtype: int64
New features to create:
years_sold = data['Yr Sold'] - data['Year Built']
years_sold[years_sold < 0] #Only one unexpected value.
2180 -1 dtype: int64
year_until_remod = data['Year Remod/Add'] - data['Year Built']
year_until_remod[year_until_remod<0]
850 -1 dtype: int64
data['Years Sold'] = years_sold
data['Year Until Remod'] = year_until_remod
Columns that leak data about the sale:
Columns that are not useful for machine learning:
data = data.drop(['Mo Sold','Yr Sold','Sale Type','Sale Condition','PID','Order'], axis=1)
def transform_features(data):
#These are all the lines of code we have created above but in a compact fashion.
per_missing = data.isnull().sum()/len(data)
per_missing = per_missing[per_missing<0.05].index
data = data[per_missing]
text_data_cols = data.select_dtypes(include= ['object']).isnull().sum()
text_drop_cols = text_data_cols[text_data_cols>0]
data = data.drop(text_drop_cols.index, axis=1)
numeric_data = data.select_dtypes(include= ['integer','float'])
cols = numeric_data.columns
mode = data.filter(cols).mode()
data[cols] = data[cols].fillna(mode.iloc[0])
data.isnull().sum().value_counts()
years_sold = data['Yr Sold'] - data['Year Built']
year_until_remod = data['Year Remod/Add'] - data['Year Built']
data['Years Sold'] = years_sold
data['Year Until Remod'] = year_until_remod
data = data.drop(['Mo Sold','Yr Sold','Sale Type','Sale Condition','PID','Order'], axis=1)
return data
def select_features(df):
#The sole purpose of this function is to select features
return df[['Gr Liv Area','SalePrice']]
def train_test(df):
df = select_features(df)
train = df[:1460]
test = df[1460:]
numeric_train = train.select_dtypes(include = ['integer','float'])
numeric_test = test.select_dtypes(include = ['integer','float'])
features = numeric_train.columns.drop('SalePrice')
model = LinearRegression()
model.fit(train[features],train['SalePrice'])
prediction = model.predict(test[features])
mse = mean_squared_error(test['SalePrice'],prediction)
rmse = mse**0.5
return rmse
#removed the first line because removing the values twice raises an error.
select_df = select_features(data)
rmse = train_test(select_df)
print('rmse:',rmse)
rmse: 57088.25161263909
In this block we will select the features from the data we will use in the end.
#For the numeric columns we will generate a heatmap
import seaborn as sns
cols = data.select_dtypes(include=['integer','float']).columns
corr = data[cols].corr()
corr.style.background_gradient(cmap='viridis')
MS SubClass | Lot Area | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Cars | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | SalePrice | Years Sold | Year Until Remod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MS SubClass | 1.000000 | -0.204613 | 0.039419 | -0.067349 | 0.036579 | 0.043397 | 0.003634 | -0.059767 | -0.070847 | -0.129987 | -0.218489 | -0.247828 | 0.304237 | 0.025765 | 0.068061 | 0.014187 | -0.003179 | 0.134631 | 0.175879 | -0.019208 | 0.257698 | 0.031898 | -0.049955 | -0.045851 | -0.103374 | -0.017310 | -0.014823 | -0.022866 | -0.037956 | -0.050614 | -0.003434 | -0.029254 | -0.085092 | -0.037302 | -0.008367 |
Lot Area | -0.204613 | 1.000000 | 0.097188 | -0.034759 | 0.023258 | 0.021682 | 0.125950 | 0.191692 | 0.083198 | 0.023883 | 0.253765 | 0.332235 | 0.032996 | 0.000812 | 0.285599 | 0.124208 | 0.026448 | 0.127433 | 0.035497 | 0.136569 | -0.020301 | 0.216597 | 0.256989 | 0.179494 | 0.212749 | 0.157212 | 0.103760 | 0.021868 | 0.016243 | 0.055044 | 0.093775 | 0.069188 | 0.266549 | -0.024227 | -0.010451 |
Overall Qual | 0.039419 | 0.097188 | 1.000000 | -0.094812 | 0.597027 | 0.569609 | 0.423089 | 0.284458 | -0.041122 | 0.270527 | 0.547766 | 0.477837 | 0.241402 | -0.048680 | 0.570556 | 0.168373 | -0.041433 | 0.522263 | 0.268853 | 0.063291 | -0.159744 | 0.380693 | 0.393007 | 0.599392 | 0.563562 | 0.255663 | 0.298412 | -0.140332 | 0.018240 | 0.041615 | 0.030399 | 0.005179 | 0.799262 | -0.597021 | -0.256948 |
Overall Cond | -0.067349 | -0.034759 | -0.094812 | 1.000000 | -0.368773 | 0.047680 | -0.132472 | -0.051341 | 0.040992 | -0.137303 | -0.174179 | -0.157052 | 0.006218 | 0.009175 | -0.115643 | -0.043458 | 0.084185 | -0.214316 | -0.088127 | -0.006137 | -0.086386 | -0.089816 | -0.031702 | -0.181508 | -0.153918 | 0.020344 | -0.068934 | 0.071459 | 0.043852 | 0.044055 | -0.016787 | 0.034056 | -0.101697 | 0.369570 | 0.505496 |
Year Built | 0.036579 | 0.023258 | 0.597027 | -0.368773 | 1.000000 | 0.612095 | 0.306797 | 0.280069 | -0.027328 | 0.129311 | 0.407764 | 0.310463 | 0.016828 | -0.144282 | 0.241726 | 0.212119 | -0.030515 | 0.469406 | 0.269268 | -0.055093 | -0.137852 | 0.111919 | 0.170672 | 0.537033 | 0.480726 | 0.228964 | 0.198365 | -0.374364 | 0.015803 | -0.041436 | 0.002213 | -0.011011 | 0.558426 | -0.999055 | -0.727220 |
Year Remod/Add | 0.043397 | 0.021682 | 0.569609 | 0.047680 | 0.612095 | 1.000000 | 0.191740 | 0.152240 | -0.061934 | 0.165397 | 0.298393 | 0.242108 | 0.158939 | -0.060365 | 0.316855 | 0.134590 | -0.046175 | 0.457266 | 0.211771 | -0.021536 | -0.142404 | 0.197528 | 0.133322 | 0.425434 | 0.375566 | 0.217857 | 0.241748 | -0.220383 | 0.037412 | -0.046888 | -0.011410 | -0.003132 | 0.532974 | -0.609747 | 0.097670 |
Mas Vnr Area | 0.003634 | 0.125950 | 0.423089 | -0.132472 | 0.306797 | 0.191740 | 1.000000 | 0.299951 | -0.014955 | 0.089662 | 0.393530 | 0.392280 | 0.120159 | -0.057173 | 0.400703 | 0.139684 | 0.016609 | 0.254471 | 0.188703 | 0.080590 | -0.050733 | 0.278418 | 0.270051 | 0.356646 | 0.370479 | 0.165875 | 0.140003 | -0.109915 | 0.014222 | 0.066817 | 0.004893 | 0.045276 | 0.502196 | -0.307065 | -0.219680 |
BsmtFin SF 1 | -0.059767 | 0.191692 | 0.284458 | -0.051341 | 0.280069 | 0.152240 | 0.299951 | 1.000000 | -0.054022 | -0.477244 | 0.536731 | 0.457586 | -0.163710 | -0.066129 | 0.209944 | 0.640212 | 0.077687 | 0.078086 | -0.008205 | -0.118575 | -0.086654 | 0.048118 | 0.296098 | 0.255750 | 0.309649 | 0.224199 | 0.125150 | -0.100318 | 0.050567 | 0.095952 | 0.084147 | 0.092900 | 0.433147 | -0.278673 | -0.220328 |
BsmtFin SF 2 | -0.070847 | 0.083198 | -0.041122 | 0.040992 | -0.027328 | -0.061934 | -0.014955 | -0.054022 | 1.000000 | -0.238420 | 0.090109 | 0.084710 | -0.098188 | -0.004989 | -0.017996 | 0.163634 | 0.099147 | -0.076262 | -0.033004 | -0.033033 | -0.037864 | -0.049288 | 0.067180 | -0.014395 | 0.003625 | 0.098591 | -0.005516 | 0.032415 | -0.023314 | 0.062978 | 0.044403 | -0.005195 | 0.006018 | 0.027594 | -0.019367 |
Bsmt Unf SF | -0.129987 | 0.023883 | 0.270527 | -0.137303 | 0.129311 | 0.165397 | 0.089662 | -0.477244 | -0.238420 | 1.000000 | 0.412248 | 0.296139 | 0.002320 | 0.047207 | 0.235870 | -0.398629 | -0.105784 | 0.274901 | -0.033530 | 0.188508 | 0.065651 | 0.251633 | 0.001790 | 0.179736 | 0.164837 | -0.039285 | 0.119143 | 0.006383 | -0.005399 | -0.047945 | -0.031963 | -0.010125 | 0.183308 | -0.130699 | -0.019176 |
Total Bsmt SF | -0.218489 | 0.253765 | 0.547766 | -0.174179 | 0.407764 | 0.298393 | 0.393530 | 0.536731 | 0.090109 | 0.412248 | 1.000000 | 0.800429 | -0.204487 | -0.023180 | 0.445108 | 0.326861 | 0.012850 | 0.325434 | -0.054557 | 0.052721 | -0.038612 | 0.281627 | 0.333467 | 0.437861 | 0.485608 | 0.230290 | 0.245941 | -0.084848 | 0.037918 | 0.075499 | 0.072109 | 0.083895 | 0.632529 | -0.407603 | -0.254174 |
1st Flr SF | -0.247828 | 0.332235 | 0.477837 | -0.157052 | 0.310463 | 0.242108 | 0.392280 | 0.457586 | 0.084710 | 0.296139 | 0.800429 | 1.000000 | -0.250057 | -0.012689 | 0.562166 | 0.257836 | 0.010469 | 0.371584 | -0.104203 | 0.106648 | 0.076025 | 0.390162 | 0.406345 | 0.439370 | 0.491208 | 0.227131 | 0.238041 | -0.065713 | 0.044061 | 0.098316 | 0.121821 | 0.093003 | 0.621676 | -0.310585 | -0.180573 |
2nd Flr SF | 0.304237 | 0.032996 | 0.241402 | 0.006218 | 0.016828 | 0.158939 | 0.120159 | -0.163710 | -0.098188 | 0.002320 | -0.204487 | -0.250057 | 1.000000 | 0.018024 | 0.655251 | -0.163068 | -0.059458 | 0.404143 | 0.611634 | 0.504651 | 0.069444 | 0.585214 | 0.165844 | 0.182199 | 0.127508 | 0.089097 | 0.184538 | 0.055429 | -0.032172 | 0.011741 | 0.044602 | -0.005078 | 0.269373 | -0.017607 | 0.116782 |
Low Qual Fin SF | 0.025765 | 0.000812 | -0.048680 | 0.009175 | -0.144282 | -0.060365 | -0.057173 | -0.066129 | -0.004989 | 0.047207 | -0.023180 | -0.012689 | 0.018024 | 1.000000 | 0.097050 | -0.047215 | -0.013385 | -0.002606 | -0.039303 | 0.070531 | 0.000517 | 0.102181 | -0.006946 | -0.067336 | -0.053390 | -0.015646 | -0.000761 | 0.087326 | -0.004505 | 0.006943 | 0.035200 | -0.005939 | -0.037660 | 0.143973 | 0.129186 |
Gr Liv Area | 0.068061 | 0.285599 | 0.570556 | -0.115643 | 0.241726 | 0.316855 | 0.400703 | 0.209944 | -0.017996 | 0.235870 | 0.445108 | 0.562166 | 0.655251 | 0.097050 | 1.000000 | 0.057367 | -0.043498 | 0.630321 | 0.433949 | 0.516808 | 0.117836 | 0.807772 | 0.454924 | 0.488854 | 0.483971 | 0.250153 | 0.340857 | 0.004030 | 0.006481 | 0.086804 | 0.135463 | 0.067252 | 0.706780 | -0.242510 | -0.029186 |
Bsmt Full Bath | 0.014187 | 0.124208 | 0.168373 | -0.043458 | 0.212119 | 0.134590 | 0.139684 | 0.640212 | 0.163634 | -0.398629 | 0.326861 | 0.257836 | -0.163068 | -0.047215 | 0.057367 | 1.000000 | -0.149027 | -0.023011 | -0.035385 | -0.161084 | -0.018349 | -0.043642 | 0.172775 | 0.162053 | 0.184903 | 0.187283 | 0.080765 | -0.069014 | 0.027086 | 0.052355 | 0.043730 | -0.004817 | 0.275823 | -0.209849 | -0.150131 |
Bsmt Half Bath | -0.003179 | 0.026448 | -0.041433 | 0.084185 | -0.030515 | -0.046175 | 0.016609 | 0.077687 | 0.099147 | -0.105784 | 0.012850 | 0.010469 | -0.059458 | -0.013385 | -0.043498 | -0.149027 | 1.000000 | -0.046371 | -0.057999 | 0.020333 | -0.064671 | -0.048069 | 0.038089 | -0.033463 | -0.021147 | 0.051546 | -0.035276 | -0.009272 | 0.026971 | 0.042372 | 0.066911 | 0.036996 | -0.035817 | 0.029619 | -0.001676 |
Full Bath | 0.134631 | 0.127433 | 0.522263 | -0.214316 | 0.469406 | 0.457266 | 0.254471 | 0.078086 | -0.076262 | 0.274901 | 0.325434 | 0.371584 | 0.404143 | -0.002606 | 0.630321 | -0.023011 | -0.046371 | 1.000000 | 0.159689 | 0.359489 | 0.171175 | 0.528599 | 0.229793 | 0.478206 | 0.406497 | 0.179574 | 0.258675 | -0.117795 | 0.015435 | -0.015130 | 0.028205 | -0.009771 | 0.545604 | -0.468900 | -0.193849 |
Half Bath | 0.175879 | 0.035497 | 0.268853 | -0.088127 | 0.269268 | 0.211771 | 0.188703 | -0.008205 | -0.033004 | -0.033530 | -0.054557 | -0.104203 | 0.611634 | -0.039303 | 0.433949 | -0.035385 | -0.057999 | 0.159689 | 1.000000 | 0.247053 | -0.042321 | 0.346493 | 0.201871 | 0.233324 | 0.178735 | 0.115212 | 0.180704 | -0.081312 | -0.023231 | 0.035990 | 0.001515 | 0.026648 | 0.285056 | -0.268792 | -0.155062 |
Bedroom AbvGr | -0.019208 | 0.136569 | 0.063291 | -0.006137 | -0.055093 | -0.021536 | 0.080590 | -0.118575 | -0.033033 | 0.188508 | 0.052721 | 0.106648 | 0.504651 | 0.070531 | 0.516808 | -0.161084 | 0.020333 | 0.359489 | 0.247053 | 1.000000 | 0.240737 | 0.672647 | 0.076971 | 0.091375 | 0.073417 | 0.029711 | 0.083650 | 0.052115 | -0.047151 | 0.009250 | 0.036707 | 0.000887 | 0.143913 | 0.054227 | 0.050642 |
Kitchen AbvGr | 0.257698 | -0.020301 | -0.159744 | -0.086386 | -0.137852 | -0.142404 | -0.050733 | -0.086654 | -0.037864 | 0.065651 | -0.038612 | 0.076025 | 0.069444 | 0.000517 | 0.117836 | -0.018349 | -0.064671 | 0.171175 | -0.042321 | 0.240737 | 1.000000 | 0.294445 | -0.108085 | -0.037113 | -0.057576 | -0.087410 | -0.068283 | 0.027911 | -0.021379 | -0.056337 | -0.013066 | 0.025145 | -0.119814 | 0.139182 | 0.049882 |
TotRms AbvGrd | 0.031898 | 0.216597 | 0.380693 | -0.089816 | 0.111919 | 0.197528 | 0.278418 | 0.048118 | -0.049288 | 0.251633 | 0.281627 | 0.390162 | 0.585214 | 0.102181 | 0.807772 | -0.043642 | -0.048069 | 0.528599 | 0.346493 | 0.672647 | 0.294445 | 1.000000 | 0.302865 | 0.355517 | 0.327217 | 0.154735 | 0.235684 | 0.017221 | -0.025097 | 0.033731 | 0.072103 | 0.061134 | 0.495474 | -0.113074 | 0.030604 |
Fireplaces | -0.049955 | 0.256989 | 0.393007 | -0.031702 | 0.170672 | 0.133322 | 0.270051 | 0.296098 | 0.067180 | 0.001790 | 0.333467 | 0.406345 | 0.165844 | -0.006946 | 0.454924 | 0.172775 | 0.038089 | 0.229793 | 0.201871 | 0.076971 | -0.108085 | 0.302865 | 1.000000 | 0.321057 | 0.294262 | 0.228064 | 0.159637 | -0.000250 | 0.018414 | 0.168004 | 0.098449 | 0.008192 | 0.474558 | -0.170744 | -0.099070 |
Garage Cars | -0.045851 | 0.179494 | 0.599392 | -0.181508 | 0.537033 | 0.425434 | 0.356646 | 0.255750 | -0.014395 | 0.179736 | 0.437861 | 0.439370 | 0.182199 | -0.067336 | 0.488854 | 0.162053 | -0.033463 | 0.478206 | 0.233324 | 0.091375 | -0.037113 | 0.355517 | 0.321057 | 1.000000 | 0.888699 | 0.241272 | 0.204088 | -0.132332 | 0.023334 | 0.042980 | 0.030386 | -0.016957 | 0.647812 | -0.537197 | -0.306588 |
Garage Area | -0.103374 | 0.212749 | 0.563562 | -0.153918 | 0.480726 | 0.375566 | 0.370479 | 0.309649 | 0.003625 | 0.164837 | 0.485608 | 0.491208 | 0.127508 | -0.053390 | 0.483971 | 0.184903 | -0.021147 | 0.406497 | 0.178735 | 0.073417 | -0.057576 | 0.327217 | 0.294262 | 0.888699 | 1.000000 | 0.237682 | 0.233229 | -0.108237 | 0.029511 | 0.062598 | 0.053055 | 0.008526 | 0.640138 | -0.480541 | -0.279010 |
Wood Deck SF | -0.017310 | 0.157212 | 0.255663 | 0.020344 | 0.228964 | 0.217857 | 0.165875 | 0.224199 | 0.098591 | -0.039285 | 0.230290 | 0.227131 | 0.089097 | -0.015646 | 0.250153 | 0.187283 | 0.051546 | 0.179574 | 0.115212 | 0.029711 | -0.087410 | 0.154735 | 0.228064 | 0.241272 | 0.237682 | 1.000000 | 0.039243 | -0.119136 | -0.003967 | -0.052191 | 0.094156 | 0.056820 | 0.327143 | -0.228578 | -0.099055 |
Open Porch SF | -0.014823 | 0.103760 | 0.298412 | -0.068934 | 0.198365 | 0.241748 | 0.140003 | 0.125150 | -0.005516 | 0.119143 | 0.245941 | 0.238041 | 0.184538 | -0.000761 | 0.340857 | 0.080765 | -0.035276 | 0.258675 | 0.180704 | 0.083650 | -0.068283 | 0.235684 | 0.159637 | 0.204088 | 0.233229 | 0.039243 | 1.000000 | -0.059875 | -0.009458 | 0.047548 | 0.064135 | 0.077254 | 0.312951 | -0.199692 | -0.039808 |
Enclosed Porch | -0.022866 | 0.021868 | -0.140332 | 0.071459 | -0.374364 | -0.220383 | -0.109915 | -0.100318 | 0.032415 | 0.006383 | -0.084848 | -0.065713 | 0.055429 | 0.087326 | 0.004030 | -0.069014 | -0.009272 | -0.117795 | -0.081312 | 0.052115 | 0.027911 | 0.017221 | -0.000250 | -0.132332 | -0.108237 | -0.119136 | -0.059875 | 1.000000 | -0.032674 | -0.063965 | 0.092596 | 0.008773 | -0.128787 | 0.373774 | 0.279852 |
3Ssn Porch | -0.037956 | 0.016243 | 0.018240 | 0.043852 | 0.015803 | 0.037412 | 0.014222 | 0.050567 | -0.023314 | -0.005399 | 0.037918 | 0.044061 | -0.032172 | -0.004505 | 0.006481 | 0.027086 | 0.026971 | 0.015435 | -0.023231 | -0.047151 | -0.021379 | -0.025097 | 0.018414 | 0.023334 | 0.029511 | -0.003967 | -0.009458 | -0.032674 | 1.000000 | -0.029430 | -0.006501 | -0.000753 | 0.032225 | -0.014794 | 0.012585 |
Screen Porch | -0.050614 | 0.055044 | 0.041615 | 0.044055 | -0.041436 | -0.046888 | 0.066817 | 0.095952 | 0.062978 | -0.047945 | 0.075499 | 0.098316 | 0.011741 | 0.006943 | 0.086804 | 0.052355 | 0.042372 | -0.015130 | 0.035990 | 0.009250 | -0.056337 | 0.033731 | 0.168004 | 0.042980 | 0.062598 | -0.052191 | 0.047548 | -0.063965 | -0.029430 | 1.000000 | 0.026383 | 0.007162 | 0.112151 | 0.041107 | 0.011449 |
Pool Area | -0.003434 | 0.093775 | 0.030399 | -0.016787 | 0.002213 | -0.011410 | 0.004893 | 0.084147 | 0.044403 | -0.031963 | 0.072109 | 0.121821 | 0.044602 | 0.035200 | 0.135463 | 0.043730 | 0.066911 | 0.028205 | 0.001515 | 0.036707 | -0.013066 | 0.072103 | 0.098449 | 0.030386 | 0.053055 | 0.094156 | 0.064135 | 0.092596 | -0.006501 | 0.026383 | 1.000000 | 0.011942 | 0.068403 | -0.004493 | -0.012689 |
Misc Val | -0.029254 | 0.069188 | 0.005179 | 0.034056 | -0.011011 | -0.003132 | 0.045276 | 0.092900 | -0.005195 | -0.010125 | 0.083895 | 0.093003 | -0.005078 | -0.005939 | 0.067252 | -0.004817 | 0.036996 | -0.009771 | 0.026648 | 0.000887 | 0.025145 | 0.061134 | 0.008192 | -0.016957 | 0.008526 | 0.056820 | 0.077254 | 0.008773 | -0.000753 | 0.007162 | 0.011942 | 1.000000 | -0.015691 | 0.011367 | 0.011139 |
SalePrice | -0.085092 | 0.266549 | 0.799262 | -0.101697 | 0.558426 | 0.532974 | 0.502196 | 0.433147 | 0.006018 | 0.183308 | 0.632529 | 0.621676 | 0.269373 | -0.037660 | 0.706780 | 0.275823 | -0.035817 | 0.545604 | 0.285056 | 0.143913 | -0.119814 | 0.495474 | 0.474558 | 0.647812 | 0.640138 | 0.327143 | 0.312951 | -0.128787 | 0.032225 | 0.112151 | 0.068403 | -0.015691 | 1.000000 | -0.558907 | -0.240168 |
Years Sold | -0.037302 | -0.024227 | -0.597021 | 0.369570 | -0.999055 | -0.609747 | -0.307065 | -0.278673 | 0.027594 | -0.130699 | -0.407603 | -0.310585 | -0.017607 | 0.143973 | -0.242510 | -0.209849 | 0.029619 | -0.468900 | -0.268792 | 0.054227 | 0.139182 | -0.113074 | -0.170744 | -0.537197 | -0.480541 | -0.228578 | -0.199692 | 0.373774 | -0.014794 | 0.041107 | -0.004493 | 0.011367 | -0.558907 | 1.000000 | 0.728069 |
Year Until Remod | -0.008367 | -0.010451 | -0.256948 | 0.505496 | -0.727220 | 0.097670 | -0.219680 | -0.220328 | -0.019367 | -0.019176 | -0.254174 | -0.180573 | 0.116782 | 0.129186 | -0.029186 | -0.150131 | -0.001676 | -0.193849 | -0.155062 | 0.050642 | 0.049882 | 0.030604 | -0.099070 | -0.306588 | -0.279010 | -0.099055 | -0.039808 | 0.279852 | 0.012585 | 0.011449 | -0.012689 | 0.011139 | -0.240168 | 0.728069 | 1.000000 |
#Selecting the columns that have a correlation above 0.50 or below -0.50
corr_price = corr['SalePrice']
corr_price = (corr_price >= 0.50) + (corr_price <= -0.50)
corr = corr[corr_price]
corr = corr.loc[:,corr.index]
corr.style.background_gradient(cmap='viridis')
Overall Qual | Year Built | Year Remod/Add | Mas Vnr Area | Total Bsmt SF | 1st Flr SF | Gr Liv Area | Full Bath | Garage Cars | Garage Area | SalePrice | Years Sold | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Overall Qual | 1.000000 | 0.597027 | 0.569609 | 0.423089 | 0.547766 | 0.477837 | 0.570556 | 0.522263 | 0.599392 | 0.563562 | 0.799262 | -0.597021 |
Year Built | 0.597027 | 1.000000 | 0.612095 | 0.306797 | 0.407764 | 0.310463 | 0.241726 | 0.469406 | 0.537033 | 0.480726 | 0.558426 | -0.999055 |
Year Remod/Add | 0.569609 | 0.612095 | 1.000000 | 0.191740 | 0.298393 | 0.242108 | 0.316855 | 0.457266 | 0.425434 | 0.375566 | 0.532974 | -0.609747 |
Mas Vnr Area | 0.423089 | 0.306797 | 0.191740 | 1.000000 | 0.393530 | 0.392280 | 0.400703 | 0.254471 | 0.356646 | 0.370479 | 0.502196 | -0.307065 |
Total Bsmt SF | 0.547766 | 0.407764 | 0.298393 | 0.393530 | 1.000000 | 0.800429 | 0.445108 | 0.325434 | 0.437861 | 0.485608 | 0.632529 | -0.407603 |
1st Flr SF | 0.477837 | 0.310463 | 0.242108 | 0.392280 | 0.800429 | 1.000000 | 0.562166 | 0.371584 | 0.439370 | 0.491208 | 0.621676 | -0.310585 |
Gr Liv Area | 0.570556 | 0.241726 | 0.316855 | 0.400703 | 0.445108 | 0.562166 | 1.000000 | 0.630321 | 0.488854 | 0.483971 | 0.706780 | -0.242510 |
Full Bath | 0.522263 | 0.469406 | 0.457266 | 0.254471 | 0.325434 | 0.371584 | 0.630321 | 1.000000 | 0.478206 | 0.406497 | 0.545604 | -0.468900 |
Garage Cars | 0.599392 | 0.537033 | 0.425434 | 0.356646 | 0.437861 | 0.439370 | 0.488854 | 0.478206 | 1.000000 | 0.888699 | 0.647812 | -0.537197 |
Garage Area | 0.563562 | 0.480726 | 0.375566 | 0.370479 | 0.485608 | 0.491208 | 0.483971 | 0.406497 | 0.888699 | 1.000000 | 0.640138 | -0.480541 |
SalePrice | 0.799262 | 0.558426 | 0.532974 | 0.502196 | 0.632529 | 0.621676 | 0.706780 | 0.545604 | 0.647812 | 0.640138 | 1.000000 | -0.558907 |
Years Sold | -0.597021 | -0.999055 | -0.609747 | -0.307065 | -0.407603 | -0.310585 | -0.242510 | -0.468900 | -0.537197 | -0.480541 | -0.558907 | 1.000000 |
The above numeric features are the chosen ones. However, there are still a few columns that need to be removed because they correspond too much to each other
Those will be dropped for now.
corr = corr.drop(['Garage Area','Year Remod/Add','Total Bsmt SF'],axis=1)
#Corr has all the right numeric columns now. let's add those to a final value,
# and remove the target column SalePrice
final_numeric_cols = ['Overall Qual', 'Year Built', 'Mas Vnr Area', '1st Flr SF',
'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Years Sold']
# Now we can experiment with the different numeric columns.
# Note that a multivariate test is impossible at this point before standardization.
univariate_test = {}
for col in final_numeric_cols:
def select_features(df):
#The sole purpose of this function is to select features
return df[[col,'SalePrice']]
select_df = select_features(data)
rmse = train_test(select_df)
univariate_test[col] = rmse
univariate_test
{'Overall Qual': 49492.23698000454, 'Year Built': 69220.60962857504, 'Mas Vnr Area': 70899.7027183288, '1st Flr SF': 66145.85396434217, 'Gr Liv Area': 57088.25161263909, 'Full Bath': 66967.82905766825, 'Garage Cars': 63016.29294369743, 'Years Sold': 69229.55176985388}
Now it is time to do some research on the categorical values and see which ones are suited best to become dummy columns. To be selected the columns should be:
text_cols = ['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities',
'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
'Exterior 1st', 'Exterior 2nd', 'Exter Qual', 'Exter Cond',
'Foundation', 'Heating', 'Heating QC', 'Central Air', 'Kitchen Qual',
'Functional', 'Paved Drive']
Right now I will make a lot of visualizations to estimate the above conditions.
#Let's judge the values one by one
data['MS Zoning'].value_counts(normalize=True).plot.bar()
#This does not seem useful, 80% has the same value.
<matplotlib.axes._subplots.AxesSubplot at 0x29efede4408>
data['Street'].value_counts(normalize=True).plot.bar()
#This one can be very easily dropped
<matplotlib.axes._subplots.AxesSubplot at 0x29efef4c2c8>
data['Lot Shape'].value_counts(normalize=True).plot.bar()
#This one is informative, telling us about if the shape is irregular.
<matplotlib.axes._subplots.AxesSubplot at 0x29efef7c908>
data['Land Contour'].value_counts(normalize=True).plot.bar()
#This one could be useful, let's keep it.
<matplotlib.axes._subplots.AxesSubplot at 0x29efefbab88>
data['Utilities'].value_counts(normalize=True).plot.bar()
#Easy drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff010ac8>
data['Land Slope'].value_counts(normalize=True).plot.bar()
#Many have the same value, so it is better to drop it.
<matplotlib.axes._subplots.AxesSubplot at 0x29eff04b408>
data['Neighborhood'].value_counts(normalize=True).plot.bar()
#This one has many values, but it is very informative. Hence I will keep it at first and maybe take it out later.
<matplotlib.axes._subplots.AxesSubplot at 0x29eff085048>
data['Condition 1'].value_counts(normalize=True).plot.bar()
#nope
<matplotlib.axes._subplots.AxesSubplot at 0x29eff331e48>
data['Condition 2'].value_counts(normalize=True).plot.bar()
#nope
<matplotlib.axes._subplots.AxesSubplot at 0x29eff3a9a48>
data['Bldg Type'].value_counts(normalize=True).plot.bar()
#Very informative, it's a keeper.
#Dropped on inspection.
<matplotlib.axes._subplots.AxesSubplot at 0x29eff42f5c8>
data['House Style'].value_counts(normalize=True).plot.bar()
#Informative, keeper
<matplotlib.axes._subplots.AxesSubplot at 0x29eff4a2388>
data['Roof Style'].value_counts(normalize=True).plot.bar()
#Drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff50e088>
data['Roof Matl'].value_counts(normalize=True).plot.bar()
#Drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff58dc48>
data['Exterior 1st'].value_counts(normalize=True).plot.bar()
#Drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff60a888>
data['Exterior 2nd'].value_counts(normalize=True).plot.bar()
#Drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff6be288>
data['Exter Qual'].value_counts(normalize=True).plot.bar()
#Keep
<matplotlib.axes._subplots.AxesSubplot at 0x29eff75ae88>
data['Exter Cond'].value_counts(normalize=True).plot.bar()
#Drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff7d69c8>
data['Foundation'].value_counts(normalize=True).plot.bar()
#Keep
<matplotlib.axes._subplots.AxesSubplot at 0x29eff82fa48>
data['Heating'].value_counts(normalize=True).plot.bar()
#Drop
<matplotlib.axes._subplots.AxesSubplot at 0x29eff8a8c48>
data['Heating QC'].value_counts(normalize=True).plot.bar()
#Keep
<matplotlib.axes._subplots.AxesSubplot at 0x29eff91d808>
data['Central Air'].value_counts(normalize=True).plot.bar()
#This is a valuable attribute Keep
<matplotlib.axes._subplots.AxesSubplot at 0x29eff96d808>
data['Kitchen Qual'].value_counts(normalize=True).plot.bar()
#Keep!
<matplotlib.axes._subplots.AxesSubplot at 0x29eff9e1348>
data['Functional'].value_counts(normalize=True).plot.bar()
#Keep, for now, the information is telling a lot.
<matplotlib.axes._subplots.AxesSubplot at 0x29effa560c8>
data['Paved Drive'].value_counts(normalize=True).plot.bar()
#Keep
<matplotlib.axes._subplots.AxesSubplot at 0x29effac1d08>
#After this manual selection I've kept:
text_cols = ['Lot Shape', 'Land Contour',
'Lot Config', 'Neighborhood', 'Bldg Type', 'House Style',
'Exter Qual','Foundation', 'Heating QC', 'Central Air', 'Kitchen Qual',
'Functional', 'Paved Drive']
Now we can standardize the numeric columns and transform the categorical columns to dummies columns.
#Now we have all the right data in three dataframes.
text_cols_dummies = pd.get_dummies(data[text_cols])
normalized_numeric = (data[final_numeric_cols]-data[final_numeric_cols].mean())/data[final_numeric_cols].std()
target = data['SalePrice']
#After concatenating everything will be one happy data family.
data_final = pd.concat([text_cols_dummies,normalized_numeric,target],axis=1)
Lot Shape_IR1 | Lot Shape_IR2 | Lot Shape_IR3 | Lot Shape_Reg | Land Contour_Bnk | Land Contour_HLS | Land Contour_Low | Land Contour_Lvl | Lot Config_Corner | Lot Config_CulDSac | Lot Config_FR2 | Lot Config_FR3 | Lot Config_Inside | Neighborhood_Blmngtn | Neighborhood_Blueste | Neighborhood_BrDale | Neighborhood_BrkSide | Neighborhood_ClearCr | Neighborhood_CollgCr | Neighborhood_Crawfor | Neighborhood_Edwards | Neighborhood_Gilbert | Neighborhood_Greens | Neighborhood_GrnHill | Neighborhood_IDOTRR | Neighborhood_Landmrk | Neighborhood_MeadowV | Neighborhood_Mitchel | Neighborhood_NAmes | Neighborhood_NPkVill | Neighborhood_NWAmes | Neighborhood_NoRidge | Neighborhood_NridgHt | Neighborhood_OldTown | Neighborhood_SWISU | Neighborhood_Sawyer | Neighborhood_SawyerW | Neighborhood_Somerst | Neighborhood_StoneBr | Neighborhood_Timber | Neighborhood_Veenker | Bldg Type_1Fam | Bldg Type_2fmCon | Bldg Type_Duplex | Bldg Type_Twnhs | Bldg Type_TwnhsE | House Style_1.5Fin | House Style_1.5Unf | House Style_1Story | House Style_2.5Fin | House Style_2.5Unf | House Style_2Story | House Style_SFoyer | House Style_SLvl | Exter Qual_Ex | Exter Qual_Fa | Exter Qual_Gd | Exter Qual_TA | Foundation_BrkTil | Foundation_CBlock | Foundation_PConc | Foundation_Slab | Foundation_Stone | Foundation_Wood | Heating QC_Ex | Heating QC_Fa | Heating QC_Gd | Heating QC_Po | Heating QC_TA | Central Air_N | Central Air_Y | Kitchen Qual_Ex | Kitchen Qual_Fa | Kitchen Qual_Gd | Kitchen Qual_Po | Kitchen Qual_TA | Functional_Maj1 | Functional_Maj2 | Functional_Min1 | Functional_Min2 | Functional_Mod | Functional_Sal | Functional_Sev | Functional_Typ | Paved Drive_N | Paved Drive_P | Paved Drive_Y | Overall Qual | Year Built | Mas Vnr Area | 1st Flr SF | Gr Liv Area | Full Bath | Garage Cars | Years Sold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | -0.067242 | -0.375473 | 0.061036 | 1.266787 | 0.309212 | -1.024618 | 0.306537 | 0.447846 | 215000 |
1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.775946 | -0.342410 | -0.565943 | -0.672528 | -1.194223 | -1.024618 | -1.008476 | 0.414834 | 105000 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.067242 | -0.441599 | 0.038644 | 0.432371 | -0.337661 | -1.024618 | -1.008476 | 0.513872 | 172000 |
3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0.641462 | -0.110970 | -0.565943 | 2.425273 | 1.207317 | 0.783894 | 0.306537 | 0.183744 | 244000 |
4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.775946 | 0.847855 | -0.565943 | -0.590873 | 0.255801 | 0.783894 | 0.306537 | -0.773624 | 189900 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.067242 | 0.418037 | -0.565943 | -0.399493 | -0.982555 | -1.024618 | 0.306537 | -0.476510 | 142500 |
2926 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.775946 | 0.384974 | -0.565943 | -0.657218 | -1.182354 | -1.024618 | 0.306537 | -0.443497 | 131000 |
2927 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.775946 | 0.682541 | -0.565943 | -0.483700 | -1.047836 | -1.024618 | -2.323489 | -0.740612 | 132000 |
2928 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | -0.775946 | 0.087408 | -0.565943 | 0.585475 | -0.218968 | -1.024618 | 0.306537 | -0.146383 | 170000 |
2929 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0.641462 | 0.715604 | -0.039729 | -0.417355 | 0.989715 | 0.783894 | 1.621550 | -0.773624 | 188000 |
2930 rows × 96 columns
Now we are able to update our data pipeline.
def transform_features(data):
#These are all the lines of code we have created above but in a compact fashion.
per_missing = data.isnull().sum()/len(data)
per_missing = per_missing[per_missing<0.05].index
data = data[per_missing]
text_data_cols = data.select_dtypes(include= ['object']).isnull().sum()
text_drop_cols = text_data_cols[text_data_cols>0]
data = data.drop(text_drop_cols.index, axis=1)
numeric_data = data.select_dtypes(include= ['integer','float'])
cols = numeric_data.columns
mode = data.filter(cols).mode()
data[cols] = data[cols].fillna(mode.iloc[0])
data.isnull().sum().value_counts()
years_sold = data['Yr Sold'] - data['Year Built']
year_until_remod = data['Year Remod/Add'] - data['Year Built']
data['Years Sold'] = years_sold
data['Year Until Remod'] = year_until_remod
data = data.drop(['Mo Sold','Yr Sold','Sale Type','Sale Condition','PID','Order'], axis=1)
return data
def select_features(data):
final_numeric_cols = ['Overall Qual', 'Year Built', 'Mas Vnr Area', '1st Flr SF',
'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Years Sold']
text_cols = ['Lot Shape', 'Land Contour',
'Lot Config', 'Neighborhood', 'Bldg Type', 'House Style',
'Exter Qual','Foundation', 'Heating QC', 'Central Air', 'Kitchen Qual',
'Functional', 'Paved Drive']
text_cols_dummies = pd.get_dummies(data[text_cols])
normalized_numeric = (data[final_numeric_cols]-data[final_numeric_cols].mean())/data[final_numeric_cols].std()
target = data['SalePrice']
data_final = pd.concat([text_cols_dummies,normalized_numeric,target],axis=1)
return data_final
def train_test(df):
train = df[:1460]
test = df[1460:]
numeric_train = train.select_dtypes(include = ['integer','float'])
numeric_test = test.select_dtypes(include = ['integer','float'])
features = numeric_train.columns.drop('SalePrice')
model = LinearRegression()
model.fit(train[features],train['SalePrice'])
prediction = model.predict(test[features])
mse = mean_squared_error(test['SalePrice'],prediction)
rmse = mse**0.5
return rmse
selection = select_features(data)
rmse = train_test(selection)
print('rmse:',rmse)
rmse: 407005942187829.56
Now that is a VERY poor performance. Let's be more strict on the categorical columns see which ones we should drop.
After some playing around and experimenting a lot, these are the values that have performed the best. It's quite a big decrease!
By the way I have even removed some of the numerical columns. They might come back from the dead afterwards, because this was only useful to tweak the model at this point far enough. It could be that when we are cross validating that we should welcome some columns back.
def select_features(data):
final_numeric_cols = ['Overall Qual', 'Mas Vnr Area',
'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Years Sold']
text_cols = ['Lot Shape', 'Land Contour',
'Lot Config', 'Neighborhood', 'House Style',
'Exter Qual', 'Heating QC', 'Central Air', 'Kitchen Qual',
'Functional']
text_cols_dummies = pd.get_dummies(data[text_cols])
normalized_numeric = (data[final_numeric_cols]-data[final_numeric_cols].mean())/data[final_numeric_cols].std()
target = data['SalePrice']
data_final = pd.concat([text_cols_dummies,normalized_numeric,target],axis=1)
return data_final
def train_test(df):
train = df[:1460]
test = df[1460:]
numeric_train = train.select_dtypes(include = ['integer','float'])
numeric_test = test.select_dtypes(include = ['integer','float'])
features = numeric_train.columns.drop('SalePrice')
model = LinearRegression()
model.fit(train[features],train['SalePrice'])
prediction = model.predict(test[features])
mse = mean_squared_error(test['SalePrice'],prediction)
rmse = mse**0.5
return rmse
selection = select_features(data)
rmse = train_test(selection)
print('rmse:',rmse)
rmse: 35271.21165732622
Up to now we have only been applying holdout validation. Here we will change the last part of the function in order to be able to apply several types of validation.
def transform_features(data):
#These are all the lines of code we have created above but in a compact fashion.
per_missing = data.isnull().sum()/len(data)
per_missing = per_missing[per_missing<0.05].index
data = data[per_missing]
text_data_cols = data.select_dtypes(include= ['object']).isnull().sum()
text_drop_cols = text_data_cols[text_data_cols>0]
data = data.drop(text_drop_cols.index, axis=1)
numeric_data = data.select_dtypes(include= ['integer','float'])
cols = numeric_data.columns
mode = data.filter(cols).mode()
data[cols] = data[cols].fillna(mode.iloc[0])
data.isnull().sum().value_counts()
years_sold = data['Yr Sold'] - data['Year Built']
year_until_remod = data['Year Remod/Add'] - data['Year Built']
data['Years Sold'] = years_sold
data['Year Until Remod'] = year_until_remod
data = data.drop(['Mo Sold','Yr Sold','Sale Type','Sale Condition','PID','Order'], axis=1)
return data
def select_features(data):
final_numeric_cols = ['Overall Qual', 'Year Built', 'Mas Vnr Area', '1st Flr SF',
'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Years Sold']
text_cols = ['Lot Shape', 'Land Contour',
'Lot Config', 'Neighborhood', 'Bldg Type', 'House Style',
'Exter Qual','Foundation', 'Heating QC', 'Central Air', 'Kitchen Qual',
'Functional', 'Paved Drive']
text_cols_dummies = pd.get_dummies(data[text_cols])
normalized_numeric = (data[final_numeric_cols]-data[final_numeric_cols].mean())/data[final_numeric_cols].std()
target = data['SalePrice']
data_final = pd.concat([text_cols_dummies,normalized_numeric,target],axis=1)
return data_final
def train_test(df,k=0):
model = LinearRegression()
if k==0:
train = df[:1460]
test = df[1460:]
numeric_train = train.select_dtypes(include = ['integer','float'])
numeric_test = test.select_dtypes(include = ['integer','float'])
features = numeric_train.columns.drop('SalePrice')
model.fit(train[features],train['SalePrice'])
prediction = model.predict(test[features])
mse = mean_squared_error(test['SalePrice'],prediction)
rmse = mse**0.5
return rmse
if k==1:
#This is done to shuffle the dataframe. frac=1 is needed as an argument.
#Here we are doing Holdout validation, which means the model will be tested on both halves.
shuffled_df = df.sample(frac=1)
train = shuffled_df[:1460]
test = shuffled_df[1460:]
train_no_price = train.drop('SalePrice', axis=1)
test_no_price = test.drop('SalePrice',axis=1)
model.fit(train_no_price,train['SalePrice'])
prediction_one = model.predict(test_no_price)
mse_one = mean_squared_error(test['SalePrice'],prediction_one)
rmse_one = mse_one**0.5
model.fit(test_no_price,test['SalePrice'])
prediction_two = model.predict(train_no_price)
mse_two = mean_squared_error(train['SalePrice'],prediction_two)
rmse_two = mse_two**0.5
print(rmse_one,rmse_two)
avg_rmse = (rmse_one + rmse_two) / 2
return avg_rmse
else:
kf = KFold(k,shuffle=True,random_state = 5)
rmse_values = []
for train_index, test_index in kf.split(df):
train = df.loc[train_index]
test = df.loc[test_index]
train_no_price = train.drop('SalePrice',axis=1)
test_no_price = test.drop('SalePrice',axis=1)
model.fit(train_no_price,train['SalePrice'])
prediction = model.predict(test_no_price)
mse = mean_squared_error(test['SalePrice'],prediction)
rmse = mse**0.5
rmse_values.append(rmse)
print(rmse_values)
avg_rmse = sum(rmse_values)/len(rmse_values)
return avg_rmse
transformation = transform_features(raw_data)
selection = select_features(transformation)
rmse = train_test(selection, 10)
print('rmse:',rmse)
[22091309167529.234, 26325.783488498048, 23962.404978954568, 36133.86801129016, 209124838710047.94, 25351.71847370442, 40736.19352619256, 29633.301767522484, 27069.687985545977, 39692.875356692304] rmse: 23121614812648.3