In this project, we'll build a Linear Regression model and exploring ways to improve it. We will work with housing data for the city of Ames, Iowa, United States from 2006 to 2010. Read more about why the data was collected here. Read about the different columns in the data here.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso # Linear Model trained with L1 prior as regularizer (aka the Lasso)
from sklearn.linear_model import Ridge # Linear least squares with l2 regularization.
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
%matplotlib inline
ames_housing = pd.read_csv('AmesHousing.tsv', sep = '\t')
pd.options.display.max_columns = 82
ames_housing.head()
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Gd | Attchd | 1960.0 | Fin | 2.0 | 528.0 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 93.0 | 11160 | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | TA | Attchd | 1968.0 | Fin | 2.0 | 522.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
ames_housing.describe()
Order | PID | MS SubClass | Lot Frontage | Lot Area | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Yr Blt | Garage Cars | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2930.00000 | 2.930000e+03 | 2930.000000 | 2440.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2907.000000 | 2929.000000 | 2929.000000 | 2929.000000 | 2929.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2928.000000 | 2928.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2771.000000 | 2929.000000 | 2929.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 |
mean | 1465.50000 | 7.144645e+08 | 57.387372 | 69.224590 | 10147.921843 | 6.094881 | 5.563140 | 1971.356314 | 1984.266553 | 101.896801 | 442.629566 | 49.722431 | 559.262547 | 1051.614544 | 1159.557679 | 335.455973 | 4.676792 | 1499.690444 | 0.431352 | 0.061134 | 1.566553 | 0.379522 | 2.854266 | 1.044369 | 6.443003 | 0.599317 | 1978.132443 | 1.766815 | 472.819734 | 93.751877 | 47.533447 | 23.011604 | 2.592491 | 16.002048 | 2.243345 | 50.635154 | 6.216041 | 2007.790444 | 180796.060068 |
std | 845.96247 | 1.887308e+08 | 42.638025 | 23.365335 | 7880.017759 | 1.411026 | 1.111537 | 30.245361 | 20.860286 | 179.112611 | 455.590839 | 169.168476 | 439.494153 | 440.615067 | 391.890885 | 428.395715 | 46.310510 | 505.508887 | 0.524820 | 0.245254 | 0.552941 | 0.502629 | 0.827731 | 0.214076 | 1.572964 | 0.647921 | 25.528411 | 0.760566 | 215.046549 | 126.361562 | 67.483400 | 64.139059 | 25.141331 | 56.087370 | 35.597181 | 566.344288 | 2.714492 | 1.316613 | 79886.692357 |
min | 1.00000 | 5.263011e+08 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1895.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 12789.000000 |
25% | 733.25000 | 5.284770e+08 | 20.000000 | 58.000000 | 7440.250000 | 5.000000 | 5.000000 | 1954.000000 | 1965.000000 | 0.000000 | 0.000000 | 0.000000 | 219.000000 | 793.000000 | 876.250000 | 0.000000 | 0.000000 | 1126.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1960.000000 | 1.000000 | 320.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2007.000000 | 129500.000000 |
50% | 1465.50000 | 5.354536e+08 | 50.000000 | 68.000000 | 9436.500000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 370.000000 | 0.000000 | 466.000000 | 990.000000 | 1084.000000 | 0.000000 | 0.000000 | 1442.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1979.000000 | 2.000000 | 480.000000 | 0.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 160000.000000 |
75% | 2197.75000 | 9.071811e+08 | 70.000000 | 80.000000 | 11555.250000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 164.000000 | 734.000000 | 0.000000 | 802.000000 | 1302.000000 | 1384.000000 | 703.750000 | 0.000000 | 1742.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 213500.000000 |
max | 2930.00000 | 1.007100e+09 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1526.000000 | 2336.000000 | 6110.000000 | 5095.000000 | 2065.000000 | 1064.000000 | 5642.000000 | 3.000000 | 2.000000 | 4.000000 | 2.000000 | 8.000000 | 3.000000 | 15.000000 | 4.000000 | 2207.000000 | 5.000000 | 1488.000000 | 1424.000000 | 742.000000 | 1012.000000 | 508.000000 | 576.000000 | 800.000000 | 17000.000000 | 12.000000 | 2010.000000 | 755000.000000 |
# Check out numerical columns
numerical_cols = ames_housing.select_dtypes(np.number)
numerical_cols.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 2930 non-null int64 1 PID 2930 non-null int64 2 MS SubClass 2930 non-null int64 3 Lot Frontage 2440 non-null float64 4 Lot Area 2930 non-null int64 5 Overall Qual 2930 non-null int64 6 Overall Cond 2930 non-null int64 7 Year Built 2930 non-null int64 8 Year Remod/Add 2930 non-null int64 9 Mas Vnr Area 2907 non-null float64 10 BsmtFin SF 1 2929 non-null float64 11 BsmtFin SF 2 2929 non-null float64 12 Bsmt Unf SF 2929 non-null float64 13 Total Bsmt SF 2929 non-null float64 14 1st Flr SF 2930 non-null int64 15 2nd Flr SF 2930 non-null int64 16 Low Qual Fin SF 2930 non-null int64 17 Gr Liv Area 2930 non-null int64 18 Bsmt Full Bath 2928 non-null float64 19 Bsmt Half Bath 2928 non-null float64 20 Full Bath 2930 non-null int64 21 Half Bath 2930 non-null int64 22 Bedroom AbvGr 2930 non-null int64 23 Kitchen AbvGr 2930 non-null int64 24 TotRms AbvGrd 2930 non-null int64 25 Fireplaces 2930 non-null int64 26 Garage Yr Blt 2771 non-null float64 27 Garage Cars 2929 non-null float64 28 Garage Area 2929 non-null float64 29 Wood Deck SF 2930 non-null int64 30 Open Porch SF 2930 non-null int64 31 Enclosed Porch 2930 non-null int64 32 3Ssn Porch 2930 non-null int64 33 Screen Porch 2930 non-null int64 34 Pool Area 2930 non-null int64 35 Misc Val 2930 non-null int64 36 Mo Sold 2930 non-null int64 37 Yr Sold 2930 non-null int64 38 SalePrice 2930 non-null int64 dtypes: float64(11), int64(28) memory usage: 892.9 KB
# Check out non-numerical columns
non_numerical_cols = ames_housing.select_dtypes(exclude = np.number)
non_numerical_cols.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MS Zoning 2930 non-null object 1 Street 2930 non-null object 2 Alley 198 non-null object 3 Lot Shape 2930 non-null object 4 Land Contour 2930 non-null object 5 Utilities 2930 non-null object 6 Lot Config 2930 non-null object 7 Land Slope 2930 non-null object 8 Neighborhood 2930 non-null object 9 Condition 1 2930 non-null object 10 Condition 2 2930 non-null object 11 Bldg Type 2930 non-null object 12 House Style 2930 non-null object 13 Roof Style 2930 non-null object 14 Roof Matl 2930 non-null object 15 Exterior 1st 2930 non-null object 16 Exterior 2nd 2930 non-null object 17 Mas Vnr Type 2907 non-null object 18 Exter Qual 2930 non-null object 19 Exter Cond 2930 non-null object 20 Foundation 2930 non-null object 21 Bsmt Qual 2850 non-null object 22 Bsmt Cond 2850 non-null object 23 Bsmt Exposure 2847 non-null object 24 BsmtFin Type 1 2850 non-null object 25 BsmtFin Type 2 2849 non-null object 26 Heating 2930 non-null object 27 Heating QC 2930 non-null object 28 Central Air 2930 non-null object 29 Electrical 2929 non-null object 30 Kitchen Qual 2930 non-null object 31 Functional 2930 non-null object 32 Fireplace Qu 1508 non-null object 33 Garage Type 2773 non-null object 34 Garage Finish 2771 non-null object 35 Garage Qual 2771 non-null object 36 Garage Cond 2771 non-null object 37 Paved Drive 2930 non-null object 38 Pool QC 13 non-null object 39 Fence 572 non-null object 40 Misc Feature 106 non-null object 41 Sale Type 2930 non-null object 42 Sale Condition 2930 non-null object dtypes: object(43) memory usage: 984.4+ KB
# Check values in each non_numerical_cols
for col in non_numerical_cols.columns:
print(non_numerical_cols[col].value_counts(),
'\n',
'*'*10)
RL 2273 RM 462 FV 139 RH 27 C (all) 25 I (all) 2 A (agr) 2 Name: MS Zoning, dtype: int64 ********** Pave 2918 Grvl 12 Name: Street, dtype: int64 ********** Grvl 120 Pave 78 Name: Alley, dtype: int64 ********** Reg 1859 IR1 979 IR2 76 IR3 16 Name: Lot Shape, dtype: int64 ********** Lvl 2633 HLS 120 Bnk 117 Low 60 Name: Land Contour, dtype: int64 ********** AllPub 2927 NoSewr 2 NoSeWa 1 Name: Utilities, dtype: int64 ********** Inside 2140 Corner 511 CulDSac 180 FR2 85 FR3 14 Name: Lot Config, dtype: int64 ********** Gtl 2789 Mod 125 Sev 16 Name: Land Slope, dtype: int64 ********** NAmes 443 CollgCr 267 OldTown 239 Edwards 194 Somerst 182 NridgHt 166 Gilbert 165 Sawyer 151 NWAmes 131 SawyerW 125 Mitchel 114 BrkSide 108 Crawfor 103 IDOTRR 93 Timber 72 NoRidge 71 StoneBr 51 SWISU 48 ClearCr 44 MeadowV 37 BrDale 30 Blmngtn 28 Veenker 24 NPkVill 23 Blueste 10 Greens 8 GrnHill 2 Landmrk 1 Name: Neighborhood, dtype: int64 ********** Norm 2522 Feedr 164 Artery 92 RRAn 50 PosN 39 RRAe 28 PosA 20 RRNn 9 RRNe 6 Name: Condition 1, dtype: int64 ********** Norm 2900 Feedr 13 Artery 5 PosN 4 PosA 4 RRNn 2 RRAe 1 RRAn 1 Name: Condition 2, dtype: int64 ********** 1Fam 2425 TwnhsE 233 Duplex 109 Twnhs 101 2fmCon 62 Name: Bldg Type, dtype: int64 ********** 1Story 1481 2Story 873 1.5Fin 314 SLvl 128 SFoyer 83 2.5Unf 24 1.5Unf 19 2.5Fin 8 Name: House Style, dtype: int64 ********** Gable 2321 Hip 551 Gambrel 22 Flat 20 Mansard 11 Shed 5 Name: Roof Style, dtype: int64 ********** CompShg 2887 Tar&Grv 23 WdShake 9 WdShngl 7 Metal 1 Membran 1 Roll 1 ClyTile 1 Name: Roof Matl, dtype: int64 ********** VinylSd 1026 MetalSd 450 HdBoard 442 Wd Sdng 420 Plywood 221 CemntBd 126 BrkFace 88 WdShing 56 AsbShng 44 Stucco 43 BrkComm 6 AsphShn 2 Stone 2 CBlock 2 ImStucc 1 PreCast 1 Name: Exterior 1st, dtype: int64 ********** VinylSd 1015 MetalSd 447 HdBoard 406 Wd Sdng 397 Plywood 274 CmentBd 126 Wd Shng 81 Stucco 47 BrkFace 47 AsbShng 38 Brk Cmn 22 ImStucc 15 Stone 6 AsphShn 4 CBlock 3 PreCast 1 Other 1 Name: Exterior 2nd, dtype: int64 ********** None 1752 BrkFace 880 Stone 249 BrkCmn 25 CBlock 1 Name: Mas Vnr Type, dtype: int64 ********** TA 1799 Gd 989 Ex 107 Fa 35 Name: Exter Qual, dtype: int64 ********** TA 2549 Gd 299 Fa 67 Ex 12 Po 3 Name: Exter Cond, dtype: int64 ********** PConc 1310 CBlock 1244 BrkTil 311 Slab 49 Stone 11 Wood 5 Name: Foundation, dtype: int64 ********** TA 1283 Gd 1219 Ex 258 Fa 88 Po 2 Name: Bsmt Qual, dtype: int64 ********** TA 2616 Gd 122 Fa 104 Po 5 Ex 3 Name: Bsmt Cond, dtype: int64 ********** No 1906 Av 418 Gd 284 Mn 239 Name: Bsmt Exposure, dtype: int64 ********** GLQ 859 Unf 851 ALQ 429 Rec 288 BLQ 269 LwQ 154 Name: BsmtFin Type 1, dtype: int64 ********** Unf 2499 Rec 106 LwQ 89 BLQ 68 ALQ 53 GLQ 34 Name: BsmtFin Type 2, dtype: int64 ********** GasA 2885 GasW 27 Grav 9 Wall 6 OthW 2 Floor 1 Name: Heating, dtype: int64 ********** Ex 1495 TA 864 Gd 476 Fa 92 Po 3 Name: Heating QC, dtype: int64 ********** Y 2734 N 196 Name: Central Air, dtype: int64 ********** SBrkr 2682 FuseA 188 FuseF 50 FuseP 8 Mix 1 Name: Electrical, dtype: int64 ********** TA 1494 Gd 1160 Ex 205 Fa 70 Po 1 Name: Kitchen Qual, dtype: int64 ********** Typ 2728 Min2 70 Min1 65 Mod 35 Maj1 19 Maj2 9 Sal 2 Sev 2 Name: Functional, dtype: int64 ********** Gd 744 TA 600 Fa 75 Po 46 Ex 43 Name: Fireplace Qu, dtype: int64 ********** Attchd 1731 Detchd 782 BuiltIn 186 Basment 36 2Types 23 CarPort 15 Name: Garage Type, dtype: int64 ********** Unf 1231 RFn 812 Fin 728 Name: Garage Finish, dtype: int64 ********** TA 2615 Fa 124 Gd 24 Po 5 Ex 3 Name: Garage Qual, dtype: int64 ********** TA 2665 Fa 74 Gd 15 Po 14 Ex 3 Name: Garage Cond, dtype: int64 ********** Y 2652 N 216 P 62 Name: Paved Drive, dtype: int64 ********** Gd 4 Ex 4 TA 3 Fa 2 Name: Pool QC, dtype: int64 ********** MnPrv 330 GdPrv 118 GdWo 112 MnWw 12 Name: Fence, dtype: int64 ********** Shed 95 Gar2 5 Othr 4 TenC 1 Elev 1 Name: Misc Feature, dtype: int64 ********** WD 2536 New 239 COD 87 ConLD 26 CWD 12 ConLI 9 ConLw 8 Oth 7 Con 5 VWD 1 Name: Sale Type, dtype: int64 ********** Normal 2413 Partial 245 Abnorml 190 Family 46 Alloca 24 AdjLand 12 Name: Sale Condition, dtype: int64 **********
# Visualize our target column SalePrice
sns.distplot(ames_housing.SalePrice)
plt.xlim(12800, 755000) # xlim based on minimum & maximum SalePrice
(12800, 755000)
# Let's pick a random numerical column and plot a scatter plot with SalePrice
fig = plt.figure(figsize = (15,5))
plt.scatter(ames_housing['Gr Liv Area'], ames_housing.SalePrice)
<matplotlib.collections.PathCollection at 0x7f9186a94450>
After a quick look at the dataset, we found that:
Order
, PID
.Pool QC
. Considering the large amount of missing data, we should just drop those columns. We will arbitrarily set the cut-off of at 25%.Let's first drop the columns that are not relevant to our target column
features = ames_housing.drop(['Order', 'PID'], axis = 1)
# Function to clean & transform features
def transform_features(df):
data = df.copy()
# Drop_cols & drop columns with more than 25% null values
null_check = data.isnull().sum()
drop_cols = null_check[null_check>len(data)*0.25].index
data.drop(drop_cols, axis = 1, inplace = True)
# Fill in missing values in non_numerical values with 'missing'
non_numerical_cols = data.copy().select_dtypes(exclude = np.number)
non_numerical_cols.fillna('missing', inplace = True)
# Convert all non-numerical columns to categorical
for col in non_numerical_cols.columns:
non_numerical_cols[col] = non_numerical_cols[col].astype('category')
# Inpute numerical columns that has more than 2 missing values
numerical_cols = data.copy().select_dtypes(np.number)
numerical_null_check = numerical_cols.isnull().sum()
inpute_cols = numerical_null_check[numerical_null_check>2].index
numerical_cols[inpute_cols] = numerical_cols[inpute_cols].fillna(numerical_cols[inpute_cols].mean())
# Concatenate numerical and non-numerical columns
data = pd.concat([numerical_cols, non_numerical_cols], axis = 1)
print(len(data))
# Remove remaining rows with missing values
non_null_data = data.dropna()
return non_null_data
non_null_data = transform_features(features).reset_index()
non_null_data.info()
2930 <class 'pandas.core.frame.DataFrame'> RangeIndex: 2927 entries, 0 to 2926 Data columns (total 76 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 2927 non-null int64 1 MS SubClass 2927 non-null int64 2 Lot Frontage 2927 non-null float64 3 Lot Area 2927 non-null int64 4 Overall Qual 2927 non-null int64 5 Overall Cond 2927 non-null int64 6 Year Built 2927 non-null int64 7 Year Remod/Add 2927 non-null int64 8 Mas Vnr Area 2927 non-null float64 9 BsmtFin SF 1 2927 non-null float64 10 BsmtFin SF 2 2927 non-null float64 11 Bsmt Unf SF 2927 non-null float64 12 Total Bsmt SF 2927 non-null float64 13 1st Flr SF 2927 non-null int64 14 2nd Flr SF 2927 non-null int64 15 Low Qual Fin SF 2927 non-null int64 16 Gr Liv Area 2927 non-null int64 17 Bsmt Full Bath 2927 non-null float64 18 Bsmt Half Bath 2927 non-null float64 19 Full Bath 2927 non-null int64 20 Half Bath 2927 non-null int64 21 Bedroom AbvGr 2927 non-null int64 22 Kitchen AbvGr 2927 non-null int64 23 TotRms AbvGrd 2927 non-null int64 24 Fireplaces 2927 non-null int64 25 Garage Yr Blt 2927 non-null float64 26 Garage Cars 2927 non-null float64 27 Garage Area 2927 non-null float64 28 Wood Deck SF 2927 non-null int64 29 Open Porch SF 2927 non-null int64 30 Enclosed Porch 2927 non-null int64 31 3Ssn Porch 2927 non-null int64 32 Screen Porch 2927 non-null int64 33 Pool Area 2927 non-null int64 34 Misc Val 2927 non-null int64 35 Mo Sold 2927 non-null int64 36 Yr Sold 2927 non-null int64 37 SalePrice 2927 non-null int64 38 MS Zoning 2927 non-null category 39 Street 2927 non-null category 40 Lot Shape 2927 non-null category 41 Land Contour 2927 non-null category 42 Utilities 2927 non-null category 43 Lot Config 2927 non-null category 44 Land Slope 2927 non-null category 45 Neighborhood 2927 non-null category 46 Condition 1 2927 non-null category 47 Condition 2 2927 non-null category 48 Bldg Type 2927 non-null category 49 House Style 2927 non-null category 50 Roof Style 2927 non-null category 51 Roof Matl 2927 non-null category 52 Exterior 1st 2927 non-null category 53 Exterior 2nd 2927 non-null category 54 Mas Vnr Type 2927 non-null category 55 Exter Qual 2927 non-null category 56 Exter Cond 2927 non-null category 57 Foundation 2927 non-null category 58 Bsmt Qual 2927 non-null category 59 Bsmt Cond 2927 non-null category 60 Bsmt Exposure 2927 non-null category 61 BsmtFin Type 1 2927 non-null category 62 BsmtFin Type 2 2927 non-null category 63 Heating 2927 non-null category 64 Heating QC 2927 non-null category 65 Central Air 2927 non-null category 66 Electrical 2927 non-null category 67 Kitchen Qual 2927 non-null category 68 Functional 2927 non-null category 69 Garage Type 2927 non-null category 70 Garage Finish 2927 non-null category 71 Garage Qual 2927 non-null category 72 Garage Cond 2927 non-null category 73 Paved Drive 2927 non-null category 74 Sale Type 2927 non-null category 75 Sale Condition 2927 non-null category dtypes: category(38), float64(11), int64(27) memory usage: 988.9 KB
non_null_data.drop('index', axis = 1, inplace = True)
# Transform column names to snake_casing
non_null_data.columns = non_null_data.columns.str.lower().str.replace('\s', '_')
non_null_data.select_dtypes(np.number).describe()
ms_subclass | lot_frontage | lot_area | overall_qual | overall_cond | year_built | year_remod/add | mas_vnr_area | bsmtfin_sf_1 | bsmtfin_sf_2 | bsmt_unf_sf | total_bsmt_sf | 1st_flr_sf | 2nd_flr_sf | low_qual_fin_sf | gr_liv_area | bsmt_full_bath | bsmt_half_bath | full_bath | half_bath | bedroom_abvgr | kitchen_abvgr | totrms_abvgrd | fireplaces | garage_yr_blt | garage_cars | garage_area | wood_deck_sf | open_porch_sf | enclosed_porch | 3ssn_porch | screen_porch | pool_area | misc_val | mo_sold | yr_sold | saleprice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 | 2927.000000 |
mean | 57.408609 | 69.202613 | 10137.138367 | 6.096344 | 5.562009 | 1971.385719 | 1984.269218 | 102.001239 | 442.744790 | 49.756406 | 559.538435 | 1052.039631 | 1158.813119 | 335.497096 | 4.681585 | 1498.991800 | 0.431500 | 0.061155 | 1.566109 | 0.379570 | 2.853775 | 1.044414 | 6.442433 | 0.599248 | 1978.149958 | 1.766997 | 472.833960 | 93.788521 | 47.455073 | 22.962761 | 2.595149 | 16.018449 | 2.245644 | 50.687052 | 6.217629 | 2007.790571 | 180805.550735 |
std | 42.648035 | 21.299187 | 7854.136212 | 1.410927 | 1.111442 | 30.243145 | 20.858455 | 178.469567 | 455.668872 | 169.221285 | 439.498694 | 440.322069 | 388.942762 | 428.404651 | 46.334003 | 503.785033 | 0.524849 | 0.245293 | 0.552431 | 0.502658 | 0.827049 | 0.214181 | 1.570844 | 0.647546 | 24.828936 | 0.760682 | 215.072336 | 126.393850 | 67.239479 | 64.073909 | 25.154079 | 56.113774 | 35.615352 | 566.632226 | 2.714885 | 1.317195 | 79880.475356 |
min | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1895.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 12789.000000 |
25% | 20.000000 | 60.000000 | 7440.500000 | 5.000000 | 5.000000 | 1954.000000 | 1965.000000 | 0.000000 | 0.000000 | 0.000000 | 219.000000 | 793.000000 | 876.000000 | 0.000000 | 0.000000 | 1126.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1962.000000 | 1.000000 | 320.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2007.000000 | 129500.000000 |
50% | 50.000000 | 69.224590 | 9439.000000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 370.000000 | 0.000000 | 466.000000 | 990.000000 | 1084.000000 | 0.000000 | 0.000000 | 1442.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1978.132443 | 2.000000 | 480.000000 | 0.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 160000.000000 |
75% | 70.000000 | 78.000000 | 11554.500000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 163.000000 | 734.500000 | 0.000000 | 802.500000 | 1302.000000 | 1384.000000 | 703.500000 | 0.000000 | 1742.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2001.000000 | 2.000000 | 576.000000 | 168.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 213500.000000 |
max | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1526.000000 | 2336.000000 | 6110.000000 | 5095.000000 | 2065.000000 | 1064.000000 | 5642.000000 | 3.000000 | 2.000000 | 4.000000 | 2.000000 | 8.000000 | 3.000000 | 15.000000 | 4.000000 | 2207.000000 | 5.000000 | 1488.000000 | 1424.000000 | 742.000000 | 1012.000000 | 508.000000 | 576.000000 | 800.000000 | 17000.000000 | 12.000000 | 2010.000000 | 755000.000000 |
# Generate a heat map based on column correlations
feature_corr = round(non_null_data.corr() ,2)
# Create a mask for self-referencial correlations
mask = feature_corr == 1
# Create labels for annotation only show values of strong correlations
labels = feature_corr.copy()
labels[(abs(labels)<0.6)] = ''
# Plot heatmap
fig = plt.figure(figsize = (20,15))
heat_map = sns.heatmap(feature_corr, annot = labels, mask = mask, fmt = '', annot_kws={'fontsize':12}, cmap = 'summer_r')
heat_map.set_xticklabels(heat_map.get_xticklabels(), rotation = 45, horizontalalignment='right')
[Text(0.5, 0, 'ms_subclass'), Text(1.5, 0, 'lot_frontage'), Text(2.5, 0, 'lot_area'), Text(3.5, 0, 'overall_qual'), Text(4.5, 0, 'overall_cond'), Text(5.5, 0, 'year_built'), Text(6.5, 0, 'year_remod/add'), Text(7.5, 0, 'mas_vnr_area'), Text(8.5, 0, 'bsmtfin_sf_1'), Text(9.5, 0, 'bsmtfin_sf_2'), Text(10.5, 0, 'bsmt_unf_sf'), Text(11.5, 0, 'total_bsmt_sf'), Text(12.5, 0, '1st_flr_sf'), Text(13.5, 0, '2nd_flr_sf'), Text(14.5, 0, 'low_qual_fin_sf'), Text(15.5, 0, 'gr_liv_area'), Text(16.5, 0, 'bsmt_full_bath'), Text(17.5, 0, 'bsmt_half_bath'), Text(18.5, 0, 'full_bath'), Text(19.5, 0, 'half_bath'), Text(20.5, 0, 'bedroom_abvgr'), Text(21.5, 0, 'kitchen_abvgr'), Text(22.5, 0, 'totrms_abvgrd'), Text(23.5, 0, 'fireplaces'), Text(24.5, 0, 'garage_yr_blt'), Text(25.5, 0, 'garage_cars'), Text(26.5, 0, 'garage_area'), Text(27.5, 0, 'wood_deck_sf'), Text(28.5, 0, 'open_porch_sf'), Text(29.5, 0, 'enclosed_porch'), Text(30.5, 0, '3ssn_porch'), Text(31.5, 0, 'screen_porch'), Text(32.5, 0, 'pool_area'), Text(33.5, 0, 'misc_val'), Text(34.5, 0, 'mo_sold'), Text(35.5, 0, 'yr_sold'), Text(36.5, 0, 'saleprice')]
From the heatmap, we can tell that:
overall_qual
, total_bsmt_sf
, 1st_flr_sf
, gr_liv_area
, garage_cars
, garage_area
have strong correlations with our target column saleprice
.overall_qual
has a strong correlation with year_built
, and surprisingly, garage_cars
.Let's create some scatter plot with higher correlations to see if there is linearity to it.
plt.scatter(non_null_data.overall_qual, non_null_data.saleprice)
<matplotlib.collections.PathCollection at 0x7f9186f96b90>
plt.scatter(non_null_data.gr_liv_area, non_null_data.saleprice)
<matplotlib.collections.PathCollection at 0x7f9186d23290>
# Select features using PCA
def select_features_pca(df, target, feature_num):
numerical_features = df.select_dtypes(np.number).drop(target, axis = 1)
categorical_features = df.select_dtypes('category')
# Numerical feature selection
# test = SelectKBest(score_func=f_regression, k=feature_num)
# top_features = test.fit_transform(numerical_features, non_null_data.saleprice)
# top_numerical_features = pd.DataFrame(top_features)
# Get dummies for categorical features
categorical_features = pd.get_dummies(categorical_features)
# Concatenate numerical & categorical features
all_features = pd.concat([numerical_features, categorical_features], axis = 1)
# Fit pca to features
pca = PCA(n_components=feature_num, random_state=1)
features = pca.fit_transform(all_features)
# Concatenate numerical & categorical features
# features = pd.concat([top_numerical_features, categorical_features], axis = 1, copy = False )
return features
# Test select_features_pca function
selected_features = select_features_pca(non_null_data, 'saleprice', 10)
selected_features.shape
(2927, 10)
Because it's not recommended to perform PCA on categorical features, and because we have both numerical and categorical features, we will try to do feature selection separately next.
# # Write our own function to pick features
def select_features_by_type(df, target, numerical_num):
# Seperate numerical features and categorical features
numerical_features = df.drop(target, axis = 1).select_dtypes(np.number)
categorical_features = df.select_dtypes('category')
# Select k best numerical features
kbest = SelectKBest(f_regression, numerical_num)
selected_num = kbest.fit_transform(numerical_features, df[target])
# Transform categorical columns with get_dummies
enc = OneHotEncoder()
trans_cat = enc.fit_transform(categorical_features).toarray()
# Concatenate numerical & categorical features
features = np.concatenate((selected_num, trans_cat), axis = 1)
return features
# Test select_features_by_type
select_features = select_features_by_type(non_null_data, 'saleprice', 5)
select_features.shape
(2927, 264)
In this section, we will use f_regression to pick our numerical features to test on, using Linear Regression model without regularization.
# Function to train & test
def train_and_test(df, target, feature_num, training_size = None, feature_selection = None, test = False):
if feature_selection == 'pca':
X = select_features_pca(df, target, feature_num)
elif feature_selection == 'type':
X = select_features_by_type(df, target, feature_num)
y = df[target]
# First split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Split train data into train, val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
lr = LinearRegression(normalize=True)
# Train model with small sample size for plotting the learning curve
if training_size:
X_train_sm = X_train[:training_size]
y_train_sm = y_train[:training_size]
X_val_sm = X_val[:training_size]
y_val_sm = y_val[:training_size]
lr.fit(X_train_sm, y_train_sm)
pred_train_sm = lr.predict(X_train_sm)
mse_train_sm = mean_squared_error(y_train_sm, pred_train_sm)
pred_val_sm = lr.predict(X_val_sm)
mse_val_sm = mean_squared_error(y_val_sm, pred_val_sm)
return mse_train_sm, mse_val_sm
if test:
print('Training model with {} features'.format(feature_num))
lr.fit(X_train, y_train)
pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared = False)
pred_test = lr.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared = False)
print('Model has a training rmse of {}, and test rmse of {}.'.format(rmse_train, rmse_test))
print('*'*20)
return rmse_test
# Train model
print('Training model with {} features'.format(feature_num))
lr.fit(X_train, y_train)
pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared = False)
pred_val = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, pred_val, squared = False)
print('Model has a training rmse of {}, and cross_validation rmse of {}.'.format(rmse_train, rmse_val))
print('*'*20)
return rmse_train, rmse_val
# Train and test on different number of numerical features
rmses_train = {}
rmses_val = {}
for i in range(30, 100):
print('Ver.{}:'.format(i-9))
rmse_train, rmse_val = train_and_test(df = non_null_data,
target = 'saleprice',
feature_num = i,
feature_selection='pca')
rmses_train['Model with {} features'.format(i)] = rmse_train
rmses_val['Model with {} features'.format(i)] = rmse_val
Ver.21: Training model with 30 features Model has a training rmse of 29264.315862340267, and cross_validation rmse of 26513.91145885102. ******************** Ver.22: Training model with 31 features Model has a training rmse of 28935.467596038965, and cross_validation rmse of 26561.086283456156. ******************** Ver.23: Training model with 32 features Model has a training rmse of 28929.232740642794, and cross_validation rmse of 26549.236235926983. ******************** Ver.24: Training model with 33 features Model has a training rmse of 28914.317996221052, and cross_validation rmse of 26569.847704436517. ******************** Ver.25: Training model with 34 features Model has a training rmse of 28796.53074277462, and cross_validation rmse of 26487.75202310115. ******************** Ver.26: Training model with 35 features Model has a training rmse of 28873.310962599364, and cross_validation rmse of 26568.017002484856. ******************** Ver.27: Training model with 36 features Model has a training rmse of 28744.721460062654, and cross_validation rmse of 26535.545912208763. ******************** Ver.28: Training model with 37 features Model has a training rmse of 28721.68399408437, and cross_validation rmse of 26560.326506727906. ******************** Ver.29: Training model with 38 features Model has a training rmse of 28633.18292884306, and cross_validation rmse of 26518.688387803075. ******************** Ver.30: Training model with 39 features Model has a training rmse of 28644.57419369505, and cross_validation rmse of 26565.638641819474. ******************** Ver.31: Training model with 40 features Model has a training rmse of 28534.402459168974, and cross_validation rmse of 26491.542439198172. ******************** Ver.32: Training model with 41 features Model has a training rmse of 28570.492747204713, and cross_validation rmse of 26463.640355183998. ******************** Ver.33: Training model with 42 features Model has a training rmse of 28578.261321702907, and cross_validation rmse of 26521.082546557693. ******************** Ver.34: Training model with 43 features Model has a training rmse of 28263.917473512596, and cross_validation rmse of 26112.865375806185. ******************** Ver.35: Training model with 44 features Model has a training rmse of 28291.744434074197, and cross_validation rmse of 26151.30957224682. ******************** Ver.36: Training model with 45 features Model has a training rmse of 28288.428890755848, and cross_validation rmse of 26098.0947398373. ******************** Ver.37: Training model with 46 features Model has a training rmse of 28217.955405727145, and cross_validation rmse of 26065.744055177216. ******************** Ver.38: Training model with 47 features Model has a training rmse of 28210.076713737042, and cross_validation rmse of 26133.648112001745. ******************** Ver.39: Training model with 48 features Model has a training rmse of 28133.018421489447, and cross_validation rmse of 25953.59890348835. ******************** Ver.40: Training model with 49 features Model has a training rmse of 28109.483685712818, and cross_validation rmse of 25962.424230511806. ******************** Ver.41: Training model with 50 features Model has a training rmse of 28112.78918154681, and cross_validation rmse of 25970.62217096045. ******************** Ver.42: Training model with 51 features Model has a training rmse of 28017.022337748997, and cross_validation rmse of 26091.742093124325. ******************** Ver.43: Training model with 52 features Model has a training rmse of 28021.484787282556, and cross_validation rmse of 26047.119752396993. ******************** Ver.44: Training model with 53 features Model has a training rmse of 27923.481282269528, and cross_validation rmse of 26006.44671979776. ******************** Ver.45: Training model with 54 features Model has a training rmse of 27908.276528270617, and cross_validation rmse of 26114.002693048624. ******************** Ver.46: Training model with 55 features Model has a training rmse of 27753.69897906455, and cross_validation rmse of 25898.94765358111. ******************** Ver.47: Training model with 56 features Model has a training rmse of 27804.93454895275, and cross_validation rmse of 26004.00674977477. ******************** Ver.48: Training model with 57 features Model has a training rmse of 27724.935941030926, and cross_validation rmse of 26054.65594368001. ******************** Ver.49: Training model with 58 features Model has a training rmse of 27630.39877827149, and cross_validation rmse of 25937.91095561885. ******************** Ver.50: Training model with 59 features Model has a training rmse of 27634.520194213168, and cross_validation rmse of 25969.030763729894. ******************** Ver.51: Training model with 60 features Model has a training rmse of 27591.746980106596, and cross_validation rmse of 25728.44447556765. ******************** Ver.52: Training model with 61 features Model has a training rmse of 27576.096656519312, and cross_validation rmse of 25834.05384867486. ******************** Ver.53: Training model with 62 features Model has a training rmse of 27544.47110574157, and cross_validation rmse of 25996.085120732463. ******************** Ver.54: Training model with 63 features Model has a training rmse of 27456.733449500043, and cross_validation rmse of 26010.535328456328. ******************** Ver.55: Training model with 64 features Model has a training rmse of 27509.648963463795, and cross_validation rmse of 25947.585293927303. ******************** Ver.56: Training model with 65 features Model has a training rmse of 27237.739108334525, and cross_validation rmse of 26007.050769184287. ******************** Ver.57: Training model with 66 features Model has a training rmse of 27196.191943907346, and cross_validation rmse of 25771.081739609046. ******************** Ver.58: Training model with 67 features Model has a training rmse of 27198.673881587318, and cross_validation rmse of 26014.07599303737. ******************** Ver.59: Training model with 68 features Model has a training rmse of 27138.164894291782, and cross_validation rmse of 25918.044458420984. ******************** Ver.60: Training model with 69 features Model has a training rmse of 27094.90624318574, and cross_validation rmse of 25813.709654453243. ******************** Ver.61: Training model with 70 features Model has a training rmse of 27173.236206629113, and cross_validation rmse of 25761.44712170298. ******************** Ver.62: Training model with 71 features Model has a training rmse of 26895.720925422705, and cross_validation rmse of 25409.429228407243. ******************** Ver.63: Training model with 72 features Model has a training rmse of 26818.99957815695, and cross_validation rmse of 25708.846679573187. ******************** Ver.64: Training model with 73 features Model has a training rmse of 26634.327028266005, and cross_validation rmse of 25578.324250987414. ******************** Ver.65: Training model with 74 features Model has a training rmse of 26582.60976338655, and cross_validation rmse of 25498.46609462126. ******************** Ver.66: Training model with 75 features Model has a training rmse of 26625.698754867255, and cross_validation rmse of 25532.297889577167. ******************** Ver.67: Training model with 76 features Model has a training rmse of 26572.690933381255, and cross_validation rmse of 25650.292759271084. ******************** Ver.68: Training model with 77 features Model has a training rmse of 26417.244711184612, and cross_validation rmse of 25525.119088333657. ******************** Ver.69: Training model with 78 features Model has a training rmse of 26476.911900046016, and cross_validation rmse of 25705.27823529284. ******************** Ver.70: Training model with 79 features Model has a training rmse of 26182.568036989225, and cross_validation rmse of 25576.364529214356. ******************** Ver.71: Training model with 80 features Model has a training rmse of 26202.5427936338, and cross_validation rmse of 25720.128604785954. ******************** Ver.72: Training model with 81 features Model has a training rmse of 26159.565020455204, and cross_validation rmse of 25635.093437666375. ******************** Ver.73: Training model with 82 features Model has a training rmse of 26041.239050720153, and cross_validation rmse of 25654.380283571307. ******************** Ver.74: Training model with 83 features Model has a training rmse of 25990.327991350467, and cross_validation rmse of 25660.365624021695. ******************** Ver.75: Training model with 84 features Model has a training rmse of 26096.173645295174, and cross_validation rmse of 25636.150298011176. ******************** Ver.76: Training model with 85 features Model has a training rmse of 25845.105905197168, and cross_validation rmse of 25700.431915116555. ******************** Ver.77: Training model with 86 features Model has a training rmse of 25954.34558266699, and cross_validation rmse of 25635.03558747344. ******************** Ver.78: Training model with 87 features Model has a training rmse of 25989.71614451249, and cross_validation rmse of 25729.050717311085. ******************** Ver.79: Training model with 88 features Model has a training rmse of 25882.16822416779, and cross_validation rmse of 25840.956567768142. ******************** Ver.80: Training model with 89 features Model has a training rmse of 25773.246299581206, and cross_validation rmse of 25848.25114896974. ******************** Ver.81: Training model with 90 features Model has a training rmse of 25892.066800586294, and cross_validation rmse of 25711.233651169776. ******************** Ver.82: Training model with 91 features Model has a training rmse of 25712.662900050946, and cross_validation rmse of 25533.388390586308. ******************** Ver.83: Training model with 92 features Model has a training rmse of 25678.80904653712, and cross_validation rmse of 25566.91202369129. ******************** Ver.84: Training model with 93 features Model has a training rmse of 25717.896499629867, and cross_validation rmse of 25329.45027791872. ******************** Ver.85: Training model with 94 features Model has a training rmse of 25743.74351199665, and cross_validation rmse of 25523.688950829554. ******************** Ver.86: Training model with 95 features Model has a training rmse of 25651.798353165515, and cross_validation rmse of 25591.056408445278. ******************** Ver.87: Training model with 96 features Model has a training rmse of 25606.675194273288, and cross_validation rmse of 25535.415537811285. ******************** Ver.88: Training model with 97 features Model has a training rmse of 25463.259371899603, and cross_validation rmse of 25292.363422490176. ******************** Ver.89: Training model with 98 features Model has a training rmse of 25510.98170088129, and cross_validation rmse of 25563.59988123535. ******************** Ver.90: Training model with 99 features Model has a training rmse of 25512.722573977208, and cross_validation rmse of 25270.737784809236. ********************
# Find the model with the lowest cross-validation rmse
def best_performance(rmses_val):
min_val_key = min(rmses_val, key=rmses_val.get)
print('{} has the lowest cross-validation rmse:'.format(min_val_key), str(rmses_val[min_val_key]))
best_performance(rmses_val)
Model with 99 features has the lowest cross-validation rmse: 25270.737784809236
# Get the learning curve of the model with lowest cross-validation rmse
mses_train_sm = []
mses_val_sm = []
for m in range(1, 400):
mse_train_sm, mse_val_sm = train_and_test(df = non_null_data,
target = 'saleprice',
feature_num =99,
training_size=m,
feature_selection='pca')
mses_train_sm.append(mse_train_sm)
mses_val_sm.append(mse_val_sm)
plt.plot(mses_train_sm, color = 'red', label = 'training error')
plt.plot(mses_val_sm, color = 'orange', label = 'cross-validation error')
plt.legend()
plt.xlim(1,400)
plt.ylim(0, 0.1e10)
plt.title('Learning curve for unregularized linear regression')
Text(0.5, 1.0, 'Learning curve for unregularized linear regression')
# Train and test on different number of numerical features with type sensitive feature selection
rmses_train_bytype = {}
rmses_val_bytype = {}
# There are 37 numerical columns in total
for i in range(1, 37):
print('Ver.{}:'.format(i))
rmse_train, rmse_val = train_and_test(df = non_null_data,
target = 'saleprice',
feature_num = i,
feature_selection='type')
rmses_train_bytype['Model with {} numerical features'.format(i)] = rmse_train
rmses_val_bytype['Model with {} numerical features'.format(i)] = rmse_val
Ver.1: Training model with 1 features Model has a training rmse of 29896.49873344851, and cross_validation rmse of 8.254544980908652e+17. ******************** Ver.2: Training model with 2 features Model has a training rmse of 26133.032853354965, and cross_validation rmse of 5.654435874996211e+17. ******************** Ver.3: Training model with 3 features Model has a training rmse of 24759.880622937348, and cross_validation rmse of 5.0546624166998106e+17. ******************** Ver.4: Training model with 4 features Model has a training rmse of 24723.321509523106, and cross_validation rmse of 3.703736226167728e+17. ******************** Ver.5: Training model with 5 features Model has a training rmse of 24783.510151732455, and cross_validation rmse of 1.1651036644123674e+18. ******************** Ver.6: Training model with 6 features Model has a training rmse of 24704.067951980465, and cross_validation rmse of 7.287134515827548e+17. ******************** Ver.7: Training model with 7 features Model has a training rmse of 24758.054522096205, and cross_validation rmse of 1.9826053798589002e+18. ******************** Ver.8: Training model with 8 features Model has a training rmse of 24664.899361809792, and cross_validation rmse of 3.713157777150656e+17. ******************** Ver.9: Training model with 9 features Model has a training rmse of 24998.31461900748, and cross_validation rmse of 3.2617314992621555e+17. ******************** Ver.10: Training model with 10 features Model has a training rmse of 24529.069710188633, and cross_validation rmse of 9.94917575845344e+16. ******************** Ver.11: Training model with 11 features Model has a training rmse of 24328.622326346507, and cross_validation rmse of 5.49547827260809e+17. ******************** Ver.12: Training model with 12 features Model has a training rmse of 24346.209839506595, and cross_validation rmse of 1.1801990409242788e+18. ******************** Ver.13: Training model with 13 features Model has a training rmse of 24777.520113247676, and cross_validation rmse of 5.4616430612497786e+17. ******************** Ver.14: Training model with 14 features Model has a training rmse of 23883.58263918169, and cross_validation rmse of 1.5446022194771222e+17. ******************** Ver.15: Training model with 15 features Model has a training rmse of 23925.94947329908, and cross_validation rmse of 3.2611894268277024e+17. ******************** Ver.16: Training model with 16 features Model has a training rmse of 23883.76943018631, and cross_validation rmse of 1.5594020842181862e+18. ******************** Ver.17: Training model with 17 features Model has a training rmse of 23832.10664603602, and cross_validation rmse of 1.030833655743287e+18. ******************** Ver.18: Training model with 18 features Model has a training rmse of 23966.62555106253, and cross_validation rmse of 1.250460513455006e+18. ******************** Ver.19: Training model with 19 features Model has a training rmse of 23727.016186105797, and cross_validation rmse of 4.2064771365949344e+17. ******************** Ver.20: Training model with 20 features Model has a training rmse of 23706.79091353854, and cross_validation rmse of 1.3093941621967133e+17. ******************** Ver.21: Training model with 21 features Model has a training rmse of 23576.756509041054, and cross_validation rmse of 1.6284652273295504e+16. ******************** Ver.22: Training model with 22 features Model has a training rmse of 23500.398556200806, and cross_validation rmse of 7.051277312671669e+16. ******************** Ver.23: Training model with 23 features Model has a training rmse of 23480.299357563516, and cross_validation rmse of 6.976275849947753e+17. ******************** Ver.24: Training model with 24 features Model has a training rmse of 23534.167956374844, and cross_validation rmse of 2.1092542258463104e+17. ******************** Ver.25: Training model with 25 features Model has a training rmse of 23456.143756859314, and cross_validation rmse of 3.198120782390376e+17. ******************** Ver.26: Training model with 26 features Model has a training rmse of 23287.882848666588, and cross_validation rmse of 6.93754261662707e+17. ******************** Ver.27: Training model with 27 features Model has a training rmse of 23009.562783523048, and cross_validation rmse of 9.925628892669309e+16. ******************** Ver.28: Training model with 28 features Model has a training rmse of 22992.006837246474, and cross_validation rmse of 4.0006664363610406e+17. ******************** Ver.29: Training model with 29 features Model has a training rmse of 22959.057115791013, and cross_validation rmse of 3690339131045095.0. ******************** Ver.30: Training model with 30 features Model has a training rmse of 23001.53985405216, and cross_validation rmse of 3.1069652922541184e+16. ******************** Ver.31: Training model with 31 features Model has a training rmse of 22935.609006191666, and cross_validation rmse of 1.3638957466243752e+17. ******************** Ver.32: Training model with 32 features Model has a training rmse of 23010.203855750406, and cross_validation rmse of 3.9014241269147546e+17. ******************** Ver.33: Training model with 33 features Model has a training rmse of 22922.624592235967, and cross_validation rmse of 9.898882287285144e+16. ******************** Ver.34: Training model with 34 features Model has a training rmse of 22904.704779757874, and cross_validation rmse of 2.254029043834957e+17. ******************** Ver.35: Training model with 35 features Model has a training rmse of 22251.016702056866, and cross_validation rmse of 7.977437466412845e+16. ******************** Ver.36: Training model with 36 features Model has a training rmse of 22248.218056996855, and cross_validation rmse of 1.4850447171842723e+17. ********************
best_performance(rmses_val_bytype)
Model with 29 numerical features has the lowest cross-validation rmse: 3690339131045095.0
# Get the learning curve of the model with lowest cross-validation rmse
mses_train_sm_bytype = []
mses_val_sm_bytype = []
for m in range(1, 400):
mse_train_sm_bytype, mse_val_sm_bytype = train_and_test(df = non_null_data,
target = 'saleprice',
feature_num = 29, # Indicates number of numerical features
training_size=m,
feature_selection='type')
mses_train_sm_bytype.append(mse_train_sm_bytype)
mses_val_sm_bytype.append(mse_val_sm_bytype)
plt.plot(mses_train_sm, color = 'red', label = 'training error')
plt.plot(mses_val_sm, color = 'orange', label = 'cross-validation error')
plt.legend()
plt.xlim(1,400)
plt.ylim(0, 0.1e10)
plt.title('Learning curve for unregularized linear regression feature by type')
Text(0.5, 1.0, 'Learning curve for unregularized linear regression feature by type')
The learning curve above -- the low training error and high cross-validation error indicates over fitting, aka high variance. Next, let's try regularizing the model with a lambda value.
# Best pca params
train_and_test(non_null_data, 'saleprice', feature_num = 99, feature_selection = 'pca', test = True)
Training model with 99 features Model has a training rmse of 25512.722573977208, and test rmse of 34156.78191518114. ********************
34156.78191518114
# Best bytype params
train_and_test(non_null_data, 'saleprice', feature_num = 29, feature_selection = 'type', test = True)
Training model with 29 features Model has a training rmse of 22959.057115791013, and test rmse of 2397499759613219.5. ********************
2397499759613219.5
Above with a low training error and really high test error is typical over fitting.
In this section, we will leave the feature columns be, and use Linear Regression model with l2(Ridge) regularization.
We will compare the regularized Linear Regression with our previous unregularized Linear Regression, also, with or without feature selection
# Function to train & test with Ridge
def regularized_train_and_test(df, target, alpha,
training_size = None,
feature_selection = None,
feature_number = None,
test = False):
if feature_selection == 'type':
X = select_features_by_type(df, target, feature_number)
elif feature_selection == 'pca':
X = select_features_pca(df, target, feature_number)
else:
# Get dummies for categorized data
categorized_features = df.select_dtypes('category')
categorized_features = pd.get_dummies(categorized_features)
# Get numerical features
numerical_features = df.select_dtypes(np.number).drop(target, axis = 1)
X = pd.concat([numerical_features, categorized_features], axis = 1)
y = df[target]
# First split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Split train data into train, val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
model = Ridge(alpha = alpha)
# Train model with small sample size for plotting the learning curve
if training_size:
X_train_sm = X_train[:training_size]
y_train_sm = y_train[:training_size]
X_val_sm = X_val[:training_size]
y_val_sm = y_val[:training_size]
model.fit(X_train_sm, y_train_sm)
pred_train_sm = model.predict(X_train_sm)
mse_train_sm = mean_squared_error(y_train_sm, pred_train_sm)
pred_val_sm = model.predict(X_val_sm)
mse_val_sm = mean_squared_error(y_val_sm, pred_val_sm)
return mse_train_sm, mse_val_sm
if test:
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared = False)
pred_test = model.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared = False)
print('Model has a training rmse of {}, and test rmse of {}.'.format(rmse_train, rmse_test))
return rmse_test
# Train model
print('Training model with regularization alpha: {}'.format(alpha))
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared = False)
pred_val = model.predict(X_val)
rmse_val = mean_squared_error(y_val, pred_val, squared = False)
print('Model has a training rmse of {}, and cross_validation rmse of {}.'.format(rmse_train, rmse_val))
return rmse_train, rmse_val
# Train and test on different number of alpha without any feature selection
r_rmses_train = {}
r_rmses_val = {}
for alpha in range(1,50):
rmse_train, rmse_val = regularized_train_and_test(non_null_data, 'saleprice', alpha)
r_rmses_train['Model with alpha: {}'.format(alpha)] = rmse_train
r_rmses_val['Model with alpha: {}'.format(alpha)] = rmse_val
Training model with regularization alpha: 1 Model has a training rmse of 22417.20236351627, and cross_validation rmse of 25215.78283012421. Training model with regularization alpha: 2 Model has a training rmse of 22576.68476954469, and cross_validation rmse of 24864.76631414978. Training model with regularization alpha: 3 Model has a training rmse of 22709.58775954422, and cross_validation rmse of 24662.027498705756. Training model with regularization alpha: 4 Model has a training rmse of 22823.65477819027, and cross_validation rmse of 24525.9801438628. Training model with regularization alpha: 5 Model has a training rmse of 22923.858334912566, and cross_validation rmse of 24427.74800099716. Training model with regularization alpha: 6 Model has a training rmse of 23013.457202712198, and cross_validation rmse of 24353.697642298208. Training model with regularization alpha: 7 Model has a training rmse of 23094.68134489779, and cross_validation rmse of 24296.299336978263. Training model with regularization alpha: 8 Model has a training rmse of 23169.118360276214, and cross_validation rmse of 24250.959226411094. Training model with regularization alpha: 9 Model has a training rmse of 23237.937385801153, and cross_validation rmse of 24214.674860101055. Training model with regularization alpha: 10 Model has a training rmse of 23302.024274615967, and cross_validation rmse of 24185.382309950073. Training model with regularization alpha: 11 Model has a training rmse of 23362.06657357418, and cross_validation rmse of 24161.606945150397. Training model with regularization alpha: 12 Model has a training rmse of 23418.608933937045, and cross_validation rmse of 24142.26251811979. Training model with regularization alpha: 13 Model has a training rmse of 23472.09043117432, and cross_validation rmse of 24126.528811403216. Training model with regularization alpha: 14 Model has a training rmse of 23522.87042572415, and cross_validation rmse of 24113.773647963764. Training model with regularization alpha: 15 Model has a training rmse of 23571.246937231113, and cross_validation rmse of 24103.50128362192. Training model with regularization alpha: 16 Model has a training rmse of 23617.469990906255, and cross_validation rmse of 24095.31717288603. Training model with regularization alpha: 17 Model has a training rmse of 23661.751503244384, and cross_validation rmse of 24088.90326729176. Training model with regularization alpha: 18 Model has a training rmse of 23704.2727329814, and cross_validation rmse of 24084.00029891489. Training model with regularization alpha: 19 Model has a training rmse of 23745.18998504733, and cross_validation rmse of 24080.3948199162. Training model with regularization alpha: 20 Model has a training rmse of 23784.639038632373, and cross_validation rmse of 24077.909555391587. Training model with regularization alpha: 21 Model has a training rmse of 23822.738628435214, and cross_validation rmse of 24076.396111406815. Training model with regularization alpha: 22 Model has a training rmse of 23859.59321304461, and cross_validation rmse of 24075.729387315154. Training model with regularization alpha: 23 Model has a training rmse of 23895.29519948119, and cross_validation rmse of 24075.80324115286. Training model with regularization alpha: 24 Model has a training rmse of 23929.92674782726, and cross_validation rmse of 24076.527089651478. Training model with regularization alpha: 25 Model has a training rmse of 23963.56124803814, and cross_validation rmse of 24077.823214415428. Training model with regularization alpha: 26 Model has a training rmse of 23996.264538225456, and cross_validation rmse of 24079.62460796194. Training model with regularization alpha: 27 Model has a training rmse of 24028.095917141945, and cross_validation rmse of 24081.873236931453. Training model with regularization alpha: 28 Model has a training rmse of 24059.108991421428, and cross_validation rmse of 24084.518630844614. Training model with regularization alpha: 29 Model has a training rmse of 24089.352389068998, and cross_validation rmse of 24087.51672720918. Training model with regularization alpha: 30 Model has a training rmse of 24118.870363883354, and cross_validation rmse of 24090.82892017868. Training model with regularization alpha: 31 Model has a training rmse of 24147.703310319957, and cross_validation rmse of 24094.4212720936. Training model with regularization alpha: 32 Model has a training rmse of 24175.88820433478, and cross_validation rmse of 24098.263856296904. Training model with regularization alpha: 33 Model has a training rmse of 24203.458982680124, and cross_validation rmse of 24102.330206459257. Training model with regularization alpha: 34 Model has a training rmse of 24230.446870730542, and cross_validation rmse of 24106.596852861407. Training model with regularization alpha: 35 Model has a training rmse of 24256.880667035108, and cross_validation rmse of 24111.042930085936. Training model with regularization alpha: 36 Model has a training rmse of 24282.786991304398, and cross_validation rmse of 24115.649843671254. Training model with regularization alpha: 37 Model has a training rmse of 24308.19050135228, and cross_validation rmse of 24120.400985701475. Training model with regularization alpha: 38 Model has a training rmse of 24333.114083561828, and cross_validation rmse of 24125.281491208167. Training model with regularization alpha: 39 Model has a training rmse of 24357.579020675326, and cross_validation rmse of 24130.278028764045. Training model with regularization alpha: 40 Model has a training rmse of 24381.605140085, and cross_validation rmse of 24135.37861984461. Training model with regularization alpha: 41 Model has a training rmse of 24405.210945291972, and cross_validation rmse of 24140.572482497515. Training model with regularization alpha: 42 Model has a training rmse of 24428.413732783203, and cross_validation rmse of 24145.849895624044. Training model with regularization alpha: 43 Model has a training rmse of 24451.229696231974, and cross_validation rmse of 24151.20208080889. Training model with regularization alpha: 44 Model has a training rmse of 24473.67401964192, and cross_validation rmse of 24156.621099139837. Training model with regularization alpha: 45 Model has a training rmse of 24495.76096081784, and cross_validation rmse of 24162.099760872567. Training model with regularization alpha: 46 Model has a training rmse of 24517.503926347592, and cross_validation rmse of 24167.63154614188. Training model with regularization alpha: 47 Model has a training rmse of 24538.915539113466, and cross_validation rmse of 24173.210535196366. Training model with regularization alpha: 48 Model has a training rmse of 24560.0076992114, and cross_validation rmse of 24178.831346871157. Training model with regularization alpha: 49 Model has a training rmse of 24580.791639037063, and cross_validation rmse of 24184.489084204204.
# Find the model with the lowest cross-validation rmse
best_performance(r_rmses_val)
Model with alpha: 22 has the lowest cross-validation rmse: 24075.729387315154
r_mses_train_sm = []
r_mses_val_sm = []
for m in range(1,300):
r_mse_train_sm, r_mse_val_sm = regularized_train_and_test(non_null_data, 'saleprice', 22, training_size = m)
r_mses_train_sm.append(r_mse_train_sm)
r_mses_val_sm.append(r_mse_val_sm)
plt.plot(r_mses_train_sm, color = 'red', label = 'regularized_training error')
plt.plot(r_mses_val_sm, color = 'orange', label = 'regularized_cross-validation error')
plt.xlim(1,100)
plt.ylim(0, 0.2e10)
plt.title('Learning curve for regularized linear regression without feature seleciton')
plt.legend()
<matplotlib.legend.Legend at 0x7f91427dad50>
# Train and test on different number of alpha with feature selection
r_rmses_train_type = {}
r_rmses_val_type = {}
for alpha in range(1,50):
rmse_train, rmse_val = regularized_train_and_test(non_null_data,
'saleprice',
alpha,
feature_selection='type',
feature_number=29)
r_rmses_train_type['Model with alpha: {}'.format(alpha)] = rmse_train
r_rmses_val_type['Model with alpha: {}'.format(alpha)] = rmse_val
Training model with regularization alpha: 1 Model has a training rmse of 23091.620392188084, and cross_validation rmse of 25428.428036528872. Training model with regularization alpha: 2 Model has a training rmse of 23229.17589591547, and cross_validation rmse of 25085.72167287561. Training model with regularization alpha: 3 Model has a training rmse of 23350.047245439677, and cross_validation rmse of 24882.350668011004. Training model with regularization alpha: 4 Model has a training rmse of 23456.86311129338, and cross_validation rmse of 24744.24193359724. Training model with regularization alpha: 5 Model has a training rmse of 23552.526805044326, and cross_validation rmse of 24643.965967830878. Training model with regularization alpha: 6 Model has a training rmse of 23639.279737064015, and cross_validation rmse of 24568.22317198507. Training model with regularization alpha: 7 Model has a training rmse of 23718.78478067093, and cross_validation rmse of 24509.534499089837. Training model with regularization alpha: 8 Model has a training rmse of 23792.285704596656, and cross_validation rmse of 24463.276730206253. Training model with regularization alpha: 9 Model has a training rmse of 23860.73026686621, and cross_validation rmse of 24426.397959673573. Training model with regularization alpha: 10 Model has a training rmse of 23924.855201339193, and cross_validation rmse of 24396.78388225504. Training model with regularization alpha: 11 Model has a training rmse of 23985.24401335224, and cross_validation rmse of 24372.91451500174. Training model with regularization alpha: 12 Model has a training rmse of 24042.36669002751, and cross_validation rmse of 24353.664641811178. Training model with regularization alpha: 13 Model has a training rmse of 24096.607476599893, and cross_validation rmse of 24338.181222070263. Training model with regularization alpha: 14 Model has a training rmse of 24148.284690400542, and cross_validation rmse of 24325.804664612093. Training model with regularization alpha: 15 Model has a training rmse of 24197.665129573525, and cross_validation rmse of 24316.016396597053. Training model with regularization alpha: 16 Model has a training rmse of 24244.97474209576, and cross_validation rmse of 24308.4028658586. Training model with regularization alpha: 17 Model has a training rmse of 24290.406658332944, and cross_validation rmse of 24302.630179665077. Training model with regularization alpha: 18 Model has a training rmse of 24334.12733136862, and cross_validation rmse of 24298.425836284026. Training model with regularization alpha: 19 Model has a training rmse of 24376.281296402987, and cross_validation rmse of 24295.565309555415. Training model with regularization alpha: 20 Model has a training rmse of 24416.994906686985, and cross_validation rmse of 24293.862029228618. Training model with regularization alpha: 21 Model has a training rmse of 24456.37930006325, and cross_validation rmse of 24293.1597846822. Training model with regularization alpha: 22 Model has a training rmse of 24494.532779498957, and cross_validation rmse of 24293.326888559353. Training model with regularization alpha: 23 Model has a training rmse of 24531.542741882033, and cross_validation rmse of 24294.25163858454. Training model with regularization alpha: 24 Model has a training rmse of 24567.487254704687, and cross_validation rmse of 24295.838750477076. Training model with regularization alpha: 25 Model has a training rmse of 24602.43635546629, and cross_validation rmse of 24298.006526544657. Training model with regularization alpha: 26 Model has a training rmse of 24636.453130649814, and cross_validation rmse of 24300.684588055843. Training model with regularization alpha: 27 Model has a training rmse of 24669.59461792704, and cross_validation rmse of 24303.812044215112. Training model with regularization alpha: 28 Model has a training rmse of 24701.912565445098, and cross_validation rmse of 24307.336002517917. Training model with regularization alpha: 29 Model has a training rmse of 24733.454074686615, and cross_validation rmse of 24311.21034840283. Training model with regularization alpha: 30 Model has a training rmse of 24764.262147814243, and cross_validation rmse of 24315.394739079595. Training model with regularization alpha: 31 Model has a training rmse of 24794.376156137063, and cross_validation rmse of 24319.8537689868. Training model with regularization alpha: 32 Model has a training rmse of 24823.832243036002, and cross_validation rmse of 24324.556273753067. Training model with regularization alpha: 33 Model has a training rmse of 24852.663672114104, and cross_validation rmse of 24329.474746664575. Training model with regularization alpha: 34 Model has a training rmse of 24880.901129320882, and cross_validation rmse of 24334.584847078124. Training model with regularization alpha: 35 Model has a training rmse of 24908.572986203955, and cross_validation rmse of 24339.864984409614. Training model with regularization alpha: 36 Model has a training rmse of 24935.70553017099, and cross_validation rmse of 24345.295964574845. Training model with regularization alpha: 37 Model has a training rmse of 24962.3231666276, and cross_validation rmse of 24350.860688298955. Training model with regularization alpha: 38 Model has a training rmse of 24988.448597035524, and cross_validation rmse of 24356.543892709313. Training model with regularization alpha: 39 Model has a training rmse of 25014.102976270144, and cross_validation rmse of 24362.331929210217. Training model with regularization alpha: 40 Model has a training rmse of 25039.306052113254, and cross_validation rmse of 24368.212571897995. Training model with regularization alpha: 41 Model has a training rmse of 25064.07628927207, and cross_validation rmse of 24374.1748517872. Training model with regularization alpha: 42 Model has a training rmse of 25088.43097994867, and cross_validation rmse of 24380.20891293513. Training model with regularization alpha: 43 Model has a training rmse of 25112.38634268044, and cross_validation rmse of 24386.305887209655. Training model with regularization alpha: 44 Model has a training rmse of 25135.957610919722, and cross_validation rmse of 24392.45778498513. Training model with regularization alpha: 45 Model has a training rmse of 25159.159112610134, and cross_validation rmse of 24398.657399491385. Training model with regularization alpha: 46 Model has a training rmse of 25182.004341839693, and cross_validation rmse of 24404.898222900487. Training model with regularization alpha: 47 Model has a training rmse of 25204.50602350283, and cross_validation rmse of 24411.174372535857. Training model with regularization alpha: 48 Model has a training rmse of 25226.676171776813, and cross_validation rmse of 24417.480525835028. Training model with regularization alpha: 49 Model has a training rmse of 25248.526143111867, and cross_validation rmse of 24423.81186290303.
best_performance(r_rmses_val_type)
Model with alpha: 21 has the lowest cross-validation rmse: 24293.1597846822
# Find learning curve for best performance
r_mses_train_sm_type = []
r_mses_val_sm_type = []
for m in range(1,300):
r_mse_train_sm, r_mse_val_sm = regularized_train_and_test(non_null_data,
'saleprice',
alpha = 21,
training_size = m,
feature_selection='type',
numerical_number=29)
r_mses_train_sm_type.append(r_mse_train_sm)
r_mses_val_sm_type.append(r_mse_val_sm)
plt.plot(r_mses_train_sm_type, color = 'red', label = 'regularized_training error')
plt.plot(r_mses_val_sm_type, color = 'orange', label = 'regularized_cross-validation error')
plt.xlim(1,100)
plt.ylim(0, 0.2e10)
plt.title('Learning curve for regularized linear regression with feature selection')
plt.legend()
<matplotlib.legend.Legend at 0x7f91427c8b50>
# Train and test on different number of alpha with feature selection
r_rmses_train_pca = {}
r_rmses_val_pca = {}
for alpha in range(1,50):
rmse_train, rmse_val = regularized_train_and_test(non_null_data,
'saleprice',
alpha,
feature_selection='pca',
feature_number = 99)
r_rmses_train_pca['Model with alpha: {}'.format(alpha)] = rmse_train
r_rmses_val_pca['Model with alpha: {}'.format(alpha)] = rmse_val
Training model with regularization alpha: 1 Model has a training rmse of 25512.950560249235, and cross_validation rmse of 25255.149669138045. Training model with regularization alpha: 2 Model has a training rmse of 25513.61578093464, and cross_validation rmse of 25240.26567702547. Training model with regularization alpha: 3 Model has a training rmse of 25514.691593687814, and cross_validation rmse of 25226.051146606333. Training model with regularization alpha: 4 Model has a training rmse of 25516.153180353576, and cross_validation rmse of 25212.47361987071. Training model with regularization alpha: 5 Model has a training rmse of 25517.97739879202, and cross_validation rmse of 25199.50266821296. Training model with regularization alpha: 6 Model has a training rmse of 25520.14264881916, and cross_validation rmse of 25187.109734533376. Training model with regularization alpha: 7 Model has a training rmse of 25522.628750717304, and cross_validation rmse of 25175.26799006724. Training model with regularization alpha: 8 Model has a training rmse of 25525.416834959837, and cross_validation rmse of 25163.95220434511. Training model with regularization alpha: 9 Model has a training rmse of 25528.48924196036, and cross_validation rmse of 25153.13862688357. Training model with regularization alpha: 10 Model has a training rmse of 25531.829430798578, and cross_validation rmse of 25142.804879376174. Training model with regularization alpha: 11 Model has a training rmse of 25535.42189599891, and cross_validation rmse of 25132.929857300023. Training model with regularization alpha: 12 Model has a training rmse of 25539.25209154528, and cross_validation rmse of 25123.493639980974. Training model with regularization alpha: 13 Model has a training rmse of 25543.30636140911, and cross_validation rmse of 25114.47740827145. Training model with regularization alpha: 14 Model has a training rmse of 25547.57187594887, and cross_validation rmse of 25105.863369089977. Training model with regularization alpha: 15 Model has a training rmse of 25552.036573611294, and cross_validation rmse of 25097.63468615691. Training model with regularization alpha: 16 Model has a training rmse of 25556.68910742657, and cross_validation rmse of 25089.775416333723. Training model with regularization alpha: 17 Model has a training rmse of 25561.518795844855, and cross_validation rmse of 25082.27045103724. Training model with regularization alpha: 18 Model has a training rmse of 25566.515577509752, and cross_validation rmse of 25075.105462258205. Training model with regularization alpha: 19 Model has a training rmse of 25571.66996960689, and cross_validation rmse of 25068.266852762. Training model with regularization alpha: 20 Model has a training rmse of 25576.973029463294, and cross_validation rmse of 25061.7417100942. Training model with regularization alpha: 21 Model has a training rmse of 25582.41631910661, and cross_validation rmse of 25055.517764052005. Training model with regularization alpha: 22 Model has a training rmse of 25587.991872522478, and cross_validation rmse of 25049.5833473174. Training model with regularization alpha: 23 Model has a training rmse of 25593.69216537456, and cross_validation rmse of 25043.927358977784. Training model with regularization alpha: 24 Model has a training rmse of 25599.510086975057, and cross_validation rmse of 25038.539230687507. Training model with regularization alpha: 25 Model has a training rmse of 25605.438914313912, and cross_validation rmse of 25033.408895247238. Training model with regularization alpha: 26 Model has a training rmse of 25611.472287973727, and cross_validation rmse of 25028.526757399977. Training model with regularization alpha: 27 Model has a training rmse of 25617.604189773487, and cross_validation rmse of 25023.883666661317. Training model with regularization alpha: 28 Model has a training rmse of 25623.828921999124, and cross_validation rmse of 25019.470892018908. Training model with regularization alpha: 29 Model has a training rmse of 25630.141088092107, and cross_validation rmse of 25015.280098351122. Training model with regularization alpha: 30 Model has a training rmse of 25636.535574678936, and cross_validation rmse of 25011.303324429005. Training model with regularization alpha: 31 Model has a training rmse of 25643.007534835193, and cross_validation rmse of 25007.532962377278. Training model with regularization alpha: 32 Model has a training rmse of 25649.552372487102, and cross_validation rmse of 25003.961738482063. Training model with regularization alpha: 33 Model has a training rmse of 25656.16572786248, and cross_validation rmse of 25000.58269524213. Training model with regularization alpha: 34 Model has a training rmse of 25662.843463910263, and cross_validation rmse of 24997.389174569842. Training model with regularization alpha: 35 Model has a training rmse of 25669.581653615245, and cross_validation rmse of 24994.374802056223. Training model with regularization alpha: 36 Model has a training rmse of 25676.37656814049, and cross_validation rmse of 24991.5334722213. Training model with regularization alpha: 37 Model has a training rmse of 25683.224665736005, and cross_validation rmse of 24988.859334678273. Training model with regularization alpha: 38 Model has a training rmse of 25690.122581357162, and cross_validation rmse of 24986.346781145214. Training model with regularization alpha: 39 Model has a training rmse of 25697.067116941194, and cross_validation rmse of 24983.990433244075. Training model with regularization alpha: 40 Model has a training rmse of 25704.055232294144, and cross_validation rmse of 24981.785131031444. Training model with regularization alpha: 41 Model has a training rmse of 25711.084036544806, and cross_validation rmse of 24979.725922209782. Training model with regularization alpha: 42 Model has a training rmse of 25718.150780125394, and cross_validation rmse of 24977.80805197248. Training model with regularization alpha: 43 Model has a training rmse of 25725.25284724214, and cross_validation rmse of 24976.026953439057. Training model with regularization alpha: 44 Model has a training rmse of 25732.387748801702, and cross_validation rmse of 24974.378238640897. Training model with regularization alpha: 45 Model has a training rmse of 25739.553115762104, and cross_validation rmse of 24972.857690020457. Training model with regularization alpha: 46 Model has a training rmse of 25746.74669287927, and cross_validation rmse of 24971.461252410118. Training model with regularization alpha: 47 Model has a training rmse of 25753.96633282247, and cross_validation rmse of 24970.18502545898. Training model with regularization alpha: 48 Model has a training rmse of 25761.209990633877, and cross_validation rmse of 24969.02525647891. Training model with regularization alpha: 49 Model has a training rmse of 25768.475718509635, and cross_validation rmse of 24967.978333682328.
best_performance(r_rmses_val_pca)
Model with alpha: 49 has the lowest cross-validation rmse: 24967.978333682328
# Learning curve for Regularized Linear Regression with feature selection
# Find learning curve for best performance
r_mses_train_sm_pca = []
r_mses_val_sm_pca = []
for m in range(1,300):
r_mse_train_sm, r_mse_val_sm = regularized_train_and_test(non_null_data,
'saleprice',
alpha = 49,
training_size = m,
feature_selection='pca',
feature_number=99)
r_mses_train_sm_pca.append(r_mse_train_sm)
r_mses_val_sm_pca.append(r_mse_val_sm)
plt.plot(r_mses_train_sm_pca, color = 'red', label = 'regularized_training error')
plt.plot(r_mses_val_sm_pca, color = 'orange', label = 'regularized_cross-validation error')
plt.xlim(1,100)
plt.ylim(0, 0.2e10)
plt.title('Learning curve for regularized linear regression with pca')
plt.legend()
<matplotlib.legend.Legend at 0x7f9158a0afd0>
# Best without feature selection
regularized_train_and_test(non_null_data,
'saleprice',
alpha = 22,
test = True)
Model has a training rmse of 23859.59321304461, and test rmse of 33184.771578804764.
33184.771578804764
# Best with by type feature selection
regularized_train_and_test(non_null_data,
'saleprice',
alpha = 21,
feature_selection = 'type',
feature_number = 29,
test = True)
Model has a training rmse of 24456.37930006325, and test rmse of 32341.53439894292.
32341.53439894292
# Best with pca
regularized_train_and_test(non_null_data,
'saleprice',
alpha = 49,
feature_selection = 'pca',
feature_number = 99,
test = True)
Model has a training rmse of 25768.475718509635, and test rmse of 34665.044236412556.
34665.044236412556
*Results:*
no feature selection | pca | feature selection by type | |
---|---|---|---|
unregularized | none | cv: 25271 test: 34157 | cv: 3.7e15 test: 2.4e15 |
regularized | cv: 24076 test: 33185 | cv: 24968 test: 34665 | cv: 24293 test: 32342 |
*From the results above, we can conclude:*
*Next steps:*