%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";
import urllib.request
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import ktrain
from ktrain import tabular
In this notebook, we will predict the prices of houses from various house attributes. The dataset can be downloaded from Kaggle here.
train_df = pd.read_csv('data/housing_price/train.csv', index_col=0)
train_df.head()
MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
train_df.drop(['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','Utilities'], 1, inplace=True)
train_df.head()
MSSubClass | MSZoning | LotFrontage | LotArea | Street | LotShape | LandContour | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1 | 60 | RL | 65.0 | 8450 | Pave | Reg | Lvl | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | WD | Normal | 208500 |
2 | 20 | RL | 80.0 | 9600 | Pave | Reg | Lvl | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | WD | Normal | 181500 |
3 | 60 | RL | 68.0 | 11250 | Pave | IR1 | Lvl | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | WD | Normal | 223500 |
4 | 70 | RL | 60.0 | 9550 | Pave | IR1 | Lvl | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 | WD | Abnorml | 140000 |
5 | 60 | RL | 84.0 | 14260 | Pave | IR1 | Lvl | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 | WD | Normal | 250000 |
trn, val, preproc = tabular.tabular_from_df(train_df, is_regression=True,
label_columns='SalePrice', random_state=42)
processing train: 1309 rows x 74 columns The following integer column(s) are being treated as categorical variables: ['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', '3SsnPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'] To treat any of these column(s) as numerical, cast the column to float in DataFrame or CSV and re-run tabular_from* function. processing test: 151 rows x 74 columns
/home/amaiya/projects/ghub/ktrain/ktrain/utils.py:556: UserWarning: Task is being treated as REGRESSION because either class_names argument was not supplied or is_regression=True. If this is incorrect, change accordingly. 'If this is incorrect, change accordingly.')
Learner
¶model = tabular.tabular_regression_model('mlp', trn)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128)
done.
learner.lr_find(show_plot=True, max_epochs=16)
simulating training for different learning rates... this may take a few moments... Train for 10 steps Epoch 1/16 10/10 [==============================] - 5s 526ms/step - loss: 39023478485.7307 - mae: 181231.9375 Epoch 2/16 10/10 [==============================] - 1s 97ms/step - loss: 39033418674.8315 - mae: 181204.8594 Epoch 3/16 10/10 [==============================] - 1s 99ms/step - loss: 38418801680.4742 - mae: 179555.7969 Epoch 4/16 10/10 [==============================] - 1s 100ms/step - loss: 38186333255.9661 - mae: 179885.9062 Epoch 5/16 10/10 [==============================] - 1s 96ms/step - loss: 39033367996.8027 - mae: 181204.7500 Epoch 6/16 10/10 [==============================] - 1s 98ms/step - loss: 39178636118.9229 - mae: 181407.5469 Epoch 7/16 10/10 [==============================] - 1s 97ms/step - loss: 39022837652.4843 - mae: 181230.2969 Epoch 8/16 10/10 [==============================] - 1s 96ms/step - loss: 38555598667.6511 - mae: 180373.7500 Epoch 9/16 10/10 [==============================] - 1s 96ms/step - loss: 38548896255.5665 - mae: 180083.1719 Epoch 10/16 10/10 [==============================] - 1s 97ms/step - loss: 35094031275.8950 - mae: 170316.9062 Epoch 11/16 10/10 [==============================] - 1s 101ms/step - loss: 10132749122.6554 - mae: 77930.0859 Epoch 12/16 10/10 [==============================] - 1s 96ms/step - loss: 7250272029.6969 - mae: 73190.8281 Epoch 13/16 10/10 [==============================] - 1s 98ms/step - loss: 7010621260.5182 - mae: 70700.7656 Epoch 14/16 10/10 [==============================] - 1s 96ms/step - loss: 25998046822.4000 - mae: 135040.7500 Epoch 15/16 7/10 [====================>.........] - ETA: 0s - loss: 874942046208.0000 - mae: 330034.8438 done. Visually inspect loss plot and select learning rate associated with falling loss
learner.autofit(1e-1)
early_stopping automatically enabled at patience=5 reduce_on_plateau automatically enabled at patience=2 begin training using triangular learning rate policy with max lr of 0.1... Train for 11 steps, validate for 5 steps Epoch 1/1024 11/11 [==============================] - 6s 541ms/step - loss: 33410952382.0443 - mae: 156290.1250 - val_loss: 15485848576.0000 - val_mae: 114282.9531 Epoch 2/1024 11/11 [==============================] - 1s 132ms/step - loss: 24494850035.0924 - mae: 145351.7188 - val_loss: 25421583155.2000 - val_mae: 153021.7188 Epoch 3/1024 11/11 [==============================] - 1s 136ms/step - loss: 18668215047.6272 - mae: 125661.6016 - val_loss: 12586017996.8000 - val_mae: 91371.8438 Epoch 4/1024 11/11 [==============================] - 1s 133ms/step - loss: 12545563277.9832 - mae: 84823.7500 - val_loss: 8312404070.4000 - val_mae: 86080.5156 Epoch 5/1024 11/11 [==============================] - 2s 140ms/step - loss: 5367907446.2460 - mae: 61690.7188 - val_loss: 3880212326.4000 - val_mae: 54640.9609 Epoch 6/1024 11/11 [==============================] - 2s 137ms/step - loss: 2749079195.2330 - mae: 35548.7422 - val_loss: 994916563.2000 - val_mae: 22115.1758 Epoch 7/1024 11/11 [==============================] - 2s 137ms/step - loss: 1747315035.4652 - mae: 28340.7891 - val_loss: 942788454.4000 - val_mae: 20258.2383 Epoch 8/1024 11/11 [==============================] - 1s 135ms/step - loss: 1317610653.1398 - mae: 24707.8145 - val_loss: 795084512.0000 - val_mae: 18270.5781 Epoch 9/1024 11/11 [==============================] - 2s 137ms/step - loss: 1345168384.1711 - mae: 23964.7910 - val_loss: 751457507.2000 - val_mae: 17889.5137 Epoch 10/1024 11/11 [==============================] - 2s 138ms/step - loss: 1204894672.8556 - mae: 22083.0527 - val_loss: 729885836.8000 - val_mae: 17330.7344 Epoch 11/1024 11/11 [==============================] - 1s 130ms/step - loss: 1293521195.8075 - mae: 24240.8613 - val_loss: 845886457.6000 - val_mae: 19439.5332 Epoch 12/1024 8/11 [====================>.........] - ETA: 0s - loss: 1224515284.0000 - mae: 22132.0742 Epoch 00012: Reducing Max LR on Plateau: new max lr will be 0.05 (if not early_stopping). 11/11 [==============================] - 1s 134ms/step - loss: 1196409911.1933 - mae: 22769.3926 - val_loss: 997325734.4000 - val_mae: 21638.4668 Epoch 13/1024 11/11 [==============================] - 1s 136ms/step - loss: 1081636823.2361 - mae: 22792.1152 - val_loss: 709030249.6000 - val_mae: 18628.7402 Epoch 14/1024 11/11 [==============================] - 1s 136ms/step - loss: 984812624.6539 - mae: 20191.5820 - val_loss: 662907520.0000 - val_mae: 16497.9941 Epoch 15/1024 11/11 [==============================] - 1s 135ms/step - loss: 984294369.7418 - mae: 19897.7480 - val_loss: 666114873.6000 - val_mae: 16434.1055 Epoch 16/1024 9/11 [=======================>......] - ETA: 0s - loss: 895732711.1111 - mae: 19749.8887 Epoch 00016: Reducing Max LR on Plateau: new max lr will be 0.025 (if not early_stopping). 11/11 [==============================] - 1s 135ms/step - loss: 957869730.4446 - mae: 19990.8848 - val_loss: 708456806.4000 - val_mae: 18026.2402 Epoch 17/1024 11/11 [==============================] - 1s 133ms/step - loss: 860801337.9251 - mae: 19515.2520 - val_loss: 695209459.2000 - val_mae: 16676.5195 Epoch 18/1024 11/11 [==============================] - 1s 136ms/step - loss: 824453914.3285 - mae: 19024.5078 - val_loss: 661850604.8000 - val_mae: 16811.2227 Epoch 19/1024 11/11 [==============================] - 2s 138ms/step - loss: 801468495.2299 - mae: 18976.1191 - val_loss: 660384323.2000 - val_mae: 16322.7549 Epoch 20/1024 11/11 [==============================] - 1s 131ms/step - loss: 753795500.6753 - mae: 18709.6406 - val_loss: 663768908.8000 - val_mae: 16474.5703 Epoch 21/1024 10/11 [==========================>...] - ETA: 0s - loss: 783667638.4000 - mae: 18711.5293 Epoch 00021: Reducing Max LR on Plateau: new max lr will be 0.0125 (if not early_stopping). 11/11 [==============================] - 1s 133ms/step - loss: 786054059.7830 - mae: 18731.3438 - val_loss: 664781280.0000 - val_mae: 16376.9863 Epoch 22/1024 11/11 [==============================] - 1s 133ms/step - loss: 825439074.7869 - mae: 19253.4219 - val_loss: 668607993.6000 - val_mae: 16350.0859 Epoch 23/1024 8/11 [====================>.........] - ETA: 0s - loss: 824802156.0000 - mae: 18940.5098 Epoch 00023: Reducing Max LR on Plateau: new max lr will be 0.00625 (if not early_stopping). 11/11 [==============================] - 1s 134ms/step - loss: 790809524.3453 - mae: 18444.7891 - val_loss: 669713436.8000 - val_mae: 16343.8193 Epoch 24/1024 9/11 [=======================>......] - ETA: 0s - loss: 714549009.7778 - mae: 18092.3867Restoring model weights from the end of the best epoch. 11/11 [==============================] - 1s 136ms/step - loss: 736000784.4584 - mae: 18215.8066 - val_loss: 666175027.2000 - val_mae: 16389.6992 Epoch 00024: early stopping Weights from best epoch have been loaded into model.
<tensorflow.python.keras.callbacks.History at 0x7f7c482316d8>
learner.evaluate(test_data=val)
[('mae', 16322.754966887418)]