Predict diamond prices using the Kaggle data set here (https://www.kaggle.com/shivam2503/diamonds)
The first part of my code is borrowed heavily from Vineel Sindiri's Towards Data Science post on Medium (https://towardsdatascience.com/diamond-price-prediction-based-on-their-cut-colour-clarity-price-with-pytorch-1e0353d2503b), but it's a neural network.
The second part is the traditional sklearn mulitple regression.
!pip install jovian
import torch
import numpy as np
import jovian
import torchvision
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torch.utils.data import DataLoader, TensorDataset, random_split
from google.colab import drive
drive.mount('/data/')
data_dir = '/data/My Drive/EMSE 6575/LinearRegressionHomework'
Drive already mounted at /data/; to attempt to forcibly remount, call drive.mount("/data/", force_remount=True). LinearRegressionKoban
data = pd.read_csv(data_dir + '/diamonds.csv')
data.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)
print("data shape: " + str(data.shape))
data.head()
data shape: (53940, 10)
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
# Drop rows with bogus diamond dimensions
data = data.loc[(data[['x','y','z']]!=0).all(axis=1)]
data.shape
(53920, 10)
# Look at the distributions for the continuous variables
data.hist(bins=50,figsize=(20,15))
plt.show()
# Examine pairwise plots to look for linear relationships visually
sns.pairplot(data , diag_kind = 'kde')
<seaborn.axisgrid.PairGrid at 0x7f930b751e48>
Some of the relationship definitely look to be nonlinear. For example carats relationship to width. . Since carats are a measure of weight, it makes sense the diameter (or any of the dimensions) would have a non-linear relationship with weight. For example if we looked at the volume of a sphere (4/3 pi*r^3) the volume of a sphere grows in a nonlinear fashion with increases to the radius size.
# Examine heatmap of the correlation matrix
plt.figure(figsize = (10,5))
sns.heatmap(data.corr(),annot = True , cmap = 'coolwarm' );
Price appears to be most related to carat size and strongly related to individual measures of size (x, y, z). In contrast, depth percentage (depth
) and table are not closely related.
In short - big diamonds cost more money regardless of their shape.
# Next we look at the categorical variables and how they relate to price
input_cat_columns = data.select_dtypes(include = ['object']).columns.tolist()
for col in input_cat_columns:
sns.catplot(x=col, y="price",
kind="box", dodge=False, height = 5, aspect = 3,data=data);
Nothing stands out in a major way to me, but it looks liked: 1) I and J color grades are on average more pricy, 2) surprisingly the lowest clarity grade (I1) is more expensive, on average, than some of the other higher quality clarities but still has very few high priced diamonds (as shown by the small number of boxplot outlier points).
# One-hod encode the categorical variables so we can actually use them.
data_one_hot_encoding = pd.get_dummies(data)
data_one_hot_encoding.head()
carat | depth | table | price | x | y | z | cut_Fair | cut_Good | cut_Ideal | cut_Premium | cut_Very Good | color_D | color_E | color_F | color_G | color_H | color_I | color_J | clarity_I1 | clarity_IF | clarity_SI1 | clarity_SI2 | clarity_VS1 | clarity_VS2 | clarity_VVS1 | clarity_VVS2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.23 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 0.21 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 0.23 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0.29 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 0.31 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
# first we'll use a normal train/test split of 80/20
val_percent = 0.20 #Consider 20% of data as validation data
batch_size = 16
torch.manual_seed(99) # Set Seed for Reproducibility
# selected input cols to predict the diamond price
input_cols = data_one_hot_encoding.columns.values.tolist() # Converting the dataframe columns to list
input_cols.remove('price')
input_cols.remove('depth')
# price is the target variable
output_col = ['price']
# convert the dataframes to arrages
def dataframe_to_arrays(dataframe):
# Make a copy of the original dataframe
dataframe1 = dataframe.copy(deep=True)
# Extract input & outupts as numpy arrays
inputs_array = dataframe1[input_cols].to_numpy() #converting dataframes to numpy arrays
targets_array = dataframe1[output_col].to_numpy()
return inputs_array, targets_array
# split up the data into the regressors (inputs) and response variable (targets)
inputs_array, targets_array = dataframe_to_arrays(data_one_hot_encoding)
# Convert to PyTorch dataset
dataset = TensorDataset(torch.tensor(inputs_array, dtype=torch.float32), torch.tensor(targets_array, dtype=torch.float32))
# Split the data into a test and training set
val_size = int(inputs_array.shape[0] * val_percent)
train_size = inputs_array.shape[0] - val_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size*2)
# Look at a batch of data to verify everything is working
for xb, yb in train_loader:
print(xb.shape)
print(yb.shape)
break
torch.Size([16, 25]) torch.Size([16, 1])
# Formulate a neural network
class DiamondPriceModel(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(len(input_cols), len(output_col))
def forward(self, xb):
out = self.linear(xb)
return out
def training_step(self, batch):
inputs, targets = batch
out = self(inputs) # Generate predictions
loss = F.mse_loss(out, targets) # Calculate loss
return loss
def validation_step(self, batch):
inputs, targets = batch
out = self(inputs) # Generate predictions
loss = F.mse_loss(out, targets)
return {'val_loss': loss.detach()}
def validation_epoch_end(self, outputs):
batch_losses = [x['val_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean() # Combine losses
return {'val_loss': epoch_loss.item()}
def epoch_end(self, epoch, result):
print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))
model = DiamondPriceModel()
list(model.parameters())
[Parameter containing: tensor([[-0.0236, 0.0285, 0.0867, -0.0966, -0.1664, -0.1684, 0.1414, -0.1524, -0.1915, -0.1808, 0.1978, -0.1712, -0.0934, 0.1063, 0.1454, 0.0839, 0.0492, 0.1027, -0.1967, 0.1332, 0.0705, 0.1790, 0.1119, 0.1888, -0.1859]], requires_grad=True), Parameter containing: tensor([-0.0753], requires_grad=True)]
def evaluate(model, val_loader):
outputs = [model.validation_step(batch) for batch in val_loader]
return model.validation_epoch_end(outputs)
def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
history = []
optimizer = opt_func(model.parameters(), lr)
for epoch in range(epochs):
# Training Phase
for batch in train_loader:
loss = model.training_step(batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation phase
result = evaluate(model, val_loader)
model.epoch_end(epoch, result)
history.append(result)
return history
result = evaluate(model, val_loader)
result
{'val_loss': 31092834.0}
epochs = 100
lr = 1e-4
history = fit(epochs, lr, model, train_loader, val_loader)
Epoch [1], val_loss: 3191967.5000 Epoch [2], val_loss: 3270314.5000 Epoch [3], val_loss: 3117373.5000 Epoch [4], val_loss: 3284848.7500 Epoch [5], val_loss: 3058512.2500 Epoch [6], val_loss: 2998649.0000 Epoch [7], val_loss: 2931978.0000 Epoch [8], val_loss: 2962788.0000 Epoch [9], val_loss: 2903011.2500 Epoch [10], val_loss: 2881529.0000 Epoch [11], val_loss: 2908995.2500 Epoch [12], val_loss: 2819926.5000 Epoch [13], val_loss: 2785488.2500 Epoch [14], val_loss: 2747942.0000 Epoch [15], val_loss: 2741837.0000 Epoch [16], val_loss: 2691671.0000 Epoch [17], val_loss: 2715485.2500 Epoch [18], val_loss: 2876935.2500 Epoch [19], val_loss: 2670701.5000 Epoch [20], val_loss: 2655907.2500 Epoch [21], val_loss: 2599642.5000 Epoch [22], val_loss: 2563503.0000 Epoch [23], val_loss: 2542094.2500 Epoch [24], val_loss: 2555021.7500 Epoch [25], val_loss: 2686869.7500 Epoch [26], val_loss: 2486597.2500 Epoch [27], val_loss: 2514733.7500 Epoch [28], val_loss: 2999701.0000 Epoch [29], val_loss: 2657407.5000 Epoch [30], val_loss: 2503158.7500 Epoch [31], val_loss: 2537009.2500 Epoch [32], val_loss: 2468742.0000 Epoch [33], val_loss: 2409890.7500 Epoch [34], val_loss: 2454716.7500 Epoch [35], val_loss: 2333963.2500 Epoch [36], val_loss: 2328470.5000 Epoch [37], val_loss: 2498205.7500 Epoch [38], val_loss: 2300742.0000 Epoch [39], val_loss: 2341092.0000 Epoch [40], val_loss: 2297582.2500 Epoch [41], val_loss: 2261484.7500 Epoch [42], val_loss: 2233631.2500 Epoch [43], val_loss: 2288317.0000 Epoch [44], val_loss: 2472119.7500 Epoch [45], val_loss: 2188259.7500 Epoch [46], val_loss: 2281713.2500 Epoch [47], val_loss: 2163731.2500 Epoch [48], val_loss: 2172975.0000 Epoch [49], val_loss: 2199711.7500 Epoch [50], val_loss: 2132118.0000 Epoch [51], val_loss: 2258257.5000 Epoch [52], val_loss: 2119360.5000 Epoch [53], val_loss: 2104539.5000 Epoch [54], val_loss: 2078226.6250 Epoch [55], val_loss: 2067217.5000 Epoch [56], val_loss: 2096229.0000 Epoch [57], val_loss: 2108937.0000 Epoch [58], val_loss: 2160276.5000 Epoch [59], val_loss: 2314162.7500 Epoch [60], val_loss: 2016976.7500 Epoch [61], val_loss: 2002174.1250 Epoch [62], val_loss: 1992228.6250 Epoch [63], val_loss: 2078359.8750 Epoch [64], val_loss: 1975626.5000 Epoch [65], val_loss: 1980692.8750 Epoch [66], val_loss: 2171798.2500 Epoch [67], val_loss: 2160782.5000 Epoch [68], val_loss: 1935280.3750 Epoch [69], val_loss: 2038315.6250 Epoch [70], val_loss: 1986663.7500 Epoch [71], val_loss: 2029741.6250 Epoch [72], val_loss: 2033855.2500 Epoch [73], val_loss: 1893505.8750 Epoch [74], val_loss: 1882096.2500 Epoch [75], val_loss: 1871919.1250 Epoch [76], val_loss: 2049239.7500 Epoch [77], val_loss: 1897434.6250 Epoch [78], val_loss: 1971170.7500 Epoch [79], val_loss: 1839293.8750 Epoch [80], val_loss: 1842433.1250 Epoch [81], val_loss: 1828160.3750 Epoch [82], val_loss: 1849418.1250 Epoch [83], val_loss: 1829339.7500 Epoch [84], val_loss: 1818947.0000 Epoch [85], val_loss: 1794473.0000 Epoch [86], val_loss: 1812740.1250 Epoch [87], val_loss: 1792867.1250 Epoch [88], val_loss: 1822400.7500 Epoch [89], val_loss: 1869268.5000 Epoch [90], val_loss: 1760650.5000 Epoch [91], val_loss: 1864027.0000 Epoch [92], val_loss: 1760863.0000 Epoch [93], val_loss: 1740222.6250 Epoch [94], val_loss: 1779263.7500 Epoch [95], val_loss: 1731497.0000 Epoch [96], val_loss: 1776213.1250 Epoch [97], val_loss: 1714654.2500 Epoch [98], val_loss: 1815826.6250 Epoch [99], val_loss: 1724539.5000 Epoch [100], val_loss: 1701177.3750
#losses = [r['val_loss'] for r in [result] + history]
losses = [r['val_loss'] for r in history]
plt.plot(losses, '-x')
plt.xlabel('epoch')
plt.ylabel('val_loss')
plt.title('val_loss vs. epochs');
epochs = 100
lr = 1e-4
history += fit(epochs, lr, model, train_loader, val_loader)
Epoch [1], val_loss: 1729838.7500 Epoch [2], val_loss: 1707794.6250 Epoch [3], val_loss: 1717449.2500 Epoch [4], val_loss: 1680712.1250 Epoch [5], val_loss: 1677706.2500 Epoch [6], val_loss: 1664883.2500 Epoch [7], val_loss: 1838860.3750 Epoch [8], val_loss: 1668610.8750 Epoch [9], val_loss: 1656368.7500 Epoch [10], val_loss: 1657868.7500 Epoch [11], val_loss: 1662590.1250 Epoch [12], val_loss: 1640304.7500 Epoch [13], val_loss: 1691214.2500 Epoch [14], val_loss: 1648269.6250 Epoch [15], val_loss: 1621398.2500 Epoch [16], val_loss: 1629121.0000 Epoch [17], val_loss: 1616644.7500 Epoch [18], val_loss: 1686487.1250 Epoch [19], val_loss: 1748692.1250 Epoch [20], val_loss: 1596417.8750 Epoch [21], val_loss: 1617926.5000 Epoch [22], val_loss: 1758274.6250 Epoch [23], val_loss: 1600703.0000 Epoch [24], val_loss: 1738608.6250 Epoch [25], val_loss: 1625493.5000 Epoch [26], val_loss: 1571246.7500 Epoch [27], val_loss: 1576111.6250 Epoch [28], val_loss: 1677877.1250 Epoch [29], val_loss: 1629164.8750 Epoch [30], val_loss: 1565951.0000 Epoch [31], val_loss: 1578886.1250 Epoch [32], val_loss: 1582962.8750 Epoch [33], val_loss: 1563267.2500 Epoch [34], val_loss: 1540794.8750 Epoch [35], val_loss: 1582453.1250 Epoch [36], val_loss: 1534271.0000 Epoch [37], val_loss: 1663728.6250 Epoch [38], val_loss: 1594564.7500 Epoch [39], val_loss: 1526511.0000 Epoch [40], val_loss: 1527443.0000 Epoch [41], val_loss: 1691024.7500 Epoch [42], val_loss: 1560186.5000 Epoch [43], val_loss: 1568422.7500 Epoch [44], val_loss: 1538545.7500 Epoch [45], val_loss: 1575142.8750 Epoch [46], val_loss: 1601624.7500 Epoch [47], val_loss: 1595337.2500 Epoch [48], val_loss: 1526671.3750 Epoch [49], val_loss: 1524164.1250 Epoch [50], val_loss: 1489738.5000 Epoch [51], val_loss: 1524942.3750 Epoch [52], val_loss: 1492134.5000 Epoch [53], val_loss: 1494172.1250 Epoch [54], val_loss: 1501972.7500 Epoch [55], val_loss: 1565057.1250 Epoch [56], val_loss: 1509151.3750 Epoch [57], val_loss: 1506826.2500 Epoch [58], val_loss: 1474124.1250 Epoch [59], val_loss: 1532212.0000 Epoch [60], val_loss: 1527819.6250 Epoch [61], val_loss: 1461034.5000 Epoch [62], val_loss: 1478140.2500 Epoch [63], val_loss: 1502673.6250 Epoch [64], val_loss: 1474192.2500 Epoch [65], val_loss: 1457436.8750 Epoch [66], val_loss: 1450983.1250 Epoch [67], val_loss: 1455459.5000 Epoch [68], val_loss: 1471777.2500 Epoch [69], val_loss: 1449789.8750 Epoch [70], val_loss: 1636580.6250 Epoch [71], val_loss: 1450079.7500 Epoch [72], val_loss: 1451274.8750 Epoch [73], val_loss: 1492702.1250 Epoch [74], val_loss: 1440794.3750 Epoch [75], val_loss: 1505263.7500 Epoch [76], val_loss: 1509527.5000 Epoch [77], val_loss: 1428067.7500 Epoch [78], val_loss: 1445754.5000 Epoch [79], val_loss: 1604042.8750 Epoch [80], val_loss: 1424190.5000 Epoch [81], val_loss: 1418047.3750 Epoch [82], val_loss: 1416269.5000 Epoch [83], val_loss: 1415936.3750 Epoch [84], val_loss: 1445656.7500 Epoch [85], val_loss: 1427232.8750 Epoch [86], val_loss: 1425765.8750 Epoch [87], val_loss: 1484739.3750 Epoch [88], val_loss: 1432813.0000 Epoch [89], val_loss: 1405482.8750 Epoch [90], val_loss: 1403007.3750 Epoch [91], val_loss: 1422885.0000 Epoch [92], val_loss: 1399564.3750 Epoch [93], val_loss: 1454019.6250 Epoch [94], val_loss: 1399253.0000 Epoch [95], val_loss: 1396396.0000 Epoch [96], val_loss: 1397645.2500 Epoch [97], val_loss: 1518198.1250 Epoch [98], val_loss: 1412370.0000 Epoch [99], val_loss: 1391590.1250 Epoch [100], val_loss: 1399635.3750
#losses = [r['val_loss'] for r in [result] + history]
losses = [r['val_loss'] for r in history]
plt.plot(losses, '-x')
plt.xlabel('epoch')
plt.ylabel('val_loss')
plt.title('val_loss vs. epochs');
def predict_single(x, model):
xb = x.unsqueeze(0)
return model(x).item()
x, target = val_ds[10]
pred = predict_single(x, model)
print("Input: ", x)
print("Target: ", target.item())
print("Prediction:", pred)
Input: tensor([ 0.5300, 67.0000, 5.2200, 5.1100, 3.2200, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000]) Target: 1778.0 Prediction: 1330.05419921875
x, target = val_ds[0]
pred = predict_single(x, model)
print("Input: ", x)
print("Target: ", target.item())
print("Prediction:", pred)
Input: tensor([ 1.0100, 54.0000, 6.3900, 6.3500, 4.0200, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000]) Target: 3818.0 Prediction: 4162.35693359375
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
# divide into regressors and response
X = data_one_hot_encoding.drop(columns = 'price')
y = data_one_hot_encoding['price']
# Do 80/20 split for the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01)
print(f'Training Shape: {X_train.shape}')
print(f'Testing Shape: {X_test.shape}')
# fit the model
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
# Get test predictions
y_pred = model.predict(X_test)
y_pred
# evaluate the model
sns.set_style('darkgrid')
%matplotlib inline
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared = False)}')
errors = y_test - y_pred
sns.histplot(errors)
Training Shape: (53380, 27) Testing Shape: (540, 27) RMSE: 1018.0659202383456
<matplotlib.axes._subplots.AxesSubplot at 0x7f92eecf12e8>
# divide into regressors and response
X = data_one_hot_encoding.drop(columns = 'price')
y = data_one_hot_encoding['price']
# Do 80/20 split for the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
print(f'Training Shape: {X_train.shape}')
print(f'Testing Shape: {X_test.shape}')
# fit the model
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
# Get test predictions
y_pred = model.predict(X_test)
y_pred
# evaluate the model
sns.set_style('darkgrid')
%matplotlib inline
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared = False)}')
errors = y_test - y_pred
sns.histplot(errors)
Training Shape: (26960, 27) Testing Shape: (26960, 27) RMSE: 1102.8603999759207
<matplotlib.axes._subplots.AxesSubplot at 0x7f92ee4dad30>
df3 = data_one_hot_encoding
df3['carat_squared'] = 100*df3['carat']**2
df3.head()
carat | depth | table | price | x | y | z | cut_Fair | cut_Good | cut_Ideal | cut_Premium | cut_Very Good | color_D | color_E | color_F | color_G | color_H | color_I | color_J | clarity_I1 | clarity_IF | clarity_SI1 | clarity_SI2 | clarity_VS1 | clarity_VS2 | clarity_VVS1 | clarity_VVS2 | carat_squared | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.23 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5.29 |
1 | 0.21 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4.41 |
2 | 0.23 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5.29 |
3 | 0.29 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8.41 |
4 | 0.31 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9.61 |
# divide into regressors and response
X = data_one_hot_encoding.drop(columns = 'price')
y = data_one_hot_encoding['price']
# Do 80/20 split for the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
print(f'Training Shape: {X_train.shape}')
print(f'Testing Shape: {X_test.shape}')
# fit the model
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
# Get test predictions
y_pred = model.predict(X_test)
y_pred
# evaluate the model
sns.set_style('darkgrid')
%matplotlib inline
print(f'RMSE: {mean_squared_error(y_test, y_pred, squared = False)}')
errors = y_test - y_pred
sns.histplot(errors)
Training Shape: (45832, 27) Testing Shape: (8088, 27) RMSE: 1084.577583401863
<matplotlib.axes._subplots.AxesSubplot at 0x7f92edfbec18>