In this guided project, we will predict a car's market price using its attributes. The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more. You can read more about the data set here and can download it directly from here.
Let's start by reading in the dataset.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
cars = pd.read_csv('imports-85.data', names = ['symboling', 'normalized_losses','make', 'fuel_type', 'aspiration',
'num_doors', 'body_style', 'drive_wheels', 'engine_location',
'wheel_base', 'length', 'width', 'height', 'curb_weight',
'engine_type', 'num_cylinders', 'engine_size', 'fuel_system',
'bore', 'stroke', 'compression_ratio', 'horsepower',
'peak_rpm', 'city_mpg', 'highway_mpg', 'price'])
cars.head()
cars.shape
cars.info()
Since we would like to predict car prices, the target column is price
. Currently the price
column is not in numeric format. We need to clean it up.
We usually can't have any missing values if we want to use them for predictive modeling. Based on the data set preview from the last step, we can tell that the normalized_losses
column contains missing values represented using "?". Let's replace these values and look for the presence of missing values in other numeric columns. Let's also rescale the values in the numeric columns so they all range from 0 to 1.
cars = cars.replace('?', np.NaN)
Because ?
is a string value, columns containing this value were cast to the pandas object
data type (instead of a numeric type like int
or float
). Let's determine which columns should be converted back to numeric values and convert them.
numeric_cols = ['normalized_losses', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_size', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg',
'highway_mpg', 'price']
cars[numeric_cols] = cars[numeric_cols].astype('float')
Let's see how many rows in the normalized_losses
column have missing values.
cars['normalized_losses'].isnull().sum()
There are a few ways we could handle columns with missing values:
When it comes to the normalized_losses
column, dropping the rows with missing values would result in too much data loss - 20% of the rows.
Let's see how many rows with missing data there are in the rest of the numeric columns.
cars[numeric_cols].isnull().sum()
The price
column is our target column. Let's drop the four rows with missing values in that column.
cars = cars.dropna(subset=['price'])
cars[numeric_cols].isnull().sum()
Let's fill in the missing values of the remaining columns with the mean value for each column.
for col in numeric_cols:
cars[col] = cars[col].fillna(cars[col].mean())
cars.info()
Now, let's normalize the numeric columns to range from 0 to 1, excluding the 'price' column.
for col in numeric_cols[:-1]:
cars[col] = (cars[col]-cars[col].min())/(cars[col].max()-cars[col].min())
cars[numeric_cols].describe().T
Let's start with some univariate k-nearest neighbors models. Starting with simple models before moving to more complex models helps us structure our code workflow and understand the features better.
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
def knn_train_test(train_col, target_col, df):
knn = KNeighborsRegressor()
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
#Split the dataset in two (75:25):
train_set = rand_df[:int(len(rand_df)*0.75)]
test_set = rand_df[int(len(rand_df)*0.75):]
knn.fit(train_set[[train_col]], train_set[target_col])
predictions = knn.predict(test_set[[train_col]])
#Calculate the RMSE
rmse = mean_squared_error(test_set[target_col], predictions, squared=False)
return rmse
rmses = {}
for col in numeric_cols[:-1]:
rmses[col] = knn_train_test(col, 'price', cars)
rmses_series = pd.Series(rmses)
rmses_series = rmses_series.sort_values()
rmses_series
Now let's modify the knn_train_test()
function to accept a parameter for the k value.
For each numeric column, we will create, train, and test a univariate model using the following k values (1, 3, 5, 7, and 9). We will then visualize the results using a scatter plot.
def knn_train_test_k(train_col, target_col, df, k_values=5):
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
#Split the dataset in two:
train_set = rand_df[:int(len(rand_df)*0.75)]
test_set = rand_df[int(len(rand_df)*0.75):]
k_rmses = {}
for k in k_values:
knn = KNeighborsRegressor(n_neighbors = k)
knn.fit(train_set[[train_col]], train_set[target_col])
predictions = knn.predict(test_set[[train_col]])
#Calculate the RMSE
rmse = mean_squared_error(test_set[target_col], predictions, squared=False)
k_rmses[k] = rmse
return k_rmses
k_rmse_result = {}
for col in numeric_cols[:-1]:
k_rmse_result[col] = knn_train_test_k(col, 'price', cars, k_values= range(1,10,2))
k_rmse_result
for k, v in k_rmse_result.items():
x = list(v.keys())
y = list(v.values())
plt.scatter(x, y, label=k)
plt.xlabel('K-value')
plt.ylabel('RMSE')
plt.xticks(x)
plt.title('RMSE per k-value', y=1.06)
plt.legend(bbox_to_anchor=(1.05, 1))
It seems that a k-value of 5 is optimal for the Univariate model.
Now, let's calculate the average RMSE accross different k
values for each feature.
Afterwards, we will modify the knn_test_train()
function to work with multiple columns.
Then we will use the modified function to calculate the RMSE when using the top 2, 3, 4, and 5 best features.
#Calculate the average RMSE across different `k` values for each feature.
avg_rmse = {}
for k, v in k_rmse_result.items():
key = k
value = np.mean(list(v.values()))
avg_rmse[key] = value
avg_rmse_series = pd.Series(avg_rmse)
avg_rmse_series = avg_rmse_series.sort_values()
avg_rmse_series
def knn_train_test_list(train_col, target_col, df):
knn = KNeighborsRegressor()
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
#Split the dataset in two:
train_set = rand_df[:int(len(rand_df)*0.75)]
test_set = rand_df[int(len(rand_df)*0.75):]
knn.fit(train_set[train_col], train_set[target_col])
predictions = knn.predict(test_set[train_col])
#Calculate the RMSE
rmse = mean_squared_error(test_set[target_col], predictions, squared=False)
return rmse
#Now, let's use the 2, 3, 4, and 5 best features from the previous step with default k value of 5.
best_features = {}
for x in range(2,6):
rmse = knn_train_test_list(avg_rmse_series.index[:x], 'price', cars)
best_features['RMSE for Features: {}'.format(list(avg_rmse_series.index[:x]))] = rmse
best_features
It looks like the Multivariate model preforms best with four features.
Let's now optimize the model that performed the best in the previous step.
For the top three models from the previous step, we will vary the hyperparameter value from 1 to 25 and plot the resulting values.
def knn_train_test_k(train_col_list, target_col, df):
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
#Split the dataset in two:
train_set = rand_df[:int(len(rand_df)*0.75)]
test_set = rand_df[int(len(rand_df)*0.75):]
k_values = range(1,26)
k_rmses = {}
for k in k_values:
knn = KNeighborsRegressor(n_neighbors = k)
knn.fit(train_set[train_col_list], train_set[target_col])
predictions = knn.predict(test_set[train_col_list])
#Calculate the RMSE
rmse = mean_squared_error(test_set[target_col], predictions, squared=False)
k_rmses[k] = rmse
return k_rmses
top3 = [3,4,5]
top3_rmse = {}
for x in top3:
rmse = knn_train_test_k(avg_rmse_series.index[:x], 'price', cars)
top3_rmse['Best {} features'.format(x)] = rmse
top3_rmse
for k, v in top3_rmse.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x, y, label=k)
plt.legend(loc='lower right')
plt.xlabel('K-value')
plt.ylabel('RMSE')
plt.title('RMSE for k-values 1 to 25')
It seems like using a k-value 1 is best for the model when using the best 3, 4 and 5 features.
Now, let's modify the knn_train_test()
function to use k-fold cross validation. We will use the default n_neighbors
of 5 and will loop through folds 2 to 10.
from sklearn.model_selection import KFold, cross_val_score
def knn_train_test_kfold(train_col, target_col, df, folds=range(2,12,2)):
avg_rmses = {}
for f in folds:
kf = KFold(f, shuffle=True, random_state=1)
knn = KNeighborsRegressor()
mse = cross_val_score(knn, df[[train_col]], df[target_col], scoring='neg_mean_squared_error', cv=kf)
rmse = np.sqrt(np.absolute(mse))
avg_rmse = np.mean(rmse)
avg_rmses[f] = avg_rmse
return avg_rmses
diff_folds = {}
for col in numeric_cols[:-1]:
diff_folds[col] = knn_train_test_kfold(col, 'price', cars, folds=range(2,12,2))
diff_folds_df = pd.DataFrame(diff_folds)
diff_folds_df
We see that for 6 out of the 14 columns (~43%), 10 folds give the best result. Because of that, we will use 10 folds for the rest of the tests.
Now let's modify the knn_train_test_kfold()
so that the k nearest neighbors changes - 1, 3, 5, 7, 9.
def knn_train_test_kfold(train_col, target_col, df, folds=10):
avg_rmses = {}
kf = KFold(folds, shuffle=True, random_state=1)
k_values = range(1,10,2)
for k in k_values:
knn = KNeighborsRegressor(n_neighbors=k)
mse = cross_val_score(knn, df[[train_col]], df[target_col], scoring='neg_mean_squared_error', cv=kf)
rmse = np.sqrt(np.absolute(mse))
avg_rmse = np.mean(rmse)
avg_rmses[k] = avg_rmse
return avg_rmses
cols_k = {}
for col in numeric_cols[:-1]:
cols_k[col] = knn_train_test_kfold(col, 'price', cars, folds=10)
cols_k_df=pd.DataFrame(cols_k)
cols_k_df
Now, let's use the above results for k-values
[1,3,5,9] and average the results for each feature. We will then use the top features to calculate the RMSE for more than one column over 10 folds.
#Calculate the average RMSE across different `k` values for each feature.
avg_rmse_k = {}
for k, v in cols_k.items():
key = k
value = np.mean(list(v.values()))
avg_rmse_k[key] = value
avg_rmse_k_series = pd.Series(avg_rmse_k)
avg_rmse_k_series = avg_rmse_k_series.sort_values()
avg_rmse_k_series
def knn_train_test_kfold(train_col, target_col, df, folds=10):
kf = KFold(folds, shuffle=True, random_state=1)
knn = KNeighborsRegressor()
mse = cross_val_score(knn, df[train_col], df[target_col], scoring='neg_mean_squared_error', cv=kf)
rmse = np.sqrt(np.absolute(mse))
avg_rmse = np.mean(rmse)
return avg_rmse
features = {}
for x in range(2,7):
result = knn_train_test_kfold(list(avg_rmse_k_series.index[:x]), 'price', cars, folds=10)
features["{} best features".format(x)] = result
features
It looks like the 10 fold model and k value of 5 preforms best with 2 features.
Let's use the top 3 features combinations from above (2, 3, 4, 5) and calculate the RMSE with varying nearest neighbors from 1 to 25, using 10 folds.
def knn_train_test_kfold(train_col, target_col, df, folds=10):
kf = KFold(folds, shuffle=True, random_state=1)
avg_rmses = {}
for k in range(1,26):
knn = KNeighborsRegressor(n_neighbors=k)
mse = cross_val_score(knn, df[train_col], df[target_col], scoring='neg_mean_squared_error', cv=kf)
rmse = np.sqrt(np.absolute(mse))
avg_rmse = np.mean(rmse)
avg_rmses[k] = avg_rmse
return avg_rmses
features_25 = {}
features = [2,3,4,5]
for x in features:
result = knn_train_test_kfold(list(ten_folds_s.index[:x]), 'price', cars, folds=10)
features_25["{} best features".format(x)] = result
features_25
for k, v in features_25.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x, y, label=k)
plt.legend(loc='lower right', frameon=False)
plt.xlabel('K-value')
plt.ylabel('RMSE')
plt.title('RMSE for k-values 1 to 25\n10 Folds')
It looks like 2, 3, and 4 best features produce most accurate results with k value of 2 whereas 5 features work best with k value of 1.