In this project i will use the K-Nearest Neighbors algorithm to predict a car's market price using its attributes. The dataset contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates and more. Information about the data can be found here and can be downloaded from here
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
cars = pd.read_csv('imports-85.data', names=['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'])
cars.head()
Now let's select the columns with numeric and constant values
num_cars = cars[['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']]
Lets transform the ? character into NaN
num_cars = num_cars.replace('?', np.nan)
num_cars.head()
Some columns are recognized as strings, let's transform the entire dataset into float
num_cars = num_cars.astype('float')
num_cars.info()
I'm going to drop those 4 rows with NaN values in the price column
num_cars.dropna(subset=['price'], inplace=True)
num_cars.isnull().sum()
After dropping the NaN values in the price column we find NaN values in the columns num_doors, bore, stroke, horsepower and peak_rpm
The 'normalized-losses' is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door, small, station wagons, sports/specialty, etc…), and represents the average loss per car per year. So it's not ok to drop the entirely column.
There are 37 rows with NaN values in the normalized_losses and 205 rows in total, that means that the NaN valures in this column represent ~20% of the dataset. So dropping them is not a good idea becase we would loss a considerable amount of data. According to the documentation this column goes from 65 to 256, so it's ok to fill the values with the mean value of the column, the same will be applied in the num_doors, bore, stroke, horsepower and peak_rpm columns
num_cars = num_cars.fillna(num_cars.mean())
num_cars.info()
Now let's see how the dataframe looks
num_cars.head()
After cleaning and transforming the data, let's normalize the dataframe except the price column
price_col = num_cars['price']
num_cars = (num_cars - num_cars.min()) / (num_cars.max() - num_cars.min())
num_cars['price'] = price_col
num_cars.head()
Let's start with a simple univariate model
def knn_train_test(train_col, target_col, df):
knn = KNeighborsRegressor()
#Divide the dataset into training and test set
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
last_train_row = int((len(rand_df) / 2))
train_df = rand_df.iloc[0:last_train_row]
test_df = rand_df.iloc[last_train_row:]
#Fit, train and predict knn
knn.fit(train_df[[train_col]], train_df[target_col])
prediction = knn.predict(test_df[[target_col]])
#Calculate MSE, RMSE and R2
mse = mean_squared_error(test_df[target_col], prediction)
rmse = np.sqrt(mse)
return rmse
train_col = num_cars.columns.drop('price')
rmse_results = {}
for col in train_col:
rmse = knn_train_test(col, 'price', num_cars)
rmse_results[col] = rmse
rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()
def knn_train_test(train_col, target_col, df):
#Divide the dataset into training and test set
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
last_train_row = int((len(rand_df) / 2))
train_df = rand_df.iloc[0:last_train_row]
test_df = rand_df.iloc[last_train_row:]
#Fit, train and predict knn
k = [1, 3, 5, 7, 9]
k_rmse = {}
for val in k:
knn = KNeighborsRegressor(n_neighbors=val)
knn.fit(train_df[[train_col]], train_df[target_col])
prediction = knn.predict(test_df[[target_col]])
#Calculate RMSE
mse = mean_squared_error(test_df[target_col], prediction)
rmse = np.sqrt(mse)
k_rmse[val] = rmse
return k_rmse
rmse_results = {}
train_cols = num_cars.columns.drop('price')
for col in train_cols:
rmse_val = knn_train_test(col, 'price', num_cars)
rmse_results[col] = rmse_val
rmse_results
for k,v in rmse_results.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x,y)
plt.xlabel('k value')
plt.ylabel('RMSE')
feature_avg_rmse = {}
for k,v in rmse_results.items():
avg_rmse = np.mean(list(v.values()))
feature_avg_rmse[k] = avg_rmse
series_avg_rmse = pd.Series(feature_avg_rmse)
sorted_series_avg_rmse = series_avg_rmse.sort_values()
print(sorted_series_avg_rmse)
sorted_features = sorted_series_avg_rmse.index
def knn_train_test(train_cols, target_col, df):
np.random.seed(1)
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
# Divide number of rows in half and round.
last_train_row = int(len(rand_df) / 2)
# Select the first half and set as training set.
# Select the second half and set as test set.
train_df = rand_df.iloc[0:last_train_row]
test_df = rand_df.iloc[last_train_row:]
k_values = [5]
k_rmses = {}
for k in k_values:
# Fit model using k nearest neighbors.
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(train_df[train_cols], train_df[target_col])
# Make predictions using model.
predicted_labels = knn.predict(test_df[train_cols])
# Calculate and return RMSE.
mse = mean_squared_error(test_df[target_col], predicted_labels)
rmse = np.sqrt(mse)
k_rmses[k] = rmse
return k_rmses
k_rmse_results = {}
for nr_best_feats in range(2,7):
k_rmse_results['{} best features'.format(nr_best_feats)] = knn_train_test(
sorted_features[:nr_best_feats],
'price',
num_cars
)
k_rmse_results
def knn_train_test(train_cols, target_col, df):
np.random.seed(1)
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
# Divide number of rows in half and round.
last_train_row = int(len(rand_df) / 2)
# Select the first half and set as training set.
# Select the second half and set as test set.
train_df = rand_df.iloc[0:last_train_row]
test_df = rand_df.iloc[last_train_row:]
k_values = [i for i in range(1, 25)]
k_rmses = {}
for k in k_values:
# Fit model using k nearest neighbors.
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(train_df[train_cols], train_df[target_col])
# Make predictions using model.
predicted_labels = knn.predict(test_df[train_cols])
# Calculate and return RMSE.
mse = mean_squared_error(test_df[target_col], predicted_labels)
rmse = np.sqrt(mse)
k_rmses[k] = rmse
return k_rmses
k_rmse_results = {}
for nr_best_feats in range(2,6):
k_rmse_results['{} best features'.format(nr_best_feats)] = knn_train_test(
sorted_features[:nr_best_feats],
'price',
num_cars
)
k_rmse_results
for k,v in k_rmse_results.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x,y, label="{}".format(k))
plt.xlabel('k value')
plt.ylabel('RMSE')
plt.legend()