In this project, we'll practice the machine learning workflow to predict a car's market price using its attributes.
This data set consists of three types of entities:
The second rating corresponds to the degree to which the auto is more risky than its price indicates:
Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale.
Actuarians call this process "symboling". A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.
The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car per year.
You can read more about the data set here.
import pandas as pd
import numpy as np
pd.options.display.max_columns = 99
columns_name=['symboling','normalized_losses','make','fuel_type',\
'aspiration','num_doors','body_style','drive_wheels',\
'engine_location','wheel_base','length','width','height',\
'curb_weight','engine_type','num_cylinders','engine_size',\
'fuel_system','bore','stroke','compression_ratio','horsepower',\
'peak_rpm','city_mpg','highway_mpg','price']
cars=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data',names=columns_name)
cars.head()
# Select only the columns with continuous values from
#https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names
continuous_values_cols=['normalized_losses','wheel_base','length','width',
'height','curb_weight','engine_size','bore','stroke','compression_ratio',
'horsepower','peak_rpm','city_mpg','highway_mpg','price']
numeric_cars=cars[continuous_values_cols]
# Select feature columns
features = numeric_cars.columns.tolist()
features.remove('price')
numeric_cars.head()
# Replace all of the ? values with np.nan in the dataframe
numeric_cars=numeric_cars.replace('?',np.nan).astype('float')
# Check missing values
numeric_cars.isnull().sum()
# Because `price` is the column we want to predict, let's remove any rows with missing `price` values.
numeric_cars = numeric_cars.dropna(subset=['price'])
numeric_cars.isnull().sum()
# Replace missing values in other columns using column means.
numeric_cars=numeric_cars.fillna(numeric_cars.mean())
# Confirm that there's no more missing values!
numeric_cars.isnull().sum()
def knn_train_test(df, features, target, k=[5]):
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Normalize all columnns to range from 0 to 1 except the target column
target_col = df[target]
df = (df - df.min()) / (df.max() - df.min())
df[target] = target_col
# Split full dataset into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target],\
test_size = 0.25, random_state = 1)
k_values = k
k_rmses = dict()
for k in k_values:
# Instantiate model
model = KNeighborsRegressor(n_neighbors=k, algorithm='brute')
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions using model
predictions = model.predict(X_test)
# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
k_rmses[k] = np.sqrt(mse)
return k_rmses
k_rmse_univariate = dict()
# For each feature, train a model, return RMSE value and add to the dictionary `k_rmse_univariate`.
for f in features:
k_rmse_univariate[f] = knn_train_test(numeric_cars, [f], 'price', k=[k for k in range(1,26)])
k_min_rmse = dict()
# For each feature, get RMSE minimum and k parameter in the dictionary `k_rmse_univariate`.
for k,v in k_rmse_univariate.items():
k_min = min(v, key=v.get)
k_min_rmse['{}, k={}'.format(k,k_min)] = v[k_min]
#print(k,'k={}, RMSE={}'.format(k_min, v[k_min]))
# Create a Series object from the dictionary so we can easily view the results
pd.Series(k_min_rmse).sort_values()
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(12, 5))
# Visualize the results using a line plot
for k,v in k_rmse_univariate.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x,y)
plt.xlabel('k value')
plt.ylabel('RMSE')
#plt.legend(k_rmse_univariate.keys())
# Compute average RMSE across different `k` values for each feature.
feature_avg_rmse = dict()
for k,v in k_rmse_univariate.items():
feature_avg_rmse[k] = np.mean(list(v.values()))
sorted_avg_rmse = pd.Series(feature_avg_rmse).sort_values()
print(sorted_avg_rmse)
sorted_features = list(sorted_avg_rmse.index)
k_rmse_multivariate = dict()
for nr_best_feats in range(2,len(features)):
features = sorted_features[:nr_best_feats]
k_rmse_multivariate['{} features {}'.format(nr_best_feats, features)] = \
knn_train_test(numeric_cars, features, 'price', [k for k in range(1,26)])
k_min_rmse = dict()
# For each feature groups, get RMSE minimum and k parameter in the dictionary `k_rmse_multivariate`
for k,v in k_rmse_multivariate.items():
k_min = min(v, key=v.get)
k_min_rmse['{}, k={}'.format(k,k_min)] = v[k_min]
# Create a Series object from the dictionary so we can easily view the results
pd.Series(k_min_rmse).sort_values()
plt.figure(figsize=(12, 5))
# Visualize the results using a line plot
for k,v in k_rmse_multivariate.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x,y)
plt.xlabel('k value')
plt.ylabel('RMSE')
#plt.legend(k_rmse_multivariate.keys())
To build a better k-nearest neighbors model, we can change the features it uses or tweak the number of neighbors (a hyperparameter). To accurately understand a model's performance, we can perform k-fold cross validation and select the proper number of folds.
from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=1)
knn_model = KNeighborsRegressor(n_neighbors=5)
mses = cross_val_score(knn_model, numeric_cars[features], numeric_cars['price'], scoring='neg_mean_squared_error', cv=kf) rmses = np.sqrt(np.absolute(mses)) avg_rmse = np.mean(rmses)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
num_folds = [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23]
for fold in num_folds:
kf = KFold(fold, shuffle=True, random_state=1)
knn_model = KNeighborsRegressor(n_neighbors=5)
mses = cross_val_score(knn_model, numeric_cars[features], numeric_cars['price'], scoring='neg_mean_squared_error', cv=kf)
rmses = np.sqrt(np.absolute(mses))
avg_rmse = np.mean(rmses)
std_rmse = np.std(rmses)
print(str(fold), "folds: ", "avg RMSE: ", str(avg_rmse), "std RMSE: ", str(std_rmse))
As you increase the number the folds, the number of observations in each fold decreases and the variance of the fold-by-fold errors increases.
The standard deviation of the RMSE values can be a proxy for a model's variance while the average RMSE is a proxy for a model's bias. Bias and variance are the 2 observable sources of error in a model that we can indirectly control.
def knn_train_validate(df, features, target, k_neighbors=[5], n_folds=[10]):
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
# Normalize all columnns to range from 0 to 1 except the target column
target_col = df[target]
df = (df - df.min()) / (df.max() - df.min())
df[target] = target_col
k_rmses = dict()
k_folds_rmses = dict()
for fold in n_folds:
kf = KFold(fold, shuffle=True, random_state=1)
for k in k_neighbors:
model = KNeighborsRegressor(n_neighbors=k, algorithm='brute')
mses = cross_val_score(model, df[features], df[target], \
scoring='neg_mean_squared_error', cv=kf)
rmses = np.sqrt(np.absolute(mses))
avg_rmse = np.mean(rmses)
std_rmse = np.std(rmses)
k_rmses[k] = avg_rmse
k_folds_rmses[fold] = k_rmses
return k_folds_rmses
k_folds_univariate = dict()
for f in features:
k_folds_univariate[f] = knn_train_validate(numeric_cars, [f], 'price',\
k_neighbors= [k for k in range(1,6)], n_folds= [5, 7, 9, 10])
for k,v in k_folds_univariate.items():
print(k,v)