In [1]:

# -*- coding: utf-8 -*-
"""
Created on Tue May 11 11:01:11 2021

@author: ananya Joshi
"""

'''
In this project we will predict the car prices using it's attributes by K Nearest Neighbors 
algorithm based on Regression'

For each car we have info. about technical aspects such as motor's displacement, weight of car, 
miles per gallon and so on.
'''

'''
#--To read more about dataset uncomment this section and execute code to go to dataset page--
#-- dataset documentation --

import webbrowser
webbrowser.open('https://archive.ics.uci.edu/ml/datasets/automobile')
'''


from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np



pd.set_option('display.max_columns',26)
# using column names from the dataset documentation mentioned above
cols = ['symbolizing','normalized-losses','make','fuel-type','aspiration','num-of-doors',
        'body-style','drive-wheels','engine-location','wheel-base','length','width','height',
        'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore',
        'stroke', 'compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg',
        'price']
cars = pd.read_csv('E:\DataQuest\Projects\K Nearest Neighbors\imports-85.data',names=cols)

print('---------------------------------')
# determining the column types in dataset
print(cars.info())
print('---------------------------------')
print(cars.isnull().sum())

# let's see few starting rows of dataframe
print('---------------------------------')
print(cars.head(2))

'''
After seeing the data we can clearly see our target column i.e. price which we will predict.
Since we are using KNN based on regression we need to avoid the features that will affect
our models performance and in case of KNN based on regression

features that:
    i. have non-numeric values
    ii. have numeric values but are not ordinal
    iii. geographical values
such features are of no use in this algorithm therefore we can say that it's safe to remove
these columns.

numeric columns in our data

-> symbolizing, wheel-base, length, width, height, curb-weight, engine-size,compression-ratio,
city-mpg, highway-mpg

all other columns are not numeric even out target column i.e. price is not numeric
Also if we are making a predicting model then our data must not have any missing values.

This means that our data here now need some cleaning

'''

'''
our normalized-losses column contains missing values as '?' So, let us replace all ? with
np.nan in whole dataset. So, whereever there is '?' it will be rekpllaced by numpy.nan
'''
print('---------------------------------')
cars = cars.replace('?',np.nan)
# checking the null values in dataset again
print(cars.isnull().sum())
# selecting numeric columns only
numeric_cols = cols = ['symbolizing','normalized-losses','wheel-base','length','width','height'
,'curb-weight','engine-size','bore','stroke', 'compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg',
'price']
cars_num_df = cars[numeric_cols]

print('---------------------------------')
# Now we have all numeric data in our data thus we can convert it all to numeric by astype
cars_num_df = cars_num_df.astype('float')
print(cars_num_df.dtypes)

'''
Now, we have all numeric data in our dataframe and our data is cleaned in a way to predict
pricing of cars
'''
print('---------------------------------')
print(cars_num_df.shape) #So, we have ony 205 rows
print('Null values in normalized-losses : ',cars_num_df['normalized-losses'].isnull().sum())
'''
means that out of 205 rows in normalized-losses feature there are 41 missing values so, now
we have 3 options to opt from:
    i. replace missing values with average value of column or,
    ii. drop the rows entirely or,
    iii. drop the column entirely
    
Since, here there are about 25% approx. of rows are missing values so it's better to 
drop the entire column as dropping rows will affect other columns and result in overall
less data for our model.

More the 'correct' data the better it is for model
'''
plt.figure(figsize = (20,20), dpi=400)
sns.heatmap(cars_num_df, cbar=False)
plt.title('Before Removing Missing values')
plt.show()



# dropping normalized-losses column
cars_num_df.drop(['normalized-losses'],inplace=True,axis=1)
cars_num_df.columns
print('---------------------------------')
# let us check out other columns also for missing values and take appropriate action
print(cars_num_df.isnull().sum())

'''
So, we have null values in following columns:
    bore
    stroke
    price

Now, worst case to assume is that all these missing values rows are different.
Since, in total we have less rows it's better to replace them with average value of column
'''

for col in cars_num_df:
    cars_num_df[col].fillna(value=cars_num_df[col].mean(),inplace=True)

print('---------------------------------')
print(cars_num_df.isnull().sum())
print('---------------------------------')

'''
Since, we selected above columns to be in our model so, now let us move forward
'''
print('---------------------------------')
# Now, let us normalize our data
normalized_cars_df = (cars_num_df - cars_num_df.mean())/cars_num_df.std()

print(normalized_cars_df.isnull().sum())
plt.figure(figsize = (20,20), dpi=400)
sns.heatmap(normalized_cars_df.isnull(), cbar=False)
plt.title('After Removing Missing Values')
plt.show()
print('---------------------------------')


# UNIVARIATE MODEL

# using simple train/test validation
# print(normalized_cars_df.shape)

univariate_rmses = []
def knn_train_test(df, target_col, feature, k):
    train_df = df.iloc[:154]
    test_df = df.iloc[154:]
    model = KNeighborsRegressor(algorithm = 'brute', n_neighbors = k)
    model.fit(X = train_df[[feature]], y = train_df[target_col])
    predictions = model.predict(X = test_df[[feature]])
    rmse = np.sqrt(mean_squared_error(y_true = test_df['price'], y_pred = predictions))
    univariate_rmses.append(rmse)
    
'''
Now, using each column one by one we will compute the rmse value for our
univariate model. We will not use 'price' column as using it in case of prediction
model leads towards data-leakage in machine learning as it is our target column.
'''
univariate_columns = [x for x in normalized_cars_df.columns if x != 'price']

# calling function to train model

'''
# Uncomment this section to get rmse value for each feature
for col in univariate_columns:
    knn_train_test(normalized_cars_df, 'price', col)
'''

# for now we will use below mentioned columns
# cols -> curb-weight, highway-mpg, city-mpg, width, engine-size
univariate_features = 'curb-weight'
hyper_params = [1, 3, 5, 7, 9]


for k in hyper_params:
    knn_train_test(normalized_cars_df, 'price', col, k)

print('RMSE values are : ',univariate_rmses)
print('---------------------------------')

print('Univariate Hyper-parameter values : ')
print(hyper_params,'\n')
print('Univariate-model RMSE values for above hyper-parameter values')
print(univariate_rmses)
import matplotlib.pyplot as plt
plt.scatter(x = hyper_params, y = univariate_rmses)
plt.xlabel('Hyper-Parameter')
plt.ylabel('Root Mean Squared Error')
plt.title('Univariate RMSES')
plt.show()

'''
So, based on the scatter-plot we have the lowest rmse value at k = 5
This lowest rmse computed value is for a univariate model with feature -> 'curb-weight'
'''

# MULTIVARIATE MODEL
print('---------------------------------')
'''
Building a good model:
    i. changing features
    ii. hyperparameter optimization
'''
np.random.seed(1)
shuffled_index = np.random.permutation(len(normalized_cars_df))
normalized_cars_df = normalized_cars_df.iloc[shuffled_index]
def knn_train_test_mult(df, target_col, features_list):
    mult_rmses = []
    train_df = df.iloc[:154]
    test_df = df.iloc[154:]
    #for feature in features_list:
    model = KNeighborsRegressor(algorithm = 'brute', n_neighbors = 5)
    model.fit(X = train_df[features_list], y = train_df[target_col])
    predictions = model.predict(X = test_df[features_list])
    rmse = np.sqrt(mean_squared_error(y_true = test_df['price'], y_pred = predictions))
    mult_rmses.append(rmse)
    return mult_rmses
# calling function for multivariate


best_5_features_list = ['curb-weight','engine-size','highway-mpg','width','city-mpg']
two_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:2])
three_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:3])
four_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:4])
five_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:5])

models_avg_rmse_dict = {'2-features model\'s RMSE':two_features[0], '3-features model\'s RMSE':three_features[0],'4-features model\'s RMSE':four_features[0],'5-features model\'s RMSE':five_features[0]}


# displaying all rmse values computed from above multivariate model
print('Multivariate model RMSE values for model of 2,3,4,5 features respectively : \n')
print (' two_features_rmse   -> {}\n three_features_rmse -> {}\n four features_rmse  -> {}\n five_features_rmse  -> {}'.format(two_features,three_features,four_features,five_features))
print('\nOR\n')
print(models_avg_rmse_dict)
plt.scatter(x = [5,5,5,5], y = [two_features,three_features,four_features,five_features])
plt.title('Multivariate-Model\'s RMSE Compared')
plt.xlabel('Hyper-Parameter')
plt.ylabel('Root Mean Squared Value')
plt.ylim(0.35,0.55)
plt.show()
'''
So, from above scatterplot we can clearly see that Minimum RMSE value of 0.4435 approx. when 
we used only all of the 5 features
This proves that using all these 5 features the model is performing good
'''

# PERFORMING HYPERPARAMETER TUNING
'''
Top 3 top models selected (based on RMSE values):
    i. four_features model (RMSE = 0.4435)
    ii. three_features model (RMSE = 0.4631)
    iii. five_features model (RMSE = 0.4462)
'''
hyper_param_opt = [x for x in range(1,26,1)]
rmses_3_features = []
rmses_4_features = []
rmses_5_features = []
# just rewriting knn_train_test_mult
def knn_train_test_mult(df, target_col, features_list,hyper_param_opt,n_features):
    train_df = df.iloc[:154]
    test_df = df.iloc[154:]
    #for feature in features_list:
    for hp in hyper_param_opt:
        model = KNeighborsRegressor(algorithm = 'brute', n_neighbors = hp)
        model.fit(X = train_df[features_list], y = train_df[target_col])
        predictions = model.predict(X = test_df[features_list])
        rmse = np.sqrt(mean_squared_error(y_true = test_df['price'], y_pred = predictions))
        if n_features == 3:
            rmses_3_features.append(rmse)
        elif n_features == 4:
            rmses_4_features.append(rmse)
        elif n_features == 5:
            rmses_5_features.append(rmse)
    return predictions


predictions_3_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:3], hyper_param_opt, 3)
predictions_4_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:4], hyper_param_opt, 4)
predictions_5_features = knn_train_test_mult(normalized_cars_df, 'price', best_5_features_list[:5], hyper_param_opt, 5)

fig = plt.figure(figsize=(20,20),dpi=300)

ax1 = fig.add_subplot(3,1,1)
ax2 = fig.add_subplot(3,1,2)
ax3 = fig.add_subplot(3,1,3)

ax1.scatter(x=hyper_param_opt,y=rmses_3_features,c='black')
ax1.set_title('3-features model\'s RMSES')
ax1.set_ylabel('RMSE')
ax1.set_xlabel('Hyperparameter')
ax1.set_facecolor('orange')
ax2.scatter(x=hyper_param_opt,y=rmses_4_features,c='black')
ax2.set_ylabel('RMSE')
ax2.set_xlabel('Hyperparameter')
ax2.set_facecolor('orange')
ax2.set_title('4-features model\'s RMSES')
ax3.scatter(x=hyper_param_opt,y=rmses_5_features,c='black')
ax3.set_ylabel('RMSE')
ax3.set_facecolor('orange')
ax3.set_xlabel('Hyperparameter')
ax3.set_title('5-features model\'s RMSES')
fig.suptitle("Top 3-Models RMSES's with Varying Hyperparameters")
plt.show()

'''
In 3-feature model:
    Minimum RMSE is at k-value of 3 so k=3 is our best build model
In 4-feature model:
    Minimum RMSE is at k-value of 3 so k=3 is our best build model
In 5-feature model:
    Minimum RMSE is at k-value of 2 so k=2 is our best build model
'''

# Performing K-Fold Cross-Validation
'''
To accurately understand the model's performance:
    i. Perform K-Fold Cross-Validation
    (Since we have less data we will stick with default k value i.e. 5)
And,
Using k-hyperparameter as respective value mentioned above
'''

print('---------------------------------\n')

fold = 5
# for single model only
# instantiating KFold class
num_folds = [3, 5, 10, 15]
# df = cars_num_df
def kfold_cv(df, target_col, features_list, n_features):
    print('\n\n{}-FEATURE MODEL:\n'.format(n_features))
    for n_fold in num_folds:
        kf = KFold(n_splits = n_fold, shuffle = True, random_state = 1)
        # instantiating KNN class
        knn = KNeighborsRegressor(algorithm = 'brute', n_neighbors=5)
        # computing score metric
        mse = cross_val_score(X = df[features_list], estimator = knn, y = df[target_col], cv = kf, scoring = 'neg_mean_squared_error')
        rmses = np.sqrt(np.abs(mse))
        avg_rmse = np.mean(rmses)
        std_rmse = np.std(rmses)
        print('{}-feature model\'s avg-RMSE is {} and std-RMSE is {}'.format(n_features,avg_rmse,std_rmse))
    
    

# calling 3 feature model
kfold_cv(cars_num_df,'price',best_5_features_list[:3],3)
kfold_cv(cars_num_df,'price',best_5_features_list[:4],4)
kfold_cv(cars_num_df,'price',best_5_features_list[:5],5)

---------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symbolizing        205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non-null    int64  
 17  fuel-system        205 non-null    object 
 18  bore               205 non-null    object 
 19  stroke             205 non-null    object 
 20  compression-ratio  205 non-null    float64
 21  horsepower         205 non-null    object 
 22  peak-rpm           205 non-null    object 
 23  city-mpg           205 non-null    int64  
 24  highway-mpg        205 non-null    int64  
 25  price              205 non-null    object 
dtypes: float64(5), int64(5), object(16)
memory usage: 41.8+ KB
None
---------------------------------
symbolizing          0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64
---------------------------------
   symbolizing normalized-losses         make fuel-type aspiration  \
0            3                 ?  alfa-romero       gas        std   
1            3                 ?  alfa-romero       gas        std   

  num-of-doors   body-style drive-wheels engine-location  wheel-base  length  \
0          two  convertible          rwd           front        88.6   168.8   
1          two  convertible          rwd           front        88.6   168.8   

   width  height  curb-weight engine-type num-of-cylinders  engine-size  \
0   64.1    48.8         2548        dohc             four          130   
1   64.1    48.8         2548        dohc             four          130   

  fuel-system  bore stroke  compression-ratio horsepower peak-rpm  city-mpg  \
0        mpfi  3.47   2.68                9.0        111     5000        21   
1        mpfi  3.47   2.68                9.0        111     5000        21   

   highway-mpg  price  
0           27  13495  
1           27  16500  
---------------------------------
symbolizing           0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64
---------------------------------
symbolizing          float64
normalized-losses    float64
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight          float64
engine-size          float64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg             float64
highway-mpg          float64
price                float64
dtype: object
---------------------------------
(205, 16)
Null values in normalized-losses :  41

---------------------------------
symbolizing          0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
dtype: int64
---------------------------------
symbolizing          0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64
---------------------------------
---------------------------------
symbolizing          0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

---------------------------------
RMSE values are :  [0.01875799196764876, 0.014325325407032047, 0.013878766751085932, 0.018737680071968646, 0.029784623980567592]
---------------------------------
Univariate Hyper-parameter values : 
[1, 3, 5, 7, 9] 

Univariate-model RMSE values for above hyper-parameter values
[0.01875799196764876, 0.014325325407032047, 0.013878766751085932, 0.018737680071968646, 0.029784623980567592]

---------------------------------
Multivariate model RMSE values for model of 2,3,4,5 features respectively : 

 two_features_rmse   -> [0.4875401483812488]
 three_features_rmse -> [0.46314388295862713]
 four features_rmse  -> [0.44354607210636016]
 five_features_rmse  -> [0.44628732596763776]

OR

{"2-features model's RMSE": 0.4875401483812488, "3-features model's RMSE": 0.46314388295862713, "4-features model's RMSE": 0.44354607210636016, "5-features model's RMSE": 0.44628732596763776}

---------------------------------



3-FEATURE MODEL:

3-feature model's avg-RMSE is 4006.376644592172 and std-RMSE is 106.95186119478558
3-feature model's avg-RMSE is 3855.0471506843605 and std-RMSE is 384.03422789028144
3-feature model's avg-RMSE is 3808.071293994925 and std-RMSE is 959.8487353657314
3-feature model's avg-RMSE is 3661.7831507405294 and std-RMSE is 1211.4150299273301


4-FEATURE MODEL:

4-feature model's avg-RMSE is 4006.376644592172 and std-RMSE is 106.95186119478558
4-feature model's avg-RMSE is 3858.387185493432 and std-RMSE is 388.7071249809679
4-feature model's avg-RMSE is 3808.1874610597324 and std-RMSE is 961.1628413525466
4-feature model's avg-RMSE is 3668.008804131959 and std-RMSE is 1204.2916356656985


5-FEATURE MODEL:

5-feature model's avg-RMSE is 4005.9048684961417 and std-RMSE is 107.93836354924069
5-feature model's avg-RMSE is 3854.588236660626 and std-RMSE is 390.59003507593724
5-feature model's avg-RMSE is 3807.60007492125 and std-RMSE is 959.0297930476255
5-feature model's avg-RMSE is 3666.7181132000655 and std-RMSE is 1204.6639674492594