#!/usr/bin/env python
# coding: utf-8

# # Predicting Car Prices
# 
# In this project we will use a dataset to build a model to predict a cars market price based on its attributes.
# 
# The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more. You can read more about the data set [here](https://archive.ics.uci.edu/ml/datasets/automobile) and can download it directly from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/autos).
# 
# 

# ## Import Data
# 
# Whilst importing the data I notice that there isn't a header row, so I will use the column names from the documentation.

# In[1]:


import pandas as pd

pd.set_option('display.max_columns', None)

# As there isn't a header row, use the column names from the documentation
cols = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
cars = pd.read_csv("imports-85.data", header = None, names = cols)
cars.head()


# From looking at the table above I have identified which columns are numerical, and will be used as feature columns. These columns will also have to be cleaned.
# 
# 
# 
# 
# 

# ## Feature Columns:

# In[2]:


feature_cols = ["normalized-losses", "num-of-doors", "wheel-base", "length", "width", "height", "curb-weight", "num-of-cylinders", "engine-size", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg"]


# ## Target column:
# 
# price

# ## Data Clean

# In[3]:


cars.isnull().sum()


# From the above we can see that there are not any null values. But we do know from the previous table that there are "?" values.

# In[4]:


#replace ? with nan
import numpy as np
df_data = cars.replace("?",np.nan)
df_data.isnull().sum()


# Now we can see which columns need further work.

# change "num-of-doors" values to numbers

# In[5]:


# change "num-of-doors" values to numbers
cars['num-of-doors'] = cars['num-of-doors'].replace(to_replace={'four':4, 'two':2})
cars["num-of-doors"].value_counts()


# change "num-of-cylinders" values to numbers

# In[6]:


# change "num-of-cylinders" values to numbers
cars['num-of-cylinders'] = cars['num-of-cylinders'].replace(to_replace={'four':4,'six':6,'five':5,'eight':8, 'two':2,'twelve':11,'three':3})
cars['num-of-cylinders'].value_counts()


# replace "?" values with mean values for that column

# In[7]:


#replace "?" values with mean values for that column


nvcol = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price', 'num-of-doors']

for a in nvcol:
    df_temp = cars[cars[a]!='?']
    normalised_mean = np.mean(df_temp[a].astype(float))
    cars[a] = cars[a].replace('?',normalised_mean).astype(float)


# ## Normalize The Data:

# I will now normalize the data in feature columns as they currently have differing scales.

# In[8]:


# Normalize the feature columns

result = cars.copy()

for feature_name in feature_cols:
    max_value = cars[feature_name].max()
    min_value = cars[feature_name].min()
    result[feature_name] = (cars[feature_name] - min_value) / (max_value - min_value)
    

result.head()


# ## K Nearest Neighbors Univariate Model:
# 
# I will train and test a model to identify the feature with the lowest RMSE.
# 

# In[9]:


from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
import math

def knn_train_test(data, cols):
    rows = len(data)
    np.random.seed(1)
    indexs = np.random.permutation(rows)
    train_data = data.iloc[indexs[: round(rows * 0.75)]]
    test_data = data.iloc[indexs[round(rows * 0.75):]]
    
    
    # initialize dictionary 
    rmses = {} 
  
    # iterating through the elements of list 
    
    for a in cols:
        rmses[a] = None
        knn = KNeighborsRegressor()
        knn.fit(train_data[[a]], train_data["price"])
        predictions = knn.predict(test_data[[a]])
        mse = mean_squared_error(test_data["price"],predictions)
        rmse = mse ** (0.5)
        rmses[a] = rmse
        
    return rmses

cols_rmse = knn_train_test(result,feature_cols)
cols_rmse
        

# I will use a Scatter Plot to present the results

# In[10]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
plt.scatter(cols_rmse.keys(),cols_rmse.values())
plt.xticks(rotation=90)
plt.show()


# For the standard K = 5, curb-weight has the lowest rmse.

# I will now amend the function to vary the number of folds and present the results with individual scatter plots.

# In[11]:


def knn_train_test(data, cols):
    rows = len(data)
    np.random.seed(1)
    indexs = np.random.permutation(rows)
    train_data = data.iloc[indexs[: round(len(data)/2)]]
    test_data = data.iloc[indexs[round(len(data)/2):]]

    k = [1,3,5,7,9]
    
    df_rmses = pd.DataFrame()
    
    
    for a in cols:
        
        for val in k:
            
            knn = KNeighborsRegressor(n_neighbors=val)
            knn.fit(train_data[[a]], train_data["price"])
            predictions = knn.predict(test_data[[a]])
            mse = mean_squared_error(test_data["price"],predictions)
           
           
            val = str(val)
            
            df_rmses.loc[a,val]= mse ** (0.5)
            
    
        plt.scatter(k,df_rmses.loc[a,:])
        plt.xlabel("Number of Folds")
        plt.title(a + "RMSE Values and Folds")
        plt.show()
    
 
    return df_rmses


data_rmses = knn_train_test(result,feature_cols)


# I will now amend the function to work with multiple columns. 

# In[12]:


#use all the feature columns
def knn_train_test3(data, cols):
    rows = len(data)
    np.random.seed(1)
    indexs = np.random.permutation(rows)
    train_data = data.iloc[indexs[: round(rows * 0.75)]]
    test_data = data.iloc[indexs[round(rows * 0.75):]]
    
    
    knn = KNeighborsRegressor(algorithm='brute')
    knn.fit(train_data[cols], train_data["price"])
    predictions = knn.predict(test_data[cols])
    mse = mean_squared_error(test_data["price"],predictions)
    rmse = mse ** (0.5)
        
        
    return rmse

cols_rmse = knn_train_test3(result,feature_cols)
cols_rmse


# We can see that using all the columns at the standard k=5 , produces an rmse at the lower range of results. That would indicate we are using too many features.

# I will now identify the 2,3,4,5 best features and obtain rmse for them.

# In[13]:


# have to order results from prev exercise.

data_rmses


# In order to identify the best feature I have decided to calculate the mean value of each row and sort the index by it.

# In[14]:


#calculate the average - mean
data_rmses["average"] = data_rmses.apply(lambda x: x.mean(), axis =1)

#sort the dataframe by avergae column
data_rmses.sort_values(by='average')


# In[15]:


rmses_dataframe = pd.DataFrame()
   
            
rows = len(result)
np.random.seed(1)
indexs = np.random.permutation(rows)
train_data = result.iloc[indexs[: round(rows * 0.75)]]
test_data = result.iloc[indexs[round(rows * 0.75):]]

#2 best features
kn_two_bf = KNeighborsRegressor()
kn_two_bf.fit(train_data[["engine-size", "highway-mpg"]], train_data["price"])
predictions = kn_two_bf.predict(test_data[["engine-size", "highway-mpg"]])
mse = mean_squared_error(test_data["price"], predictions)
print(mse ** 0.5)
rmses_dataframe.loc["rmse", "engine-size, highway-mpg"] = (mse ** 0.5)


# In[16]:


#3 best features
kn_three_bf = KNeighborsRegressor()
kn_three_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight"]], train_data["price"])
predictions = kn_three_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight"]])
mse = mean_squared_error(test_data["price"], predictions)
rmses_dataframe.loc["rmse", "engine-size, highway-mpg, curb-weight"] = (mse ** 0.5)


# In[17]:


#4 best features
kn_four_bf = KNeighborsRegressor()
kn_four_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]], train_data["price"])
predictions = kn_four_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]])
mse = mean_squared_error(test_data["price"], predictions)
rmses_dataframe.loc["rmse", "engine-size, highway-mpg, curb-weight, horsepower"] = (mse ** 0.5)


# In[18]:


#5 best features
kn_five_bf = KNeighborsRegressor()
kn_five_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]], train_data["price"])
predictions = kn_five_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]])
mse = mean_squared_error(test_data["price"], predictions)
rmses_dataframe.loc["rmse", "engine-size, highway-mpg, curb-weight, horsepower, width"] = (mse ** 0.5)


# In[19]:


rmses_dataframe


# From the above results we can see that the 5 feature model has the lowest rmse.

# I will now optimize the 3 best models with hyperparameters ranging from 1 to 25.

# In[20]:


#3 best features with Hyper Parameters


hp_df = pd.DataFrame()

hp = [x for x in range(1,25)]
    
for val in hp:
    kn_three_bf = KNeighborsRegressor(n_neighbors=val, algorithm ="brute")
    kn_three_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight"]], train_data["price"])
    predictions = kn_three_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight"]])
    mse = mean_squared_error(test_data["price"], predictions)
    hp_df.loc[val, "engine-size, highway-mpg, curb-weight"] = (mse ** 0.5)
     
    
# In[21]:


#4 best features with Hyper Parameters

for val in hp:
    kn_four_bf = KNeighborsRegressor(n_neighbors= val, algorithm= "brute")
    kn_four_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]], train_data["price"])
    predictions = kn_four_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]])
    mse = mean_squared_error(test_data["price"], predictions)
    hp_df.loc[val, "engine-size, highway-mpg, curb-weight, horsepower"] = (mse ** 0.5)
    

# In[22]:


#5 best features with Hyper Parameters

for val in hp:
    kn_five_bf = KNeighborsRegressor(n_neighbors= val, algorithm= "brute")
    kn_five_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]], train_data["price"])
    predictions = kn_five_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]])
    mse = mean_squared_error(test_data["price"], predictions)
    hp_df.loc[val, "engine-size, highway-mpg, curb-weight, horsepower, width"] = (mse ** 0.5)
    

# In[23]:


for a in hp_df.columns :
    plt.scatter( hp, hp_df[a])
    plt.xlabel("Number of Hyper Parameters")
    plt.title(a)
    plt.show()    


# In summary:
# 
# For the 3 best features : "engine-size, highway-mpg, curb-weight"
# A k value = 3 had the lowest RMSE
# 
# For the 4 best features : "engine-size, highway-mpg, curb-weight, horsepower"
# A k value = 1 had the lowest RMSE
# 
# For the 5 best features :"engine-size, highway-mpg, curb-weight, horsepower, width"
# A k value = 2 had the lowest RMSE
# 
# Best RMSE was with 4 best features. In each case as the k value increased the RMSE increased. This would indicate a weak relationship between price and the features.

# I will now perform Cross Validation using the 4 best features model as it had the lowest RMSE.
#            

# In[24]:


#Cross Validation using the 4 best features model as it had the lowest RMSE

from sklearn.model_selection import cross_val_score, KFold

num_folds = [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23]

cross_dataframe = pd.DataFrame()

for fold in num_folds:
    kf = KFold(fold, shuffle=True, random_state=1)
    model = KNeighborsRegressor()
    mses = cross_val_score(model, result[["engine-size", "highway-mpg", "curb-weight","horsepower"]], result["price"], scoring="neg_mean_squared_error", cv=kf)
    rmses = np.sqrt(np.absolute(mses))
    avg_rmse = np.mean(rmses)
    std_rmse = np.std(rmses)
    print(str(fold), "folds: ", "avg RMSE: ", str(avg_rmse), "std RMSE: ", str(std_rmse))
    cross_dataframe.loc[fold, "avg RMSE"] =  avg_rmse
    cross_dataframe.loc[fold, "std RMSE"] =  std_rmse
                       
    
# In[25]:


cross_dataframe


# I will now present the results visually for clarity:

# In[26]:


cross_dataframe.plot(kind="bar")
plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.xlabel("Number of Folds")
plt.ylabel("RMSE")
plt.xticks(rotation=0)

plt.show()


# ## Conclusion:
# 
# 
# From the chart above we can see that folds = 5 has the lowest RMSE with the least variance for "engine-size", "highway-mpg", "curb-weight","horsepower". But the size of the RMSE would indicate that relationship to price is not strong, just less inaccurate than the other features.

# In[ ]: