#!/usr/bin/env python # coding: utf-8 # # Predicting Car Prices # # In this project we will use a dataset to build a model to predict a cars market price based on its attributes. # # The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more. You can read more about the data set [here](https://archive.ics.uci.edu/ml/datasets/automobile) and can download it directly from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/autos). # # # ## Import Data # # Whilst importing the data I notice that there isn't a header row, so I will use the column names from the documentation. # In[1]: import pandas as pd pd.set_option('display.max_columns', None) # As there isn't a header row, use the column names from the documentation cols = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price'] cars = pd.read_csv("imports-85.data", header = None, names = cols) cars.head() # From looking at the table above I have identified which columns are numerical, and will be used as feature columns. These columns will also have to be cleaned. # # # # # # ## Feature Columns: # In[2]: feature_cols = ["normalized-losses", "num-of-doors", "wheel-base", "length", "width", "height", "curb-weight", "num-of-cylinders", "engine-size", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg"] # ## Target column: # # price # ## Data Clean # In[3]: cars.isnull().sum() # From the above we can see that there are not any null values. But we do know from the previous table that there are "?" values. # In[4]: #replace ? with nan import numpy as np df_data = cars.replace("?",np.nan) df_data.isnull().sum() # Now we can see which columns need further work. # change "num-of-doors" values to numbers # In[5]: # change "num-of-doors" values to numbers cars['num-of-doors'] = cars['num-of-doors'].replace(to_replace={'four':4, 'two':2}) cars["num-of-doors"].value_counts() # change "num-of-cylinders" values to numbers # In[6]: # change "num-of-cylinders" values to numbers cars['num-of-cylinders'] = cars['num-of-cylinders'].replace(to_replace={'four':4,'six':6,'five':5,'eight':8, 'two':2,'twelve':11,'three':3}) cars['num-of-cylinders'].value_counts() # replace "?" values with mean values for that column # In[7]: #replace "?" values with mean values for that column nvcol = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price', 'num-of-doors'] for a in nvcol: df_temp = cars[cars[a]!='?'] normalised_mean = np.mean(df_temp[a].astype(float)) cars[a] = cars[a].replace('?',normalised_mean).astype(float) # ## Normalize The Data: # I will now normalize the data in feature columns as they currently have differing scales. # In[8]: # Normalize the feature columns result = cars.copy() for feature_name in feature_cols: max_value = cars[feature_name].max() min_value = cars[feature_name].min() result[feature_name] = (cars[feature_name] - min_value) / (max_value - min_value) result.head() # ## K Nearest Neighbors Univariate Model: # # I will train and test a model to identify the feature with the lowest RMSE. # # In[9]: from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error import math def knn_train_test(data, cols): rows = len(data) np.random.seed(1) indexs = np.random.permutation(rows) train_data = data.iloc[indexs[: round(rows * 0.75)]] test_data = data.iloc[indexs[round(rows * 0.75):]] # initialize dictionary rmses = {} # iterating through the elements of list for a in cols: rmses[a] = None knn = KNeighborsRegressor() knn.fit(train_data[[a]], train_data["price"]) predictions = knn.predict(test_data[[a]]) mse = mean_squared_error(test_data["price"],predictions) rmse = mse ** (0.5) rmses[a] = rmse return rmses cols_rmse = knn_train_test(result,feature_cols) cols_rmse # I will use a Scatter Plot to present the results # In[10]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.scatter(cols_rmse.keys(),cols_rmse.values()) plt.xticks(rotation=90) plt.show() # For the standard K = 5, curb-weight has the lowest rmse. # I will now amend the function to vary the number of folds and present the results with individual scatter plots. # In[11]: def knn_train_test(data, cols): rows = len(data) np.random.seed(1) indexs = np.random.permutation(rows) train_data = data.iloc[indexs[: round(len(data)/2)]] test_data = data.iloc[indexs[round(len(data)/2):]] k = [1,3,5,7,9] df_rmses = pd.DataFrame() for a in cols: for val in k: knn = KNeighborsRegressor(n_neighbors=val) knn.fit(train_data[[a]], train_data["price"]) predictions = knn.predict(test_data[[a]]) mse = mean_squared_error(test_data["price"],predictions) val = str(val) df_rmses.loc[a,val]= mse ** (0.5) plt.scatter(k,df_rmses.loc[a,:]) plt.xlabel("Number of Folds") plt.title(a + "RMSE Values and Folds") plt.show() return df_rmses data_rmses = knn_train_test(result,feature_cols) # I will now amend the function to work with multiple columns. # In[12]: #use all the feature columns def knn_train_test3(data, cols): rows = len(data) np.random.seed(1) indexs = np.random.permutation(rows) train_data = data.iloc[indexs[: round(rows * 0.75)]] test_data = data.iloc[indexs[round(rows * 0.75):]] knn = KNeighborsRegressor(algorithm='brute') knn.fit(train_data[cols], train_data["price"]) predictions = knn.predict(test_data[cols]) mse = mean_squared_error(test_data["price"],predictions) rmse = mse ** (0.5) return rmse cols_rmse = knn_train_test3(result,feature_cols) cols_rmse # We can see that using all the columns at the standard k=5 , produces an rmse at the lower range of results. That would indicate we are using too many features. # I will now identify the 2,3,4,5 best features and obtain rmse for them. # In[13]: # have to order results from prev exercise. data_rmses # In order to identify the best feature I have decided to calculate the mean value of each row and sort the index by it. # In[14]: #calculate the average - mean data_rmses["average"] = data_rmses.apply(lambda x: x.mean(), axis =1) #sort the dataframe by avergae column data_rmses.sort_values(by='average') # In[15]: rmses_dataframe = pd.DataFrame() rows = len(result) np.random.seed(1) indexs = np.random.permutation(rows) train_data = result.iloc[indexs[: round(rows * 0.75)]] test_data = result.iloc[indexs[round(rows * 0.75):]] #2 best features kn_two_bf = KNeighborsRegressor() kn_two_bf.fit(train_data[["engine-size", "highway-mpg"]], train_data["price"]) predictions = kn_two_bf.predict(test_data[["engine-size", "highway-mpg"]]) mse = mean_squared_error(test_data["price"], predictions) print(mse ** 0.5) rmses_dataframe.loc["rmse", "engine-size, highway-mpg"] = (mse ** 0.5) # In[16]: #3 best features kn_three_bf = KNeighborsRegressor() kn_three_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight"]], train_data["price"]) predictions = kn_three_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight"]]) mse = mean_squared_error(test_data["price"], predictions) rmses_dataframe.loc["rmse", "engine-size, highway-mpg, curb-weight"] = (mse ** 0.5) # In[17]: #4 best features kn_four_bf = KNeighborsRegressor() kn_four_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]], train_data["price"]) predictions = kn_four_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]]) mse = mean_squared_error(test_data["price"], predictions) rmses_dataframe.loc["rmse", "engine-size, highway-mpg, curb-weight, horsepower"] = (mse ** 0.5) # In[18]: #5 best features kn_five_bf = KNeighborsRegressor() kn_five_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]], train_data["price"]) predictions = kn_five_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]]) mse = mean_squared_error(test_data["price"], predictions) rmses_dataframe.loc["rmse", "engine-size, highway-mpg, curb-weight, horsepower, width"] = (mse ** 0.5) # In[19]: rmses_dataframe # From the above results we can see that the 5 feature model has the lowest rmse. # I will now optimize the 3 best models with hyperparameters ranging from 1 to 25. # In[20]: #3 best features with Hyper Parameters hp_df = pd.DataFrame() hp = [x for x in range(1,25)] for val in hp: kn_three_bf = KNeighborsRegressor(n_neighbors=val, algorithm ="brute") kn_three_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight"]], train_data["price"]) predictions = kn_three_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight"]]) mse = mean_squared_error(test_data["price"], predictions) hp_df.loc[val, "engine-size, highway-mpg, curb-weight"] = (mse ** 0.5) # In[21]: #4 best features with Hyper Parameters for val in hp: kn_four_bf = KNeighborsRegressor(n_neighbors= val, algorithm= "brute") kn_four_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]], train_data["price"]) predictions = kn_four_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower"]]) mse = mean_squared_error(test_data["price"], predictions) hp_df.loc[val, "engine-size, highway-mpg, curb-weight, horsepower"] = (mse ** 0.5) # In[22]: #5 best features with Hyper Parameters for val in hp: kn_five_bf = KNeighborsRegressor(n_neighbors= val, algorithm= "brute") kn_five_bf.fit(train_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]], train_data["price"]) predictions = kn_five_bf.predict(test_data[["engine-size", "highway-mpg", "curb-weight","horsepower", "width"]]) mse = mean_squared_error(test_data["price"], predictions) hp_df.loc[val, "engine-size, highway-mpg, curb-weight, horsepower, width"] = (mse ** 0.5) # In[23]: for a in hp_df.columns : plt.scatter( hp, hp_df[a]) plt.xlabel("Number of Hyper Parameters") plt.title(a) plt.show() # In summary: # # For the 3 best features : "engine-size, highway-mpg, curb-weight" # A k value = 3 had the lowest RMSE # # For the 4 best features : "engine-size, highway-mpg, curb-weight, horsepower" # A k value = 1 had the lowest RMSE # # For the 5 best features :"engine-size, highway-mpg, curb-weight, horsepower, width" # A k value = 2 had the lowest RMSE # # Best RMSE was with 4 best features. In each case as the k value increased the RMSE increased. This would indicate a weak relationship between price and the features. # I will now perform Cross Validation using the 4 best features model as it had the lowest RMSE. # # In[24]: #Cross Validation using the 4 best features model as it had the lowest RMSE from sklearn.model_selection import cross_val_score, KFold num_folds = [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23] cross_dataframe = pd.DataFrame() for fold in num_folds: kf = KFold(fold, shuffle=True, random_state=1) model = KNeighborsRegressor() mses = cross_val_score(model, result[["engine-size", "highway-mpg", "curb-weight","horsepower"]], result["price"], scoring="neg_mean_squared_error", cv=kf) rmses = np.sqrt(np.absolute(mses)) avg_rmse = np.mean(rmses) std_rmse = np.std(rmses) print(str(fold), "folds: ", "avg RMSE: ", str(avg_rmse), "std RMSE: ", str(std_rmse)) cross_dataframe.loc[fold, "avg RMSE"] = avg_rmse cross_dataframe.loc[fold, "std RMSE"] = std_rmse # In[25]: cross_dataframe # I will now present the results visually for clarity: # In[26]: cross_dataframe.plot(kind="bar") plt.legend(bbox_to_anchor=(1,1), loc="upper left") plt.xlabel("Number of Folds") plt.ylabel("RMSE") plt.xticks(rotation=0) plt.show() # ## Conclusion: # # # From the chart above we can see that folds = 5 has the lowest RMSE with the least variance for "engine-size", "highway-mpg", "curb-weight","horsepower". But the size of the RMSE would indicate that relationship to price is not strong, just less inaccurate than the other features. # In[ ]: