#!/usr/bin/env python # coding: utf-8 # # Guided Project: Predicting Car Prices # # In this guided project, we will predict a car's market price using its attributes. The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more. You can read more about the data set [here](https://archive.ics.uci.edu/ml/datasets/automobile) and can download it directly from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data). # # Let's start by reading in the dataset. # In[364]: import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('fivethirtyeight') # In[365]: cars = pd.read_csv('imports-85.data', names = ['symboling', 'normalized_losses','make', 'fuel_type', 'aspiration', 'num_doors', 'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type', 'num_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price']) # In[366]: cars.head() # In[367]: cars.shape # In[368]: cars.info() # Since we would like to predict car prices, the target column is `price`. Currently the `price` column is not in numeric format. We need to clean it up. # # ## Data cleaning # # We usually can't have any missing values if we want to use them for predictive modeling. Based on the data set preview from the last step, we can tell that the `normalized_losses` column contains missing values represented using "?". Let's replace these values and look for the presence of missing values in other numeric columns. Let's also rescale the values in the numeric columns so they all range from 0 to 1. # In[369]: cars = cars.replace('?', np.NaN) # Because `?` is a string value, columns containing this value were cast to the pandas `object` data type (instead of a numeric type like `int` or `float`). Let's determine which columns should be converted back to numeric values and convert them. # In[370]: numeric_cols = ['normalized_losses', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_size', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price'] cars[numeric_cols] = cars[numeric_cols].astype('float') # Let's see how many rows in the `normalized_losses` column have missing values. # In[371]: cars['normalized_losses'].isnull().sum() # There are a few ways we could handle columns with missing values: # # - Replace the missing values using the average values from that column. # - Drop the rows entirely (especially if other columns in those rows have missing values). # - Drop the column entirely. # # When it comes to the `normalized_losses` column, dropping the rows with missing values would result in too much data loss - 20% of the rows. # # Let's see how many rows with missing data there are in the rest of the numeric columns. # In[372]: cars[numeric_cols].isnull().sum() # The `price` column is our target column. Let's drop the four rows with missing values in that column. # In[373]: cars = cars.dropna(subset=['price']) cars[numeric_cols].isnull().sum() # Let's fill in the missing values of the remaining columns with the mean value for each column. # In[374]: for col in numeric_cols: cars[col] = cars[col].fillna(cars[col].mean()) # In[412]: cars.info() # Now, let's normalize the numeric columns to range from 0 to 1, excluding the 'price' column. # In[376]: for col in numeric_cols[:-1]: cars[col] = (cars[col]-cars[col].min())/(cars[col].max()-cars[col].min()) # In[377]: cars[numeric_cols].describe().T # ## Univariate Model # # Let's start with some univariate k-nearest neighbors models. Starting with simple models before moving to more complex models helps us structure our code workflow and understand the features better. # In[378]: from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error # In[379]: def knn_train_test(train_col, target_col, df): knn = KNeighborsRegressor() np.random.seed(1) shuffled_index = np.random.permutation(df.index) rand_df = df.reindex(shuffled_index) #Split the dataset in two (75:25): train_set = rand_df[:int(len(rand_df)*0.75)] test_set = rand_df[int(len(rand_df)*0.75):] knn.fit(train_set[[train_col]], train_set[target_col]) predictions = knn.predict(test_set[[train_col]]) #Calculate the RMSE rmse = mean_squared_error(test_set[target_col], predictions, squared=False) return rmse # In[380]: rmses = {} for col in numeric_cols[:-1]: rmses[col] = knn_train_test(col, 'price', cars) rmses_series = pd.Series(rmses) rmses_series = rmses_series.sort_values() rmses_series # Now let's modify the `knn_train_test()` function to accept a parameter for the k value. # For each numeric column, we will create, train, and test a univariate model using the following k values (1, 3, 5, 7, and 9). We will then visualize the results using a scatter plot. # In[381]: def knn_train_test_k(train_col, target_col, df, k_values=5): np.random.seed(1) shuffled_index = np.random.permutation(df.index) rand_df = df.reindex(shuffled_index) #Split the dataset in two: train_set = rand_df[:int(len(rand_df)*0.75)] test_set = rand_df[int(len(rand_df)*0.75):] k_rmses = {} for k in k_values: knn = KNeighborsRegressor(n_neighbors = k) knn.fit(train_set[[train_col]], train_set[target_col]) predictions = knn.predict(test_set[[train_col]]) #Calculate the RMSE rmse = mean_squared_error(test_set[target_col], predictions, squared=False) k_rmses[k] = rmse return k_rmses # In[382]: k_rmse_result = {} for col in numeric_cols[:-1]: k_rmse_result[col] = knn_train_test_k(col, 'price', cars, k_values= range(1,10,2)) k_rmse_result # In[383]: for k, v in k_rmse_result.items(): x = list(v.keys()) y = list(v.values()) plt.scatter(x, y, label=k) plt.xlabel('K-value') plt.ylabel('RMSE') plt.xticks(x) plt.title('RMSE per k-value', y=1.06) plt.legend(bbox_to_anchor=(1.05, 1)) # It seems that a **k-value of 5** is optimal for the **Univariate model**. # # ## Multivariate Model # # Now, let's calculate the average RMSE accross different `k` values for each feature. # # Afterwards, we will modify the `knn_test_train()` function to work with multiple columns. # # Then we will use the modified function to calculate the RMSE when using the top 2, 3, 4, and 5 best features. # In[384]: #Calculate the average RMSE across different `k` values for each feature. avg_rmse = {} for k, v in k_rmse_result.items(): key = k value = np.mean(list(v.values())) avg_rmse[key] = value avg_rmse_series = pd.Series(avg_rmse) avg_rmse_series = avg_rmse_series.sort_values() avg_rmse_series # In[385]: def knn_train_test_list(train_col, target_col, df): knn = KNeighborsRegressor() np.random.seed(1) shuffled_index = np.random.permutation(df.index) rand_df = df.reindex(shuffled_index) #Split the dataset in two: train_set = rand_df[:int(len(rand_df)*0.75)] test_set = rand_df[int(len(rand_df)*0.75):] knn.fit(train_set[train_col], train_set[target_col]) predictions = knn.predict(test_set[train_col]) #Calculate the RMSE rmse = mean_squared_error(test_set[target_col], predictions, squared=False) return rmse # In[386]: #Now, let's use the 2, 3, 4, and 5 best features from the previous step with default k value of 5. best_features = {} for x in range(2,6): rmse = knn_train_test_list(avg_rmse_series.index[:x], 'price', cars) best_features['RMSE for Features: {}'.format(list(avg_rmse_series.index[:x]))] = rmse # In[387]: best_features # It looks like the **Multivariate model** preforms best with **four features**. # # ## Hyperparameter Tuning # # Let's now optimize the model that performed the best in the previous step. # # For the top three models from the previous step, we will vary the hyperparameter value from 1 to 25 and plot the resulting values. # In[388]: def knn_train_test_k(train_col_list, target_col, df): np.random.seed(1) shuffled_index = np.random.permutation(df.index) rand_df = df.reindex(shuffled_index) #Split the dataset in two: train_set = rand_df[:int(len(rand_df)*0.75)] test_set = rand_df[int(len(rand_df)*0.75):] k_values = range(1,26) k_rmses = {} for k in k_values: knn = KNeighborsRegressor(n_neighbors = k) knn.fit(train_set[train_col_list], train_set[target_col]) predictions = knn.predict(test_set[train_col_list]) #Calculate the RMSE rmse = mean_squared_error(test_set[target_col], predictions, squared=False) k_rmses[k] = rmse return k_rmses # In[389]: top3 = [3,4,5] top3_rmse = {} for x in top3: rmse = knn_train_test_k(avg_rmse_series.index[:x], 'price', cars) top3_rmse['Best {} features'.format(x)] = rmse # In[390]: top3_rmse # In[391]: for k, v in top3_rmse.items(): x = list(v.keys()) y = list(v.values()) plt.plot(x, y, label=k) plt.legend(loc='lower right') plt.xlabel('K-value') plt.ylabel('RMSE') plt.title('RMSE for k-values 1 to 25') # It seems like using a **k-value 1** is best for the model when using the **best 3, 4 and 5 features**. # # ## K-fold cross validation # # Now, let's modify the `knn_train_test()` function to use k-fold cross validation. We will use the default `n_neighbors` of 5 and will loop through folds 2 to 10. # In[392]: from sklearn.model_selection import KFold, cross_val_score def knn_train_test_kfold(train_col, target_col, df, folds=range(2,12,2)): avg_rmses = {} for f in folds: kf = KFold(f, shuffle=True, random_state=1) knn = KNeighborsRegressor() mse = cross_val_score(knn, df[[train_col]], df[target_col], scoring='neg_mean_squared_error', cv=kf) rmse = np.sqrt(np.absolute(mse)) avg_rmse = np.mean(rmse) avg_rmses[f] = avg_rmse return avg_rmses # In[393]: diff_folds = {} for col in numeric_cols[:-1]: diff_folds[col] = knn_train_test_kfold(col, 'price', cars, folds=range(2,12,2)) diff_folds_df = pd.DataFrame(diff_folds) diff_folds_df # We see that for 6 out of the 14 columns (~43%), 10 folds give the best result. Because of that, we will use 10 folds for the rest of the tests. # # ## Varying nearest neighbours # # Now let's modify the `knn_train_test_kfold()` so that the k nearest neighbors changes - 1, 3, 5, 7, 9. # In[413]: def knn_train_test_kfold(train_col, target_col, df, folds=10): avg_rmses = {} kf = KFold(folds, shuffle=True, random_state=1) k_values = range(1,10,2) for k in k_values: knn = KNeighborsRegressor(n_neighbors=k) mse = cross_val_score(knn, df[[train_col]], df[target_col], scoring='neg_mean_squared_error', cv=kf) rmse = np.sqrt(np.absolute(mse)) avg_rmse = np.mean(rmse) avg_rmses[k] = avg_rmse return avg_rmses # In[414]: cols_k = {} for col in numeric_cols[:-1]: cols_k[col] = knn_train_test_kfold(col, 'price', cars, folds=10) cols_k_df=pd.DataFrame(cols_k) cols_k_df # Now, let's use the above results for `k-values` [1,3,5,9] and average the results for each feature. We will then use the top features to calculate the RMSE for more than one column over 10 folds. # In[416]: #Calculate the average RMSE across different `k` values for each feature. avg_rmse_k = {} for k, v in cols_k.items(): key = k value = np.mean(list(v.values())) avg_rmse_k[key] = value avg_rmse_k_series = pd.Series(avg_rmse_k) avg_rmse_k_series = avg_rmse_k_series.sort_values() avg_rmse_k_series # In[418]: def knn_train_test_kfold(train_col, target_col, df, folds=10): kf = KFold(folds, shuffle=True, random_state=1) knn = KNeighborsRegressor() mse = cross_val_score(knn, df[train_col], df[target_col], scoring='neg_mean_squared_error', cv=kf) rmse = np.sqrt(np.absolute(mse)) avg_rmse = np.mean(rmse) return avg_rmse # In[419]: features = {} for x in range(2,7): result = knn_train_test_kfold(list(avg_rmse_k_series.index[:x]), 'price', cars, folds=10) features["{} best features".format(x)] = result features # It looks like the **10 fold model** and **k value of 5** preforms best with **2 features**. # # Let's use the top 3 features combinations from above (2, 3, 4, 5) and calculate the RMSE with varying nearest neighbors from 1 to 25, using 10 folds. # In[420]: def knn_train_test_kfold(train_col, target_col, df, folds=10): kf = KFold(folds, shuffle=True, random_state=1) avg_rmses = {} for k in range(1,26): knn = KNeighborsRegressor(n_neighbors=k) mse = cross_val_score(knn, df[train_col], df[target_col], scoring='neg_mean_squared_error', cv=kf) rmse = np.sqrt(np.absolute(mse)) avg_rmse = np.mean(rmse) avg_rmses[k] = avg_rmse return avg_rmses # In[421]: features_25 = {} features = [2,3,4,5] for x in features: result = knn_train_test_kfold(list(ten_folds_s.index[:x]), 'price', cars, folds=10) features_25["{} best features".format(x)] = result features_25 # In[422]: for k, v in features_25.items(): x = list(v.keys()) y = list(v.values()) plt.plot(x, y, label=k) plt.legend(loc='lower right', frameon=False) plt.xlabel('K-value') plt.ylabel('RMSE') plt.title('RMSE for k-values 1 to 25\n10 Folds') # It looks like **2, 3, and 4 best features** produce most accurate results with **k value of 2** whereas **5 features** work best with **k value of 1**. # In[ ]: