#!/usr/bin/env python # coding: utf-8 # # Guided Project: Predicting House Sale Prices # # In this guided project, we will work with housing data for the city of Ames, Iowa, United States from 2006 to 2010. We will train a model to predict prices based on the dataset we will use. # # More on why the data was collected can be read [here](https://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627). Information about the different columns in the data can be found [here](https://s3.amazonaws.com/dq-content/307/data_description.txt). # # Let's start by setting up a pipeline of functions that will let us quickly iterate on different models. # # ![Capture.PNG](attachment:Capture.PNG) # In[26]: import pandas as pd pd.options.display.max_columns = 999 import matplotlib.pyplot as plt import numpy as np from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[27]: ames = pd.read_csv('AmesHousing.tsv', delimiter='\t') ames.shape # In[28]: def transform_features(df): return df def select_features(df): return df[['Gr Liv Area', 'SalePrice']] def train_and_test(data): train = data[:1460] test = data[1460:] numeric_train = train.select_dtypes(include=['float','integer']) numeric_test = test.select_dtypes(include=['float', 'integer']) numeric_train = numeric_train.drop('SalePrice', axis=1) features = numeric_train.columns lr = LinearRegression() lr.fit(train[features], train['SalePrice']) predictions = lr.predict(test[features]) mse = mean_squared_error(test['SalePrice'], predictions) rmse = np.sqrt(mse) return rmse # In[29]: data = transform_features(ames) selected = select_features(data) rmse = train_and_test(selected) rmse # ## Feature Engineering # # Let's now start removing features with many missing values, diving deeper into potential categorical features, and transforming text and numerical columns. We will update `transform_features()` so that any column from the data frame with more than 25% missing values is dropped. We also need to remove any columns that leak information about the sale (e.g. like the year the sale happened). In general, the goal of this function is to: # # - remove features that we don't want to use in the model, just based on the number of missing values or data leakage # - transform features into the proper format (numerical to categorical, scaling numerical, filling in missing values, etc) # - create new features by combining other features # # Next, we need to get more familiar with the remaining columns by reading the [data documentation](https://s3.amazonaws.com/dq-content/307/data_description.txt) for each column, determining what transformations are necessary (if any), and more. # In[30]: #Which columns contain less than 5% missing values? sum_missing = ames.isnull().sum() over_5 = sum_missing[sum_missing > 146].index ames = ames.drop(over_5, axis=1) #For numerical columns, let's fill in the missing values using #the most popular value for that column numeric_missing = ames.select_dtypes(include=['float', 'integer']).isnull().sum() to_fix = numeric_missing[numeric_missing > 0].sort_values() to_fix # In[31]: for col in to_fix.index: value = ames[col].mode()[0] ames[col] = ames[col].fillna(value) # In[32]: #Let's remove the Text columns with over 1 missing value object_missing = ames.select_dtypes(include='object').isnull().sum() over_1 = object_missing[object_missing > 0].index ames = ames.drop(over_1, axis=1) # In[33]: ames.isnull().sum().sort_values(ascending=False) # What new features can we create, that better capture the information in some of the features? # # - create a column named `years_until_remod` which will store the difference between the `Year Remod/Add` and `Year Built` columns. # - create a column named `years_when_sold` which will store the difference between the `Yr Sold` and `Year Built` columns. # - create a column named `years_since_remod` which will store the difference between the `Year Remod/Add` and `Yr Sold` columns. # In[34]: ames['year_until_remod'] = ames['Year Remod/Add'] - ames['Year Built'] ames['years_when_sold'] = ames['Yr Sold'] - ames['Year Built'] ames['years_since_remod'] = ames['Yr Sold'] - ames['Year Remod/Add'] # Finally, let's drop the columns which aren't useful for machine learning. Such columns are: # # - `Order` - which repeats the index # - `PID` - which includes the property identification numbers # # We also need to drop columns which leak information about the final sale: # # - `Mo Sold` # - `Sale Type` # - `Sale Condition` # - `Yr Sold` # In[35]: ames = ames.drop(['Order', 'PID', 'Mo Sold', 'Sale Type', 'Sale Condition', 'Yr Sold'], axis=1) # Let's update the `transform_features()` function. # In[36]: def transform_features(df): sum_missing = df.isnull().sum() over_5 = sum_missing[sum_missing > len(df)/20].index df = df.drop(over_5, axis=1) numeric_missing = df.select_dtypes(include=['float', 'integer']).isnull().sum() to_fix = numeric_missing[numeric_missing > 0].sort_values() for col in to_fix.index: value = df[col].mode()[0] df[col] = df[col].fillna(value) object_missing = df.select_dtypes(include='object').isnull().sum() over_1 = object_missing[object_missing > 0].index df = df.drop(over_1, axis=1) df['year_until_remod'] = df['Year Remod/Add'] - df['Year Built'] df['years_when_sold'] = df['Yr Sold'] - df['Year Built'] df['years_since_remod'] = df['Yr Sold'] - df['Year Remod/Add'] df = df.drop(['Order', 'PID', 'Mo Sold', 'Sale Type', 'Sale Condition', 'Yr Sold'], axis=1) return df transformed_df = transform_features(pd.read_csv('AmesHousing.tsv', delimiter='\t')) filtered_df = select_features(transformed_df) rmse = train_and_test(filtered_df) rmse # ## Feature Selection # # Now that we have cleaned and transformed a lot of the features in the data set, it's time to move on to feature selection for numerical features. # # We will first generate a correlation heatmap matrix of the numerical features in the training data set. # # - Which features correlate strongly with our target column, `SalePrice`? # - Calculate the correlation coefficients for the columns that seem to correlate well with `SalePrice`. Because we have a pipeline in place, it's easy to try different features and see which features result in a better cross validation score. # In[37]: numeric = transformed_df.select_dtypes(include=['float', 'integer']) numeric.head() # In[38]: corr = numeric.corr()['SalePrice'].abs().sort_values(ascending=False) corr # In[39]: #Let's drop the columns with correlation under 0.4: to_drop = corr[corr < 0.4].index transformed_df = transformed_df.drop(to_drop, axis=1) # In[40]: transformed_df.head() # In[41]: #Let's make a heatmap to see if there are columns that are strongly correlated fig, ax = plt.subplots(figsize=(10,8)) sns.heatmap(transformed_df.corr(), ax=ax) # It looks like the `Garage Area` and `Garage Cars` columns are strongly correlated. Let's have a look at their [description in the documentaiton](https://s3.amazonaws.com/dq-content/307/data_description.txt): # # - `Garage Area` - Size of garage in square feet # - `Garage Cars` - Size of garage in car capacity # # Another strongly correlated pair is: # # - `TotRms AbvGrd` - Total rooms above grade (does not include bathrooms) # - `Gr Liv Area` - Above grade (ground) living area square feet # # Let's drop the `Garage Cars` and `TotRms AbvGrd` columns. # In[42]: transformed_df = transformed_df.drop(['TotRms AbvGrd', 'Garage Cars'], axis=1) # Which columns in the data frame should be converted to the categorical data type? All of the columns that can be categorized as nominal variables are candidates for being converted to categorical. Here are some other things we should think about: # # - If a categorical column has hundreds of unique values (or categories), should we keep it? # - When we dummy code this column, hundreds of columns will need to be added back to the data frame. # - Which categorical columns have a few unique values but more than 95% of the values in the column belong to a specific category? This would be similar to a low variance numerical feature (no variability in the data for the model to capture). # In[43]: nominal = ['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature'] # - Which columns are currently numerical but need to be encoded as categorical instead? # In[44]: cols_to_cat = [] for col in nominal: if col in transformed_df.columns: cols_to_cat.append(col) #Let's see how many unique values each column has unique_count = transformed_df[cols_to_cat].apply(lambda col: len(col.unique())) #let's drop those with over 10 unique values to_drop = unique_count[unique_count>10].index transformed_df = transformed_df.drop(to_drop, axis=1) # In[45]: #Let's transform 'Text' columns to 'Categorical'. text_cols = transformed_df.select_dtypes(include=['object']) for col in text_cols.columns: transformed_df[col] = transformed_df[col].astype('category') transformed_df = pd.concat([transformed_df, pd.get_dummies(transformed_df.select_dtypes(include=['category']))]).drop(text_cols, axis=1) # Update the `select_features()` function. # In[46]: def select_features(df): numeric = df.select_dtypes(include=['float', 'integer']) corr = numeric.corr()['SalePrice'].abs().sort_values(ascending=False) to_drop = corr[corr < 0.4].index df = df.drop(to_drop, axis=1) df = df.drop(['TotRms AbvGrd', 'Garage Cars'], axis=1) nominal = ['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature'] cols_to_cat = [] for col in nominal: if col in df.columns: cols_to_cat.append(col) unique_count = df[cols_to_cat].apply(lambda col: len(col.unique())) drop = unique_count[unique_count>10].index df = df.drop(drop, axis=1) text_cols = df.select_dtypes(include=['object']) for col in text_cols.columns: df[col] = df[col].astype('category') df = pd.concat([ df, pd.get_dummies(df.select_dtypes(include=['category'])) ], axis=1).drop(text_cols,axis=1) return df # In[47]: transformed_df = transform_features(pd.read_csv('AmesHousing.tsv', delimiter='\t')) filtered_df = select_features(transformed_df) rmse = train_and_test(filtered_df) rmse # ## Train And Test # # Now for the final part of the pipeline, training and testing. When iterating on different features, using simple validation is a good idea. Let's add a parameter named `k` that controls the type of cross validation that occurs. # # - The optional `k` parameter should accept integer values, with a default value of `0`. # # - When `k` equals `0`, perform holdout validation (what we already implemented): # - Select the first 1460 rows and assign to train. # - Select the remaining rows and assign to test. # - Train on train and test on test. # - Compute the RMSE and return. # # - When `k` equals `1`, perform simple cross validation: # - Shuffle the ordering of the rows in the data frame. # - Select the first 1460 rows and assign to `fold_one`. # - Select the remaining rows and assign to `fold_two`. # - Train on `fold_one` and test on `fold_two`. # - Train on `fold_two` and test on `fold_one`. # - Compute the average RMSE and return. # # - When `k` is greater than `0`, implement k-fold cross validation using `k` folds: # - Perform k-fold cross validation using `k` folds. # - Calculate the average RMSE value and return this value. # In[48]: def train_and_test(data, k=0): numeric_df = data.select_dtypes(include=['integer', 'float']) features = numeric_df.drop('SalePrice', axis=1).columns lr = LinearRegression() if k == 0: train = data[:1460] test = data[1460:] lr.fit(train[features], train['SalePrice']) predictions = lr.predict(test[features]) mse = mean_squared_error(test['SalePrice'], predictions) rmse = np.sqrt(mse) return rmse elif k == 1: data = data.sample(frac=1).reset_index(drop=True) fold_one = data[:1460] fold_two = data[1460:] lr.fit(fold_one[features], fold_one['SalePrice']) predict_one = lr.predict(fold_two[features]) mse_one = mean_squared_error(fold_two['SalePrice'], predict_one) rmse_one = np.sqrt(mse_one) lr.fit(fold_two[features], fold_two['SalePrice']) predict_two = lr.predict(fold_one[features]) mse_two = mean_squared_error(fold_one['SalePrice'], predict_two) rmse_two = np.sqrt(mse_two) average_rmse = np.mean([rmse_one, rmse_two]) return average_rmse else: kf = KFold(n_splits=k) rmses = [] for train_index, test_index in kf.split(data): train = data.iloc[train_index] test = data.iloc[test_index] lr.fit(train[features], train['SalePrice']) predictions = lr.predict(test[features]) mse = mean_squared_error(test['SalePrice'], predictions) rmses.append(np.sqrt(mse)) average_rmse = np.mean(rmses) return average_rmse # ## Final test # # Finally, let's put the final versions of the three dunctions together and test the model. # In[49]: def transform_features(df): sum_missing = df.isnull().sum() over_5 = sum_missing[sum_missing > len(df)/20].index df = df.drop(over_5, axis=1) numeric_missing = df.select_dtypes(include=['float', 'integer']).isnull().sum() to_fix = numeric_missing[numeric_missing > 0].sort_values() for col in to_fix.index: value = df[col].mode()[0] df[col] = df[col].fillna(value) object_missing = df.select_dtypes(include='object').isnull().sum() over_1 = object_missing[object_missing > 0].index df = df.drop(over_1, axis=1) df['year_until_remod'] = df['Year Remod/Add'] - df['Year Built'] df['years_when_sold'] = df['Yr Sold'] - df['Year Built'] df['years_since_remod'] = df['Yr Sold'] - df['Year Remod/Add'] df = df.drop(df[(df['years_when_sold'] < 0) | (df['years_since_remod'] < 0)].index) df = df.drop(['Order', 'PID', 'Mo Sold', 'Sale Type', 'Sale Condition', 'Yr Sold', 'Year Built', "Year Remod/Add"], axis=1) return df def select_features(df, corr_coef=0.4, unique_threshold=10): numeric = df.select_dtypes(include=['float', 'integer']) corr = numeric.corr()['SalePrice'].abs().sort_values(ascending=False) to_drop = corr[corr < corr_coef].index df = df.drop(to_drop, axis=1) df = df.drop(['TotRms AbvGrd', 'Garage Cars'], axis=1) nominal = ['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature'] cols_to_cat = [] for col in nominal: if col in df.columns: cols_to_cat.append(col) unique_count = df[cols_to_cat].apply(lambda col: len(col.unique())) drop = unique_count[unique_count > unique_threshold].index df = df.drop(drop, axis=1) text_cols = df.select_dtypes(include=['object']) for col in text_cols.columns: df[col] = df[col].astype('category') df = pd.concat([ df, pd.get_dummies(df.select_dtypes(include=['category'])) ], axis=1).drop(text_cols,axis=1) return df def train_and_test(data, k=0): numeric_df = data.select_dtypes(include=['integer', 'float']) features = numeric_df.drop('SalePrice', axis=1).columns lr = LinearRegression() if k == 0: train = data[:1460] test = data[1460:] lr.fit(train[features], train['SalePrice']) predictions = lr.predict(test[features]) mse = mean_squared_error(test['SalePrice'], predictions) rmse = np.sqrt(mse) return rmse if k == 1: data = data.sample(frac=1,) fold_one = data[:1460] fold_two = data[1460:] lr.fit(fold_one[features], fold_one['SalePrice']) predict_one = lr.predict(fold_two[features]) mse_one = mean_squared_error(fold_two['SalePrice'], predict_one) rmse_one = np.sqrt(mse_one) lr.fit(fold_two[features], fold_two['SalePrice']) predict_two = lr.predict(fold_one[features]) mse_two = mean_squared_error(fold_one['SalePrice'], predict_two) rmse_two = np.sqrt(mse_two) average_rmse = np.mean([rmse_one, rmse_two]) print(rmse_one, rmse_two) return average_rmse else: kf = KFold(n_splits=k, shuffle=True) rmses = [] for train_index, test_index in kf.split(data): train = data.iloc[train_index] test = data.iloc[test_index] lr.fit(train[features], train['SalePrice']) predictions = lr.predict(test[features]) mse = mean_squared_error(test['SalePrice'], predictions) rmses.append(np.sqrt(mse)) average_rmse = np.mean(rmses) print(rmses) return average_rmse # In[50]: df = transform_features(pd.read_csv('AmesHousing.tsv', delimiter='\t')) selected = select_features(df) # In[51]: #Test with k=0 train_and_test(selected, k=0) # In[52]: #Test with k=1 train_and_test(selected, k=1) # In[53]: #Test with k=4 train_and_test(selected, k=4) # In[54]: #Test with k=5 train_and_test(selected, k=5) # In[ ]: # In[ ]: