import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
def transform_features(df):
return df
def select_features(df):
return df[["Gr Liv Area", "SalePrice"]]
def train_and_test(df):
train = df[:1460]
test = df[1460:]
## You can use `pd.DataFrame.select_dtypes()` to specify column types
## and return only those columns as a data frame.
numeric_train = train.select_dtypes(include=['integer', 'float'])
numeric_test = test.select_dtypes(include=['integer', 'float'])
## You can use `pd.Series.drop()` to drop a value.
features = numeric_train.columns.drop("SalePrice")
lr = linear_model.LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
mse = mean_squared_error(test["SalePrice"], predictions)
rmse = np.sqrt(mse)
return rmse
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)
rmse
57088.25161263909
Handle missing values:
1: All columns: Drop any with 5% or more missing values for now.
## Series object: column name -> number of missing values
num_missing = df.isnull().sum()
# Filter Series to columns containing >5% missing values
drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()
# Drop those columns from the data frame. Note the use of the .index accessor
df = df.drop(drop_missing_cols.index, axis=1)
2: Text columns: Drop any with 1 or more missing values for now.
## Series object: column name -> number of missing values
text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
## Filter Series to columns containing *any* missing values
drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]
df = df.drop(drop_missing_cols_2.index, axis=1)
3: Numerical columns: For columns with missing values, fill in with the most common value in that column
## Compute column-wise missing value counts
num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
fixable_numeric_cols
BsmtFin SF 1 1 BsmtFin SF 2 1 Bsmt Unf SF 1 Total Bsmt SF 1 Garage Cars 1 Garage Area 1 Bsmt Full Bath 2 Bsmt Half Bath 2 Mas Vnr Area 23 dtype: int64
## Compute the most common value for each column in `fixable_nmeric_missing_cols`.
replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
replacement_values_dict
{'BsmtFin SF 1': 0.0, 'BsmtFin SF 2': 0.0, 'Bsmt Unf SF': 0.0, 'Total Bsmt SF': 0.0, 'Garage Cars': 2.0, 'Garage Area': 0.0, 'Bsmt Full Bath': 0.0, 'Bsmt Half Bath': 0.0, 'Mas Vnr Area': 0.0}
## Use `pd.DataFrame.fillna()` to replace missing values.
df = df.fillna(replacement_values_dict)
## Verify that every column has 0 missing values
df.isnull().sum().value_counts()
0 64 dtype: int64
What new features can we create, that better capture the information in some of the features?
years_sold = df['Yr Sold'] - df['Year Built']
years_sold[years_sold < 0]
2180 -1 dtype: int64
years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
years_since_remod[years_since_remod < 0]
1702 -1 2180 -2 2181 -1 dtype: int64
## Create new columns
df['Years Before Sale'] = years_sold
df['Years Since Remod'] = years_since_remod
## Drop rows with negative values for both of these new features
df = df.drop([1702, 2180, 2181], axis=0)
## No longer need original year columns
df = df.drop(["Year Built", "Year Remod/Add"], axis = 1)
Drop columns that:
## Drop columns that aren't useful for ML
df = df.drop(["PID", "Order"], axis=1)
## Drop columns that leak info about the final sale
df = df.drop(["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)
Let's update transform_features()
def transform_features(df):
num_missing = df.isnull().sum()
drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()
df = df.drop(drop_missing_cols.index, axis=1)
text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]
df = df.drop(drop_missing_cols_2.index, axis=1)
num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
df = df.fillna(replacement_values_dict)
years_sold = df['Yr Sold'] - df['Year Built']
years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
df['Years Before Sale'] = years_sold
df['Years Since Remod'] = years_since_remod
df = df.drop([1702, 2180, 2181], axis=0)
df = df.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
return df
def select_features(df):
return df[["Gr Liv Area", "SalePrice"]]
def train_and_test(df):
train = df[:1460]
test = df[1460:]
## You can use `pd.DataFrame.select_dtypes()` to specify column types
## and return only those columns as a data frame.
numeric_train = train.select_dtypes(include=['integer', 'float'])
numeric_test = test.select_dtypes(include=['integer', 'float'])
## You can use `pd.Series.drop()` to drop a value.
features = numeric_train.columns.drop("SalePrice")
lr = linear_model.LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
mse = mean_squared_error(test["SalePrice"], predictions)
rmse = np.sqrt(mse)
return rmse
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)
rmse
55275.367312413066
numerical_df = transform_df.select_dtypes(include=['int', 'float'])
numerical_df.head(5)
Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Bsmt Full Bath | Bsmt Half Bath | Garage Cars | Garage Area | |
---|---|---|---|---|---|---|---|---|---|
0 | 112.0 | 639.0 | 0.0 | 441.0 | 1080.0 | 1.0 | 0.0 | 2.0 | 528.0 |
1 | 0.0 | 468.0 | 144.0 | 270.0 | 882.0 | 0.0 | 0.0 | 1.0 | 730.0 |
2 | 108.0 | 923.0 | 0.0 | 406.0 | 1329.0 | 0.0 | 0.0 | 1.0 | 312.0 |
3 | 0.0 | 1065.0 | 0.0 | 1045.0 | 2110.0 | 1.0 | 0.0 | 2.0 | 522.0 |
4 | 0.0 | 791.0 | 0.0 | 137.0 | 928.0 | 0.0 | 0.0 | 2.0 | 482.0 |
abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
abs_corr_coeffs
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~\Anaconda3\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 3077 try: -> 3078 return self._engine.get_loc(key) 3079 except KeyError: pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'SalePrice' During handling of the above exception, another exception occurred: KeyError Traceback (most recent call last) <ipython-input-40-fef770f21a34> in <module>() ----> 1 abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values() 2 abs_corr_coeffs ~\Anaconda3\Anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key) 2686 return self._getitem_multilevel(key) 2687 else: -> 2688 return self._getitem_column(key) 2689 2690 def _getitem_column(self, key): ~\Anaconda3\Anaconda\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key) 2693 # get column 2694 if self.columns.is_unique: -> 2695 return self._get_item_cache(key) 2696 2697 # duplicate columns & possible reduce dimensionality ~\Anaconda3\Anaconda\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item) 2487 res = cache.get(item) 2488 if res is None: -> 2489 values = self._data.get(item) 2490 res = self._box_item_values(item, values) 2491 cache[item] = res ~\Anaconda3\Anaconda\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath) 4113 4114 if not isna(item): -> 4115 loc = self.items.get_loc(item) 4116 else: 4117 indexer = np.arange(len(self.items))[isna(self.items)] ~\Anaconda3\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 3078 return self._engine.get_loc(key) 3079 except KeyError: -> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key)) 3081 3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance) pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'SalePrice'
print(numerical_df.columns)
Index(['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Cars', 'Garage Area'], dtype='object')