import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
data = pd.read_csv('AmesHousing.tsv', delimiter = '\t')
data.head(3)
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | ... | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | ... | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | ... | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 rows × 82 columns
def transform_features(DF):
return features_list
def select_fatures(DF):
# na_count = DF[numeric_cols].isna().sum()
# bad_cols = na_count[na_count >0]
# feature_list = numeric_cols.drop(target)
# target_list = ['SalePrice']
# feature_list = ['Gr Liv Area']
return feature_list
def train_and_test(DF, transform= False, feature_list = ['Gr Liv Area']):
train = DF[:1460]
test = DF[1460:]
target = 'SalePrice'
if transform == True:
transform(DF)
lr = LinearRegression()
lr.fit(train[feature_list],train[target])
train_prediction = lr.predict(train[feature_list])
test_prediction = lr.predict(test[feature_list])
train_rmse = math.sqrt( mse(train_prediction, train['SalePrice']))
test_rmse = math.sqrt( mse(test_prediction, test['SalePrice']))
print('train_rmse {}, \n test_rmse {}'.format(train_rmse,test_rmse))
train_and_test(data)
train_rmse 56034.362001412796, test_rmse 57088.25161263909
numeric_cols = data.select_dtypes(['int64','float64']).columns
data['years_until_remod'] = (data['Year Remod/Add'] - data['Year Built'])
numeric_cols = data.select_dtypes(['int64','float64']).columns
num_data =data[numeric_cols].drop(columns = ['Order',
'PID',
'Year Built',
'Year Remod/Add',
'Lot Frontage',
'Garage Yr Blt',
'MS SubClass'])
num_data = num_data[num_data['years_until_remod']>= 0]
num_data = num_data.dropna(axis = 0)
cleaned_data = num_data
print(cleaned_data.shape, ' ', data.shape)
(2903, 33) (2930, 83)
cleaned_data is the now cleaned version of the original data set. It only contains numerical useful columns, and has only lost 27 rows of information.
import seaborn as sns
sns.heatmap(cleaned_data.corr())
plt.show()
appears to be high correlation between price and: overall qual, mas vnr area, total bsmnt sf, 1st flr sf, gr liv area, full bath, garage cars, and garage area.
clean_corr = cleaned_data.corr()['SalePrice']
hi_corr = clean_corr[clean_corr> .4].sort_values(ascending=False)
print(hi_corr)
hi_corr_index = hi_corr.index
SalePrice 1.000000 Overall Qual 0.799085 Gr Liv Area 0.708272 Garage Cars 0.646959 Garage Area 0.639152 Total Bsmt SF 0.634484 1st Flr SF 0.625433 Full Bath 0.545984 Mas Vnr Area 0.508636 TotRms AbvGrd 0.496305 Fireplaces 0.475058 BsmtFin SF 1 0.433849 Name: SalePrice, dtype: float64
train_and_test(cleaned_data, feature_list = hi_corr_index)
train_rmse 1.1867434335950793e-11, test_rmse 1.1984677298704358e-11
The above results says that my model gets within .0000000012 cents, I believe something is not quite accurate in my function.