Machine Learning - Housing Prices | Regression


Done By: Brian Tham

Check out my other projects on:

briantham.io

About this Dataset


This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import scipy.stats as st
# Suppress Future Warnings
import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv("kc_house_data.csv")
In [3]:
print(df.shape)
print(df.nunique())
(21613, 21)
id               21436
date               372
price             4028
bedrooms            13
bathrooms           30
sqft_living       1038
sqft_lot          9782
floors               6
waterfront           2
view                 5
condition            5
grade               12
sqft_above         946
sqft_basement      306
yr_built           116
yr_renovated        70
zipcode             70
lat               5034
long               752
sqft_living15      777
sqft_lot15        8689
dtype: int64
In [4]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB
None
In [5]:
df.head()
Out[5]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

In [48]:
df.corr()
Out[48]:
price bathrooms sqft_living sqft_lot sqft_above sqft_basement yr_built yr_renovated zipcode lat ... Bedrooms_3 Bedrooms_4 Bedrooms_5 Bedrooms_6 Bedrooms_7 Bedrooms_8 Bedrooms_9 Bedrooms_10 Bedrooms_11 Bedrooms_33
price 1.000000 0.518912 0.702035 0.089661 0.605567 0.323816 0.054012 0.126434 -0.053203 0.307003 ... -0.183648 0.177489 0.189925 0.087776 0.046995 0.037755 0.016064 0.008962 -0.000372 0.001851
bathrooms 0.518912 1.000000 0.701784 0.089988 0.599360 0.334909 0.377433 0.069322 -0.144088 0.049235 ... -0.166878 0.232540 0.265175 0.146150 0.091757 0.050548 0.050171 0.019878 0.008475 -0.000529
sqft_living 0.702035 0.701784 1.000000 0.172826 0.876597 0.435043 0.318049 0.055363 -0.199430 0.052529 ... -0.272404 0.353317 0.298006 0.148055 0.086562 0.045947 0.030756 0.020870 0.006815 -0.003406
sqft_lot 0.089661 0.089988 0.172826 1.000000 0.183512 0.015286 0.053080 0.007644 -0.129574 -0.085683 ... -0.015255 0.026631 0.011350 0.009485 0.005809 -0.003634 -0.003873 -0.001777 -0.001666 -0.001496
sqft_above 0.605567 0.599360 0.876597 0.183512 1.000000 -0.051943 0.423898 0.023285 -0.261190 -0.000816 ... -0.224832 0.338346 0.211493 0.096252 0.069528 0.025845 0.023074 0.009509 0.005024 -0.006148
sqft_basement 0.323816 0.334909 0.435043 0.015286 -0.051943 1.000000 -0.133124 0.071323 0.074845 0.110538 ... -0.144622 0.100140 0.222709 0.127151 0.049542 0.046992 0.020652 0.025518 0.004742 0.004434
yr_built 0.054012 0.377433 0.318049 0.053080 0.423898 -0.133124 1.000000 -0.224874 -0.346869 -0.148122 ... 0.025095 0.130070 0.030812 -0.030627 -0.011663 -0.012276 -0.022318 -0.004548 -0.012275 -0.005559
yr_renovated 0.126434 0.069322 0.055363 0.007644 0.023285 0.071323 -0.224874 1.000000 0.064357 0.029398 ... -0.019092 0.004612 0.011731 0.023658 0.013192 -0.005155 0.010396 -0.002476 0.032424 -0.001429
zipcode -0.053203 -0.144088 -0.199430 -0.129574 -0.261190 0.074845 -0.346869 0.064357 1.000000 0.267048 ... -0.024937 -0.104634 -0.040980 0.003215 0.006178 0.001791 0.010193 -0.008722 0.003567 0.003186
lat 0.307003 0.049235 0.052529 -0.085683 -0.000816 0.110538 -0.148122 0.029398 0.267048 1.000000 ... -0.056487 -0.019408 0.028817 0.016596 0.014833 0.012386 0.011150 0.003967 -0.000199 0.006271
long 0.021626 0.149157 0.240223 0.229521 0.343803 -0.144765 0.409356 -0.068372 -0.564072 -0.135512 ... 0.003579 0.120164 0.025417 -0.021510 -0.007991 -0.009559 -0.011530 0.000940 -0.007202 -0.005656
sqft_living15 0.585379 0.497484 0.756420 0.144608 0.731870 0.200355 0.326229 -0.002673 -0.279033 0.048858 ... -0.201265 0.303741 0.188249 0.048366 0.021353 0.009981 0.002548 0.003555 -0.005623 -0.006516
sqft_lot15 0.082447 0.086952 0.183286 0.718557 0.194050 0.017276 0.070958 0.007854 -0.147221 -0.086419 ... 0.001367 0.024002 0.003326 0.008222 0.004677 -0.004863 -0.005115 -0.001762 -0.001945 -0.002010
Floors_1.5 0.016023 -0.095399 -0.058316 0.013978 -0.053293 -0.021304 -0.380517 0.045535 0.143418 0.068018 ... -0.026902 0.031429 0.027085 0.008719 0.014171 -0.000990 -0.005188 -0.003668 -0.002118 -0.002118
Floors_2.0 0.232662 0.369959 0.420018 0.019461 0.578689 -0.211141 0.526940 0.003141 -0.184719 -0.052540 ... -0.120120 0.216322 0.059850 0.013063 0.010257 0.004053 0.009791 0.006923 0.008665 -0.005340
Floors_2.5 0.122770 0.079446 0.100509 0.004686 0.104773 0.012540 -0.028593 0.030001 0.023581 0.016589 ... -0.018567 0.007779 0.022752 0.057797 0.022052 -0.002125 0.063155 -0.001021 -0.000589 -0.000589
Floors_3.0 0.019750 0.063825 -0.054244 -0.043911 -0.014236 -0.085931 0.196846 -0.024820 0.102116 0.120427 ... 0.070733 -0.080283 -0.034490 -0.006787 -0.000517 -0.004191 -0.002847 -0.002013 -0.001162 -0.001162
Floors_3.5 0.020611 0.008056 0.009179 -0.005607 0.012249 -0.003870 0.012853 -0.004043 0.010317 0.011089 ... 0.001757 -0.013153 -0.005443 -0.002172 -0.000808 0.097634 -0.000321 -0.000227 -0.000131 -0.000131
watFront_1 0.266369 0.065428 0.103818 0.021604 0.072075 0.080588 -0.026161 0.092885 0.030285 -0.014274 ... -0.010837 -0.013664 0.014143 0.009349 -0.003658 -0.002139 -0.001453 -0.001027 -0.000593 -0.000593
View_1 0.092607 0.050049 0.066511 -0.008287 0.021839 0.097164 -0.034053 0.033563 0.043251 0.018019 ... -0.023353 0.012343 0.023569 -0.003976 0.021698 -0.003064 -0.002081 -0.001472 -0.000850 -0.000850
View_2 0.148418 0.099570 0.135285 0.037278 0.077861 0.135064 -0.044616 0.032590 0.052009 0.005065 ... -0.026897 0.012208 0.041669 0.019876 0.012348 -0.005298 -0.003599 0.016491 -0.001469 -0.001469
View_3 0.182880 0.119851 0.158885 0.073871 0.091663 0.158213 -0.018873 0.050668 0.040773 -0.013892 ... -0.029884 0.032458 0.027027 0.023466 -0.006524 0.008619 -0.002591 -0.001832 -0.001057 -0.001057
View_4 0.307932 0.110849 0.169460 0.019172 0.107625 0.150292 -0.020229 0.080818 0.040748 0.013969 ... -0.030822 0.003644 0.035703 0.034369 0.013180 -0.003003 -0.002040 -0.001442 -0.000833 -0.000833
Cond_2 -0.051917 -0.067614 -0.065324 0.037617 -0.058925 -0.025309 -0.067277 -0.008571 0.023615 -0.022650 ... -0.009602 -0.020979 -0.023345 0.003902 -0.003759 -0.002197 -0.001493 -0.001055 -0.000609 -0.000609
Cond_3 0.007131 0.123475 0.102413 -0.011452 0.194555 -0.151498 0.391719 0.069268 0.017798 0.042297 ... -0.013565 0.023362 -0.003094 -0.016160 0.000765 -0.001738 0.012250 -0.007798 0.005000 -0.009253
Cond_4 -0.030715 -0.114994 -0.083794 0.013157 -0.142486 0.092712 -0.257414 -0.054833 -0.060803 -0.057481 ... 0.027374 -0.028501 -0.001074 0.014645 -0.002471 -0.001783 -0.009948 0.010813 -0.004061 -0.004061
Cond_5 0.057585 -0.003155 -0.018136 -0.014497 -0.088453 0.127865 -0.244353 -0.030077 0.058646 0.025827 ... -0.015589 0.014152 0.016400 0.002455 0.004140 0.006846 -0.004870 -0.003444 -0.001988 0.023273
Grade_1 -0.007376 -0.018536 -0.013257 0.000947 -0.012309 -0.004481 -0.001854 -0.001429 -0.006858 -0.001436 ... -0.006210 -0.004649 -0.001924 -0.000768 -0.000285 -0.000167 -0.000113 -0.000080 -0.000046 -0.000046
Grade_3 -0.010733 -0.026908 -0.019028 0.003370 -0.016957 -0.007761 -0.010298 -0.002476 -0.010631 -0.019177 ... -0.010756 -0.008053 -0.003333 -0.001330 -0.000494 -0.000289 -0.000196 -0.000139 -0.000080 -0.000080
Grade_4 -0.032520 -0.049693 -0.056650 0.006190 -0.050233 -0.023573 -0.039768 -0.001472 -0.005180 -0.017343 ... -0.030923 -0.025054 -0.010368 -0.004138 -0.001538 -0.000899 -0.000611 -0.000432 -0.000249 -0.000249
Grade_5 -0.084513 -0.127585 -0.127055 0.022899 -0.108432 -0.060782 -0.128795 -0.009295 0.009456 -0.046521 ... -0.042388 -0.052912 -0.021703 -0.000180 -0.004466 -0.002611 -0.001773 -0.001254 -0.000724 -0.000724
Grade_6 -0.209329 -0.322861 -0.312096 -0.019164 -0.280453 -0.122919 -0.313452 0.008192 0.148866 -0.062725 ... -0.023006 -0.141357 -0.066479 -0.026486 -0.009762 -0.007916 -0.005377 -0.003802 -0.002195 -0.002195
Grade_7 -0.315803 -0.234908 -0.358513 -0.068004 -0.386797 -0.020268 -0.211814 -0.020668 0.070263 -0.040414 ... 0.157398 -0.137591 -0.058891 -0.012655 -0.010735 0.002290 0.008492 0.006004 0.008067 0.008067
Grade_8 0.004705 0.120300 0.071326 -0.024086 0.059621 0.036462 0.199846 0.004496 -0.052505 0.026337 ... 0.007825 0.057869 0.002166 0.012593 0.003272 0.001470 -0.004231 -0.007361 -0.004250 -0.004250
Grade_9 0.235897 0.203087 0.318419 0.049548 0.343198 0.018640 0.219956 0.019602 -0.087643 0.042223 ... -0.101614 0.157856 0.064624 0.000115 -0.002024 -0.003315 -0.006182 0.007672 -0.002524 -0.002524
Grade_10 0.340799 0.259672 0.369058 0.074335 0.376160 0.062053 0.148728 0.002240 -0.073181 0.052306 ... -0.091455 0.113100 0.070517 0.014387 0.014890 0.002691 -0.003921 -0.002773 -0.001601 -0.001601
Grade_11 0.357412 0.255943 0.345771 0.077035 0.341966 0.077705 0.098705 -0.003183 -0.058145 0.039394 ... -0.086537 0.082598 0.070141 0.024601 0.027062 -0.003364 0.018347 -0.001616 -0.000933 -0.000933
Grade_12 0.290834 0.166158 0.238806 0.061535 0.223412 0.077555 0.047053 -0.002884 -0.039395 0.017403 ... -0.046044 0.031374 0.047553 0.012036 0.014437 0.027719 -0.001078 -0.000762 -0.000440 -0.000440
Grade_13 0.211803 0.100505 0.144329 0.007758 0.126575 0.062683 0.004686 0.022997 0.003766 0.013143 ... -0.018605 -0.004616 0.036295 0.048017 -0.001030 -0.000602 -0.000409 -0.000289 -0.000167 -0.000167
Bedrooms_1 -0.058411 -0.119700 -0.125461 0.002743 -0.112286 -0.050263 -0.093017 0.020456 0.045904 0.004951 ... -0.088000 -0.065890 -0.027266 -0.010883 -0.004046 -0.002365 -0.001606 -0.001136 -0.000656 -0.000656
Bedrooms_2 -0.144571 -0.313781 -0.350008 -0.027807 -0.314731 -0.137460 -0.203107 -0.002876 0.200333 0.078832 ... -0.349277 -0.261520 -0.108222 -0.043196 -0.016058 -0.009387 -0.006376 -0.004508 -0.002603 -0.002603
Bedrooms_3 -0.183648 -0.166878 -0.272404 -0.015255 -0.224832 -0.144622 0.025095 -0.019092 -0.024937 -0.056487 ... 1.000000 -0.623946 -0.258200 -0.103058 -0.038311 -0.022395 -0.015212 -0.010756 -0.006210 -0.006210
Bedrooms_4 0.177489 0.232540 0.353317 0.026631 0.338346 0.100140 0.130070 0.004612 -0.104634 -0.019408 ... -0.623946 1.000000 -0.193327 -0.077165 -0.028685 -0.016768 -0.011390 -0.008053 -0.004649 -0.004649
Bedrooms_5 0.189925 0.265175 0.298006 0.011350 0.211493 0.222709 0.030812 0.011731 -0.040980 0.028817 ... -0.258200 -0.193327 1.000000 -0.031932 -0.011870 -0.006939 -0.004713 -0.003333 -0.001924 -0.001924
Bedrooms_6 0.087776 0.146150 0.148055 0.009485 0.096252 0.127151 -0.030627 0.023658 0.003215 0.016596 ... -0.103058 -0.077165 -0.031932 1.000000 -0.004738 -0.002770 -0.001881 -0.001330 -0.000768 -0.000768
Bedrooms_7 0.046995 0.091757 0.086562 0.005809 0.069528 0.049542 -0.011663 0.013192 0.006178 0.014833 ... -0.038311 -0.028685 -0.011870 -0.004738 1.000000 -0.001030 -0.000699 -0.000494 -0.000285 -0.000285
Bedrooms_8 0.037755 0.050548 0.045947 -0.003634 0.025845 0.046992 -0.012276 -0.005155 0.001791 0.012386 ... -0.022395 -0.016768 -0.006939 -0.002770 -0.001030 1.000000 -0.000409 -0.000289 -0.000167 -0.000167
Bedrooms_9 0.016064 0.050171 0.030756 -0.003873 0.023074 0.020652 -0.022318 0.010396 0.010193 0.011150 ... -0.015212 -0.011390 -0.004713 -0.001881 -0.000699 -0.000409 1.000000 -0.000196 -0.000113 -0.000113
Bedrooms_10 0.008962 0.019878 0.020870 -0.001777 0.009509 0.025518 -0.004548 -0.002476 -0.008722 0.003967 ... -0.010756 -0.008053 -0.003333 -0.001330 -0.000494 -0.000289 -0.000196 1.000000 -0.000080 -0.000080
Bedrooms_11 -0.000372 0.008475 0.006815 -0.001666 0.005024 0.004742 -0.012275 0.032424 0.003567 -0.000199 ... -0.006210 -0.004649 -0.001924 -0.000768 -0.000285 -0.000167 -0.000113 -0.000080 1.000000 -0.000046
Bedrooms_33 0.001851 -0.000529 -0.003406 -0.001496 -0.006148 0.004434 -0.005559 -0.001429 0.003186 0.006271 ... -0.006210 -0.004649 -0.001924 -0.000768 -0.000285 -0.000167 -0.000113 -0.000080 -0.000046 1.000000

51 rows × 51 columns

In [6]:
plt.figure(figsize = (12, 6))

plt.subplot(121)
plt.title('Price Distribuition')
sns.distplot(df['price'])

plt.subplot(122)
g1 = plt.scatter(range(df.shape[0]), np.sort(df.price.values))
g1= plt.title("Price Curve Distribuition", fontsize=15)
g1 = plt.xlabel("")
g1 = plt.ylabel("Amount(US)", fontsize=12)

plt.subplots_adjust(wspace = 0.3, hspace = 0.5,
                    top = 0.9)
plt.show()
In [7]:
print("Price Min")
print(df['price'].min())
print("Price Mean")
print(df['price'].mean())
print("Price Median")
print(df['price'].median())
print("Price Max")
print(df['price'].max())
print("Price Std")
print(df['price'].std())
Price Min
75000.0
Price Mean
540088.1417665294
Price Median
450000.0
Price Max
7700000.0
Price Std
367127.1964826997
In [8]:
plt.figure(figsize = (8, 5))
sns.jointplot(df.sqft_living, df.price, 
              alpha = 0.5)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()
<Figure size 576x360 with 0 Axes>
In [9]:
condition = df['condition'].value_counts()

print("Condition counting: ")
print(condition)

fig, ax = plt.subplots(ncols=2, figsize=(14,5))
sns.countplot(x='condition', data=df, ax=ax[0])
sns.boxplot(x='condition', y= 'price',
            data=df, ax=ax[1])
plt.show()
Condition counting: 
3    14031
4     5679
5     1701
2      172
1       30
Name: condition, dtype: int64
In [10]:
plt.figure(figsize = (12,8))
g = sns.FacetGrid(data=df, hue='condition',height= 5, aspect=2)
g.map(plt.scatter, "sqft_living", "price")
plt.show()
<Figure size 864x576 with 0 Axes>

Exploring bathrooms columns by price and conditions

In [11]:
df["bathrooms"] = df['bathrooms'].round(0).astype(int)

print("Freuency bathroom description:")
print(df["bathrooms"].value_counts())

plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bathrooms", data=df,
                    ax=ax1)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("Bathrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_xlabel("count")

ax2 = plt.subplot(222)
ax2 = sns.boxplot(x="bathrooms", y='price',
                  data=df, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Bathrooms distribution price", fontsize=15)
ax2.set_xlabel("Bathrooms number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.stripplot(x="bathrooms", y="price",
                    data=df, alpha=0.5,
                    jitter=True, hue="condition")
ax0.set_title("Better view distribuition through price", fontsize=15)
ax0.set_xlabel("Bathroom number")
ax0.set_ylabel("log Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)

plt.show()
Freuency bathroom description:
2    13851
1     3933
3     2527
4     1201
5       57
6       24
0       14
8        4
7        2
Name: bathrooms, dtype: int64
In [12]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

plt.figure(figsize = (12,6))
ax1 = plt.subplot2grid((2,2), (0,0), colspan = 2)
for val in range(1,6,1):
    indeX = df.condition == val
    ax1.scatter(df.sqft_living.loc[indeX], df.price.loc[indeX], label = val, alpha=0.5)
ax1.legend(bbox_to_anchor = [1.1, 1])
ax1.set_xlabel('sqfit living area')
ax1.set_ylabel('Price house')
ax1.set_title('Sqft Living - Price w.r.t Conditions')

ax2 = plt.subplot2grid((2,2), (1,0))
sns.boxplot(x = 'condition', y = 'price', data = df, ax = ax2)
ax2.set_title('Box Plot Condition & Price', fontsize = 12)

ax3 = plt.subplot2grid((2,2), (1,1))
cubicQual = df.groupby(['condition'])['price'].mean().round(0)
testTrain = df.loc[:, ['condition', 'price']].copy()
testTrain['sqCond'] = np.power(testTrain['condition'],2)
mdl = linear_model.LinearRegression()
mdl.fit(testTrain[['condition', 'sqCond']], testTrain['price'])
y_pred = mdl.predict(testTrain[['condition', 'sqCond']])
print("Mean squared error: %.2f" % mean_squared_error(y_pred, testTrain.price))
# Plot outputs
ax3.scatter(testTrain['condition'], testTrain['price'],  color='black')
ax3.plot(testTrain['condition'], y_pred, color='blue', linewidth=3)
ax3.set_title('LinReg, price ~ condtion + sqft_cond', fontsize = 12)
ax3.set_xlabel('Condition Rate')
plt.subplots_adjust(hspace = 0.5, top = 0.9)
plt.suptitle('Condition Effect to Sale Price', fontsize = 14)
plt.show()
Mean squared error: 134582326689.42
In [13]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

plt.figure(figsize = (12,6))
ax1 = plt.subplot2grid((2,2), (0,0), colspan = 2)

for val in range(0,5,1):
    indeX = df.view == val
    ax1.scatter(df.sqft_living.loc[indeX], df.price.loc[indeX], label = val, alpha=0.4)
ax1.legend(bbox_to_anchor = [1.1, 1])
ax1.set_xlabel('sqfit living area')
ax1.set_ylabel('Price house')
ax1.set_title('Sqft Living - Price w.r.t View')

ax2 = plt.subplot2grid((2,2), (1,0))
sns.boxplot(x = 'view', y = 'price', data = df, ax = ax2)
ax2.set_title('Box Plot View & Price', fontsize = 12)

ax3 = plt.subplot2grid((2,2), (1,1))
cubicV = df.groupby(['view'])['price'].mean().round(0)
testTrain = df.loc[:, ['view', 'price']].copy()
testTrain['sqview'] = np.power(testTrain['view'],2)
mdl = linear_model.LinearRegression()
mdl.fit(testTrain[['view', 'sqview']], testTrain['price'])
y_pred = mdl.predict(testTrain[['view', 'sqview']])
print("Mean squared error: %.2f" % mean_squared_error(y_pred, testTrain.price))
# Plot outputs
ax3.scatter(testTrain['view'], testTrain['price'],  color='black')
ax3.plot(testTrain['view'], y_pred, color='blue', linewidth=3)
ax3.set_title('LinReg, price ~ condtion + sqft_cond', fontsize = 12)
ax3.set_xlabel('View rate')
plt.subplots_adjust(hspace = 0.5, top = 0.9)
plt.suptitle('"VIEW" Effect To SalePrice', fontsize = 14)
plt.show()
Mean squared error: 112971203793.79
In [15]:
bedrooms = df.bedrooms.value_counts()


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)


ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bedrooms", data=df,
                    ax=ax1)
ax1.set_title("bedrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_ylabel("count")

ax2 = plt.subplot(222)
ax2 = sns.regplot(x="bedrooms", y='price', 
                  data=df, ax=ax2, x_jitter=True)
ax2.set_title("Bedrooms distribution price", fontsize=15)
ax2.set_xlabel("Bedrooms number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.boxenplot(x="bedrooms", y="price",
                    data=df)
ax0.set_title("Better understaning price", fontsize=15)
ax0.set_xlabel("Bedrooms")
ax0.set_ylabel("log Price(US)")

plt.show()
In [16]:
print("Floors counting description")
print(df['floors'].value_counts())


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.boxenplot(x="floors", y='price', 
                    data=df, ax=ax1, )
ax1.set_title("Floors counting", fontsize=15)
ax1.set_xlabel("Floors number")
ax1.set_ylabel("Count")

ax2 = plt.subplot(222)
ax2 = sns.countplot(x="floors",
                  data=df, ax=ax2)
ax2.set_title("Floor distribution by price", fontsize=15)
ax2.set_xlabel("Floor number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.regplot(x="floors", y="price", #I need to change floors by sqft_living and hue bye floors
                    data=df, x_jitter=True)
ax0.set_title("Better understaning price by floor", fontsize=15)
ax0.set_xlabel("Floor")
ax0.set_ylabel("log Price(US)")

plt.show()
Floors counting description
1.0    10680
2.0     8241
1.5     1910
3.0      613
2.5      161
3.5        8
Name: floors, dtype: int64
In [17]:
plt.figure(figsize = (12,8))
g=sns.lmplot(x="sqft_living", y="price", aspect=1.8,
                    data=df, hue="floors", fit_reg=False)
g.set_titles("Floors by sqft_living and price", fontsize=15)
g.set_xlabels("Sqft Living")
g.set_ylabels("Price(US)")
plt.show()
<Figure size 864x576 with 0 Axes>
In [18]:
print("Grade counting description")
print(df['grade'].value_counts())


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.boxenplot(x="grade", y='price', 
                    data=df, ax=ax1, )
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("grade counting", fontsize=15)
ax1.set_xlabel("Grade number")
ax1.set_ylabel("Count")

ax2 = plt.subplot(222)
ax2 = sns.countplot(x="grade",
                  data=df, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Grade distribution price", fontsize=15)
ax2.set_xlabel("Grade number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.regplot(x="grade", y="price",
                    data=df, x_jitter=True)
ax0.set_title("Better understaning price by grade", fontsize=15)
ax0.set_xlabel("Grade")
ax0.set_ylabel("log Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)

plt.show()
Grade counting description
7     8981
8     6068
9     2615
6     2038
10    1134
11     399
5      242
12      90
4       29
13      13
3        3
1        1
Name: grade, dtype: int64
In [19]:
#Clearly view of bathrooms and bedrooms correlation

bath = ['bathrooms', 'bedrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bath[0]], df[bath[1]]).style.background_gradient(cmap = cm)
Out[19]:
bedrooms 0 1 2 3 4 5 6 7 8 9 10 11 33
bathrooms
0 7 4 2 0 1 0 0 0 0 0 0 0 0
1 2 167 1587 1800 327 43 6 1 0 0 0 0 0
2 4 28 1129 7186 4709 695 90 7 1 0 1 0 1
3 0 0 41 656 1219 506 88 7 6 2 1 1 0
4 0 0 1 182 601 321 72 17 4 3 0 0 0
5 0 0 0 0 19 22 12 2 1 0 1 0 0
6 0 0 0 0 6 13 2 2 1 0 0 0 0
7 0 0 0 0 0 1 0 1 0 0 0 0 0
8 0 0 0 0 0 0 2 1 0 1 0 0 0
In [20]:
bath_cond = ['bathrooms', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bath_cond[0]], df[bath_cond[1]]).style.background_gradient(cmap = cm)
Out[20]:
condition 1 2 3 4 5
bathrooms
0 1 1 9 3 0
1 20 91 2211 1300 311
2 9 75 9000 3702 1065
3 0 5 1713 553 256
4 0 0 1026 109 66
5 0 0 46 8 3
6 0 0 21 3 0
7 0 0 2 0 0
8 0 0 3 1 0
In [21]:
bed_cond = ['bedrooms', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bed_cond[0]], df[bed_cond[1]]).style.background_gradient(cmap = cm)
Out[21]:
condition 1 2 3 4 5
bedrooms
0 1 1 10 1 0
1 4 11 124 48 12
2 12 51 1779 718 200
3 8 69 6308 2711 728
4 4 36 4580 1682 580
5 0 1 1031 418 151
6 1 3 158 87 23
7 0 0 25 9 4
8 0 0 8 3 2
9 0 0 6 0 0
10 0 0 1 2 0
11 0 0 1 0 0
33 0 0 0 0 1
In [22]:
cond_water = ['condition', 'waterfront']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[cond_water[0]], df[cond_water[1]]).style.background_gradient(cmap = cm)
Out[22]:
waterfront 0 1
condition
1 29 1
2 171 1
3 13940 91
4 5629 50
5 1681 20
In [23]:
grade_cond = ['grade', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_cond[0]], df[grade_cond[1]]).style.background_gradient(cmap = cm)
Out[23]:
condition 1 2 3 4 5
grade
1 1 0 0 0 0
3 0 1 1 0 1
4 1 5 13 10 0
5 9 15 100 84 34
6 11 59 1035 685 248
7 6 75 5234 2833 833
8 2 13 4269 1394 390
9 0 2 2041 446 126
10 0 2 921 156 55
11 0 0 332 56 11
12 0 0 74 13 3
13 0 0 11 2 0
In [24]:
grade_bed = ['grade', 'bedrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_bed[0]], df[grade_bed[1]]).style.background_gradient(cmap = cm)
Out[24]:
bedrooms 0 1 2 3 4 5 6 7 8 9 10 11 33
grade
1 1 0 0 0 0 0 0 0 0 0 0 0 0
3 0 3 0 0 0 0 0 0 0 0 0 0 0
4 2 12 14 1 0 0 0 0 0 0 0 0 0
5 0 37 114 62 21 5 3 0 0 0 0 0 0
6 0 78 824 854 233 41 7 1 0 0 0 0 0
7 6 52 1205 4917 2177 501 98 11 6 4 2 1 1
8 3 14 499 2796 2194 455 90 12 4 1 0 0 0
9 0 2 78 832 1351 313 33 4 1 0 1 0 0
10 0 1 21 296 615 173 22 5 1 0 0 0 0
11 0 0 3 56 239 83 13 4 0 1 0 0 0
12 1 0 2 9 49 24 3 1 1 0 0 0 0
13 0 0 0 1 3 6 3 0 0 0 0 0 0
In [25]:
grade_bath = ['grade', 'bathrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_bath[0]], df[grade_bath[1]]).style.background_gradient(cmap = cm)
Out[25]:
bathrooms 0 1 2 3 4 5 6 7 8
grade
1 1 0 0 0 0 0 0 0 0
3 2 1 0 0 0 0 0 0 0
4 0 28 1 0 0 0 0 0 0
5 1 204 36 1 0 0 0 0 0
6 2 1441 574 18 3 0 0 0 0
7 4 2104 6216 575 75 6 0 0 1
8 3 145 4788 882 244 4 2 0 0
9 0 9 1671 629 303 3 0 0 0
10 0 0 480 309 333 11 1 0 0
11 0 1 77 95 195 23 7 1 0
12 1 0 7 17 45 8 10 1 1
13 0 0 1 1 3 2 4 0 2

Correlation matrix

In [26]:
corr = df[['bathrooms', 'bedrooms', 'sqft_living', 'sqft_lot', 'floors', 'grade', 'price']]

plt.figure(figsize=(10,8))
plt.title('Correlation of variables')
sns.heatmap(corr.astype(float).corr(),vmax=1.0,  annot=True)
plt.show()

Year built distribuition

In [27]:
sns.distplot(df['yr_built'])
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1dbc5efdb48>
In [28]:
g = sns.catplot(x="yr_built", y = "price", data=df[df['price'] < 1000000], 
                   height= 7, aspect = 2, kind="box" )
g.set_xticklabels(rotation=90)
plt.show()
In [29]:
df = df.merge(pd.get_dummies(df.floors, drop_first=True, prefix='Floors'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.waterfront, drop_first=True, prefix='watFront'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.view, drop_first=True, prefix='View'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.condition, drop_first=True, prefix='Cond'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.grade, prefix='Grade'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.bedrooms, drop_first=True, prefix='Bedrooms'), left_index=True, right_index=True)
In [30]:
del df['floors'],
del df['waterfront']
del df['view']
del df['condition']
del df['grade']
del df['bedrooms']
In [31]:
plt.figure(figsize=(15,12))
plt.title('Correlation of variables', fontsize=20)
sns.heatmap(df.corr().astype(float).corr(),vmax=1.0)
plt.show()