In [1]:

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

In [3]:

from sklearn.datasets import load_boston

In [4]:

boston = load_boston()

In [5]:

print boston.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:

:Number of Instances: 506

:Number of Attributes: 13 numeric/categorical predictive

:Median Value (attribute 14) is usually the target

:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's

:Missing Attribute Values: None

:Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing

This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.

**References**

- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
- many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)

In [6]:

plt.hist(boston.target,bins=50)

plt.xlabel('Prices in $1000s')
plt.ylabel('Number of houses')

Out[6]:

<matplotlib.text.Text at 0x1944c860>

In [7]:

plt.scatter(boston.data[:,5],boston.target)

plt.ylabel('Price in $1000s')
plt.xlabel('Number of rooms')

Out[7]:

<matplotlib.text.Text at 0x195e1438>

In [8]:

boston_df = DataFrame(boston.data)

boston_df.columns = boston.feature_names

boston_df.head()

Out[8]:

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98
1	0.02731	0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14
2	0.02729	0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03
3	0.03237	0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94
4	0.06905	0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33

In [9]:

boston_df['Price'] = boston.target

In [10]:

boston_df.head()

Out[10]:

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	Price
0	0.00632	18	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	0.02731	0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	0.02729	0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	0.03237	0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	0.06905	0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

In [11]:

sns.lmplot('RM','Price',data=boston_df)

Out[11]:

<seaborn.axisgrid.FacetGrid at 0x19732940>

In [16]:

X = boston_df.RM
X.shape

Out[16]:

(506L,)

In [19]:

X = np.vstack(boston_df.RM)

In [15]:

X.shape

Out[15]:

(506L, 1L)

In [21]:

Y = boston_df.Price

In [22]:

Out[22]:

array([[ 6.575],
       [ 6.421],
       [ 7.185],
       [ 6.998],
       [ 7.147],
       [ 6.43 ],
       [ 6.012],
       [ 6.172],
       [ 5.631],
       [ 6.004],
       [ 6.377],
       [ 6.009],
       [ 5.889],
       [ 5.949],
       [ 6.096],
       [ 5.834],
       [ 5.935],
       [ 5.99 ],
       [ 5.456],
       [ 5.727],
       [ 5.57 ],
       [ 5.965],
       [ 6.142],
       [ 5.813],
       [ 5.924],
       [ 5.599],
       [ 5.813],
       [ 6.047],
       [ 6.495],
       [ 6.674],
       [ 5.713],
       [ 6.072],
       [ 5.95 ],
       [ 5.701],
       [ 6.096],
       [ 5.933],
       [ 5.841],
       [ 5.85 ],
       [ 5.966],
       [ 6.595],
       [ 7.024],
       [ 6.77 ],
       [ 6.169],
       [ 6.211],
       [ 6.069],
       [ 5.682],
       [ 5.786],
       [ 6.03 ],
       [ 5.399],
       [ 5.602],
       [ 5.963],
       [ 6.115],
       [ 6.511],
       [ 5.998],
       [ 5.888],
       [ 7.249],
       [ 6.383],
       [ 6.816],
       [ 6.145],
       [ 5.927],
       [ 5.741],
       [ 5.966],
       [ 6.456],
       [ 6.762],
       [ 7.104],
       [ 6.29 ],
       [ 5.787],
       [ 5.878],
       [ 5.594],
       [ 5.885],
       [ 6.417],
       [ 5.961],
       [ 6.065],
       [ 6.245],
       [ 6.273],
       [ 6.286],
       [ 6.279],
       [ 6.14 ],
       [ 6.232],
       [ 5.874],
       [ 6.727],
       [ 6.619],
       [ 6.302],
       [ 6.167],
       [ 6.389],
       [ 6.63 ],
       [ 6.015],
       [ 6.121],
       [ 7.007],
       [ 7.079],
       [ 6.417],
       [ 6.405],
       [ 6.442],
       [ 6.211],
       [ 6.249],
       [ 6.625],
       [ 6.163],
       [ 8.069],
       [ 7.82 ],
       [ 7.416],
       [ 6.727],
       [ 6.781],
       [ 6.405],
       [ 6.137],
       [ 6.167],
       [ 5.851],
       [ 5.836],
       [ 6.127],
       [ 6.474],
       [ 6.229],
       [ 6.195],
       [ 6.715],
       [ 5.913],
       [ 6.092],
       [ 6.254],
       [ 5.928],
       [ 6.176],
       [ 6.021],
       [ 5.872],
       [ 5.731],
       [ 5.87 ],
       [ 6.004],
       [ 5.961],
       [ 5.856],
       [ 5.879],
       [ 5.986],
       [ 5.613],
       [ 5.693],
       [ 6.431],
       [ 5.637],
       [ 6.458],
       [ 6.326],
       [ 6.372],
       [ 5.822],
       [ 5.757],
       [ 6.335],
       [ 5.942],
       [ 6.454],
       [ 5.857],
       [ 6.151],
       [ 6.174],
       [ 5.019],
       [ 5.403],
       [ 5.468],
       [ 4.903],
       [ 6.13 ],
       [ 5.628],
       [ 4.926],
       [ 5.186],
       [ 5.597],
       [ 6.122],
       [ 5.404],
       [ 5.012],
       [ 5.709],
       [ 6.129],
       [ 6.152],
       [ 5.272],
       [ 6.943],
       [ 6.066],
       [ 6.51 ],
       [ 6.25 ],
       [ 7.489],
       [ 7.802],
       [ 8.375],
       [ 5.854],
       [ 6.101],
       [ 7.929],
       [ 5.877],
       [ 6.319],
       [ 6.402],
       [ 5.875],
       [ 5.88 ],
       [ 5.572],
       [ 6.416],
       [ 5.859],
       [ 6.546],
       [ 6.02 ],
       [ 6.315],
       [ 6.86 ],
       [ 6.98 ],
       [ 7.765],
       [ 6.144],
       [ 7.155],
       [ 6.563],
       [ 5.604],
       [ 6.153],
       [ 7.831],
       [ 6.782],
       [ 6.556],
       [ 7.185],
       [ 6.951],
       [ 6.739],
       [ 7.178],
       [ 6.8  ],
       [ 6.604],
       [ 7.875],
       [ 7.287],
       [ 7.107],
       [ 7.274],
       [ 6.975],
       [ 7.135],
       [ 6.162],
       [ 7.61 ],
       [ 7.853],
       [ 8.034],
       [ 5.891],
       [ 6.326],
       [ 5.783],
       [ 6.064],
       [ 5.344],
       [ 5.96 ],
       [ 5.404],
       [ 5.807],
       [ 6.375],
       [ 5.412],
       [ 6.182],
       [ 5.888],
       [ 6.642],
       [ 5.951],
       [ 6.373],
       [ 6.951],
       [ 6.164],
       [ 6.879],
       [ 6.618],
       [ 8.266],
       [ 8.725],
       [ 8.04 ],
       [ 7.163],
       [ 7.686],
       [ 6.552],
       [ 5.981],
       [ 7.412],
       [ 8.337],
       [ 8.247],
       [ 6.726],
       [ 6.086],
       [ 6.631],
       [ 7.358],
       [ 6.481],
       [ 6.606],
       [ 6.897],
       [ 6.095],
       [ 6.358],
       [ 6.393],
       [ 5.593],
       [ 5.605],
       [ 6.108],
       [ 6.226],
       [ 6.433],
       [ 6.718],
       [ 6.487],
       [ 6.438],
       [ 6.957],
       [ 8.259],
       [ 6.108],
       [ 5.876],
       [ 7.454],
       [ 8.704],
       [ 7.333],
       [ 6.842],
       [ 7.203],
       [ 7.52 ],
       [ 8.398],
       [ 7.327],
       [ 7.206],
       [ 5.56 ],
       [ 7.014],
       [ 8.297],
       [ 7.47 ],
       [ 5.92 ],
       [ 5.856],
       [ 6.24 ],
       [ 6.538],
       [ 7.691],
       [ 6.758],
       [ 6.854],
       [ 7.267],
       [ 6.826],
       [ 6.482],
       [ 6.812],
       [ 7.82 ],
       [ 6.968],
       [ 7.645],
       [ 7.923],
       [ 7.088],
       [ 6.453],
       [ 6.23 ],
       [ 6.209],
       [ 6.315],
       [ 6.565],
       [ 6.861],
       [ 7.148],
       [ 6.63 ],
       [ 6.127],
       [ 6.009],
       [ 6.678],
       [ 6.549],
       [ 5.79 ],
       [ 6.345],
       [ 7.041],
       [ 6.871],
       [ 6.59 ],
       [ 6.495],
       [ 6.982],
       [ 7.236],
       [ 6.616],
       [ 7.42 ],
       [ 6.849],
       [ 6.635],
       [ 5.972],
       [ 4.973],
       [ 6.122],
       [ 6.023],
       [ 6.266],
       [ 6.567],
       [ 5.705],
       [ 5.914],
       [ 5.782],
       [ 6.382],
       [ 6.113],
       [ 6.426],
       [ 6.376],
       [ 6.041],
       [ 5.708],
       [ 6.415],
       [ 6.431],
       [ 6.312],
       [ 6.083],
       [ 5.868],
       [ 6.333],
       [ 6.144],
       [ 5.706],
       [ 6.031],
       [ 6.316],
       [ 6.31 ],
       [ 6.037],
       [ 5.869],
       [ 5.895],
       [ 6.059],
       [ 5.985],
       [ 5.968],
       [ 7.241],
       [ 6.54 ],
       [ 6.696],
       [ 6.874],
       [ 6.014],
       [ 5.898],
       [ 6.516],
       [ 6.635],
       [ 6.939],
       [ 6.49 ],
       [ 6.579],
       [ 5.884],
       [ 6.728],
       [ 5.663],
       [ 5.936],
       [ 6.212],
       [ 6.395],
       [ 6.127],
       [ 6.112],
       [ 6.398],
       [ 6.251],
       [ 5.362],
       [ 5.803],
       [ 8.78 ],
       [ 3.561],
       [ 4.963],
       [ 3.863],
       [ 4.97 ],
       [ 6.683],
       [ 7.016],
       [ 6.216],
       [ 5.875],
       [ 4.906],
       [ 4.138],
       [ 7.313],
       [ 6.649],
       [ 6.794],
       [ 6.38 ],
       [ 6.223],
       [ 6.968],
       [ 6.545],
       [ 5.536],
       [ 5.52 ],
       [ 4.368],
       [ 5.277],
       [ 4.652],
       [ 5.   ],
       [ 4.88 ],
       [ 5.39 ],
       [ 5.713],
       [ 6.051],
       [ 5.036],
       [ 6.193],
       [ 5.887],
       [ 6.471],
       [ 6.405],
       [ 5.747],
       [ 5.453],
       [ 5.852],
       [ 5.987],
       [ 6.343],
       [ 6.404],
       [ 5.349],
       [ 5.531],
       [ 5.683],
       [ 4.138],
       [ 5.608],
       [ 5.617],
       [ 6.852],
       [ 5.757],
       [ 6.657],
       [ 4.628],
       [ 5.155],
       [ 4.519],
       [ 6.434],
       [ 6.782],
       [ 5.304],
       [ 5.957],
       [ 6.824],
       [ 6.411],
       [ 6.006],
       [ 5.648],
       [ 6.103],
       [ 5.565],
       [ 5.896],
       [ 5.837],
       [ 6.202],
       [ 6.193],
       [ 6.38 ],
       [ 6.348],
       [ 6.833],
       [ 6.425],
       [ 6.436],
       [ 6.208],
       [ 6.629],
       [ 6.461],
       [ 6.152],
       [ 5.935],
       [ 5.627],
       [ 5.818],
       [ 6.406],
       [ 6.219],
       [ 6.485],
       [ 5.854],
       [ 6.459],
       [ 6.341],
       [ 6.251],
       [ 6.185],
       [ 6.417],
       [ 6.749],
       [ 6.655],
       [ 6.297],
       [ 7.393],
       [ 6.728],
       [ 6.525],
       [ 5.976],
       [ 5.936],
       [ 6.301],
       [ 6.081],
       [ 6.701],
       [ 6.376],
       [ 6.317],
       [ 6.513],
       [ 6.209],
       [ 5.759],
       [ 5.952],
       [ 6.003],
       [ 5.926],
       [ 5.713],
       [ 6.167],
       [ 6.229],
       [ 6.437],
       [ 6.98 ],
       [ 5.427],
       [ 6.162],
       [ 6.484],
       [ 5.304],
       [ 6.185],
       [ 6.229],
       [ 6.242],
       [ 6.75 ],
       [ 7.061],
       [ 5.762],
       [ 5.871],
       [ 6.312],
       [ 6.114],
       [ 5.905],
       [ 5.454],
       [ 5.414],
       [ 5.093],
       [ 5.983],
       [ 5.983],
       [ 5.707],
       [ 5.926],
       [ 5.67 ],
       [ 5.39 ],
       [ 5.794],
       [ 6.019],
       [ 5.569],
       [ 6.027],
       [ 6.593],
       [ 6.12 ],
       [ 6.976],
       [ 6.794],
       [ 6.03 ]])

In [23]:

# [X 1]
X = np.array( [ [value,1] for value in X]   )

In [24]:

Out[24]:

array([[array([ 6.575]), 1],
       [array([ 6.421]), 1],
       [array([ 7.185]), 1],
       ..., 
       [array([ 6.976]), 1],
       [array([ 6.794]), 1],
       [array([ 6.03]), 1]], dtype=object)

In [26]:

m , b = np.linalg.lstsq(X,Y)[0]

In [27]:

plt.plot(boston_df.RM,boston_df.Price,'o')

x = boston_df.RM

plt.plot(x, m*x + b,'r',label='Best Fit Line')

Out[27]:

[<matplotlib.lines.Line2D at 0x1ac70588>]

In [29]:

result = np.linalg.lstsq(X,Y)

error_total = result[1]

rmse = np.sqrt(error_total/len(X))

print ' The root mean square error was %.2f' %rmse

 The root mean square error was 6.60

In [30]:

import sklearn
from sklearn.linear_model import LinearRegression

In [31]:

lreg = LinearRegression()

In [32]:

X_multi = boston_df.drop('Price',1)

Y_target = boston_df.Price

In [33]:

lreg.fit(X_multi,Y_target)

Out[33]:

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [36]:

print ' The estimated intercept coefficient is  %.2f ' % lreg.intercept_

print ' The number of coefficients used was %d ' %len(lreg.coef_)

 The estimated intercept coefficient is  36.49 
 The number of coefficients used was 13

In [37]:

coeff_df = DataFrame(boston_df.columns)
coeff_df.columns = ['Features']

coeff_df['Coefficient Estimate'] = Series(lreg.coef_)

coeff_df

Out[37]:

	Features	Coefficient Estimate
0	CRIM	-0.107171
1	ZN	0.046395
2	INDUS	0.020860
3	CHAS	2.688561
4	NOX	-17.795759
5	RM	3.804752
6	AGE	0.000751
7	DIS	-1.475759
8	RAD	0.305655
9	TAX	-0.012329
10	PTRATIO	-0.953464
11	B	0.009393
12	LSTAT	-0.525467
13	Price	NaN

In [38]:

X_train,X_test,Y_train,Y_test = sklearn.cross_validation.train_test_split(X,boston_df.Price)

In [39]:

print X_train.shape, X_test.shape , Y_train.shape , Y_test.shape

(379L, 2L) (127L, 2L) (379L,) (127L,)

In [40]:

lreg = LinearRegression()

lreg.fit(X_train,Y_train)

Out[40]:

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [41]:

pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)

In [42]:

print " Fit a model X_train, and calculate the MSE with Y_train: %.2f " % np.mean((Y_train-pred_train)**2)

print " Fit a model X_train, and calculate MSE with X_test and Y_test: %.2f " % np.mean((Y_test - pred_test)**2)

 Fit a model X_train, and calculate the MSE with Y_train: 47.30 
 Fit a model X_train, and calculate MSE with X_test and Y_test: 32.75

In [44]:

train = plt.scatter(pred_train,(pred_train - Y_train),c='b',alpha=0.5)

test = plt.scatter(pred_test,(pred_test-Y_test),c='r',alpha=0.5)

plt.hlines(y=0,xmin=-10,xmax=40)

plt.legend((train,test),('Training','Test'),loc='lower left')

plt.title('Residual Plots')

Out[44]:

<matplotlib.text.Text at 0x1b39e0f0>

In [ ]: