import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
from sklearn.datasets import load_boston
boston = load_boston()
print boston.DESCR
Boston House Prices dataset Notes ------ Data Set Characteristics: :Number of Instances: 506 :Number of Attributes: 13 numeric/categorical predictive :Median Value (attribute 14) is usually the target :Attribute Information (in order): - CRIM per capita crime rate by town - ZN proportion of residential land zoned for lots over 25,000 sq.ft. - INDUS proportion of non-retail business acres per town - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - NOX nitric oxides concentration (parts per 10 million) - RM average number of rooms per dwelling - AGE proportion of owner-occupied units built prior to 1940 - DIS weighted distances to five Boston employment centres - RAD index of accessibility to radial highways - TAX full-value property-tax rate per $10,000 - PTRATIO pupil-teacher ratio by town - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - LSTAT % lower status of the population - MEDV Median value of owner-occupied homes in $1000's :Missing Attribute Values: None :Creator: Harrison, D. and Rubinfeld, D.L. This is a copy of UCI ML housing dataset. http://archive.ics.uci.edu/ml/datasets/Housing This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter. The Boston house-price data has been used in many machine learning papers that address regression problems. **References** - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261. - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann. - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
plt.hist(boston.target,bins=50)
plt.xlabel('Prices in $1000s')
plt.ylabel('Number of houses')
<matplotlib.text.Text at 0x1944c860>
plt.scatter(boston.data[:,5],boston.target)
plt.ylabel('Price in $1000s')
plt.xlabel('Number of rooms')
<matplotlib.text.Text at 0x195e1438>
boston_df = DataFrame(boston.data)
boston_df.columns = boston.feature_names
boston_df.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 |
1 | 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 |
2 | 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 |
3 | 0.03237 | 0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 |
4 | 0.06905 | 0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 |
boston_df['Price'] = boston.target
boston_df.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
sns.lmplot('RM','Price',data=boston_df)
<seaborn.axisgrid.FacetGrid at 0x19732940>
X = boston_df.RM
X.shape
(506L,)
X = np.vstack(boston_df.RM)
X.shape
(506L, 1L)
Y = boston_df.Price
X
array([[ 6.575], [ 6.421], [ 7.185], [ 6.998], [ 7.147], [ 6.43 ], [ 6.012], [ 6.172], [ 5.631], [ 6.004], [ 6.377], [ 6.009], [ 5.889], [ 5.949], [ 6.096], [ 5.834], [ 5.935], [ 5.99 ], [ 5.456], [ 5.727], [ 5.57 ], [ 5.965], [ 6.142], [ 5.813], [ 5.924], [ 5.599], [ 5.813], [ 6.047], [ 6.495], [ 6.674], [ 5.713], [ 6.072], [ 5.95 ], [ 5.701], [ 6.096], [ 5.933], [ 5.841], [ 5.85 ], [ 5.966], [ 6.595], [ 7.024], [ 6.77 ], [ 6.169], [ 6.211], [ 6.069], [ 5.682], [ 5.786], [ 6.03 ], [ 5.399], [ 5.602], [ 5.963], [ 6.115], [ 6.511], [ 5.998], [ 5.888], [ 7.249], [ 6.383], [ 6.816], [ 6.145], [ 5.927], [ 5.741], [ 5.966], [ 6.456], [ 6.762], [ 7.104], [ 6.29 ], [ 5.787], [ 5.878], [ 5.594], [ 5.885], [ 6.417], [ 5.961], [ 6.065], [ 6.245], [ 6.273], [ 6.286], [ 6.279], [ 6.14 ], [ 6.232], [ 5.874], [ 6.727], [ 6.619], [ 6.302], [ 6.167], [ 6.389], [ 6.63 ], [ 6.015], [ 6.121], [ 7.007], [ 7.079], [ 6.417], [ 6.405], [ 6.442], [ 6.211], [ 6.249], [ 6.625], [ 6.163], [ 8.069], [ 7.82 ], [ 7.416], [ 6.727], [ 6.781], [ 6.405], [ 6.137], [ 6.167], [ 5.851], [ 5.836], [ 6.127], [ 6.474], [ 6.229], [ 6.195], [ 6.715], [ 5.913], [ 6.092], [ 6.254], [ 5.928], [ 6.176], [ 6.021], [ 5.872], [ 5.731], [ 5.87 ], [ 6.004], [ 5.961], [ 5.856], [ 5.879], [ 5.986], [ 5.613], [ 5.693], [ 6.431], [ 5.637], [ 6.458], [ 6.326], [ 6.372], [ 5.822], [ 5.757], [ 6.335], [ 5.942], [ 6.454], [ 5.857], [ 6.151], [ 6.174], [ 5.019], [ 5.403], [ 5.468], [ 4.903], [ 6.13 ], [ 5.628], [ 4.926], [ 5.186], [ 5.597], [ 6.122], [ 5.404], [ 5.012], [ 5.709], [ 6.129], [ 6.152], [ 5.272], [ 6.943], [ 6.066], [ 6.51 ], [ 6.25 ], [ 7.489], [ 7.802], [ 8.375], [ 5.854], [ 6.101], [ 7.929], [ 5.877], [ 6.319], [ 6.402], [ 5.875], [ 5.88 ], [ 5.572], [ 6.416], [ 5.859], [ 6.546], [ 6.02 ], [ 6.315], [ 6.86 ], [ 6.98 ], [ 7.765], [ 6.144], [ 7.155], [ 6.563], [ 5.604], [ 6.153], [ 7.831], [ 6.782], [ 6.556], [ 7.185], [ 6.951], [ 6.739], [ 7.178], [ 6.8 ], [ 6.604], [ 7.875], [ 7.287], [ 7.107], [ 7.274], [ 6.975], [ 7.135], [ 6.162], [ 7.61 ], [ 7.853], [ 8.034], [ 5.891], [ 6.326], [ 5.783], [ 6.064], [ 5.344], [ 5.96 ], [ 5.404], [ 5.807], [ 6.375], [ 5.412], [ 6.182], [ 5.888], [ 6.642], [ 5.951], [ 6.373], [ 6.951], [ 6.164], [ 6.879], [ 6.618], [ 8.266], [ 8.725], [ 8.04 ], [ 7.163], [ 7.686], [ 6.552], [ 5.981], [ 7.412], [ 8.337], [ 8.247], [ 6.726], [ 6.086], [ 6.631], [ 7.358], [ 6.481], [ 6.606], [ 6.897], [ 6.095], [ 6.358], [ 6.393], [ 5.593], [ 5.605], [ 6.108], [ 6.226], [ 6.433], [ 6.718], [ 6.487], [ 6.438], [ 6.957], [ 8.259], [ 6.108], [ 5.876], [ 7.454], [ 8.704], [ 7.333], [ 6.842], [ 7.203], [ 7.52 ], [ 8.398], [ 7.327], [ 7.206], [ 5.56 ], [ 7.014], [ 8.297], [ 7.47 ], [ 5.92 ], [ 5.856], [ 6.24 ], [ 6.538], [ 7.691], [ 6.758], [ 6.854], [ 7.267], [ 6.826], [ 6.482], [ 6.812], [ 7.82 ], [ 6.968], [ 7.645], [ 7.923], [ 7.088], [ 6.453], [ 6.23 ], [ 6.209], [ 6.315], [ 6.565], [ 6.861], [ 7.148], [ 6.63 ], [ 6.127], [ 6.009], [ 6.678], [ 6.549], [ 5.79 ], [ 6.345], [ 7.041], [ 6.871], [ 6.59 ], [ 6.495], [ 6.982], [ 7.236], [ 6.616], [ 7.42 ], [ 6.849], [ 6.635], [ 5.972], [ 4.973], [ 6.122], [ 6.023], [ 6.266], [ 6.567], [ 5.705], [ 5.914], [ 5.782], [ 6.382], [ 6.113], [ 6.426], [ 6.376], [ 6.041], [ 5.708], [ 6.415], [ 6.431], [ 6.312], [ 6.083], [ 5.868], [ 6.333], [ 6.144], [ 5.706], [ 6.031], [ 6.316], [ 6.31 ], [ 6.037], [ 5.869], [ 5.895], [ 6.059], [ 5.985], [ 5.968], [ 7.241], [ 6.54 ], [ 6.696], [ 6.874], [ 6.014], [ 5.898], [ 6.516], [ 6.635], [ 6.939], [ 6.49 ], [ 6.579], [ 5.884], [ 6.728], [ 5.663], [ 5.936], [ 6.212], [ 6.395], [ 6.127], [ 6.112], [ 6.398], [ 6.251], [ 5.362], [ 5.803], [ 8.78 ], [ 3.561], [ 4.963], [ 3.863], [ 4.97 ], [ 6.683], [ 7.016], [ 6.216], [ 5.875], [ 4.906], [ 4.138], [ 7.313], [ 6.649], [ 6.794], [ 6.38 ], [ 6.223], [ 6.968], [ 6.545], [ 5.536], [ 5.52 ], [ 4.368], [ 5.277], [ 4.652], [ 5. ], [ 4.88 ], [ 5.39 ], [ 5.713], [ 6.051], [ 5.036], [ 6.193], [ 5.887], [ 6.471], [ 6.405], [ 5.747], [ 5.453], [ 5.852], [ 5.987], [ 6.343], [ 6.404], [ 5.349], [ 5.531], [ 5.683], [ 4.138], [ 5.608], [ 5.617], [ 6.852], [ 5.757], [ 6.657], [ 4.628], [ 5.155], [ 4.519], [ 6.434], [ 6.782], [ 5.304], [ 5.957], [ 6.824], [ 6.411], [ 6.006], [ 5.648], [ 6.103], [ 5.565], [ 5.896], [ 5.837], [ 6.202], [ 6.193], [ 6.38 ], [ 6.348], [ 6.833], [ 6.425], [ 6.436], [ 6.208], [ 6.629], [ 6.461], [ 6.152], [ 5.935], [ 5.627], [ 5.818], [ 6.406], [ 6.219], [ 6.485], [ 5.854], [ 6.459], [ 6.341], [ 6.251], [ 6.185], [ 6.417], [ 6.749], [ 6.655], [ 6.297], [ 7.393], [ 6.728], [ 6.525], [ 5.976], [ 5.936], [ 6.301], [ 6.081], [ 6.701], [ 6.376], [ 6.317], [ 6.513], [ 6.209], [ 5.759], [ 5.952], [ 6.003], [ 5.926], [ 5.713], [ 6.167], [ 6.229], [ 6.437], [ 6.98 ], [ 5.427], [ 6.162], [ 6.484], [ 5.304], [ 6.185], [ 6.229], [ 6.242], [ 6.75 ], [ 7.061], [ 5.762], [ 5.871], [ 6.312], [ 6.114], [ 5.905], [ 5.454], [ 5.414], [ 5.093], [ 5.983], [ 5.983], [ 5.707], [ 5.926], [ 5.67 ], [ 5.39 ], [ 5.794], [ 6.019], [ 5.569], [ 6.027], [ 6.593], [ 6.12 ], [ 6.976], [ 6.794], [ 6.03 ]])
# [X 1]
X = np.array( [ [value,1] for value in X] )
X
array([[array([ 6.575]), 1], [array([ 6.421]), 1], [array([ 7.185]), 1], ..., [array([ 6.976]), 1], [array([ 6.794]), 1], [array([ 6.03]), 1]], dtype=object)
m , b = np.linalg.lstsq(X,Y)[0]
plt.plot(boston_df.RM,boston_df.Price,'o')
x = boston_df.RM
plt.plot(x, m*x + b,'r',label='Best Fit Line')
[<matplotlib.lines.Line2D at 0x1ac70588>]
result = np.linalg.lstsq(X,Y)
error_total = result[1]
rmse = np.sqrt(error_total/len(X))
print ' The root mean square error was %.2f' %rmse
The root mean square error was 6.60
import sklearn
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()
X_multi = boston_df.drop('Price',1)
Y_target = boston_df.Price
lreg.fit(X_multi,Y_target)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
print ' The estimated intercept coefficient is %.2f ' % lreg.intercept_
print ' The number of coefficients used was %d ' %len(lreg.coef_)
The estimated intercept coefficient is 36.49 The number of coefficients used was 13
coeff_df = DataFrame(boston_df.columns)
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = Series(lreg.coef_)
coeff_df
Features | Coefficient Estimate | |
---|---|---|
0 | CRIM | -0.107171 |
1 | ZN | 0.046395 |
2 | INDUS | 0.020860 |
3 | CHAS | 2.688561 |
4 | NOX | -17.795759 |
5 | RM | 3.804752 |
6 | AGE | 0.000751 |
7 | DIS | -1.475759 |
8 | RAD | 0.305655 |
9 | TAX | -0.012329 |
10 | PTRATIO | -0.953464 |
11 | B | 0.009393 |
12 | LSTAT | -0.525467 |
13 | Price | NaN |
X_train,X_test,Y_train,Y_test = sklearn.cross_validation.train_test_split(X,boston_df.Price)
print X_train.shape, X_test.shape , Y_train.shape , Y_test.shape
(379L, 2L) (127L, 2L) (379L,) (127L,)
lreg = LinearRegression()
lreg.fit(X_train,Y_train)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)
print " Fit a model X_train, and calculate the MSE with Y_train: %.2f " % np.mean((Y_train-pred_train)**2)
print " Fit a model X_train, and calculate MSE with X_test and Y_test: %.2f " % np.mean((Y_test - pred_test)**2)
Fit a model X_train, and calculate the MSE with Y_train: 47.30 Fit a model X_train, and calculate MSE with X_test and Y_test: 32.75
train = plt.scatter(pred_train,(pred_train - Y_train),c='b',alpha=0.5)
test = plt.scatter(pred_test,(pred_test-Y_test),c='r',alpha=0.5)
plt.hlines(y=0,xmin=-10,xmax=40)
plt.legend((train,test),('Training','Test'),loc='lower left')
plt.title('Residual Plots')
<matplotlib.text.Text at 0x1b39e0f0>