#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd from pandas import Series,DataFrame # In[2]: import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: from sklearn.datasets import load_boston # In[4]: boston = load_boston() # In[5]: print boston.DESCR # In[6]: plt.hist(boston.target,bins=50) plt.xlabel('Prices in $1000s') plt.ylabel('Number of houses') # In[7]: plt.scatter(boston.data[:,5],boston.target) plt.ylabel('Price in $1000s') plt.xlabel('Number of rooms') # In[8]: boston_df = DataFrame(boston.data) boston_df.columns = boston.feature_names boston_df.head() # In[9]: boston_df['Price'] = boston.target # In[10]: boston_df.head() # In[11]: sns.lmplot('RM','Price',data=boston_df) # In[16]: X = boston_df.RM X.shape # In[19]: X = np.vstack(boston_df.RM) # In[15]: X.shape # In[21]: Y = boston_df.Price # In[22]: X # In[23]: # [X 1] X = np.array( [ [value,1] for value in X] ) # In[24]: X # In[26]: m , b = np.linalg.lstsq(X,Y)[0] # In[27]: plt.plot(boston_df.RM,boston_df.Price,'o') x = boston_df.RM plt.plot(x, m*x + b,'r',label='Best Fit Line') # In[29]: result = np.linalg.lstsq(X,Y) error_total = result[1] rmse = np.sqrt(error_total/len(X)) print ' The root mean square error was %.2f' %rmse # In[30]: import sklearn from sklearn.linear_model import LinearRegression # In[31]: lreg = LinearRegression() # In[32]: X_multi = boston_df.drop('Price',1) Y_target = boston_df.Price # In[33]: lreg.fit(X_multi,Y_target) # In[36]: print ' The estimated intercept coefficient is %.2f ' % lreg.intercept_ print ' The number of coefficients used was %d ' %len(lreg.coef_) # In[37]: coeff_df = DataFrame(boston_df.columns) coeff_df.columns = ['Features'] coeff_df['Coefficient Estimate'] = Series(lreg.coef_) coeff_df # In[38]: X_train,X_test,Y_train,Y_test = sklearn.cross_validation.train_test_split(X,boston_df.Price) # In[39]: print X_train.shape, X_test.shape , Y_train.shape , Y_test.shape # In[40]: lreg = LinearRegression() lreg.fit(X_train,Y_train) # In[41]: pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # In[42]: print " Fit a model X_train, and calculate the MSE with Y_train: %.2f " % np.mean((Y_train-pred_train)**2) print " Fit a model X_train, and calculate MSE with X_test and Y_test: %.2f " % np.mean((Y_test - pred_test)**2) # In[44]: train = plt.scatter(pred_train,(pred_train - Y_train),c='b',alpha=0.5) test = plt.scatter(pred_test,(pred_test-Y_test),c='r',alpha=0.5) plt.hlines(y=0,xmin=-10,xmax=40) plt.legend((train,test),('Training','Test'),loc='lower left') plt.title('Residual Plots') # In[ ]: