#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
from pandas import Series,DataFrame


# In[2]:


import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


from sklearn.datasets import load_boston


# In[4]:


boston = load_boston()


# In[5]:


print boston.DESCR


# In[6]:


plt.hist(boston.target,bins=50)

plt.xlabel('Prices in $1000s')
plt.ylabel('Number of houses')


# In[7]:


plt.scatter(boston.data[:,5],boston.target)

plt.ylabel('Price in $1000s')
plt.xlabel('Number of rooms')


# In[8]:


boston_df = DataFrame(boston.data)

boston_df.columns = boston.feature_names

boston_df.head()


# In[9]:


boston_df['Price'] = boston.target


# In[10]:


boston_df.head()


# In[11]:


sns.lmplot('RM','Price',data=boston_df)


# In[16]:


X = boston_df.RM
X.shape


# In[19]:


X = np.vstack(boston_df.RM)


# In[15]:


X.shape


# In[21]:


Y = boston_df.Price


# In[22]:


X


# In[23]:


# [X 1]
X = np.array( [ [value,1] for value in X]   )


# In[24]:


X


# In[26]:


m , b = np.linalg.lstsq(X,Y)[0]


# In[27]:


plt.plot(boston_df.RM,boston_df.Price,'o')

x = boston_df.RM

plt.plot(x, m*x + b,'r',label='Best Fit Line')


# In[29]:


result = np.linalg.lstsq(X,Y)

error_total = result[1]

rmse = np.sqrt(error_total/len(X))

print ' The root mean square error was %.2f' %rmse


# In[30]:


import sklearn
from sklearn.linear_model import LinearRegression


# In[31]:


lreg = LinearRegression()


# In[32]:


X_multi = boston_df.drop('Price',1)

Y_target = boston_df.Price


# In[33]:


lreg.fit(X_multi,Y_target)


# In[36]:


print ' The estimated intercept coefficient is  %.2f ' % lreg.intercept_

print ' The number of coefficients used was %d ' %len(lreg.coef_)


# In[37]:


coeff_df = DataFrame(boston_df.columns)
coeff_df.columns = ['Features']

coeff_df['Coefficient Estimate'] = Series(lreg.coef_)

coeff_df


# In[38]:


X_train,X_test,Y_train,Y_test = sklearn.cross_validation.train_test_split(X,boston_df.Price)


# In[39]:


print X_train.shape, X_test.shape , Y_train.shape , Y_test.shape


# In[40]:


lreg = LinearRegression()

lreg.fit(X_train,Y_train)


# In[41]:


pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)


# In[42]:


print " Fit a model X_train, and calculate the MSE with Y_train: %.2f " % np.mean((Y_train-pred_train)**2)

print " Fit a model X_train, and calculate MSE with X_test and Y_test: %.2f " % np.mean((Y_test - pred_test)**2)


# In[44]:


train = plt.scatter(pred_train,(pred_train - Y_train),c='b',alpha=0.5)

test = plt.scatter(pred_test,(pred_test-Y_test),c='r',alpha=0.5)

plt.hlines(y=0,xmin=-10,xmax=40)

plt.legend((train,test),('Training','Test'),loc='lower left')

plt.title('Residual Plots')


# In[ ]: