#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import modin.pandas as pd
import matplotlib.pyplot as plt
import sklearn


# In[2]:


data = pd.read_csv("data/boston_housing.csv")

data.head()


# In[3]:


features = data.drop("PRICE", axis=1)
labels = data["PRICE"]

type(features)


# In[4]:


from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(features, labels)


# In[5]:


plt.scatter(data["RM"], labels)
plt.xlabel("Average number of rooms per dwelling")
plt.ylabel("Housing Price")
plt.title("Relationship between Rooms and Price")
plt.show()


# In[6]:


predicted_prices = lm.predict(features)


# In[7]:


plt.scatter(labels, predicted_prices)
plt.xlabel("Prices")
plt.ylabel("Predicted Prices")
plt.title("Prices versus Predicted Prices")
plt.show()


# In[8]:


training_error = \
    (labels - predicted_prices).apply(lambda x: x ** 2).mean()

training_error


# In[9]:


# Citation: http://bigdata-madesimple.com/how-to-run-linear-regression-in-python-scikit-learn/