In [1]:
%matplotlib inline

import numpy as np
import modin.pandas as pd
import matplotlib.pyplot as plt
import sklearn
Process STDOUT and STDERR is being redirected to /tmp/raylogs/.
Waiting for redis server at 127.0.0.1:35043 to respond...
Waiting for redis server at 127.0.0.1:49923 to respond...
Starting local scheduler with the following resources: {'CPU': 4, 'GPU': 0}.

======================================================================
View the web UI at http://localhost:8889/notebooks/ray_ui93764.ipynb?token=23507892afd3d95e7604e7cd889b30382368ed888e79fc8c
======================================================================

In [2]:
data = pd.read_csv("data/boston_housing.csv")

data.head()
Out[2]:
Unnamed: 0 CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
0 0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
In [3]:
features = data.drop("PRICE", axis=1)
labels = data["PRICE"]

type(features)
Out[3]:
modin.pandas.dataframe.DataFrame
In [4]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(features, labels)
Out[4]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [5]:
plt.scatter(data["RM"], labels)
plt.xlabel("Average number of rooms per dwelling")
plt.ylabel("Housing Price")
plt.title("Relationship between Rooms and Price")
plt.show()
In [6]:
predicted_prices = lm.predict(features)
In [7]:
plt.scatter(labels, predicted_prices)
plt.xlabel("Prices")
plt.ylabel("Predicted Prices")
plt.title("Prices versus Predicted Prices")
plt.show()
In [8]:
training_error = \
    (labels - predicted_prices).apply(lambda x: x ** 2).mean()

training_error
Out[8]:
21.831934375295628
In [9]:
# Citation: http://bigdata-madesimple.com/how-to-run-linear-regression-in-python-scikit-learn/