In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

Linear regression with one variable

In [2]:
data1 = pd.read_csv('ex1data1.txt', header=None, names=['x', 'y'])
In [3]:
data1.head()
Out[3]:
x y
0 6.1101 17.5920
1 5.5277 9.1302
2 8.5186 13.6620
3 7.0032 11.8540
4 5.8598 6.8233
In [4]:
plt.figure(figsize=(8, 6))
plt.plot(data1['x'], data1['y'], 'rx')
plt.show()
In [5]:
from sklearn import linear_model
In [6]:
clf = linear_model.LinearRegression()
clf.fit(data1[['x']].values, data1['y'].values)

print clf.coef_
print clf.intercept_
[ 1.19303364]
-3.89578087831
In [7]:
plt.figure(figsize=(8, 6))

plt.plot(data1['x'], data1['y'], 'rx')
plt.plot(data1['x'], clf.predict(data1[['x']]))

plt.show()

Linear regression with multiple variables

In [8]:
data2 = pd.read_csv('ex1data2.txt', header=None, names=['size', 'number of bedrooms', 'price'], dtype=float)
In [9]:
data2.head()
Out[9]:
size number of bedrooms price
0 2104 3 399900
1 1600 3 329900
2 2400 3 369000
3 1416 2 232000
4 3000 4 539900
In [10]:
from sklearn import preprocessing
In [11]:
scaler = preprocessing.StandardScaler().fit(data2[['size', 'number of bedrooms']].values)
features = scaler.transform(data2[['size', 'number of bedrooms']].values)
print scaler.mean_, scaler.std_ 
[ 2000.68085106     3.17021277] [  7.86202619e+02   7.52842809e-01]
In [12]:
clf2 = linear_model.LinearRegression()
clf2.fit(features, data2['price'].values)

print clf2.coef_
print clf2.intercept_
[ 109447.79646964   -6578.35485416]
340412.659574
In [ ]: