In [1]:

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

Linear regression with one variable¶

In [2]:

data1 = pd.read_csv('ex1data1.txt', header=None, names=['x', 'y'])

In [3]:

data1.head()

Out[3]:

	x	y
0	6.1101	17.5920
1	5.5277	9.1302
2	8.5186	13.6620
3	7.0032	11.8540
4	5.8598	6.8233

In [4]:

plt.figure(figsize=(8, 6))
plt.plot(data1['x'], data1['y'], 'rx')
plt.show()

In [5]:

from sklearn import linear_model

In [6]:

clf = linear_model.LinearRegression()
clf.fit(data1[['x']].values, data1['y'].values)

print clf.coef_
print clf.intercept_

[ 1.19303364]
-3.89578087831

In [7]:

plt.figure(figsize=(8, 6))

plt.plot(data1['x'], data1['y'], 'rx')
plt.plot(data1['x'], clf.predict(data1[['x']]))

plt.show()

Linear regression with multiple variables¶

In [8]:

data2 = pd.read_csv('ex1data2.txt', header=None, names=['size', 'number of bedrooms', 'price'], dtype=float)

In [9]:

data2.head()

Out[9]:

	size	number of bedrooms	price
0	2104	3	399900
1	1600	3	329900
2	2400	3	369000
3	1416	2	232000
4	3000	4	539900

In [10]:

from sklearn import preprocessing

In [11]:

scaler = preprocessing.StandardScaler().fit(data2[['size', 'number of bedrooms']].values)
features = scaler.transform(data2[['size', 'number of bedrooms']].values)
print scaler.mean_, scaler.std_ 

[ 2000.68085106     3.17021277] [  7.86202619e+02   7.52842809e-01]

In [12]:

clf2 = linear_model.LinearRegression()
clf2.fit(features, data2['price'].values)

print clf2.coef_
print clf2.intercept_

[ 109447.79646964   -6578.35485416]
340412.659574

In [ ]: