import scipy as sp
import pylab as pl
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
## loading data - removing missing values (but the time line wont be aligned anymore)
data = sp.genfromtxt('data/web_traffic.tsv', delimiter='\t')
data = data[~sp.any(sp.isnan(data), axis = 1)]
print data.shape
pl.plot(data[:, 0], data[:,1], 'bo')
pl.xlabel('time')
pl.ylabel('traffic/hour')
pl.xticks(7*24*sp.arange(5), ['weeks%i' % w for w in xrange(5)])
pl.grid()
pl.autoscale(tight = True, )
(735, 2)
## train and test split
X, y = data[:, 0], data[:, 1]
X, y = shuffle(X, y)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.26, random_state = 0)
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape
(543,) (192,) (543,) (192,)
## another setting of train/test split - useful for `time-series` like data
X, y = data[:, 0], data[:, 1]
train_X, test_X = X[:500], X[500:]
train_y, test_y = y[:500], y[500:]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape
(500,) (235,) (500,) (235,)
## different polynomials
class PolynomialRegressor(BaseEstimator, RegressorMixin):
def __init__(self, degree = 1):
self.degree = degree
def fit(self, X, y):
assert len(X.shape) == 1
self.coef_ = sp.polyfit(X, y, deg = self.degree)
self.model_ = sp.poly1d(self.coef_)
return self
def predict(self, X):
return self.model_(X)
def score(self, X, y):
return mean_squared_error(self.predict(X), y)
def benchmark_poly(degree = 1, train_size = 1.):
train_size = int(train_X.shape[0] * train_size)
model = PolynomialRegressor(degree = degree).fit(train_X[:train_size], train_y[:train_size])
train_score = model.score(train_X[:train_size], train_y[:train_size])
test_score = model.score(test_X, test_y)
return (train_size, train_score, test_score)
## overfitting? I dont see it!
poly_degrees = [1, 2, 3, 5, 10, 20, 50, 100]
_, train_scores, test_scores = zip(*[benchmark_poly(degree = d, train_size = 1.) for d in poly_degrees])
pl.plot(range(len(poly_degrees)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(poly_degrees)), test_scores, 'g-+', label='test')
pl.xticks(range(len(poly_degrees)), poly_degrees)
pl.legend(loc = 'best')
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/polynomial.py:585: RankWarning: Polyfit may be poorly conditioned warnings.warn(msg, RankWarning) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/polynomial.py:585: RankWarning: Polyfit may be poorly conditioned warnings.warn(msg, RankWarning) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/polynomial.py:585: RankWarning: Polyfit may be poorly conditioned warnings.warn(msg, RankWarning)
<matplotlib.legend.Legend at 0x115eeef10>
pl.plot(data[:, 0], data[:,1], 'bo')
pl.plot(range(800), PolynomialRegressor(4).fit(train_X, train_y).predict(range(800)), 'r-', linewidth = 5)
[<matplotlib.lines.Line2D at 0x116196490>]
## learning curve for linear model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=1, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')
<matplotlib.legend.Legend at 0x116056090>
## learning curve for 2nd model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=2, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')
<matplotlib.legend.Legend at 0x11646ff50>
## learning curve for 10th model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=10, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')
<matplotlib.legend.Legend at 0x115f98c10>
## learning curve for 15th model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=15, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')
<matplotlib.legend.Legend at 0x116314f10>