Learning Curves of fitting Regression models¶

Examples from book BuildingMachineLearningSystemsWithPython ch01, traffic.csv
Putting aside that it is werid to fit a traffic model on time alone, it is an example of using model selection

In [52]:

import scipy as sp
import pylab as pl
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib

In [14]:

## loading data - removing missing values (but the time line wont be aligned anymore)
data = sp.genfromtxt('data/web_traffic.tsv', delimiter='\t')
data = data[~sp.any(sp.isnan(data), axis = 1)]
print data.shape

pl.plot(data[:, 0], data[:,1], 'bo')
pl.xlabel('time')
pl.ylabel('traffic/hour')
pl.xticks(7*24*sp.arange(5), ['weeks%i' % w for w in xrange(5)])
pl.grid()
pl.autoscale(tight = True, )

(735, 2)

In [93]:

## train and test split
X, y = data[:, 0], data[:, 1]
X, y = shuffle(X, y)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.26, random_state = 0)
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape

(543,) (192,)
(543,) (192,)

In [130]:

## another setting of train/test split - useful for `time-series` like data
X, y = data[:, 0], data[:, 1]
train_X, test_X = X[:500], X[500:]
train_y, test_y = y[:500], y[500:]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape

(500,) (235,)
(500,) (235,)

In [138]:

## different polynomials
class PolynomialRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, degree = 1):
        self.degree = degree
    def fit(self, X, y):
        assert len(X.shape) == 1
        self.coef_ = sp.polyfit(X, y, deg = self.degree)
        self.model_ = sp.poly1d(self.coef_)
        return self
    def predict(self, X):
        return self.model_(X)
    def score(self, X, y):
        return mean_squared_error(self.predict(X), y)
        

In [139]:

def benchmark_poly(degree = 1, train_size = 1.):
    train_size = int(train_X.shape[0] * train_size)
    model = PolynomialRegressor(degree = degree).fit(train_X[:train_size], train_y[:train_size])
    train_score = model.score(train_X[:train_size], train_y[:train_size])
    test_score = model.score(test_X, test_y)
    return (train_size, train_score, test_score)

In [140]:

## overfitting? I dont see it!
poly_degrees = [1, 2, 3, 5, 10, 20, 50, 100]
_, train_scores, test_scores = zip(*[benchmark_poly(degree = d, train_size = 1.) for d in poly_degrees])
pl.plot(range(len(poly_degrees)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(poly_degrees)), test_scores, 'g-+', label='test')
pl.xticks(range(len(poly_degrees)), poly_degrees)
pl.legend(loc = 'best')

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/polynomial.py:585: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/polynomial.py:585: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/polynomial.py:585: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)

Out[140]:

<matplotlib.legend.Legend at 0x115eeef10>

In [141]:

pl.plot(data[:, 0], data[:,1], 'bo')
pl.plot(range(800), PolynomialRegressor(4).fit(train_X, train_y).predict(range(800)), 'r-', linewidth = 5)

Out[141]:

[<matplotlib.lines.Line2D at 0x116196490>]

In [142]:

## learning curve for linear model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=1, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')

Out[142]:

<matplotlib.legend.Legend at 0x116056090>

In [143]:

## learning curve for 2nd model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=2, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')

Out[143]:

<matplotlib.legend.Legend at 0x11646ff50>

In [144]:

## learning curve for 10th model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=10, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')

Out[144]:

<matplotlib.legend.Legend at 0x115f98c10>

In [145]:

## learning curve for 15th model
train_sizes = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
n_trains, train_scores, test_scores = zip(*[benchmark_poly(degree=15, train_size=t) for t in train_sizes])
pl.plot(range(len(train_sizes)), train_scores, 'r-+', label = 'train')
pl.plot(range(len(train_sizes)), test_scores, 'g-+', label='test')
pl.xticks(range(len(train_sizes)), train_sizes)
pl.legend(loc = 'best')

Out[145]:

<matplotlib.legend.Legend at 0x116314f10>

In [ ]: