In [35]:
from __future__ import print_function
import numpy as np
from sklearn import datasets, metrics, cross_validation
from minirank import ordinal_logistic_fit, ordinal_logistic_predict
In [36]:
DOC = """
================================================================================
Compare the prediction accuracy of different models on the boston dataset
================================================================================
"""
print(DOC)
boston = datasets.load_boston()
X, y = boston.data, np.round(boston.target)
X -= X.mean()
y -= y.min()

idx = np.argsort(y)
X = X[idx]
y = y[idx]
cv = cross_validation.ShuffleSplit(y.size, n_iter=50, test_size=.1, random_state=0)
score_logistic = []
score_ordinal_logistic = []
score_ridge = []
for i, (train, test) in enumerate(cv):
    if not np.all(np.unique(y[train]) == np.unique(y)):
        # we need the train set to have all different classes
        continue
    assert np.all(np.unique(y[train]) == np.unique(y))
    train = np.sort(train)
    test = np.sort(test)
    w, theta = ordinal_logistic_fit(X[train], y[train])
    pred = ordinal_logistic_predict(w, theta, X[test], y)
    s = metrics.mean_absolute_error(y[test], pred)
    print('ERROR (ORDINAL)  fold %s: %s' % (i+1, s))
    score_ordinal_logistic.append(s)

    from sklearn import linear_model
    clf = linear_model.LogisticRegression(C=1.)
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    s = metrics.mean_absolute_error(y[test], pred)
    print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s))
    score_logistic.append(s)

    from sklearn import linear_model
    clf = linear_model.Ridge(alpha=1.)
    clf.fit(X[train], y[train])
    pred = np.round(clf.predict(X[test]))
    s = metrics.mean_absolute_error(y[test], pred)
    print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s))
    score_ridge.append(s)


print()
print('MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC):    %s' % np.mean(score_ordinal_logistic))
print('MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): %s' % np.mean(score_logistic))
print('MEAN ABSOLUTE ERROR (RIDGE REGRESSION):    %s' % np.mean(score_ridge))
================================================================================
Compare the prediction accuracy of different models on the boston dataset
================================================================================

ERROR (ORDINAL)  fold 2: 2.78431372549
ERROR (LOGISTIC) fold 2: 4.23529411765
ERROR (LOGISTIC) fold 2: 3.25490196078
ERROR (ORDINAL)  fold 4: 3.88235294118
ERROR (LOGISTIC) fold 4: 4.60784313725
ERROR (LOGISTIC) fold 4: 3.58823529412
ERROR (ORDINAL)  fold 5: 3.27450980392
ERROR (LOGISTIC) fold 5: 4.23529411765
ERROR (LOGISTIC) fold 5: 3.21568627451
ERROR (ORDINAL)  fold 6: 3.50980392157
ERROR (LOGISTIC) fold 6: 4.03921568627
ERROR (LOGISTIC) fold 6: 3.47058823529
ERROR (ORDINAL)  fold 7: 2.66666666667
ERROR (LOGISTIC) fold 7: 2.82352941176
ERROR (LOGISTIC) fold 7: 2.96078431373
ERROR (ORDINAL)  fold 12: 4.03921568627
ERROR (LOGISTIC) fold 12: 4.70588235294
ERROR (LOGISTIC) fold 12: 4.50980392157
ERROR (ORDINAL)  fold 13: 4.23529411765
ERROR (LOGISTIC) fold 13: 4.09803921569
ERROR (LOGISTIC) fold 13: 4.43137254902
ERROR (ORDINAL)  fold 15: 3.1568627451
ERROR (LOGISTIC) fold 15: 4.74509803922
ERROR (LOGISTIC) fold 15: 4.09803921569
ERROR (ORDINAL)  fold 23: 3.39215686275
ERROR (LOGISTIC) fold 23: 3.92156862745
ERROR (LOGISTIC) fold 23: 3.72549019608
ERROR (ORDINAL)  fold 26: 3.54901960784
ERROR (LOGISTIC) fold 26: 5.25490196078
ERROR (LOGISTIC) fold 26: 4.03921568627
ERROR (ORDINAL)  fold 29: 2.96078431373
ERROR (LOGISTIC) fold 29: 3.09803921569
ERROR (LOGISTIC) fold 29: 2.17647058824
ERROR (ORDINAL)  fold 30: 2.80392156863
ERROR (LOGISTIC) fold 30: 3.0
ERROR (LOGISTIC) fold 30: 3.25490196078
ERROR (ORDINAL)  fold 33: 3.27450980392
ERROR (LOGISTIC) fold 33: 3.90196078431
ERROR (LOGISTIC) fold 33: 3.62745098039
ERROR (ORDINAL)  fold 34: 2.52941176471
ERROR (LOGISTIC) fold 34: 3.43137254902
ERROR (LOGISTIC) fold 34: 3.33333333333
ERROR (ORDINAL)  fold 38: 2.78431372549
ERROR (LOGISTIC) fold 38: 3.66666666667
ERROR (LOGISTIC) fold 38: 2.64705882353
ERROR (ORDINAL)  fold 39: 3.25490196078
ERROR (LOGISTIC) fold 39: 3.35294117647
ERROR (LOGISTIC) fold 39: 3.72549019608
ERROR (ORDINAL)  fold 40: 3.56862745098
ERROR (LOGISTIC) fold 40: 3.72549019608
ERROR (LOGISTIC) fold 40: 3.72549019608
ERROR (ORDINAL)  fold 41: 3.98039215686
ERROR (LOGISTIC) fold 41: 3.54901960784
ERROR (LOGISTIC) fold 41: 4.27450980392
ERROR (ORDINAL)  fold 42: 4.03921568627
ERROR (LOGISTIC) fold 42: 3.21568627451
ERROR (LOGISTIC) fold 42: 4.13725490196
ERROR (ORDINAL)  fold 45: 3.35294117647
ERROR (LOGISTIC) fold 45: 3.1568627451
ERROR (LOGISTIC) fold 45: 3.41176470588
ERROR (ORDINAL)  fold 46: 2.92156862745
ERROR (LOGISTIC) fold 46: 4.50980392157
ERROR (LOGISTIC) fold 46: 3.47058823529
ERROR (ORDINAL)  fold 48: 2.60784313725
ERROR (LOGISTIC) fold 48: 3.19607843137
ERROR (LOGISTIC) fold 48: 3.29411764706

MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC):    3.29857397504
MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): 3.83957219251
MEAN ABSOLUTE ERROR (RIDGE REGRESSION):    3.5623885918
In [47]:
from pylab import *

pos = arange(3)+1.1    # the bar centers on the y axis
val = map(np.mean, (score_ordinal_logistic, score_logistic, score_ridge))
xerr = map(np.std, (score_ordinal_logistic, score_logistic, score_ridge))

barh(pos,val, xerr=xerr, align='center', alpha=.5, ecolor='black')
yticks(pos, ('Ordinal Logistic', 'Multiclass Logistic', 'Linear Regression'), rotation=45)
grid(True)
xlabel('Mean Absolute Error (lower is better)', fontsize='x-large')
tight_layout()
savefig('bars_ordinal.png')
show()