# Binary Classification
from sklearn_vw import VWClassifier
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
# get some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)
# build vowpal wabbit model
model = VWClassifier()
model.fit(X_train, y_train)
# evaluate
print 'training score: {}'.format(model.score(X_train, y_train))
print 'testing score: {}'.format(model.score(X_test, y_test))
training score: 0.992 testing score: 0.49
# Parameter Grid Search
# http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py
from operator import itemgetter
from time import time
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats.distributions import uniform
# Utility function to report best scores
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
# use a full grid over all parameters
np.random.seed(0)
n_iter = 20
params = {"l2": uniform(0.0001, 0.01),
"l": [0.01, 0.1, 1.0],
"power_t": uniform()}
# run search
search = RandomizedSearchCV(VWClassifier(), param_distributions=params, n_iter=n_iter)
start = time()
search.fit(X, y)
print("Parameter search took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(search.grid_scores_)))
report(search.grid_scores_)
GridSearchCV took 31.25 seconds for 9 candidate parameter settings. Model with rank: 1 Mean validation score: 0.503 (std: 0.000) Parameters: {'l2': 0.1, 'l': 0.01} Model with rank: 2 Mean validation score: 0.502 (std: 0.003) Parameters: {'l2': 0.001, 'l': 0.1} Model with rank: 3 Mean validation score: 0.501 (std: 0.003) Parameters: {'l2': None, 'l': 0.1}
# evaluate
model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1)
model.fit(X_train, y_train)
print 'training score: {}'.format(model.score(X_train, y_train))
print 'testing score: {}'.format(model.score(X_test, y_test))
# cleanup
del model
training score: 0.594375 testing score: 0.509
# Linear Regression
from sklearn_vw import VWRegressor
from sklearn import datasets
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
model = VWRegressor(l=100)
model.fit(X, y)
print 'intercept: {}'.format(model.get_intercept())
print 'predictions: {}'.format(model.predict(X[:10]))
print 'training R2 score: {}'.format(model.score(X, y))
intercept: 145.317504883 predictions: [ 191.21879578 168.42489624 186.91046143 193.12036133 172.59655762 138.17372131 172.12760925 113.33664703 153.11889648 199.30044556] training R2 score: 0.692918377646