In [1]:

%matplotlib inline

import matplotlib as plt
import numpy as np

from autotagger.stackoverflow.preprocess import load_pickle_sklearn_format
from sklearn import cross_validation,linear_model
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
import pickle

In [2]:

X,Y = load_pickle_sklearn_format("1GB_100_features")

In [4]:

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

Out[4]:

((166811, 100), (667245, 100), (166811, 100), (667245, 100))

In [6]:

clf = linear_model.LinearRegression()
meta_clf = OneVsRestClassifier(clf)

In [7]:

meta_clf.fit(X_train,Y_train)

Out[7]:

OneVsRestClassifier(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
          n_jobs=1)

In [8]:

Y_pred = meta_clf.predict(X_test)

In [9]:

# macro average refers to the average f1_score for each label
f1_score(Y_test,Y_pred, average='macro')

/home/felipe/auto-tagger/venv3/lib/python3.4/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Out[9]:

0.018744593095539479

In [11]:

# if we just consider the labels that have had at least one instance predicted,
# our score goes up:

label_scores = f1_score(Y_test,Y_pred,average=None)
valid_label_indices = np.nonzero(label_scores)[0]
f1_score(Y_test,Y_pred,average='macro',labels=valid_label_indices)

/home/felipe/auto-tagger/venv3/lib/python3.4/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Out[11]:

0.23430741369424352

In [12]:

# micro average refers to the average f1_score for each instance
f1_score(Y_test,Y_pred,average='micro')

Out[12]:

0.13040891760528536

In [ ]: