from __future__ import division
import re
from pandas import read_csv
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform.br import BinaryRelevance
# the original file has been divided into 100 pieces
# aa is but one piece
df = read_csv("../data/pieces/aa",
names=['id','title','body','tags'],
header=None)
df.head()
id | title | body | tags | |
---|---|---|---|---|
0 | 1 | How to check if an uploaded file is an image w... | <p>I'd like to check if an uploaded file is an... | php image-processing file-upload upload mime-t... |
1 | "2" | How can I prevent firefox from closing when I ... | <p>In my favorite editor (vim), I regularly us... | firefox |
2 | "3" | R Error Invalid type (list) for variable | <p>I am import matlab file and construct a dat... | r matlab machine-learning |
3 | "4" | How do I replace special characters in a URL? | <p>This is probably very simple, but I simply ... | c# url encoding |
4 | "5" | How to modify whois contact details? | <pre><code>function modify(.......) { $mcont... | php api file-get-contents |
df["id"]=df["id"].apply(lambda str: str.strip().lstrip('"').rstrip('"'))
df.head()
id | title | body | tags | |
---|---|---|---|---|
0 | 1 | How to check if an uploaded file is an image w... | <p>I'd like to check if an uploaded file is an... | php image-processing file-upload upload mime-t... |
1 | 2 | How can I prevent firefox from closing when I ... | <p>In my favorite editor (vim), I regularly us... | firefox |
2 | 3 | R Error Invalid type (list) for variable | <p>I am import matlab file and construct a dat... | r matlab machine-learning |
3 | 4 | How do I replace special characters in a URL? | <p>This is probably very simple, but I simply ... | c# url encoding |
4 | 5 | How to modify whois contact details? | <pre><code>function modify(.......) { $mcont... | php api file-get-contents |
pat = re.compile('<[^>]+>')
def preprocessor(s):
return pat.sub(' ',s).lower()
def tokenizer(s):
return s.split()
# i've doubled weight for the title as it should be more important
text_data = df["title"] + ' ' + df["title"] + ' ' + df["body"]
vectorizerX = TfidfVectorizer(preprocessor=preprocessor, max_features=1000)
# print(type(title_data))
X = vectorizerX.fit_transform(text_data.values)
# now extract features for labels
# regular features, no tfidf this time
# maybe what I want is a CountVectorizer?
tag_data = df["tags"]
token_pattern = '(?u)\b\w+\b' # allow 1-letter tokens
vectorizerY = CountVectorizer(tokenizer=tokenizer, token_pattern=token_pattern,max_features=1000)
Y = vectorizerY.fit_transform(tag_data.values)
N = text_data.size
N
60308
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
((12061, 1000), (48247, 1000), (12061, 1000), (48247, 1000))
clf = BinaryRelevance(GaussianNB())
clf.fit(X_train, Y_train)
# cv = cross_validation.ShuffleSplit(n=Y.shape[0],n_iter=3, test_size=0.3,random_state=0)
predictions = clf.predict(X_test)
score = f1_score(y_test,predictions)
/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1". sample_weight=sample_weight) /home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
score
0.076051253563532928