In this document, we will use SVM to help to classify the category of 20 group news dataset. Though evaluation, it is obviouse that SVM is mode accurate than the naive bayes classifier on this dataset.
The first some steps are same with the one in the naive bayes classifier: [Naive Bayes classification on 20newsgroups dataset]
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
print twenty_train.target_names
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
from sklearn import svm
clf = svm.SVC(kernel = 'linear')
clf.fit(X_train_tfidf, twenty_train.target)
# Next, we will write two sentences to test the model.
docs_new = ['Abuse of antibiotics is very common', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
# the following code will show the category pridicted by the model
predicted = clf.predict(X_new_tfidf)
print predicted
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, twenty_train.target_names[category]))
[2 1] 'Abuse of antibiotics is very common' => sci.med 'OpenGL on the GPU is fast' => comp.graphics
from sklearn import metrics
import numpy as np;
# get the test data from test dataset
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
#vectorize test data
X_test_counts = count_vect.transform(docs_test)
#extract feature of test data
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
# use the model to predict the category
predicted = clf.predict(X_test_tfidf)
#get the precision, recall, f1-score and support of this model
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))
#get the accuracy of the model
print("accurary\t"+str(np.mean(predicted == twenty_test.target)))
precision recall f1-score support alt.atheism 0.96 0.83 0.89 319 comp.graphics 0.90 0.96 0.93 389 sci.med 0.94 0.91 0.93 396 soc.religion.christian 0.89 0.96 0.93 398 avg / total 0.92 0.92 0.92 1502 accurary 0.920772303595
We can see the accuracy is 0.92, much higher than naive bayes classifier. (0.83)