polarity dataset v1.1 (training set) (2.2Mb) (includes README.1.1): approximately 700 positive and 700 negative processed reviews. Released November 2002. This alternative version was created by Nathan Treloar, who removed a few non-English/incomplete reviews and changing some of the labels (judging some polarities to be different from the original author's rating). The complete list of changes made to v1.1 can be found in diff.txt.
polarity dataset v0.9 (testing set) (2.8Mb) (includes a README):. 700 positive and 700 negative processed reviews. Introduced in Pang/Lee/Vaithyanathan EMNLP 2002. Released July 2002.
# define a function to get .txt files in a folder
from os import listdir
def list_textfiles(directory):
"Return a list of filenames ending in '.txt' in DIRECTORY."
textfiles = []
for filename in listdir(directory):
if filename.endswith(".txt"):
textfiles.append(directory + "/" + filename)
return textfiles
# define a function to read the text in a .txt file
import codecs
def read_txt(filename):
try:
# using codecs to avoid error like "'utf8' codec can't decode byte..."
f = codecs.open(filename,'r',encoding='utf-8', errors='ignore')
text = f.read()
finally:
if f:
f.close()
return text
# import training data
filenames_pos = list_textfiles("movieReview_data/tokens/pos")
filenames_neg = list_textfiles("movieReview_data/tokens/neg")
# create two lists to store reivew text and polarity
data_train = []
data_labels_train = []
for f in filenames_pos:
data_train.append(read_txt(f))
data_labels_train.append('pos')
for f in filenames_neg:
data_train.append(read_txt(f))
data_labels_train.append('neg')
Next, we initialize a sckit-learn vector with the CountVectorizer class. Because the data could be in any format, we’ll set lowercase to False and exclude common words such as “the” or “and”. This vectorizer will transform our data into vectors of features. In this case, we use a CountVector, which means that our features are counts of the words that occur in our dataset. Once the CountVectorizer class is initialized, we fit it onto the data above and convert it to an array for easy usage.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5,max_df=0.8, sublinear_tf=True,use_idf=True)
features_train = vectorizer.fit_transform(data_train)
from sklearn import svm
clf = svm.SVC()
# train svm model
clf.fit(features_train, data_labels_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
from sklearn import metrics
import numpy as np;
# import test data
filenames_pos_test = list_textfiles("mix20_rand700_tokens/tokens/pos")
filenames_neg_test = list_textfiles("mix20_rand700_tokens/tokens/neg")
# create two lists to store reivew text and polarity
data_test = []
data_labels_test = []
for f in filenames_pos:
data_test.append(read_txt(f))
data_labels_test.append('pos')
for f in filenames_neg:
data_test.append(read_txt(f))
data_labels_test.append('neg')
# vectorize
features_test = vectorizer.fit_transform(data_test)
#features_nd_test = features_test.toarray() # for easy usage
predicted = clf.predict(features_test)
# print the accuracy score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print("Accuracy score of SVM model:\n"+ str(accuracy_score(data_labels_test,predicted)))
# print evaluation report showing precision, recall, f1, support
print(classification_report(data_labels_test, predicted))
Accuracy score of SVM model: 0.500721500722 precision recall f1-score support neg 0.00 0.00 0.00 692 pos 0.50 1.00 0.67 694 avg / total 0.25 0.50 0.33 1386
/Users/zjm/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(features_train, data_labels_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
from sklearn.metrics import classification_report
mnb_predict = mnb.predict(features_test)
print("Accuracy score of Naive Bayes model:\n"+ str(accuracy_score(data_labels_test,mnb_predict)))
print(classification_report(data_labels_test, mnb_predict))
Accuracy score of Naive Bayes model: 0.968253968254 precision recall f1-score support neg 0.96 0.97 0.97 692 pos 0.97 0.96 0.97 694 avg / total 0.97 0.97 0.97 1386
joblib.dump(value, filename, compress=0, protocol=None, cache_size=None)
from sklearn.externals import joblib
joblib.dump(mnb,'sentNB.model')
['sentNB.model']
Why Naive Bayes performed much better than SVM in this prediction?