En esta práctica realizaremos una clasificación de análisis de opinión (sentiment analisys) de un dataset de 25.000 comentarios etiquetados de películas extraído de la base de datos cinematográfica IMDB. Para llevarlo a cabo utilizaremos un clasificador bayesiano, y mediremos su rendimiento mediante otro dataset de test con otros 25.000 comentarios.
def createVocab(dataSet):
vocab = {}
index = 0
for document in dataSet:
for word in document:
if word not in vocab:
vocab[word] = index
index += 1
return vocab
def setOfWords2Vec(vocab, inputSet):
words = {}
for word in inputSet:
if word in vocab:
if word not in words:
words[word] = 1
else:
words[word] += 1
else:
print("the word: %s is not in my Vocabulary!" % word)
return words
def load_data(path_to_dir):
"""
Loads the train and test set into four different lists.
"""
train_pos = []
train_neg = []
test_pos = []
test_neg = []
print("Reading positive train samples...")
with open(path_to_dir + "train-pos.txt", "r") as f:
for line in f:
words = [w.lower() for w in line.strip().split() if len(w) >= 3]
train_pos.append(words)
print("Reading negative train samples...")
with open(path_to_dir + "train-neg.txt", "r") as f:
for line in f:
words = [w.lower() for w in line.strip().split() if len(w) >= 3]
train_neg.append(words)
print("Reading positive test samples...")
with open(path_to_dir + "test-pos.txt", "r") as f:
for line in f:
words = [w.lower() for w in line.strip().split() if len(w) >= 3]
test_pos.append(words)
print("Reading negative test samples...")
with open(path_to_dir + "test-neg.txt", "r") as f:
for line in f:
words = [w.lower() for w in line.strip().split() if len(w) >= 3]
test_neg.append(words)
return train_pos, train_neg, test_pos, test_neg