把bow模型的输出结果变为概率
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument
def review_to_words(raw_review):
review_text = BeautifulSoup(raw_review, 'lxml').get_text()
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
words = letters_only.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
return(" ".join(meaningful_words))
Using TensorFlow backend.
# numpy
import numpy as np
# classifier
from sklearn.linear_model import LogisticRegression
# random
from random import shuffle
# preprocess packages
import pandas as pd
'''
Training Data
'''
train = pd.read_csv("../Sentiment/data/labeledTrainData.tsv", header=0,
delimiter='\t', quoting=3, error_bad_lines=False)
num_reviews = train["review"].size
print("Cleaning and parsing the training set movie reviews...")
clean_train_reviews = []
for i in range(0, num_reviews):
clean_train_reviews.append(review_to_words(train["review"][i]))
'''
Test Data
'''
test = pd.read_csv("../Sentiment/data/testData.tsv", header = 0, delimiter = "\t", quoting = 3)
num_reviews = len(test["review"])
clean_test_reviews = []
print("Cleaning and parsing the test set movie reviews...")
for i in range(0, num_reviews):
clean_review = review_to_words(test["review"][i])
clean_test_reviews.append(clean_review)
Cleaning and parsing the training set movie reviews... Cleaning and parsing the test set movie reviews...
构建class对象
import heapq
def select_feature(filePath, k):
read = open(filePath, 'r')
lab_fea = {}
for line in read:
line_arr = line.strip().split()
if len(line_arr) - 1 <= k:
lab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]
else:
heap = []
heapq.heapify(heap)
for kv in line_arr[1 : ]:
key, val = kv.split(':')
if len(heap) < k:
heapq.heappush(heap, (float(val), key))
else:
if float(val) > heap[0][0]:
heapq.heappop(heap)
heapq.heappush(heap, (float(val), key))
lab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]
read.close()
return lab_fea
from utils.feature_select import select_feature
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import bsr_matrix
import numpy as np
class BagOfWords(object):
def __init__(self, vocab = False, tfidf = False, max_feature = 1000):
lab_fea = None
if(vocab == True):
print("select features...")
lab_fea = select_feature('../Sentiment/data/feature_chi.txt', max_feature)["1"]
self.vectorizer = None
if(tfidf == True):
self.vectorizer = TfidfVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
vocabulary = lab_fea,
max_features = max_feature)
else:
self.vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
vocabulary = lab_fea,
max_features = max_feature)
self.lr = None
def train_lr(self, train_data, lab_data, C = 1.0):
train_data_features = self.vectorizer.fit_transform(train_data)
train_data_features = bsr_matrix(train_data_features)
print (train_data_features.shape)
print("Training the logistic regression...")
self.lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
self.lr = self.lr.fit(train_data_features, lab_data)
def test_lr(self, test_data):
test_data_features = self.vectorizer.transform(test_data)
test_data_features = bsr_matrix(test_data_features)
result = self.lr.predict_proba(test_data_features)[:,1]
return result
def validate_lr(self, train_data, lab_data, C = 1.0):
train_data_features = self.vectorizer.fit_transform(train_data)
train_data_features = bsr_matrix(train_data_features)
lab_data = np.array(lab_data)
print("start k-fold validate...")
lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
cv = np.mean(cross_val_score(lr, train_data_features, lab_data, cv=10, scoring='roc_auc'))
return cv
bow = BagOfWords(vocab = True, tfidf = True, max_feature = 19000)
bow.train_lr(clean_train_reviews, list(train["sentiment"]), C = 1)
result = bow.test_lr(clean_test_reviews)
print(result)
select features... (25000, 19000) Training the logistic regression... [ 0.95791519 0.06738943 0.64317872 ..., 0.33803325 0.95515132 0.6234163 ]
print("output...")
output_dbow_prob = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
output_dbow_prob.to_csv('../Sentiment/result/bow_lr_prob.csv', index=False, quoting=3)
output...