进行较为复杂的ensemble方法
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument
def review_to_words(raw_review):
review_text = BeautifulSoup(raw_review, 'lxml').get_text()
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
words = letters_only.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
return(" ".join(meaningful_words))
def tag_reviews(reviews, prefix):
tagged = []
for i, review in enumerate(reviews):
tagged.append(TaggedDocument(words=review.split(), tags=[prefix + '_%s' % i]))
return tagged
Using TensorFlow backend.
# gensim modules
from gensim.models import Doc2Vec
# numpy
import numpy as np
# classifier
from sklearn.linear_model import LogisticRegression
# random
from random import shuffle
# preprocess packages
import pandas as pd
# import sys
# sys.path.insert(0, '..')
# from utils.TextPreprocess import review_to_words, tag_reviews
'''
Training Data
'''
train = pd.read_csv("../Sentiment/data/labeledTrainData.tsv", header=0,
delimiter='\t', quoting=3, error_bad_lines=False)
num_reviews = train["review"].size
print("Cleaning and parsing the training set movie reviews...")
clean_train_reviews = []
for i in range(0, num_reviews):
clean_train_reviews.append(review_to_words(train["review"][i]))
'''
Test Data
'''
test = pd.read_csv("../Sentiment/data/testData.tsv", header = 0, delimiter = "\t", quoting = 3)
num_reviews = len(test["review"])
clean_test_reviews = []
print("Cleaning and parsing the test set movie reviews...")
for i in range(0, num_reviews):
clean_review = review_to_words(test["review"][i])
clean_test_reviews.append(clean_review)
# # Unlabeled Train Data
# unlabeled_reviews = pd.read_csv("../Sentiment/data/unlabeledTrainData.tsv", header = 0, delimiter = "\t", quoting = 3)
# num_reviews = len(unlabeled_reviews["review"])
# clean_unlabeled_reviews = []
# print("Cleaning and parsing the test set movie reviews...")
# for i in range( 0, num_reviews):
# if( (i+1)%5000 == 0 ):
# print("Review %d of %d\n" % (i+1, num_reviews))
# clean_review = review_to_words(unlabeled_reviews["review"][i])
# clean_unlabeled_reviews.append(clean_review)
Cleaning and parsing the training set movie reviews... Cleaning and parsing the test set movie reviews... Cleaning and parsing the test set movie reviews... Review 5000 of 50000 Review 10000 of 50000 Review 15000 of 50000 Review 20000 of 50000 Review 25000 of 50000 Review 30000 of 50000 Review 35000 of 50000 Review 40000 of 50000 Review 45000 of 50000 Review 50000 of 50000
把训练好的doc2vec模型导入,得到train和test的sentence vector
train_data_features_d2v = []
test_data_features_d2v = []
model_dbow = Doc2Vec.load('../Sentiment/src/deep/model/doc2vec_lr100')
呃,发现还需要train_tagged这样有tag信息的对象才能读取。我还是直接把处理好的vector保存好得了。在Part 2.9进行保存
train_data_features_d2v = np.loadtxt('../Sentiment/data/train_feature_d2v.txt')
test_data_features_d2v = np.loadtxt('../Sentiment/data/test_feature_d2v.txt')
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import bsr_matrix
from sklearn.svm import SVC
num_reviews = len(test["review"])
result = [0.0 for i in range(num_reviews)]
len(result)
25000
import random
def sample(train_bow, train_d2v, label):
num = len(label)
index_set = set(random.sample(range(num), int(num / 2)))
l1_train_bow = []
l1_train_d2v = []
l1_label = []
l2_train_bow = []
l2_train_d2v = []
l2_label = []
for i in range(num):
if i in index_set:
l1_train_bow.append(train_bow[i])
l1_train_d2v.append(train_d2v[i])
l1_label.append(label[i])
else:
l2_train_bow.append(train_bow[i])
l2_train_d2v.append(train_d2v[i])
l2_label.append(label[i])
return l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label
print(len(clean_train_reviews))
print(train_data_features_d2v.shape)
print(len(train["sentiment"].values))
25000 (25000, 100) 25000
l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train["sentiment"].values)
print(len(l1_train_bow))
print(len(l1_train_d2v))
print(len(l2_train_bow))
print(len(l2_train_d2v))
print(len(l1_label))
print(len(l2_label))
12500 12500 12500 12500 12500 12500
我想搞清楚这个sample()函数究竟在干什么。
明白了,random.sample(range(25000), 12500)
,其实就是从25000个数字里,随机调出12500个。这里又多加了个set,感觉有点多余
num = len(train["sentiment"].values) # num = 25000
index_set = set(random.sample(range(num), int(num / 2)))
明白了,整个sample函数,其实就是把12500个训练集平均分成了两部分,训练集相关数据有clean_train_reviews(实际的sentence),train_data_features_d2v(经过doc2vec处理的sentence vector)。对应的标签也分成了两部分。
import heapq
def select_feature(filePath, k):
read = open(filePath, 'r')
lab_fea = {}
for line in read:
line_arr = line.strip().split()
if len(line_arr) - 1 <= k:
lab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]
else:
heap = []
heapq.heapify(heap)
for kv in line_arr[1 : ]:
key, val = kv.split(':')
if len(heap) < k:
heapq.heappush(heap, (float(val), key))
else:
if float(val) > heap[0][0]:
heapq.heappop(heap)
heapq.heappush(heap, (float(val), key))
lab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]
read.close()
return lab_fea
lab_fea = select_feature('feature_chi.txt', 1000)['1']
我们一般处理方式有2种: 1)对数据先fit,再transform,好处是我可以拿到数据变换(比如scaling/幅度变换/标准化)的参数,这样你可以在测试集上也一样做相同的数据变换处理。即先对训练集做fit,然后再对训练集和测试集做transform 2)fit_trainsform,一次性完成数据的变换(比如scaling/幅度变换/标准化),比较快。但是如果在训练集和测试集上用fit_trainsform,可能执行的是两套变换标准(因为训练集和测试集幅度不一样)
这个解释的也很清楚,transform主要就是为了做中心化之类的预处理操作,让数据更好用一些。
print("training bow ...")
vectorizer_bow = TfidfVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
vocabulary = lab_fea,
max_features = 19000)
l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)
l1_train_features_bow = bsr_matrix(l1_train_features_bow)
l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)
l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)
l2_test_features_bow = bsr_matrix(l2_test_features_bow)
l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]
training bow ...
上面是先训练了一个TfidfVectorizer,对l1_train_bow(即12500个sentence)进行计算得到了l1_train_features_bow(代表每个sentence的特征向量,每个sentence 1000维)。然后用LR对(l1_train_features_bow, l1_label)进行了训练。然后把训练好的模型,对l2_train_bow(l2_test_features_bow)进行了预测。
l1_train_features_bow.shape
(12500, 1000)
print("train doc2vec ...")
l1_train_features_d2v = bsr_matrix(l1_train_d2v)
l2_test_features_d2v = bsr_matrix(l2_train_d2v)
l1_svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)
l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)
l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]
train doc2vec ...
l2_result_d2v.shape
(12500,)
上面也是,只拿了12500个doc2vec向量,l1_train_d2v,来做训练,分类器是svm,然后对l2_train_d2v进行了预测。
print("train ensemble ...")
train_data_features_ens = []
for i in range(len(l2_result_bow)):
vector = []
vector.append(l2_result_bow[i])
vector.append(l2_result_d2v[i])
train_data_features_ens.append(vector)
lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
lr_ens = lr_ens.fit(train_data_features_ens, l2_label)
train ensemble ...
train_data_features_ens[:2]
[[0.23886548792325121, 0.15337969425958606], [0.81253080751969953, 0.88316226120104124]]
这里的一个vector包含两个数字[l2_result_bow[i], l2_result_d2v[i]]
,所以这里我们得到的train_data_features_ens大概是这样的一个形式[[l2_result_bow[0], l2_result_d2v[0]], [l2_result_bow[1], l2_result_d2v[1]]]
,写成数字就是上面那样的输出。
print("final predict ...")
train_bow = vectorizer_bow.fit_transform(clean_train_reviews)
train_bow = bsr_matrix(train_bow)
test_bow = vectorizer_bow.transform(clean_test_reviews)
test_bow = bsr_matrix(test_bow)
train_d2v = bsr_matrix(train_data_features_d2v)
test_d2v = bsr_matrix(test_data_features_d2v)
lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
lr_bow = lr_bow.fit(train_bow, list(train["sentiment"]))
svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)
svm_d2v = svm_d2v.fit(train_d2v, train["sentiment"].values)
result_bow = lr_bow.predict_proba(test_bow)[:,1]
result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]
final predict ...
test_data_features_ens = []
for i in xrange(len(result_bow)):
vector = []
vector.append(result_bow[i])
vector.append(result_d2v[i])
test_data_features_ens.append(vector)
result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]
test_data_features_ens[:2]
[[0.99794119998088815, 0.99910189748012901], [0.018351063999300796, 0.0014584329684225311]]
上面所有都结束后,就算是一次epoch结束了。之后应该把结果都加到result里,然后除以epoch次数,得到平均预测概率。感觉从31开始就有点看不懂了。下面把所有的都内容都完成写一遍,为了加快速度,把svc变为lr:
result = [0.0 for i in range(num_reviews)]
len(result)
25000
max_iter = 5
for epoch in range(max_iter):
print("epoch: " + str(epoch))
l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train["sentiment"].values)
print("training bow ...")
vectorizer_bow = TfidfVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
vocabulary = lab_fea,
max_features = 19000)
l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)
l1_train_features_bow = bsr_matrix(l1_train_features_bow)
l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)
l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)
l2_test_features_bow = bsr_matrix(l2_test_features_bow)
l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]
print("train doc2vec ...")
l1_train_features_d2v = bsr_matrix(l1_train_d2v)
l2_test_features_d2v = bsr_matrix(l2_train_d2v)
l1_svm_d2v = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)
l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]
print("train ensemble ...")
train_data_features_ens = []
for i in range(len(l2_result_bow)):
vector = []
vector.append(l2_result_bow[i])
vector.append(l2_result_d2v[i])
train_data_features_ens.append(vector)
lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
lr_ens = lr_ens.fit(train_data_features_ens, l2_label)
print("final predict ...")
train_bow = vectorizer_bow.fit_transform(clean_train_reviews)
train_bow = bsr_matrix(train_bow)
test_bow = vectorizer_bow.transform(clean_test_reviews)
test_bow = bsr_matrix(test_bow)
train_d2v = bsr_matrix(train_data_features_d2v)
test_d2v = bsr_matrix(test_data_features_d2v)
lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
lr_bow = lr_bow.fit(train_bow, list(train["sentiment"]))
svm_d2v = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
svm_d2v = svm_d2v.fit(train_d2v, train["sentiment"].values)
result_bow = lr_bow.predict_proba(test_bow)[:,1]
result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]
test_data_features_ens = []
for i in range(len(result_bow)):
vector = []
vector.append(result_bow[i])
vector.append(result_d2v[i])
test_data_features_ens.append(vector)
result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]
for i in range(num_reviews):
result[i] += result_test_ens[i]
epoch: 0 training bow ... train doc2vec ... train ensemble ... final predict ... epoch: 1 training bow ... train doc2vec ... train ensemble ... final predict ... epoch: 2 training bow ... train doc2vec ... train ensemble ... final predict ... epoch: 3 training bow ... train doc2vec ... train ensemble ... final predict ... epoch: 4 training bow ... train doc2vec ... train ensemble ... final predict ...
对5次的结果取平均
for i in range(num_reviews):
result[i] /= max_iter
result = np.array(result)
result
array([ 0.97450031, 0.02570378, 0.57902932, ..., 0.05740643, 0.97011312, 0.65017741])
result > 0.5
array([ True, False, True, ..., False, True, True], dtype=bool)
result_bool = result >= 0.5
result_bool * 1
array([1, 0, 1, ..., 0, 1, 1])
combine = pd.DataFrame(data={'id': test['id'],
'sentiment': result_bool * 1})
combine.head()
id | sentiment | |
---|---|---|
0 | "12311_10" | 1 |
1 | "8348_2" | 0 |
2 | "5828_4" | 1 |
3 | "7186_2" | 0 |
4 | "12128_7" | 1 |
print("output...")
combine.to_csv('../Sentiment/result/ensemble.csv', index=False, quoting=3)
output...
最后的结果是0.88968,我不知道作者是怎么得到0.96的,反正这样的结果也只是和combine一样罢了。
test_combine = pd.read_csv('../Sentiment/result/ensemble_final.csv', header=0)
test_combine.head()
id | sentiment | |
---|---|---|
0 | 12311_10 | 0.914962 |
1 | 8348_2 | 0.063295 |
2 | 5828_4 | 0.940739 |
3 | 7186_2 | 0.134307 |
4 | 12128_7 | 0.924105 |
test_combine['sentiment'] = test_combine['sentiment'] >= 0.5
test_combine['sentiment'] = test_combine['sentiment'].astype('int')
test_combine.head()
id | sentiment | |
---|---|---|
0 | 12311_10 | 1 |
1 | 8348_2 | 0 |
2 | 5828_4 | 1 |
3 | 7186_2 | 0 |
4 | 12128_7 | 1 |
print("output...")
test_combine.to_csv('../Sentiment/result/test_combine.csv', index=False, quoting=3)
output...
呃……上面是作者原文件里的ensemble_final,我提交后也就0.89的程度……