import numpy as np
import pandas as pd
import MeCab
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from gensim.models import word2vec
from sklearn.mixture import GaussianMixture
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from collections import defaultdict
from tqdm import tqdm, tqdm_pandas, tqdm_notebook
import time
tqdm.pandas(tqdm_notebook)
news_df = pd.read_csv('../data/news.csv.gz')
news_df.head()
label | text | |
---|---|---|
0 | movie-enter | 【DVDエンター!】誘拐犯に育てられた女が目にした真実は、孤独か幸福か2005年11月から翌... |
1 | movie-enter | 藤原竜也、中学生とともにロケット打ち上げに成功「アンテナを張りながら生活をしていけばいい」2... |
2 | movie-enter | 『戦火の馬』ロイヤル・プレミアにウィリアム王子&キャサリン妃が出席3月2日より全国ロードショ... |
3 | movie-enter | 香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」女優の香里奈が18日、都内で... |
4 | movie-enter | ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」5日、東京・千代田区の内幸町... |
news_df.label.value_counts()
sports-watch 900 dokujo-tsushin 870 movie-enter 870 smax 870 it-life-hack 870 kaden-channel 864 peachy 842 topic-news 770 livedoor-homme 511 Name: label, dtype: int64
def get_wakati_text(text):
tagger = MeCab.Tagger('-Owakati')
wakati_text = tagger.parse(text).strip()
return wakati_text
news_df['wakati_text'] = news_df.text.progress_apply(get_wakati_text)
100%|██████████| 7367/7367 [00:07<00:00, 964.30it/s]
news_df.head()
label | text | wakati_text | |
---|---|---|---|
0 | movie-enter | 【DVDエンター!】誘拐犯に育てられた女が目にした真実は、孤独か幸福か2005年11月から翌... | 【 DVD エンター ! 】 誘拐 犯 に 育て られ た 女 が 目 に し た 真実 は... |
1 | movie-enter | 藤原竜也、中学生とともにロケット打ち上げに成功「アンテナを張りながら生活をしていけばいい」2... | 藤原 竜也 、 中学生 とともに ロケット 打ち上げ に 成功 「 アンテナ を 張り なが... |
2 | movie-enter | 『戦火の馬』ロイヤル・プレミアにウィリアム王子&キャサリン妃が出席3月2日より全国ロードショ... | 『 戦火 の 馬 』 ロイヤル ・ プレミア に ウィリアム 王子 & キャサリン 妃 が ... |
3 | movie-enter | 香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」女優の香里奈が18日、都内で... | 香里奈 、 女子高 生 100 人 の ガチンコ 質問 に 回答 「 ラーメン も 食べる ... |
4 | movie-enter | ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」5日、東京・千代田区の内幸町... | ユージ の 前 に 立ちはだかっ た JOY 「 僕 は AKB の 高橋 みなみ を 守る... |
y = news_df.label.values
accs_dict = {}
elapsed_times_dict = {}
def train_and_get_oof_accuracies(X, y, params):
START_TIME = time.time()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
accuracies = []
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
print(f'Start: fold {i+1}')
X_train, y_train = X[train_index, :], y[train_index]
X_valid, y_valid = X[valid_index, :], y[valid_index]
model = lgb.LGBMClassifier(**params)
model.fit(
X_train,
y_train,
eval_set=(X_valid, y_valid),
early_stopping_rounds=100,
verbose=100
)
y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f'Accuracy is {accuracy} \n')
accuracies.append(accuracy)
elapsed_time = time.time() - START_TIME
print(f'Elapsed time is {elapsed_time}.')
return accuracies, elapsed_time
params = {
'objective': 'multiclass',
'num_class': news_df.label.nunique(),
'n_estimators': 10000,
'random_seed': 0
}
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
X = vectorizer.fit_transform(news_df.wakati_text.values)
X = X.toarray()
X.shape
(7367, 71646)
accs_dict['bow'], elapsed_times_dict['bow'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.159019 [200] valid_0's multi_logloss: 0.173628 Early stopping, best iteration is: [114] valid_0's multi_logloss: 0.156125 Accuracy is 0.9525423728813559 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.148067 [200] valid_0's multi_logloss: 0.157974 Early stopping, best iteration is: [127] valid_0's multi_logloss: 0.144632 Accuracy is 0.9572591587516961 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.125745 [200] valid_0's multi_logloss: 0.122956 Early stopping, best iteration is: [136] valid_0's multi_logloss: 0.118081 Accuracy is 0.9640190088255262 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.129558 [200] valid_0's multi_logloss: 0.143817 Early stopping, best iteration is: [109] valid_0's multi_logloss: 0.127086 Accuracy is 0.9640190088255262 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.134151 [200] valid_0's multi_logloss: 0.142602 Early stopping, best iteration is: [122] valid_0's multi_logloss: 0.130477 Accuracy is 0.9599184782608695 Elapsed time is 119.94863891601562.
tsvd = TruncatedSVD(n_components=100)
X_reduced = tsvd.fit_transform(X)
X_reduced.shape
(7367, 100)
accs_dict['bow_tsvd'], elapsed_times_dict['bow_tsvd'] = train_and_get_oof_accuracies(X_reduced, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.427786 [200] valid_0's multi_logloss: 0.431585 Early stopping, best iteration is: [149] valid_0's multi_logloss: 0.408679 Accuracy is 0.8738983050847458 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.424189 [200] valid_0's multi_logloss: 0.408522 Early stopping, best iteration is: [166] valid_0's multi_logloss: 0.399322 Accuracy is 0.8772048846675712 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.411262 [200] valid_0's multi_logloss: 0.398127 Early stopping, best iteration is: [147] valid_0's multi_logloss: 0.387545 Accuracy is 0.8750848608282417 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.404523 [200] valid_0's multi_logloss: 0.408925 Early stopping, best iteration is: [132] valid_0's multi_logloss: 0.390372 Accuracy is 0.8879837067209776 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.427535 [200] valid_0's multi_logloss: 0.419641 Early stopping, best iteration is: [146] valid_0's multi_logloss: 0.405714 Accuracy is 0.8783967391304348 Elapsed time is 38.25342154502869.
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b')
X = vectorizer.fit_transform(news_df.wakati_text.values)
X = X.toarray()
X.shape
(7367, 71646)
accs_dict['tfidf'], elapsed_times_dict['tfidf'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.168869 [200] valid_0's multi_logloss: 0.189717 Early stopping, best iteration is: [101] valid_0's multi_logloss: 0.168724 Accuracy is 0.9491525423728814 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.164664 [200] valid_0's multi_logloss: 0.180684 Early stopping, best iteration is: [105] valid_0's multi_logloss: 0.163739 Accuracy is 0.9497964721845319 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.145545 [200] valid_0's multi_logloss: 0.145236 Early stopping, best iteration is: [117] valid_0's multi_logloss: 0.141311 Accuracy is 0.9599456890699253 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.140006 [200] valid_0's multi_logloss: 0.162954 Early stopping, best iteration is: [108] valid_0's multi_logloss: 0.139152 Accuracy is 0.957909029192125 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.149432 [200] valid_0's multi_logloss: 0.17345 Early stopping, best iteration is: [103] valid_0's multi_logloss: 0.148391 Accuracy is 0.953125 Elapsed time is 285.5730438232422.
tsvd = TruncatedSVD(n_components=100)
X_reduced = tsvd.fit_transform(X)
X_reduced.shape
(7367, 100)
accs_dict['tfidf_tsvd'], elapsed_times_dict['tfidf_tsvd'] = train_and_get_oof_accuracies(X_reduced, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.270505 Early stopping, best iteration is: [93] valid_0's multi_logloss: 0.268477 Accuracy is 0.9159322033898305 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.305707 Early stopping, best iteration is: [90] valid_0's multi_logloss: 0.300025 Accuracy is 0.9063772048846676 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.32101 Early stopping, best iteration is: [97] valid_0's multi_logloss: 0.319156 Accuracy is 0.8934147997284454 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.296192 Early stopping, best iteration is: [83] valid_0's multi_logloss: 0.291762 Accuracy is 0.9124236252545825 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.262702 Early stopping, best iteration is: [85] valid_0's multi_logloss: 0.26112 Accuracy is 0.9211956521739131 Elapsed time is 26.106940031051636.
corpus = [doc.split() for doc in news_df.wakati_text.values]
model_w2v = word2vec.Word2Vec(corpus, size=300, min_count=20, window=10)
model_w2v.save('../model/news_w2v.model')
def get_doc_mean_vector(doc, model):
doc_vector = np.zeros(model.vector_size)
words = doc.split()
word_cnt = 0
for word in words:
try:
word_vector = model.wv[word]
doc_vector += word_vector
word_cnt += 1
except KeyError:
pass
doc_vector /= word_cnt
return doc_vector
X = np.zeros((len(news_df), model_w2v.wv.vector_size))
for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
X[i, :] = get_doc_mean_vector(doc, model_w2v)
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
X.shape
(7367, 300)
accs_dict['w2v_mean'], elapsed_times_dict['w2v_mean'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.349747 Early stopping, best iteration is: [96] valid_0's multi_logloss: 0.348576 Accuracy is 0.8888135593220339 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.324592 Early stopping, best iteration is: [93] valid_0's multi_logloss: 0.323728 Accuracy is 0.8955223880597015 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.370943 Early stopping, best iteration is: [88] valid_0's multi_logloss: 0.367494 Accuracy is 0.8839103869653768 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.329032 Early stopping, best iteration is: [86] valid_0's multi_logloss: 0.326502 Accuracy is 0.9049558723693143 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.342435 Early stopping, best iteration is: [90] valid_0's multi_logloss: 0.340837 Accuracy is 0.8899456521739131 Elapsed time is 72.10090398788452.
def get_doc_swem_max_vector(doc, model):
words = doc.split()
word_cnt = 0
vector_size = model.vector_size
doc_vector = np.zeros((len(words), vector_size))
for i, word in enumerate(words):
try:
word_vector = model.wv[word]
except KeyError:
word_vector = np.zeros(vector_size)
doc_vector[i, :] = word_vector
doc_vector = np.max(doc_vector, axis=0)
return doc_vector
X = np.zeros((len(news_df), model_w2v.wv.vector_size))
for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
X[i, :] = get_doc_swem_max_vector(doc, model_w2v)
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
X.shape
(7367, 300)
accs_dict['swem_max'], elapsed_times_dict['swem_max'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.252278 [200] valid_0's multi_logloss: 0.282172 Early stopping, best iteration is: [110] valid_0's multi_logloss: 0.250547 Accuracy is 0.9179661016949152 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.303795 Early stopping, best iteration is: [99] valid_0's multi_logloss: 0.303722 Accuracy is 0.9077340569877883 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.290364 [200] valid_0's multi_logloss: 0.318144 Early stopping, best iteration is: [118] valid_0's multi_logloss: 0.288602 Accuracy is 0.9137813985064495 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.252443 [200] valid_0's multi_logloss: 0.286379 Early stopping, best iteration is: [106] valid_0's multi_logloss: 0.251445 Accuracy is 0.9300746775288526 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.243245 [200] valid_0's multi_logloss: 0.271738 Early stopping, best iteration is: [117] valid_0's multi_logloss: 0.239859 Accuracy is 0.9157608695652174 Elapsed time is 17.454517602920532.
#https://github.com/nyk510/scdv-python/blob/master/src/create.py より引用、一部改変
def create_document_vector(documents, w2t, n_embedding):
"""
学習済みの word topic vector と分かち書き済みの文章, 使用されている単語から
文章ベクトルを作成するメソッド.
Args:
documents(list[list[str]]):
w2t(dict): 単語 -> 埋め込み次元の dict
n_embedding(int):
Returns:
embedded document vector
"""
doc_vectors = []
for doc in documents:
vector_i = np.zeros(shape=(n_embedding,))
for w in doc:
try:
v = w2t[w]
vector_i += v
except KeyError:
continue
doc_vectors.append(vector_i)
return np.array(doc_vectors)
def create_idf_dataframe(documents):
"""
Args:
documents(list[str]):
Returns(pd.DataFrame):
"""
d = defaultdict(int)
for doc in documents:
vocab_i = set(doc)
for w in list(vocab_i):
d[w] += 1
df_idf = pd.DataFrame()
df_idf['count'] = d.values()
df_idf['word'] = d.keys()
df_idf['idf'] = np.log(len(documents) / df_idf['count'])
return df_idf
def compress_document_vector(doc_vector, p=.04):
v = np.copy(doc_vector)
vec_norm = np.linalg.norm(v, axis=1)
# zero divide しないように
vec_norm = np.where(vec_norm > 0, vec_norm, 1.)
v /= vec_norm[:, None]
a_min = v.min(axis=1).mean()
a_max = v.max(axis=1).mean()
threshold = (abs(a_min) + abs(a_max)) / 2. * p
v[abs(v) < threshold] = .0
return v
def get_scdv(parsed_docs, word_vec=model_w2v, n_components=60, compress=True):
n_wv_embed = word_vec.vector_size
# w2v model と corpus の語彙集合を作成
vocab_model = set(k for k in word_vec.wv.vocab.keys())
vocab_docs = set([w for doc in parsed_docs for w in doc])
out_of_vocabs = len(vocab_docs) - len(vocab_docs & vocab_model)
print('out of vocabs: {out_of_vocabs}'.format(**locals()))
# 使う文章に入っているものだけ学習させるため共通集合を取得してその word vector を GMM の入力にする
use_words = list(vocab_docs & vocab_model)
# 使う単語分だけ word vector を取得. よって shape = (n_vocabs, n_wv_embed,)
use_word_vectors = np.array([word_vec[w] for w in use_words])
# 公式実装: https://github.com/dheeraj7596/SCDV/blob/master/20news/SCDV.py#L32 により tied で学習
# 共分散行列全部推定する必要が有るほど低次元ではないという判断?
# -> 多分各クラスの分散を共通化することで各クラスに所属するデータ数を揃えたいとうのがお気持ちっぽい
clf = GaussianMixture(n_components=n_components, covariance_type='tied', verbose=2)
clf.fit(use_word_vectors)
# word probs は各単語のクラスタへの割当確率なので shape = (n_vocabs, n_components,)
word_probs = clf.predict_proba(use_word_vectors)
# 単語ごとにクラスタへの割当確率を wv に対して掛け算する
# shape = (n_vocabs, n_components, n_wv_embed) になる
word_cluster_vector = use_word_vectors[:, None, :] * word_probs[:, :, None]
# はじめに文章全体の idf を作成した後, use_word だけの df と left join して
# 使用している単語の idf を取得
df_use = pd.DataFrame()
df_use['word'] = use_words
df_idf = create_idf_dataframe(parsed_docs)
df_use = pd.merge(df_use, df_idf, on='word', how='left')
idf = df_use['idf'].values
# topic vector を計算するときに concatenation するとあるが
# 単に 二次元のベクトルに変形して各 vocab に対して idf をかければ OK
topic_vector = word_cluster_vector.reshape(-1, n_components * n_wv_embed) * idf[:, None]
# nanで影響が出ないように 0 で埋める
topic_vector[np.isnan(topic_vector)] = 0
word_to_topic = dict(zip(use_words, topic_vector))
n_embedding = topic_vector.shape[1]
cdv_vector = create_document_vector(parsed_docs, word_to_topic, n_embedding)
if compress:
compressed = compress_document_vector(cdv_vector)
return compressed
else:
return cdv_vector
X = get_scdv(news_df.wakati_text.values)
out of vocabs: 2567 Initialization 0
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:73: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
Iteration 10 time lapse 0.97443s ll change 0.02409 Initialization converged: True time lapse 1.62653s ll 555.91177
X.shape
(7367, 18000)
accs_dict['scdv_w2v'], elapsed_times_dict['scdv_w2v'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.408476 [200] valid_0's multi_logloss: 0.455125 Early stopping, best iteration is: [111] valid_0's multi_logloss: 0.406118 Accuracy is 0.8684745762711864 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.399024 [200] valid_0's multi_logloss: 0.433227 Early stopping, best iteration is: [129] valid_0's multi_logloss: 0.393422 Accuracy is 0.8799185888738128 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.415187 [200] valid_0's multi_logloss: 0.455897 Early stopping, best iteration is: [114] valid_0's multi_logloss: 0.414618 Accuracy is 0.86693822131704 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.354444 [200] valid_0's multi_logloss: 0.385248 Early stopping, best iteration is: [126] valid_0's multi_logloss: 0.351332 Accuracy is 0.8981670061099797 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.381748 [200] valid_0's multi_logloss: 0.421634 Early stopping, best iteration is: [113] valid_0's multi_logloss: 0.378952 Accuracy is 0.8790760869565217 Elapsed time is 2971.7359380722046.
corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(news_df.wakati_text.values)]
model = Doc2Vec(vector_size=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
X = np.array([model.infer_vector(doc.split()) for doc in news_df.wakati_text.values])
X.shape
(7367, 300)
accs_dict['d2v_default'], elapsed_times_dict['d2v_default'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.410166 [200] valid_0's multi_logloss: 0.438682 Early stopping, best iteration is: [123] valid_0's multi_logloss: 0.404398 Accuracy is 0.8515254237288136 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.375066 [200] valid_0's multi_logloss: 0.397589 Early stopping, best iteration is: [119] valid_0's multi_logloss: 0.367302 Accuracy is 0.8792401628222524 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.45298 [200] valid_0's multi_logloss: 0.496435 Early stopping, best iteration is: [121] valid_0's multi_logloss: 0.449182 Accuracy is 0.8533604887983707 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.394936 [200] valid_0's multi_logloss: 0.421278 Early stopping, best iteration is: [123] valid_0's multi_logloss: 0.390176 Accuracy is 0.8764426340801086 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.388316 [200] valid_0's multi_logloss: 0.410194 Early stopping, best iteration is: [120] valid_0's multi_logloss: 0.380114 Accuracy is 0.8716032608695652 Elapsed time is 90.71377110481262.
corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(news_df.wakati_text.values)]
model = Doc2Vec(vector_size=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=30)
X = np.array([model.infer_vector(doc.split()) for doc in news_df.wakati_text.values])
X.shape
(7367, 300)
accs_dict['d2v_epochs30'], elapsed_times_dict['d2v_epochs30'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.478027 [200] valid_0's multi_logloss: 0.394044 [300] valid_0's multi_logloss: 0.414963 Early stopping, best iteration is: [212] valid_0's multi_logloss: 0.391872 Accuracy is 0.8745762711864407 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.437851 [200] valid_0's multi_logloss: 0.340627 [300] valid_0's multi_logloss: 0.345039 Early stopping, best iteration is: [236] valid_0's multi_logloss: 0.337321 Accuracy is 0.8914518317503393 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.452152 [200] valid_0's multi_logloss: 0.358036 [300] valid_0's multi_logloss: 0.369286 Early stopping, best iteration is: [219] valid_0's multi_logloss: 0.356541 Accuracy is 0.8805159538357095 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.437499 [200] valid_0's multi_logloss: 0.352738 [300] valid_0's multi_logloss: 0.365206 Early stopping, best iteration is: [220] valid_0's multi_logloss: 0.349682 Accuracy is 0.891378139850645 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.462631 [200] valid_0's multi_logloss: 0.382527 Early stopping, best iteration is: [196] valid_0's multi_logloss: 0.382516 Accuracy is 0.876358695652174 Elapsed time is 126.87824606895447.
import gensim.models.keyedvectors as keyedvectors
model_fasttext = keyedvectors.KeyedVectors.load_word2vec_format('../model/fasttext.vec')
X = np.zeros((len(news_df), model_fasttext.wv.vector_size))
for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
X[i, :] = get_doc_mean_vector(doc, model_fasttext)
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead). """Entry point for launching an IPython kernel.
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead). import sys
X.shape
(7367, 300)
accs_dict['fasttext_mean'], elapsed_times_dict['fasttext_mean'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.411041 [200] valid_0's multi_logloss: 0.481018 Early stopping, best iteration is: [102] valid_0's multi_logloss: 0.410916 Accuracy is 0.8671186440677966 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.36119 [200] valid_0's multi_logloss: 0.402293 Early stopping, best iteration is: [108] valid_0's multi_logloss: 0.359622 Accuracy is 0.8853459972862958 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.43532 Early stopping, best iteration is: [98] valid_0's multi_logloss: 0.434781 Accuracy is 0.858112695179905 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.365556 Early stopping, best iteration is: [94] valid_0's multi_logloss: 0.364605 Accuracy is 0.879837067209776 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.356507 [200] valid_0's multi_logloss: 0.393635 Early stopping, best iteration is: [109] valid_0's multi_logloss: 0.353671 Accuracy is 0.889266304347826 Elapsed time is 79.51075530052185.
X = np.zeros((len(news_df), model_fasttext.wv.vector_size))
for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
X[i, :] = get_doc_swem_max_vector(doc, model_fasttext)
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead). """Entry point for launching an IPython kernel.
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead). # Remove the CWD from sys.path while we load stuff.
X.shape
(7367, 300)
accs_dict['swem_max_fasttext'], elapsed_times_dict['swem_max_fasttext'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.27356 [200] valid_0's multi_logloss: 0.273228 Early stopping, best iteration is: [145] valid_0's multi_logloss: 0.260954 Accuracy is 0.9138983050847458 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.286736 [200] valid_0's multi_logloss: 0.286457 Early stopping, best iteration is: [126] valid_0's multi_logloss: 0.275669 Accuracy is 0.9158751696065129 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.287594 [200] valid_0's multi_logloss: 0.275379 Early stopping, best iteration is: [154] valid_0's multi_logloss: 0.268537 Accuracy is 0.911744738628649 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.293082 [200] valid_0's multi_logloss: 0.296688 Early stopping, best iteration is: [135] valid_0's multi_logloss: 0.284888 Accuracy is 0.9131025118805159 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.265246 [200] valid_0's multi_logloss: 0.265006 Early stopping, best iteration is: [144] valid_0's multi_logloss: 0.25424 Accuracy is 0.9116847826086957 Elapsed time is 84.86075592041016.
X = get_scdv(news_df.wakati_text.values, word_vec=model_fasttext)
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:64: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
out of vocabs: 463 Initialization 0 Iteration 10 time lapse 3.31651s ll change 0.00775 Iteration 20 time lapse 3.05870s ll change 0.00595 Initialization converged: True time lapse 7.90065s ll 23.37736
X.shape
(7367, 18000)
accs_dict['scdv_fasttext'], elapsed_times_dict['scdv_fasttext'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.402941 [200] valid_0's multi_logloss: 0.448402 Early stopping, best iteration is: [107] valid_0's multi_logloss: 0.398791 Accuracy is 0.8772881355932204 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.405631 [200] valid_0's multi_logloss: 0.447015 Early stopping, best iteration is: [115] valid_0's multi_logloss: 0.403606 Accuracy is 0.878561736770692 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.427582 [200] valid_0's multi_logloss: 0.473779 Early stopping, best iteration is: [118] valid_0's multi_logloss: 0.422557 Accuracy is 0.8737270875763747 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.380217 [200] valid_0's multi_logloss: 0.422549 Early stopping, best iteration is: [115] valid_0's multi_logloss: 0.37349 Accuracy is 0.8906992532247114 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.367811 [200] valid_0's multi_logloss: 0.403988 Early stopping, best iteration is: [105] valid_0's multi_logloss: 0.366884 Accuracy is 0.8824728260869565 Elapsed time is 3519.537034034729.
X = get_scdv(news_df.wakati_text.values, word_vec=model_fasttext, compress=False)
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:64: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
out of vocabs: 463 Initialization 0 Iteration 10 time lapse 3.28870s ll change 0.01126 Iteration 20 time lapse 3.05809s ll change 0.00252 Initialization converged: True time lapse 6.65311s ll 23.50536
X.shape
(7367, 18000)
accs_dict['scdv_fasttext_raw'], elapsed_times_dict['scdv_fasttext_raw'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.302716 [200] valid_0's multi_logloss: 0.33929 Early stopping, best iteration is: [104] valid_0's multi_logloss: 0.301412 Accuracy is 0.9064406779661017 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.257149 [200] valid_0's multi_logloss: 0.283197 Early stopping, best iteration is: [112] valid_0's multi_logloss: 0.254539 Accuracy is 0.9219810040705563 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.279389 [200] valid_0's multi_logloss: 0.296254 Early stopping, best iteration is: [132] valid_0's multi_logloss: 0.277186 Accuracy is 0.9205702647657841 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.273663 [200] valid_0's multi_logloss: 0.298073 Early stopping, best iteration is: [121] valid_0's multi_logloss: 0.271395 Accuracy is 0.9205702647657841 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.255283 [200] valid_0's multi_logloss: 0.269838 Early stopping, best iteration is: [121] valid_0's multi_logloss: 0.249513 Accuracy is 0.9245923913043478 Elapsed time is 4616.114170074463.
X = get_scdv(news_df.wakati_text.values, word_vec=model_w2v, compress=False)
out of vocabs: 2567 Initialization 0
/home/nekoumei/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:73: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
Iteration 10 time lapse 1.10014s ll change 0.00740 Initialization converged: True time lapse 1.42006s ll 556.31595
X.shape
(7367, 18000)
accs_dict['scdv_w2v_raw'], elapsed_times_dict['scdv_w2v_raw'] = train_and_get_oof_accuracies(X, y, params)
Start: fold 1 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.331121 [200] valid_0's multi_logloss: 0.364291 Early stopping, best iteration is: [108] valid_0's multi_logloss: 0.328557 Accuracy is 0.8969491525423728 Start: fold 2 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.316796 [200] valid_0's multi_logloss: 0.343936 Early stopping, best iteration is: [125] valid_0's multi_logloss: 0.311626 Accuracy is 0.8989145183175034 Start: fold 3 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.329839 [200] valid_0's multi_logloss: 0.360964 Early stopping, best iteration is: [109] valid_0's multi_logloss: 0.326948 Accuracy is 0.8906992532247114 Start: fold 4 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.283325 [200] valid_0's multi_logloss: 0.311122 Early stopping, best iteration is: [116] valid_0's multi_logloss: 0.281045 Accuracy is 0.9178547182620502 Start: fold 5 Training until validation scores don't improve for 100 rounds. [100] valid_0's multi_logloss: 0.302535 [200] valid_0's multi_logloss: 0.333334 Early stopping, best iteration is: [116] valid_0's multi_logloss: 0.298851 Accuracy is 0.9021739130434783 Elapsed time is 2939.0836856365204.
results = [accs_dict, elapsed_times_dict]
pd.to_pickle(results, '../data/results_news.pkl')