#!/usr/bin/env python # coding: utf-8 # # Segmentation-free Word embeddings using sembei package # In[1]: import pandas as pd import matplotlib import matplotlib.pyplot as plt matplotlib.style.use('ggplot') matplotlib.rc('font', family=['IPAexGothic']) get_ipython().run_line_magic('matplotlib', 'inline') import sembei as sb # In[2]: with open('/path/to/jawiki-latest-pages-articles_text_10M.txt', mode='r', encoding='utf-8', errors='ignore') as f: corpus_str = ''.join(line.replace('\n', ' ') for line in f.readlines()) len(corpus_str) # In[3]: corpus_str[100000:100100] # In[4]: get_ipython().run_cell_magic('time', '', 'n_extract_tuple = [(1, 3000), (2, 100000), (3, 100000), (4, 100000), (5, 100000),\n (6, 70000), (7, 10000), (8, 10000)]\n\nvocabulary_all = sb.utils.ngram.extract_topn_ngram_lossycounting(\n corpus_str, width_ngram=len(n_extract_tuple), n_extract_tuple=n_extract_tuple,\n epsilon=1e-7, support_threshold=1e-7, n_processes=20)\n \nsize_vocabulary_all = len(vocabulary_all)\n') # In[5]: sembei = sb.embed.Sembei(corpus_str, vocabulary=vocabulary_all, dim=200, n_iter_rsvd=6, wide_window=False) # In[6]: get_ipython().run_cell_magic('time', '', 'sembei.construct_cooccurrence_matrix(n_cores=4, n_chunk=40, n_chunk_pool=10)\n') # In[7]: get_ipython().run_cell_magic('time', '', 'sembei.compute()\n') # In[8]: vectors = sembei.get_vectors(gamma=1e-6) vectors.sample(n=10) # In[9]: query_list = '鉄腕アトム 生成 確率 プログラム 倒す 数学 江戸時代 中国'.split(' ') sb.utils.show.get_topn_df(sembei, query_list) # In[ ]: