#!/usr/bin/env python
# coding: utf-8

# # Segmentation-free Word embeddings using sembei package

# In[1]:


import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
matplotlib.rc('font', family=['IPAexGothic'])
get_ipython().run_line_magic('matplotlib', 'inline')

import sembei as sb


# In[2]:


with open('/path/to/jawiki-latest-pages-articles_text_10M.txt',
          mode='r', encoding='utf-8', errors='ignore') as f:
    corpus_str = ''.join(line.replace('\n', ' ') for line in f.readlines())

len(corpus_str)


# In[3]:


corpus_str[100000:100100]


# In[4]:


get_ipython().run_cell_magic('time', '', 'n_extract_tuple = [(1, 3000), (2, 100000), (3, 100000), (4, 100000), (5, 100000),\n                   (6, 70000), (7, 10000), (8, 10000)]\n\nvocabulary_all = sb.utils.ngram.extract_topn_ngram_lossycounting(\n    corpus_str, width_ngram=len(n_extract_tuple), n_extract_tuple=n_extract_tuple,\n    epsilon=1e-7, support_threshold=1e-7, n_processes=20)\n    \nsize_vocabulary_all = len(vocabulary_all)\n')


# In[5]:


sembei = sb.embed.Sembei(corpus_str, vocabulary=vocabulary_all, dim=200,
                         n_iter_rsvd=6, wide_window=False)


# In[6]:


get_ipython().run_cell_magic('time', '', 'sembei.construct_cooccurrence_matrix(n_cores=4, n_chunk=40, n_chunk_pool=10)\n')


# In[7]:


get_ipython().run_cell_magic('time', '', 'sembei.compute()\n')


# In[8]:


vectors = sembei.get_vectors(gamma=1e-6)
vectors.sample(n=10)


# In[9]:


query_list = '鉄腕アトム 生成 確率 プログラム 倒す 数学 江戸時代 中国'.split(' ')
sb.utils.show.get_topn_df(sembei, query_list)


# In[ ]: