#!/usr/bin/env python
# coding: utf-8

# コンピュータに単語の意味を理解させるためには。
# - シソーラスによる手法
# - カウントベースの手法
# - 推論ベースの手法 (word2vec) (これは次章)
# 
# ## 2.2 シソーラス
# 
# シソーラス、同じ意味の単語が同じグループに分類されている辞書。
# この手法の問題は、人手で辞書を作成しなければならないこと。
# - 時代の変化に対応するのが困難
# - 人な作業コストが高い
# - 単語の細かなニュアンスを表現できない
# 

# # 2.3  カウントベースの手法
# 
# - コーパスとは？大量のテキストデータが、自然言語処理の研究やアプリケーションのために目的をもって収集されたテキストデータ。
#   - Wikipedia, Google News, シェイクスピア, 夏目漱石
# 
# ここでは、"You say goodby and I say hello." という文章を使用する。

# In[1]:


text = 'You say goodby and I say hello.'
text = text.lower()
text = text.replace('.', ' .')
text


# In[3]:


words = text.split()
words


# 次に、Pythonのディクショナリを作成して、単語にIDを振ることにする。最後に、文章をIDリストに変換する。

# In[10]:


word_to_id = {}
id_to_word = {}
for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word
print(id_to_word)
print(word_to_id)

import numpy as np
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus


# In[13]:


import numpy as np

def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split()

    word_to_id = {}
    id_to_word = {}

    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
            
    corpus = np.array([word_to_id[w] for w in words])
    return corpus, word_to_id, id_to_word

text = 'You say goodby and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)


# ### 2.3.2 単語の分散表現
# 単語を単語として表現するのではなく、よりコンパクトで理にかなったベクトルとして表現することを、「単語の分散表現」と呼ぶ。
# ### 2.3.3 分布仮説
# 「単語の意味は、周囲の単語によって形成される」という仮説を「分布仮説」と呼ぶ。
# - コンテキスト
# 注目する単語に対して、その周囲に存在する単語を「コンテキスト」と呼ぶ。
# - ウィンドウサイズ
# 注目する単語に対する、コンテキストのサイズ。左右の2つの単語までコンテキストに含むなら、ウィンドウサイズは2である。
# 
# ### 2.3.4 共起行列
# 単語をベクトルで表す方法として素直な方法は、周囲の単語をカウントすること。
# 例えば上記の例であれば、7つの単語が登場しているので、行列として周囲の単語をカウントする。

# In[14]:


text = 'You say goodbye and I say hello.'
corupus, word_to_id, id_to_word = preprocess(text)
print (corpus)
print (id_to_word)


# In[24]:


def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size+1):
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
                
    return co_matrix

create_co_matrix(corpus, len(id_to_word))


# ### 2.3.5 ベクトル間の類似度
# ベクトル間の類似度を計測する方法は様々あるが、ここでは、「コサイン類似度」を使用する。
# 
# 下記の`cos_similarity`の実装において、epsを指定しているのはゼロ除算を避けるため。

# In[26]:


def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    return np.dot(nx, ny)

vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
print(cos_similarity(c0, c1))


# 上記の結果から、'you'と'i'の類似度は0.70...となり、比較的高いことが分かる。
# 
# ### 2.3.6 類似単語のランキング表示
# 

# In[27]:


def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('%s is not found' % query)
        return
    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s %s' % (id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return
        
most_similar('you', word_to_id, id_to_word, C, top=5)


# 上記の手法では、'goodbye' や 'hello' に類似度があるのは感覚とズレがあるため、これを改善する。
# 
# ## 2.4 カウントベース手法の改善
# 
# ### 2.4.1 相互情報量
# 相互情報量というのは、単語xと単語yの発生確率と、xyが同時に共起する確率から以下のように表現される。
# 
# [tex: PMI(x,y) = \log_2\dfrac{P(x,y)}{P(x)P(y)}]
# 
# これを使ってCorpusから相互情報量の行列を作成する。
# 
# 

# In[28]:


def ppmi(C, verbose=False, eps = 1e-8):
    '''PPMI（正の相互情報量）の作成

    :param C: 共起行列
    :param verbose: 進行状況を出力するかどうか    
    :return:
    '''
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('%.1f%% done' % (100*cnt/total))
    return M

W = ppmi(C)

np.set_printoptions(precision=3)
print('covariance matrix')
print(C)
print('-'*50)
print('PPMI')
print(W)


# PPMI行列を作成したが、この作成には時間がかかる。また、0となる空間が多いので、次にベクトルの削減を行う。
# 
# ### 2.4.2 次元削減
# 
# 次元削減を行う手法の一つとして、特異値分解(Singlar Value Decomposition:SVD)を行う。
# 
# [tex: X = USV^{T}]

# In[31]:


# SVDによる次元削減

U, S, V = np.linalg.svd(W)
print(C[0])
print(W[0])
print(U[0])


# 各単語を2次元のベクトルで表し、それをグラフにプロットする。
# 
# 'i' と 'you', 'goodbye' と 'hello'が近いので、ある程度直観に近い。

# In[34]:


import matplotlib.pyplot as plt

for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))
plt.scatter(U[:,0], U[:,1], alpha=0.5)


# ### 2.4.4 PTBデータセット
# 
# PTBデータセットは、Penn Treebankとyばれるコーパス。本格的なコーパスである。

# In[35]:


# coding: utf-8
import sys
sys.path.append('..')
from dataset import ptb


corpus, word_to_id, id_to_word = ptb.load_data('train')

print('corpus size:', len(corpus))
print('corpus[:30]:', corpus[:30])
print()
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['car']:", word_to_id['car'])
print("word_to_id['happy']:", word_to_id['happy'])
print("word_to_id['lexus']:", word_to_id['lexus'])


# ### 2.4.5 PTBデータセットでの評価
# 
# PTBデータセットを使ってカウントベースの手法を評価する。
# SVDは自前のものを使ってもよいが、高速化するためにsklearnモジュールを使用する。
# 

# In[36]:


# coding: utf-8
import sys
sys.path.append('..')
import numpy as np
from common.util import most_similar, create_co_matrix, ppmi
from dataset import ptb


window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('counting  co-occurrence ...')
C = create_co_matrix(corpus, vocab_size, window_size)
print('calculating PPMI ...')
W = ppmi(C, verbose=True)

print('calculating SVD ...')
try:
    # truncated SVD (fast!)
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5,
                             random_state=None)
except ImportError:
    # SVD (slow)
    U, S, V = np.linalg.svd(W)

word_vecs = U[:, :wordvec_size]

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)


# sklearnの`randomized_svd()`というメソッドを使用する。Truncated SVDを使用し、乱数を使うので実行結果は毎回異なる。