#!/usr/bin/env python # coding: utf-8 # # Implementation of Eigenwords (One-Step CCA) # ## Requirements # * anaconda3 (>=4.1.1) # * Download and unzip . # # $\newcommand{\mt}[1]{\boldsymbol{\mathrm{#1}}}$ # In[1]: import gc import datetime as dt from collections import Counter import numpy as np from scipy import sparse import pandas as pd from sklearn.decomposition import PCA from sklearn.utils.extmath import randomized_svd from sklearn.preprocessing import normalize import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('load_ext', 'Cython') datetime_start = dt.datetime.today() # In[2]: path_corpus = './text8' # tuning parameters max_size_vocabulary = 20000 dim_embedding = 200 size_window = 4 # In[3]: # Load a text file with open(path_corpus, 'r') as f: txt = f.read() txt = txt[1:].split(' ') # Retrieve vocabulary vocabulary = [''] counter = Counter(txt) for word, n in counter.most_common(): if len(vocabulary) >= max_size_vocabulary: break vocabulary.append(word) print('# of word types : {0}'.format(len(counter.items()))) print('Size of vocabulary : {0}'.format(len(vocabulary))) print('\nTop 100 frequentry-used words:\n ' + ' '.join(vocabulary[0:100])) # In[4]: size_vocabulary = len(vocabulary) vocab_to_id = dict(zip(vocabulary, range(size_vocabulary))) id_tokens = np.array([vocab_to_id[t] if t in vocab_to_id else 0 for t in txt], dtype=np.int64) # In[5]: get_ipython().run_cell_magic('cython', '', '\nimport numpy as np\ncimport numpy as np\nfrom scipy import sparse\n\ndef construct_matrices(long[:] id_tokens, int size_vocabulary, int size_window):\n cdef:\n long long i_token, n_pushed = 0, n_tokens = len(id_tokens)\n long long id_word, id_context, i_offset, offset\n np.ndarray[np.int64_t, ndim=1] offsets = np.array(\n [i for i in range(-size_window, size_window + 1) if i != 0], dtype=int)\n np.ndarray[np.int64_t, ndim=1] VTV_diag = np.zeros(size_vocabulary, dtype=int)\n np.ndarray[np.int64_t, ndim=1] CTC_diag = np.zeros(2*size_window*size_vocabulary, dtype=int)\n np.ndarray[np.int64_t, ndim=1] VTC_row_index = np.empty(2*size_window*n_tokens, dtype=int) # undesirable\n np.ndarray[np.int64_t, ndim=1] VTC_col_index = np.empty(2*size_window*n_tokens, dtype=int) # undesirable\n\n for i_token in range(n_tokens): \n id_word = id_tokens[i_token]\n VTV_diag[id_word] += 1\n \n for i_offset in range(2*size_window):\n offset = offsets[i_offset]\n if not (0 <= i_token + offset < n_tokens): continue\n id_context = id_tokens[i_token + offset] + i_offset * size_vocabulary\n\n CTC_diag[id_context] += 1\n VTC_row_index[n_pushed] = id_word\n VTC_col_index[n_pushed] = id_context\n n_pushed += 1\n \n VTC_values = [1] * n_pushed\n VTC = sparse.csc_matrix(\n (VTC_values, (VTC_row_index[range(n_pushed)], VTC_col_index[range(n_pushed)])),\n shape=(size_vocabulary, 2*size_window*size_vocabulary))\n \n return (VTV_diag, CTC_diag, VTC)\n') # In[6]: VTV_diag, CTC_diag, VTC = construct_matrices(id_tokens, size_vocabulary, size_window) gc.collect() # $$ # \begin{align} # \mt{\mathcal{C}}_{VV} = \sqrt{\mt{V}^\top \mt{V}}, \quad # \mt{\mathcal{C}}_{VC} = \sqrt{\mt{V}^\top \mt{C}}, \quad # [\mt{\mathcal{C}}_{CC}]_{i, j} = \delta_{i, j} \cdot \left[\sqrt{\mt{C}^\top \mt{C}}\right]_{i, j} # \end{align} # $$ # # The optimal solution of One-Step CCA can be obtained via SVD of $\mt{A} = \mt{\mathcal{C}}_{VV}^{-1/2} \mt{\mathcal{C}}_{VC} \mt{\mathcal{C}}_{CC}^{-1/2}$. ($\sqrt{\cdot}$ indicates element-wise sqrt, and $\delta$ is Kronecker delta.) # In[7]: VTV_diag_h = VTV_diag.astype('double')**(-1/4) CTC_diag_h = CTC_diag.astype('double')**(-1/4) A = sparse.diags(VTV_diag_h) @ VTC.power(1/2) @ sparse.diags(CTC_diag_h) # In[8]: A # In[9]: plt.spy(A, markersize=0.005, alpha=0.3) # In[10]: U, S_diag, V = randomized_svd(A, n_components=dim_embedding, n_iter=3) # In[11]: print('Elapsed time :', dt.datetime.today() - datetime_start) # In[12]: plt.plot(S_diag, 'b+') plt.xlim(-2, ) plt.yscale('log') plt.xlabel('$i$', fontsize=20) plt.ylabel('singular value $\sigma_i$', fontsize=20) # In[13]: vectors = pd.DataFrame(normalize(sparse.diags(VTV_diag_h) @ U), index=vocabulary) # In[14]: query = 'godzilla' similarities = pd.Series(vectors.values @ vectors.loc[query, :].values, index=vocabulary) similarities.sort_values(ascending=False).head(10) # In[15]: # biplot margin_plot = 0.1 vectors_plot = vectors.iloc[np.arange(0, 1000, 3), :] pca = PCA(n_components=2) X = pca.fit_transform(vectors_plot) fig = plt.figure(figsize=(15, 15)) ax = fig.add_subplot(111) for i in range(X.shape[0]): ax.text(X[i, 0], X[i, 1], vectors_plot.index[i]) _ = ax.axis([min(X[:, 0]) - margin_plot, max(X[:, 0]) + margin_plot, min(X[:, 1]) - margin_plot, max(X[:, 1]) + margin_plot]) ax.set_xlabel('PC1') ax.set_ylabel('PC2') # ## Reference # * Paramveer S. Dhillon, Dean P. Foster, and Lyle H. Ungar. 2015. **Eigenwords: Spectral word embeddings**. Journal of Machine Learning Research, 16:3035–3078. # # # ## License # Copyright (c) 2017 Takamasa Oshikiri
# This software is released under the MIT License.
#