from __future__ import absolute_import, division, print_function
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
print(tf.__version__)
1.12.0
# 단어 벡터를 분석해볼 임의의 문장들
sentences = ["나 고양이 좋다",
"나 강아지 좋다",
"나 동물 좋다",
"강아지 고양이 동물",
"여자친구 고양이 강아지 좋다",
"고양이 생선 우유 좋다",
"강아지 생선 싫다 우유 좋다",
"강아지 고양이 눈 좋다",
"나 여자친구 좋다",
"여자친구 나 싫다",
"여자친구 나 영화 책 음악 좋다",
"나 게임 만화 애니 좋다",
"고양이 강아지 싫다",
"강아지 고양이 좋다"]
# 문장을 전부 합친 후, 공백으로 단어들을 나누고 고유한 단어들로 리스트를 만듭니다.
word_sequence = ' '.join(sentences).split()
word_list = list(set(word_sequence))
word_list.sort()
print(word_sequence)
print(word_list)
['나', '고양이', '좋다', '나', '강아지', '좋다', '나', '동물', '좋다', '강아지', '고양이', '동물', '여자친구', '고양이', '강아지', '좋다', '고양이', '생선', '우유', '좋다', '강아지', '생선', '싫다', '우유', '좋다', '강아지', '고양이', '눈', '좋다', '나', '여자친구', '좋다', '여자친구', '나', '싫다', '여자친구', '나', '영화', '책', '음악', '좋다', '나', '게임', '만화', '애니', '좋다', '고양이', '강아지', '싫다', '강아지', '고양이', '좋다'] ['강아지', '게임', '고양이', '나', '눈', '동물', '만화', '생선', '싫다', '애니', '여자친구', '영화', '우유', '음악', '좋다', '책']
# 문자열로 분석하는 것 보다, 숫자로 분석하는 것이 훨씬 용이하므로
# 리스트에서 문자들의 인덱스를 뽑아서 사용하기 위해,
# 이를 표현하기 위한 연관 배열과, 단어 리스트에서 단어를 참조 할 수 있는 인덱스 배열을 만듭합니다.
word_dic = {w: i for i, w in enumerate(word_list)}
print(word_dic)
{'강아지': 0, '게임': 1, '고양이': 2, '나': 3, '눈': 4, '동물': 5, '만화': 6, '생선': 7, '싫다': 8, '애니': 9, '여자친구': 10, '영화': 11, '우유': 12, '음악': 13, '좋다': 14, '책': 15}
def preprocessor(sequences, word_dic, window_size):
context = []
for idx in range(window_size, len(sequences) - window_size):
center_word = word_dic.get(sequences[idx])
context_words = [word_dic.get(sequences[idx + _]) for _ in range(-window_size, window_size + 1) if _ != 0]
for token in context_words:
context.append([center_word, token])
else:
return context
batch = preprocessor(sequences = word_sequence, word_dic = word_dic, window_size = 2)
center_words = np.array(batch)[:,0]
target_words = np.array(batch)[:,[1]]
class Word2vec:
def __init__(self, center_words, target_words, vocab_size,
embedding_dim = 2, num_sampled = 10):
self._center_words = center_words
self._target_words = target_words
with tf.variable_scope('embeddings'):
self._embeddings = tf.get_variable(name = 'lookup_table', shape = [vocab_size, embedding_dim],
dtype = tf.float32, initializer = tf.truncated_normal_initializer())
self._selected_embed = tf.nn.embedding_lookup(params = self._embeddings, ids = self._center_words)
with tf.variable_scope('nce'):
nce_weights = tf.get_variable('weights', shape = [vocab_size, embedding_dim], dtype = tf.float32,
initializer = tf.truncated_normal_initializer())
nce_biases = tf.get_variable('biases', initializer = tf.zeros(shape = [vocab_size]))
self.nce_loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights, biases = nce_biases,
labels = self._target_words, inputs = self._selected_embed,
num_sampled = num_sampled, num_classes = vocab_size))
def get_wordvector(self, sess, word_dic, word):
idx = word_dic.get(word)
feed_get_wordvector = {self._center_words : [idx]}
return sess.run(self._selected_embed, feed_dict = feed_get_wordvector)
# hyper-parameter
epochs = 200
batch_size = 8
learning_rate = .001
total_step = int(len(batch) / batch_size)
print(total_step)
24
## create input pipeline with tf.data
dataset = tf.data.Dataset.from_tensor_slices((center_words, target_words))
dataset = dataset.shuffle(buffer_size = 32)
dataset = dataset.batch(batch_size = batch_size)
iterator = dataset.make_initializable_iterator()
x_data, y_data = iterator.get_next()
sgram = Word2vec(center_words = x_data, target_words = y_data, vocab_size = len(word_dic))
# create training op
opt = tf.train.AdamOptimizer(learning_rate = learning_rate)
# equal to 'var_list = None'
training_op = opt.minimize(loss = sgram.nce_loss)
sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_loss_hist = []
for epoch in range(epochs):
sess.run(iterator.initializer)
avg_tr_loss = 0
total_step = 0
try:
while True:
_, tr_loss = sess.run(fetches = [training_op, sgram.nce_loss])
avg_tr_loss += tr_loss
total_step += 1
except tf.errors.OutOfRangeError:
pass
avg_tr_loss /= total_step
tr_loss_hist.append(avg_tr_loss)
if epoch % 20 == 0:
print('epoch : {:3}, tr_loss : {:.2f}'.format(epoch, avg_tr_loss))
epoch : 0, tr_loss : 11.64 epoch : 20, tr_loss : 8.82 epoch : 40, tr_loss : 6.86 epoch : 60, tr_loss : 5.09 epoch : 80, tr_loss : 4.15 epoch : 100, tr_loss : 3.39 epoch : 120, tr_loss : 3.11 epoch : 140, tr_loss : 3.16 epoch : 160, tr_loss : 3.00 epoch : 180, tr_loss : 3.04
plt.plot(tr_loss_hist)
[<matplotlib.lines.Line2D at 0x7fe0f00b1198>]
for word in word_list:
tmp = sgram.get_wordvector(sess = sess, word_dic = word_dic, word = word)
x, y = tmp[0][0], tmp[0][1]
plt.scatter(x, y)
plt.annotate(word, xy=(x, y), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom')