#!/usr/bin/env python
# coding: utf-8

# # 1. RNN의 문제점
# 
# 
#     1.1 BPTT(BackPropagation Through TIme)의 문제
#         저번 시간에 다루었던 BPTT는 아래의 그림과 같이 모든 타임스템마다 처음부터 끝까지 역전파를 한다.     

# ![BPTT.PNG](attachment:BPTT.PNG)

#          하지만 시간이 길 경우, 역전파의 길이는 길어진다. 즉, 깊은 네트워크가 형성된다.
#          깊은 네트워크는 Gradient Vanishing & Exploding 문제가 발생할 가능성이 크다.
#          깊은 네트워크는 역전파의 계산량이 많아지기 때문에 학습시간이 오래 걸린다

#     1.2 장기 의존성(Long_Term Dependency) 문제
#         RNN은 타임 스템t에서 이전 타임 스템(t - 1)의 상태를 입력으로 받는 구조 --> 이전의 정보가 현재의 타임 스템 t에 영향을 줄 수 있음
#         장기 의존성: 이론적으로 모든 이전 타임 스텝이 영향을 주지만 앞쪽의 타임 스템은 타음 스템이 길어질 수록 영향을 주지 못함

# ![GV.PNG](attachment:GV.PNG)

# # 2. LSTM
#     
#     LSTM(Long Short-Term Memory)는 1995년에 제안된 구조로써 RNN의 장기 의존성 문제를 해결하고 학습 속도를 높임
#     모든 RNN은 신경망의 반복적인 모듈 체인의 형대를 갖음
#     LSTM에는 모듈 체인과 같은 구조가 있지만 1개의 신경 네트워크 계층을 갖는 대신 4개의 신경 네트워크 계층을 가짐

# ![%EA%B7%B8%EB%A6%BC1.png](attachment:%EA%B7%B8%EB%A6%BC1.png)

#     LSTM의 과정
#         LSTM는 두개의 input data ht와 ct가 있음,  ct는 장기적인 기억, ht는 단기적인 기억을 저장
#         LSTM의 핵심은 ct에서 기억할 부분, 삭제할 부분, 그리고 읽어 들일 부분을 학습하는 것
#         ct는 셀의 왼쪽에서 오른쪽으로 통과하면서 forget gate와 input gate를 거침으로써 기억을 일부 잃고 얻음
#         과정을 거친 ct는 다시 tanh함수로 전달되어 ht와 yt를 만드는데 기반이 됨

# ![overview.PNG](attachment:overview.PNG)

#      forget gate layer
#         
#         어떤 정보를 버릴지 선택하는 과정
#         ft에 의해 제어되며 장기 장태ct를 얼마나 삭제할지 제어        
#     
#     input gate layer
#         
#         it에 의해 제어되며 gt의 어느 부분이 장기 상태ct에 더해져야 하는지 제어
#         기존 RNN의 셀과 같은 형태를 취함
#         
#      output gate layer
#          ot는 장기 상태 ct의 어느 부분을 읽어서 ht와 yt로 출력해야 하는지 제어

# # LSTM 예제
# 
#        파란색 박스가 입력값이고, 빨간색 박스가 우리가 원하는 출력값입니다.
#        1~4번째 음표를 데이터로 5번째 음표를 라벨값으로 학습을 시킵니다.
#        다음에는 2~5번째 음표를 데이터로 6번째 음표를 라벨값으로 학습을 시킵니다.
#        이후 한 음표씩 넘어가면서 노래 끝까지 학습시킵니다.

# ![%EC%95%85%EB%B3%B4%20LSTM.PNG](attachment:%EC%95%85%EB%B3%B4%20LSTM.PNG)

# In[62]:


code2idx = {'c4':0, 'd4':1, 'e4':2, 'f4':3, 'g4':4, 'a4':5, 'b4':6,
            'c8':7, 'd8':8, 'e8':9, 'f8':10, 'g8':11, 'a8':12, 'b8':13}

idx2code = {0:'c4', 1:'d4', 2:'e4', 3:'f4', 4:'g4', 5:'a4', 6:'b4',
            7:'c8', 8:'d8', 9:'e8', 10:'f8', 11:'g8', 12:'a8', 13:'b8'}


# In[63]:


import numpy as np

def seq2dataset(seq, window_size):
    dataset = []
    for i in range(len(seq)-window_size):
        subset = seq[i:(i+window_size+1)]
        dataset.append([code2idx[item] for item in subset])
    return np.array(dataset)


# In[64]:


seq = ['g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'd8', 'e8', 'f8', 'g8', 'g8', 'g4',
       'g8', 'e8', 'e8', 'e8', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4',
       'd8', 'd8', 'd8', 'd8', 'd8', 'e8', 'f4', 'e8', 'e8', 'e8', 'e8', 'e8', 'f8', 'g4',
       'g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4']

dataset = seq2dataset(seq, window_size = 4)

print(dataset.shape)
print(dataset)


# In[65]:


# 0. 사용할 패키지 불러오기
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils

# 랜덤시드 고정시키기
np.random.seed(5)

# 손실 이력 클래스 정의
class LossHistory(keras.callbacks.Callback):
    def init(self):
        self.losses = []
        
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

# 데이터셋 생성 함수
def seq2dataset(seq, window_size):
    dataset = []
    for i in range(len(seq)-window_size):
        subset = seq[i:(i+window_size+1)]
        dataset.append([code2idx[item] for item in subset])
    return np.array(dataset)

# 1. 데이터 준비하기
        
# 코드 사전 정의

code2idx = {'c4':0, 'd4':1, 'e4':2, 'f4':3, 'g4':4, 'a4':5, 'b4':6,
            'c8':7, 'd8':8, 'e8':9, 'f8':10, 'g8':11, 'a8':12, 'b8':13}

idx2code = {0:'c4', 1:'d4', 2:'e4', 3:'f4', 4:'g4', 5:'a4', 6:'b4',
            7:'c8', 8:'d8', 9:'e8', 10:'f8', 11:'g8', 12:'a8', 13:'b8'}

# 시퀀스 데이터 정의

seq = ['g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'd8', 'e8', 'f8', 'g8', 'g8', 'g4',
       'g8', 'e8', 'e8', 'e8', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4',
       'd8', 'd8', 'd8', 'd8', 'd8', 'e8', 'f4', 'e8', 'e8', 'e8', 'e8', 'e8', 'f8', 'g4',
       'g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4']

# 2. 데이터셋 생성하기

dataset = seq2dataset(seq, window_size = 4)

print(dataset.shape)

# 입력(X)과 출력(Y) 변수로 분리하기
x_train = dataset[:,0:4]
y_train = dataset[:,4]

max_idx_value = 13

# 입력값 정규화 시키기
x_train = x_train / float(max_idx_value)

# 입력을 (샘플 수, 타입스텝, 특성 수)로 형태 변환
x_train = np.reshape(x_train, (50, 4, 1))

# 라벨값에 대한 one-hot 인코딩 수행
y_train = np_utils.to_categorical(y_train)

one_hot_vec_size = y_train.shape[1]

print("one hot encoding vector size is ", one_hot_vec_size)

# 3. 모델 구성하기
model = Sequential()
model.add(LSTM(128, input_shape = (4, 1)))
model.add(Dense(one_hot_vec_size, activation='softmax'))

# 4. 모델 학습과정 설정하기
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = LossHistory() # 손실 이력 객체 생성
history.init()

# 5. 모델 학습시키기
model.fit(x_train, y_train, epochs=2000, batch_size=14, verbose=2, callbacks=[history])

# 6. 학습과정 살펴보기
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

plt.plot(history.losses)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

# 7. 모델 평가하기
scores = model.evaluate(x_train, y_train)
print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))

# 8. 모델 사용하기

pred_count = 50 # 최대 예측 개수 정의

# 한 스텝 예측

seq_out = ['g8', 'e8', 'e4', 'f8']
pred_out = model.predict(x_train)

for i in range(pred_count):
    idx = np.argmax(pred_out[i]) # one-hot 인코딩을 인덱스 값으로 변환
    seq_out.append(idx2code[idx]) # seq_out는 최종 악보이므로 인덱스 값을 코드로 변환하여 저장
    
print("one step prediction : ", seq_out)

# 곡 전체 예측

seq_in = ['g8', 'e8', 'e4', 'f8']
seq_out = seq_in
seq_in = [code2idx[it] / float(max_idx_value) for it in seq_in] # 코드를 인덱스값으로 변환

for i in range(pred_count):
    sample_in = np.array(seq_in)
    sample_in = np.reshape(sample_in, (1, 4, 1)) # 샘플 수, 타입스텝 수, 속성 수
    pred_out = model.predict(sample_in)
    idx = np.argmax(pred_out)
    seq_out.append(idx2code[idx])
    seq_in.append(idx / float(max_idx_value))
    seq_in.pop(0)

print("full song prediction : ", seq_out)


# # 상태유지 LSTM

# ![%EC%83%81%ED%83%9C%EC%9C%A0%EC%A7%80%20LSTM.PNG](attachment:%EC%83%81%ED%83%9C%EC%9C%A0%EC%A7%80%20LSTM.PNG)

# In[12]:


# 0. 사용할 패키지 불러오기
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import np_utils

# 랜덤시드 고정시키기
np.random.seed(5)

# 손실 이력 클래스 정의
class LossHistory(keras.callbacks.Callback):
    def init(self):
        self.losses = []
        
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

# 데이터셋 생성 함수
def seq2dataset(seq, window_size):
    dataset = []
    for i in range(len(seq)-window_size):
        subset = seq[i:(i+window_size+1)]
        dataset.append([code2idx[item] for item in subset])
    return np.array(dataset)        

# 1. 데이터 준비하기

# 코드 사전 정의

code2idx = {'c4':0, 'd4':1, 'e4':2, 'f4':3, 'g4':4, 'a4':5, 'b4':6,
            'c8':7, 'd8':8, 'e8':9, 'f8':10, 'g8':11, 'a8':12, 'b8':13}

idx2code = {0:'c4', 1:'d4', 2:'e4', 3:'f4', 4:'g4', 5:'a4', 6:'b4',
            7:'c8', 8:'d8', 9:'e8', 10:'f8', 11:'g8', 12:'a8', 13:'b8'}

# 시퀀스 데이터 정의

seq = ['g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'd8', 'e8', 'f8', 'g8', 'g8', 'g4',
       'g8', 'e8', 'e8', 'e8', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4',
       'd8', 'd8', 'd8', 'd8', 'd8', 'e8', 'f4', 'e8', 'e8', 'e8', 'e8', 'e8', 'f8', 'g4',
       'g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4']

# 2. 데이터셋 생성하기

dataset = seq2dataset(seq, window_size = 4)

print(dataset.shape)

# 입력(X)과 출력(Y) 변수로 분리하기
x_train = dataset[:,0:4]
y_train = dataset[:,4]

max_idx_value = 13

# 입력값 정규화 시키기
x_train = x_train / float(max_idx_value)

# 입력을 (샘플 수, 타임스텝, 특성 수)로 형태 변환
x_train = np.reshape(x_train, (50, 4, 1))

# 라벨값에 대한 one-hot 인코딩 수행
y_train = np_utils.to_categorical(y_train)

one_hot_vec_size = y_train.shape[1]

print("one hot encoding vector size is ", one_hot_vec_size)

# 3. 모델 구성하기
model = Sequential()
model.add(LSTM(128, batch_input_shape = (1, 4, 1), stateful=True))
model.add(Dense(one_hot_vec_size, activation='softmax'))
    
# 4. 모델 학습과정 설정하기
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 5. 모델 학습시키기
num_epochs = 2000

history = LossHistory() # 손실 이력 객체 생성

history.init()

for epoch_idx in range(num_epochs):
    print ('epochs : ' + str(epoch_idx) )
    model.fit(x_train, y_train, epochs=1, batch_size=1, verbose=2, shuffle=False, callbacks=[history]) # 50 is X.shape[0]
    model.reset_states()
    
# 6. 학습과정 살펴보기
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

plt.plot(history.losses)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

# 7. 모델 평가하기
scores = model.evaluate(x_train, y_train, batch_size=1)
print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))
model.reset_states()

# 8. 모델 사용하기

pred_count = 50 # 최대 예측 개수 정의

# 한 스텝 예측

seq_out = ['g8', 'e8', 'e4', 'f8']
pred_out = model.predict(x_train, batch_size=1)

for i in range(pred_count):
    idx = np.argmax(pred_out[i]) # one-hot 인코딩을 인덱스 값으로 변환
    seq_out.append(idx2code[idx]) # seq_out는 최종 악보이므로 인덱스 값을 코드로 변환하여 저장

model.reset_states()
    
print("one step prediction : ", seq_out)

# 곡 전체 예측

seq_in = ['g8', 'e8', 'e4', 'f8']
seq_out = seq_in
seq_in = [code2idx[it] / float(max_idx_value) for it in seq_in] # 코드를 인덱스값으로 변환

for i in range(pred_count):
    sample_in = np.array(seq_in)
    sample_in = np.reshape(sample_in, (1, 4, 1)) # 샘플 수, 타입스텝 수, 속성 수
    pred_out = model.predict(sample_in)
    idx = np.argmax(pred_out)
    seq_out.append(idx2code[idx])
    seq_in.append(idx / float(max_idx_value))
    seq_in.pop(0)

model.reset_states()
    
print("full song prediction : ", seq_out)


# # 핍홀
# 
#     핍홀은 2000년에 Recurrent Nets that and Count 논문에서 제안한 LSTM의 변종
#     기존의 LSTM에서 sigmoid layer는 ht와 xt만 입력으로 받지만 핍홀에서는 ct-1도 입력으로 받는다.
#     이를 통해 좀 더 많은 맥락을 인식할 수 있다.

# ![peephole.png](attachment:peephole.png)

# In[66]:


import tensorflow as tf
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
 
 
# 랜덤에 의해 똑같은 결과를 재현하도록 시드 설정
# 하이퍼파라미터를 튜닝하기 위한 용도(흔들리면 무엇때문에 좋아졌는지 알기 어려움)
tf.set_random_seed(777)
 
 
# Standardization
def data_standardization(x):
    x_np = np.asarray(x)
    return (x_np - x_np.mean()) / x_np.std()
 
# 너무 작거나 너무 큰 값이 학습을 방해하는 것을 방지하고자 정규화한다
# x가 양수라는 가정하에 최소값과 최대값을 이용하여 0~1사이의 값으로 변환
# Min-Max scaling
def min_max_scaling(x):
    x_np = np.asarray(x)
    return (x_np - x_np.min()) / (x_np.max() - x_np.min() + 1e-7) # 1e-7은 0으로 나누는 오류 예방차원
 
# 정규화된 값을 원래의 값으로 되돌린다
# 정규화하기 이전의 org_x값과 되돌리고 싶은 x를 입력하면 역정규화된 값을 리턴한다
def reverse_min_max_scaling(org_x, x):
    org_x_np = np.asarray(org_x)
    x_np = np.asarray(x)
    return (x_np * (org_x_np.max() - org_x_np.min() + 1e-7)) + org_x_np.min()
 
 
# 하이퍼파라미터
input_data_column_cnt = 6  # 입력데이터의 컬럼 개수(Variable 개수)
output_data_column_cnt = 1 # 결과데이터의 컬럼 개수
 
seq_length = 28            # 1개 시퀀스의 길이(시계열데이터 입력 개수)
rnn_cell_hidden_dim = 20   # 각 셀의 (hidden)출력 크기
forget_bias = 1.0          # 망각편향(기본값 1.0)
num_stacked_layers = 1     # stacked LSTM layers 개수
keep_prob = 1.0            # dropout할 때 keep할 비율
 
epoch_num = 1000           # 에폭 횟수(학습용전체데이터를 몇 회 반복해서 학습할 것인가 입력)
learning_rate = 0.01       # 학습률
 
 
# 데이터를 로딩한다.
stock_file_name = 'AMZN.csv' # 아마존 주가데이터 파일
encoding = 'euc-kr' # 문자 인코딩
names = ['Date','Open','High','Low','Close','Adj Close','Volume']
raw_dataframe = pd.read_csv(stock_file_name, names=names, encoding=encoding) #판다스이용 csv파일 로딩
raw_dataframe.info() # 데이터 정보 출력
 
# raw_dataframe.drop('Date', axis=1, inplace=True) # 시간열을 제거하고 dataframe 재생성하지 않기
del raw_dataframe['Date'] # 위 줄과 같은 효과
 
stock_info = raw_dataframe.values[1:].astype(np.float) # 금액&거래량 문자열을 부동소수점형으로 변환한다
print("stock_info.shape: ", stock_info.shape)
print("stock_info[0]: ", stock_info[0])
 
 
# 데이터 전처리
# 가격과 거래량 수치의 차이가 많아나서 각각 별도로 정규화한다
 
# 가격형태 데이터들을 정규화한다
# ['Open','High','Low','Close','Adj Close','Volume']에서 'Adj Close'까지 취함
# 곧, 마지막 열 Volume를 제외한 모든 열
price = stock_info[:,:-1]
norm_price = min_max_scaling(price) # 가격형태 데이터 정규화 처리
print("price.shape: ", price.shape)
print("price[0]: ", price[0])
print("norm_price[0]: ", norm_price[0])
print("="*100) # 화면상 구분용
 
# 거래량형태 데이터를 정규화한다
# ['Open','High','Low','Close','Adj Close','Volume']에서 마지막 'Volume'만 취함
# [:,-1]이 아닌 [:,-1:]이므로 주의하자! 스칼라가아닌 벡터값 산출해야만 쉽게 병합 가능
volume = stock_info[:,-1:]
norm_volume = min_max_scaling(volume) # 거래량형태 데이터 정규화 처리
print("volume.shape: ", volume.shape)
print("volume[0]: ", volume[0])
print("norm_volume[0]: ", norm_volume[0])
print("="*100) # 화면상 구분용
 
# 행은 그대로 두고 열을 우측에 붙여 합친다
x = np.concatenate((norm_price, norm_volume), axis=1) # axis=1, 세로로 합친다
print("x.shape: ", x.shape)
print("x[0]: ", x[0])    # x의 첫 값
print("x[-1]: ", x[-1])  # x의 마지막 값
print("="*100) # 화면상 구분용
 
y = x[:, [-2]] # 타켓은 주식 종가이다
print("y[0]: ",y[0])     # y의 첫 값
print("y[-1]: ",y[-1])   # y의 마지막 값
 
 
dataX = [] # 입력으로 사용될 Sequence Data
dataY = [] # 출력(타켓)으로 사용
 
for i in range(0, len(y) - seq_length):
    _x = x[i : i+seq_length]
    _y = y[i + seq_length] # 다음 나타날 주가(정답)
    if i is 0:
        print(_x, "->", _y) # 첫번째 행만 출력해 봄
    dataX.append(_x) # dataX 리스트에 추가
    dataY.append(_y) # dataY 리스트에 추가
 
 
# 학습용/테스트용 데이터 생성
# 전체 70%를 학습용 데이터로 사용
train_size = int(len(dataY) * 0.7)
# 나머지(30%)를 테스트용 데이터로 사용
test_size = len(dataY) - train_size
 
# 데이터를 잘라 학습용 데이터 생성
trainX = np.array(dataX[0:train_size])
trainY = np.array(dataY[0:train_size])
 
# 데이터를 잘라 테스트용 데이터 생성
testX = np.array(dataX[train_size:len(dataX)])
testY = np.array(dataY[train_size:len(dataY)])
 
 
# 텐서플로우 플레이스홀더 생성
# 입력 X, 출력 Y를 생성한다
X = tf.placeholder(tf.float32, [None, seq_length, input_data_column_cnt])
print("X: ", X)
Y = tf.placeholder(tf.float32, [None, 1])
print("Y: ", Y)
 
# 검증용 측정지표를 산출하기 위한 targets, predictions를 생성한다
targets = tf.placeholder(tf.float32, [None, 1])
print("targets: ", targets)
 
predictions = tf.placeholder(tf.float32, [None, 1])
print("predictions: ", predictions)
 
 
# 모델(LSTM 네트워크) 생성
def lstm_cell():
    # LSTM셀을 생성
    # num_units: 각 Cell 출력 크기
    # forget_bias:  to the biases of the forget gate 
    #              (default: 1)  in order to reduce the scale of forgetting in the beginning of the training.
    # state_is_tuple: True ==> accepted and returned states are 2-tuples of the c_state and m_state.
    # state_is_tuple: False ==> they are concatenated along the column axis.
    #cell = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_cell_hidden_dim, 
    #                                    forget_bias=forget_bias, state_is_tuple=True, activation=tf.nn.softsign)
    cell = tf.contrib.rnn.LSTMCell(num_units=rnn_cell_hidden_dim,
                                       forget_bias=forget_bias, state_is_tuple=True, activation=tf.nn.softsign, use_peepholes=True)
    if keep_prob < 1.0:
        cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    return cell
 
# num_stacked_layers개의 층으로 쌓인 Stacked RNNs 생성
stackedRNNs = [lstm_cell() for _ in range(num_stacked_layers)]
multi_cells = tf.contrib.rnn.MultiRNNCell(stackedRNNs, state_is_tuple=True) if num_stacked_layers > 1 else lstm_cell()
 
# RNN Cell(여기서는 LSTM셀임)들을 연결
hypothesis, _states = tf.nn.dynamic_rnn(multi_cells, X, dtype=tf.float32)
print("hypothesis: ", hypothesis)
 
# [:, -1]를 잘 살펴보자. LSTM RNN의 마지막 (hidden)출력만을 사용했다.
# 과거 여러 거래일의 주가를 이용해서 다음날의 주가 1개를 예측하기때문에 MANY-TO-ONE형태이다
hypothesis = tf.contrib.layers.fully_connected(hypothesis[:, -1], output_data_column_cnt, activation_fn=tf.identity)
 
 
# 손실함수로 평균제곱오차를 사용한다
loss = tf.reduce_sum(tf.square(hypothesis - Y))
# 최적화함수로 AdamOptimizer를 사용한다
optimizer = tf.train.AdamOptimizer(learning_rate)
# optimizer = tf.train.RMSPropOptimizer(learning_rate) # LSTM과 궁합 별로임
 
train = optimizer.minimize(loss)
 
# RMSE(Root Mean Square Error)
# 제곱오차의 평균을 구하고 다시 제곱근을 구하면 평균 오차가 나온다
# rmse = tf.sqrt(tf.reduce_mean(tf.square(targets-predictions))) # 아래 코드와 같다
rmse = tf.sqrt(tf.reduce_mean(tf.squared_difference(targets, predictions)))
 
 
train_error_summary = [] # 학습용 데이터의 오류를 중간 중간 기록한다
test_error_summary = []  # 테스트용 데이터의 오류를 중간 중간 기록한다
test_predict = ''        # 테스트용데이터로 예측한 결과
 
sess = tf.Session()
sess.run(tf.global_variables_initializer())
 
# 학습한다
start_time = datetime.datetime.now() # 시작시간을 기록한다
print('학습을 시작합니다...')
for epoch in range(epoch_num):
    _, _loss = sess.run([train, loss], feed_dict={X: trainX, Y: trainY})
    if ((epoch+1) % 100 == 0) or (epoch == epoch_num-1): # 100번째마다 또는 마지막 epoch인 경우
        # 학습용데이터로 rmse오차를 구한다
        train_predict = sess.run(hypothesis, feed_dict={X: trainX})
        train_error = sess.run(rmse, feed_dict={targets: trainY, predictions: train_predict})
        train_error_summary.append(train_error)
 
        # 테스트용데이터로 rmse오차를 구한다
        test_predict = sess.run(hypothesis, feed_dict={X: testX})
        test_error = sess.run(rmse, feed_dict={targets: testY, predictions: test_predict})
        test_error_summary.append(test_error)
        
        # 현재 오류를 출력한다
        print("epoch: {}, train_error(A): {}, test_error(B): {}, B-A: {}".format(epoch+1, train_error, test_error, test_error-train_error))
        
end_time = datetime.datetime.now() # 종료시간을 기록한다
elapsed_time = end_time - start_time # 경과시간을 구한다
print('elapsed_time:',elapsed_time)
print('elapsed_time per epoch:',elapsed_time/epoch_num)
 
 
# 하이퍼파라미터 출력
print('input_data_column_cnt:', input_data_column_cnt, end='')
print(',output_data_column_cnt:', output_data_column_cnt, end='')
 
print(',seq_length:', seq_length, end='')
print(',rnn_cell_hidden_dim:', rnn_cell_hidden_dim, end='')
print(',forget_bias:', forget_bias, end='')
print(',num_stacked_layers:', num_stacked_layers, end='')
print(',keep_prob:', keep_prob, end='')
 
print(',epoch_num:', epoch_num, end='')
print(',learning_rate:', learning_rate, end='')
 
print(',train_error:', train_error_summary[-1], end='')
print(',test_error:', test_error_summary[-1], end='')
print(',min_test_error:', np.min(test_error_summary))
 
# 결과 그래프 출력
plt.figure(1)
plt.plot(train_error_summary, 'gold')
plt.plot(test_error_summary, 'b')
plt.xlabel('Epoch(x100)')
plt.ylabel('Root Mean Square Error')
 
plt.figure(2)
plt.plot(testY, 'r')
plt.plot(test_predict, 'b')
plt.xlabel('Time Period')
plt.ylabel('Stock Price')
plt.show()
 
 
# sequence length만큼의 가장 최근 데이터를 슬라이싱한다
recent_data = np.array([x[len(x)-seq_length : ]])
print("recent_data.shape:", recent_data.shape)
print("recent_data:", recent_data)
 
# 내일 종가를 예측해본다
test_predict = sess.run(hypothesis, feed_dict={X: recent_data})
 
print("test_predict", test_predict[0])
test_predict = reverse_min_max_scaling(price,test_predict) # 금액데이터 역정규화한다
print("Tomorrow's stock price", test_predict[0]) # 예측한 주가를 출력한다
# LSTM RNN을 이용하여 아마존 주가 예측하기|작성자 똑똑이


# # EX 8

# In[1]:


import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)


# In[2]:


rain2 = pd.read_csv('train.csv')


# In[3]:


rain2.head(n=30)


# In[4]:


train_df = rain2.dropna(subset=['Ref'])


# In[5]:


train_df.describe()


# In[6]:


train_df.isna().sum()


# In[6]:


train_df = train_df.fillna(0)
train_df.isna().sum()


# In[7]:


train_seq = train_df.groupby(['Id']) ## groupby메소드를 이용하여 id를 그룹화함


# In[8]:


#
train_seq_size = train_seq.size()
train_seq_size.count(), train_seq_size.max()


# In[9]:


X = np.zeros((731556, 19, 22))
y = np.zeros((731556, 1))


# In[10]:


i = 0
for name, group in train_seq:
    # d.shape is (seq_length, 24)
    d = group.values
    # column 1~22 are features.
    # column 0 is Id and column 23 is target.
    # save 1~22 features to 0~21 index of dataset up to d.shape[0].
    X[i, :d.shape[0], 0:22] = d[:, 1:23]
    y[i, 0] = d[0, 23]
    i += 1;
print(i)


# In[11]:


def feed_gen(X, y, batch_size=1024):
    shuffled_index = np.random.permutation(len(X)) # shuffle index
    start = 0
    while 1:
        end = start + batch_size
        if end > len(X):  # cannot exceed X's length
            end = len(X)
        yield X[shuffled_index[start:end]], y[shuffled_index[start:end]]
        start = end
        if end >= len(X): # if arrive at the end, shuffle again.
            shuffled_index = np.random.permutation(len(X))
            start = 0


# In[12]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


# In[13]:


batch_size = 1024
steps_per_epoch = np.ceil(X_train.shape[0] / batch_size)
validation_steps = np.ceil(X_test.shape[0] / batch_size)
steps_per_epoch, validation_steps


# In[14]:


train_gen = feed_gen(X_train, y_train, batch_size)
val_gen = feed_gen(X_test, y_test, batch_size)


# In[15]:


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

model = Sequential()
model.add(LSTM(35, input_shape=(None, 22)))
model.add(Dense(1))

model.compile(loss='mae', optimizer='rmsprop')


# In[16]:


history = model.fit_generator(train_gen,
                              steps_per_epoch=steps_per_epoch,
                              epochs=100,
                              validation_data=val_gen,
                              validation_steps=validation_steps)


# In[19]:


rain2_test = pd.read_csv('test.csv')
test_df = rain2_test.fillna(0)


# In[20]:


test_df.head()


# In[21]:


test_seq = test_df.groupby(['Id'])
test_seq_size = test_seq.size()
test_seq_size.count(), test_seq_size.max()


# In[24]:


X_test = np.zeros((717625, 19, 22))


# In[25]:


i = 0
for name, group in test_seq:
    # d.shape is (seq_length, 23)
    d = group.values
    # column 1~22 are features.
    # save 1~22 features to 0~21 index of dataset up to d.shape[0].
    X_test[i, :d.shape[0], 0:22] = d[:, 1:23]
    i += 1;
print(i)


# In[29]:


pred = model.predict(X_test)


# In[30]:


pred_with_index = np.hstack((np.arange(1, pred.shape[0]+1).reshape(-1,1), pred))
np.savetxt("test_prediction.csv", pred_with_index, "%d,%f", 
           delimiter=",", header="Id,Expected", comments="")


# <hr>
# <h1>3.GRU Cell(Gated Recurrent Cell)</h1>
# <hr>
# 
# GRU cell은 2014년 조경현 등의 논문에서 제안됨
# 
# [참조]
# Cho, K., Van Merriënboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., & Bengio, Y. (2014). 
# Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078.<br>
# 
# GRU 셀은 그림에서 확인할 수 있는 것과 같이 LSTM 셀의 간소화된 버전이고 유사하게 작동함
# - LSTM의 장점을 유지하면서도 계산 복잡성을 낮춘 셀 구조
# - Gradient vanishing/ Explosion 문제를 극복했다는 점에서 LSTM과 유사하지만 LSTM에서 게이트의 일부를 생략함
# 
# <hr>
# 
# <h2>LSTM Cell과 GRU Cell의 차이점</h2>
# 
# <h3>LSTM cell(Long Short-Term Memory)</h3>
#     
# ![image.png](attachment:image.png)
# 
# <hr/>
# 

# 
# <h3>GRU cell(Gated Recurrent Unit)</h3>
# 
# ![image.png](attachment:image.png)
# 
# LSTM에서의 변경 점
# 1. LSTM의 상태 Vector인 c(t), h(t)가 h(t)로 통합
# 
# 2. 게이트 제어기 f(t), i(t)가 z(t)로 통합되며 z(t)는 Update gate임
#     - z(t)는 Forget Gate, Input Gate를 모두 제어 
#     - z(t)가 1을 출력하면 Forget Gate가 열리고 Input Gate가 닫히며
#     - z(t)가 0 이면 Forget Gate가 닫히고 Input Gate가 열림.
#     > 즉, 이전 (t-1)의 기억이 저장될 때 마다 타임스탭 t의 입력이 삭제됨.
#     
#     
# 3. 출력 게이트가 없음 > 전체 상태 벡터가 매 Time Step 마다 출력됨
# 4. 이전상태의 어느 부분이 출력될지 제어하는 새로운 게이트 제어기 r(t) 존재함
# 
# <hr>

# ![image.png](attachment:image.png)

# 
# GRU state vector 계산 식
# ![image.png](attachment:image.png)
# 
# z(t),r(t)는 각각 update, reset gate를 의미<br>
# Update, reset gate에서는 활성화 함수로 sigmoid 함수를 사용
# 
# 두 게이트 모두 현 시점의 입력값(x(t))와 직전 시점 은닉층 값(h(t-1))을 반영하여 구함
# 
# W는 각각 입력값과 은닉층 값을 선형결합하는 Parameter(가중치)<br>
# update, reset gate의 활성화 함수는 시그모이드 이므로 0~1사이의 범위를 갖음
# 
# 기억에 관련된 과정
# - 현 시점(t)에서 기억해 둘 만한 정보를 g(t)로 정의
# - g(t)는 현 시점 정보(W * x(t))와 과거정보(W * h(t-1))를 반영하되, 과거 정보를 얼마나 반영할지는 reset gate 값에 의존함
# 
# 
# r(t) 값이 0이면 과거 정보를 모두 잊고 1이면 과거의 정보를 모두 갖으며 r(t)값에 상관없이 현재 정보는 반영됨
# 
# 위의 활성화 함수 tanh의 경우 -1 ~ 1 사이의 범위를 갖음
# 
# 현재정보 h(t)에서 기억할만한 정보 g(t) 를 얼마나 조합할지 결정하는 것은 z(t) 즉 Update gate임
# 
# - z(t)가 1이라면 과거정보를 모두 잊고, 현재 정보 만을 기억함
# - z(t)가 0이라면 과거정보는 모두 기억하지만, 현재 정보는 모두 무시 

# <hr>
# <h3>TensorFlow GRU Cell 만드는 방법</h3>

# In[17]:


n_neurons = 5

gru_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)


# <hr>
# <h3>GRU Code Example</h3>
# 
# GRU Code Example 1 -> 덧셈에 대한 학습 방식
# 
# 1) c = a + b <br/>
# 2) 숫자를 역 bitstring으로 전환<br/>
# 3) 더 할 때 두 bitstring을 더해서 계산함<br/>
# 4) 숫자에서 오른쪽부터 시작하여 합계가 10보다 크면 일정한 숫자를 갖게 됨 <br/>
# -> 기억할 수 있으므로 GRU에 적용 가능
# 따라서, 역 bitstring화 한 숫자의 덧셈에 학습이 가능하다.
# 
# <hr>

# 각 라이브러리를 Import

# In[68]:


import tensorflow as tf
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from IPython import display
get_ipython().run_line_magic('', 'matplotlib inline')
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #간단한 warning Message가 나와서 없애기 위해 사용


# <hr>
# Dataset 생성

# In[69]:


def as_bytes(num, final_size): # byte로 변경
    """
    integer를 bitstring으로 변환
    final_size는 bitstring의 길이
    Arguments
    ---------
    num: int
        The number to convert.
    final_size: int
        The length of the bitstring.
        
    Returns
    -------
    list
        
    """
    
    
    res = []
    for _ in range(final_size):
        res.append(num % 2)
        num //= 2
    return res
    
    """
    Examples
    --------
    >>> as_bytes(3, 4)
    [1, 1, 0, 0]
    >>> as_bytes(3, 5)
    [1, 1, 0, 0, 0]
    """
    
def generate_example(num_bits):
    """Generate an example addition.
    
    Arguments
    ---------
    num_bits: int
        The number of bits to use.
        
    Returns
    -------
    a: list
        The first term (represented as reversed bitstring) of the addition.
    b: list
        The second term (represented as reversed bitstring) of the addition.
    c: list
        The addition (a + b) represented as reversed bitstring.
        
    Examples
    --------
    >>> np.random.seed(4)
    >>> a, b, c = generate_example(3)
    >>> a
    [0, 1, 0]
    >>> b
    [0, 1, 0]
    >>> c
    [1, 0, 0]
    >>> # Notice that these numbers are represented as reversed bitstrings)
    """
    a = random.randint(0, 2**(num_bits - 1) - 1)
    b = random.randint(0, 2**(num_bits - 1) - 1)
    res = a + b
    return (as_bytes(a,  num_bits),
            as_bytes(b,  num_bits),
            as_bytes(res,num_bits))

def generate_batch(num_bits, batch_size):
    """Generates instances of the addition problem.
    
    Arguments
    ---------
    num_bits: int
        The number of bits to use for each number.
    batch_size: int
        The number of examples to generate.
    
    Returns
    -------
    x: np.array
        Two numbers to be added represented as bits (in reversed order).
        Shape: b, i, n
        Where:
            b is bit index from the end.
            i is example idx in batch.
            n is one of [0,1] depending for first and second summand respectively.
    y: np.array
        The result of the addition.
        Shape: b, i, n
        Where:
            b is bit index from the end.
            i is example idx in batch.
            n is always 0 since there is only one result.
    """
    x = np.empty((batch_size, num_bits, 2))
    y = np.empty((batch_size, num_bits, 1))

    for i in range(batch_size):
        a, b, r = generate_example(num_bits)
        x[i, :, 0] = a
        x[i, :, 1] = b
        y[i, :, 0] = r
    return x, y


# <hr>
# Configuration - batch_size와 time_size를 조절

# In[ ]:


batch_size = 100
time_size = 5

#5비트에서 표현되는 수의 100가지 traning set과 test set을 생성
X_train, Y_train = generate_batch(time_size, batch_size)
X_test, Y_test = generate_batch(time_size, batch_size)


# <hr>
# GRU Model을 정의 & state vector를 통해 모델 생성

# In[71]:


class GRU:
    """Implementation of a Gated Recurrent Unit (GRU) as described in [1].
    
    [1] Chung, J., Gulcehre, C., Cho, K., & Bengio, Y. (2014). Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555.
    
    Arguments
    ---------
    input_dimensions: int
        The size of the input vectors (x_t).
    hidden_size: int
        The size of the hidden layer vectors (h_t).
    dtype: obj
        The datatype used for the variables and constants (optional).
    """
    
    def __init__(self, input_dimensions, hidden_size, dtype=tf.float64):
        self.input_dimensions = input_dimensions
        self.hidden_size = hidden_size
        
        # Weights for input vectors of shape (input_dimensions, hidden_size)
        self.Wr = tf.Variable(tf.truncated_normal(
            dtype=dtype, shape=(self.input_dimensions, self.hidden_size), mean=0, stddev=0.01), name='Wr')
        self.Wz = tf.Variable(tf.truncated_normal(
            dtype=dtype, shape=(self.input_dimensions, self.hidden_size), mean=0, stddev=0.01), name='Wz')
        self.Wh = tf.Variable(tf.truncated_normal(
            dtype=dtype, shape=(self.input_dimensions, self.hidden_size), mean=0, stddev=0.01), name='Wh')
        
        # Weights for hidden vectors of shape (hidden_size, hidden_size)
        self.Ur = tf.Variable(tf.truncated_normal(
            dtype=dtype, shape=(self.hidden_size, self.hidden_size), mean=0, stddev=0.01), name='Ur')
        self.Uz = tf.Variable(tf.truncated_normal(
            dtype=dtype, shape=(self.hidden_size, self.hidden_size), mean=0, stddev=0.01), name='Uz')
        self.Uh = tf.Variable(tf.truncated_normal(
            dtype=dtype, shape=(self.hidden_size, self.hidden_size), mean=0, stddev=0.01), name='Uh')
        
        # Biases for hidden vectors of shape (hidden_size,)
        self.br = tf.Variable(tf.truncated_normal(dtype=dtype, shape=(self.hidden_size,), mean=0, stddev=0.01), name='br')
        self.bz = tf.Variable(tf.truncated_normal(dtype=dtype, shape=(self.hidden_size,), mean=0, stddev=0.01), name='bz')
        self.bh = tf.Variable(tf.truncated_normal(dtype=dtype, shape=(self.hidden_size,), mean=0, stddev=0.01), name='bh')
        
        # Define the input layer placeholder
        self.input_layer = tf.placeholder(dtype=tf.float64, shape=(None, None, input_dimensions), name='input')
        
        # Put the time-dimension upfront for the scan operator
        self.x_t = tf.transpose(self.input_layer, [1, 0, 2], name='x_t')       
        self.h_0 = tf.matmul(self.x_t[0, :, :], tf.zeros(dtype=tf.float64, shape=(input_dimensions, hidden_size)), name='h_0')
        
        # Perform the scan operator
        self.h_t_transposed = tf.scan(self.forward_pass, self.x_t, initializer=self.h_0, name='h_t_transposed')
        
        # Transpose the result back
        self.h_t = tf.transpose(self.h_t_transposed, [1, 0, 2], name='h_t')

    def forward_pass(self, h_tm1, x_t):
        """Perform a forward pass.
        
        Arguments
        ---------
        h_tm1: np.matrix
            The hidden state at the previous timestep (h_{t-1}).
        x_t: np.matrix
            The input vector.
        """
        # Definitions of z_t and r_t
        z_t = tf.sigmoid(tf.matmul(x_t, self.Wz) + tf.matmul(h_tm1, self.Uz) + self.bz)
        r_t = tf.sigmoid(tf.matmul(x_t, self.Wr) + tf.matmul(h_tm1, self.Ur) + self.br)
        
        # Definition of h~_t
        h_proposal = tf.tanh(tf.matmul(x_t, self.Wh) + tf.matmul(tf.multiply(r_t, h_tm1), self.Uh) + self.bh)
        
        # Compute the next hidden state
        h_t = tf.multiply(1 - z_t, h_tm1) + tf.multiply(z_t, h_proposal)
        
        return h_t


# <hr>
# Model을 초기화 하고 Train 하는 코드

# In[72]:


#%% (3) Initialize and train the model.

# The input has 2 dimensions: dimension 0 is reserved for the first term and dimension 1 is reverved for the second term
input_dimensions = 2

# Arbitrary number for the size of the hidden state
hidden_size = 16

# Initialize a session
session = tf.Session()

# Create a new instance of the GRU model
gru = GRU(input_dimensions, hidden_size)

# Add an additional layer on top of each of the hidden state outputs
W_output = tf.Variable(tf.truncated_normal(dtype=tf.float64, shape=(hidden_size, 1), mean=0, stddev=0.01))
b_output = tf.Variable(tf.truncated_normal(dtype=tf.float64, shape=(1,), mean=0, stddev=0.01))
output = tf.map_fn(lambda h_t: tf.matmul(h_t, W_output) + b_output, gru.h_t)#quadratic loss 사용

# Create a placeholder for the expected output
expected_output = tf.placeholder(dtype=tf.float64, shape=(batch_size, time_size, 1), name='expected_output')

# Just use quadratic loss
loss = tf.reduce_sum(0.5 * tf.pow(output - expected_output, 2)) / float(batch_size)

# Use the Adam optimizer for training
train_step = tf.train.AdamOptimizer().minimize(loss)

# Initialize all the variables
init_variables = tf.global_variables_initializer()
session.run(init_variables)

# Initialize the losses
train_losses = []
validation_losses = []
a = 1024
b = 16
y=0
# Perform all the iterations
for epoch in range(10000):
    # Compute the losses
    _, train_loss = session.run([train_step, loss], feed_dict={gru.input_layer: X_train, expected_output: Y_train})
    validation_loss = session.run(loss, feed_dict={gru.input_layer: X_test, expected_output: Y_test})
    
    # Log the losses
    train_losses += [train_loss]
    validation_losses += [validation_loss]
    
    # Display an update every 100 iterations
    if epoch % 100 == 0:
        plt.plot(train_losses, '-b', label='Train loss')
        plt.plot(validation_losses, '-r', label='Validation loss')
        plt.legend(loc=0)
        plt.title('Loss')
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.show()
        print('Iteration: %d, train loss: %.4f, test loss: %.4f' % (epoch, train_loss, validation_loss))
    
        #%% (4) Manually evaluate the model.
        if y==a+b:
        # Define two numbers a and b and let the model compute a + b
            a = random.randrange(1, 1024)
            b = random.randrange(1, 256)

        # The model is independent of the sequence length! Now we can test the model on even longer bitstrings
        bitstring_length = 20

        # Create the feature vectors    
        X_custom_sample = np.vstack([as_bytes(a, bitstring_length), as_bytes(b, bitstring_length)]).T
        X_custom = np.zeros((1,) + X_custom_sample.shape)
        X_custom[0, :, :] = X_custom_sample

        # Make a prediction by using the model
        y_predicted = session.run(output, feed_dict={gru.input_layer: X_custom})
        # Just use a linear class separator at 0.5
        y_bits = 1 * (y_predicted > 0.5)[0, :, 0]
        # Join and reverse the bitstring
        y_bitstr = ''.join([str(int(bit)) for bit in y_bits.tolist()])[::-1]
        # Convert the found bitstring to a number
        y = int(y_bitstr, 2)
        print("a : " + str(a)+ ", b : "+str(b) + ", y : " + str(y))


# <hr>

# # 자연어 처리

# 최근 대부분의 NLP(Natrual language processing) 응용은 RNN을 기반으로 함
# RNN 기반 자연어 처리 기술은 기계 번역, 자동 요약 등에 사용되어 지고 있음

# 특히 기계 번역에 대해서는 Tensorflow의 Word2Vec, Seq2Seq tutorial에 잘 설명되어 있음

# ## Word Embedding

# ![image.png](attachment:image.png)*<b><center>오디오, 이미지, 텍스트 데이터 차이</center></b>*

# 과거 텍스트 분석에서는 단어 하나에 하나의 인덱스 정수를 할당하는 Bag of Words 방법이 주로 사용되어 왔다.
# 
# 단어 분류 벡터의 크기는 데이터셋에 존재하는 단어의 가짓수만큼 이루어진다.
# 
# 데이터셋에 "I","You","He","She", "am","are","is", "a","an", "boy","girl" 의 11개의 단어가 존재할 경우 아래와 같이 인덱스 정수가 할당된다.

# "I" : 0, "You" :1, "He" : 2, "She" : 3
# "am" : 4, "are" : 5, "is" : 6, "a" : 7
# "an" : 8, "boy" : 9, "girl" : 10

# 이러한 방식대로 인덱싱 된 단어의 인덱싱을 표현에는 단어별로 해당하는 인덱스의 값만 1이고 나머지는 모두 0의 값을 가지는 one-hot 벡터와 같은 희소 벡터(Sparse vector) 형태가 사용될 수 있다.

# ex) "am" = [0,0,0,0,1,0,0,0,0,0]

# 데이터셋에 존재하는 단어의 가짓수가 십수개정도라면 희소 벡터로 표현하는 방법을 고려해볼 만 하다. 하지만 그러한 경우는 한정적이고 대부분의 경우 수천, 수만개 이상의 단어를 고려해야 하기 때문에 공간적 낭비를 야기시킨다.
# 
# 단어의 가짓수가 N개일 경우에 N개의 차원을 가지는 벡터를 사용하지 않고, 아래와 같이 실수를 사용하여 더 적은 차원으로 표현한다면 상대적으로 공간적 이득을 취할 수 있을 것이다.

# "I" : [0.1, 0.5], "You" :[0.2,0.4], "He" : [0.2,0.6], "She" : [0.2,0.3]
# "am" : [0.5,0.2], "are" : [0.5,0.3], "is" : [0.5,0.4], "a" : [0.7,0.1]
# "an" : [0.7,0.2] , "boy" : [0.9,0.1], "girl" : [0.9,0.2]

# 이러한 표현 방식을 밀집 벡터(Dense vector)라고 한다.
# 
# 데이터셋에 존재하는 단어들을 밀집 벡터 형태로 표현하는 방법을 워드 임베딩(Word embedding)이라고 하고, 워드 임베딩의 결과물을 임베딩 벡터라고 한다.
# 
# 임베딩 벡터와 one-hot 벡터의 특징 비교는 아래 표에서 확인할 수 있다

# | 구분 | One-hot 벡터 | 임베딩 벡터 |
# |:--------|:--------:|--------:|
# | 차원 | 고차원(N개, 단어의 가짓수) | 저차원(임베딩 시에 지정) |
# | 표현 방법 | 수동, 사용자가 일일히 설정 | 훈련 데이터로부터 학습하여 표현 |
# | 값의 타입 | 정수형, 1과 0 | 실수

# 아래 그림과 같이 비슷한 의미의 단어들이 비슷한 벡터로 임베딩 된다면 벡터들의 합, 차 연산을 통하여 연관성이 있는 단어 찾기, 동일한 관계에 있는 단어 찾기 등에 활용될 수 있을 것이다. 

# ![image.png](attachment:image.png)*<b><center>워드 임베딩 예시</center></b>*

# ![image.png](attachment:image.png)*<b><center>워드 임베딩 한국어 사이트 http://word2vec.kr/search/?query=</center></b>*

# 워드 임베딩에 가장 많이 쓰이는 알고리즘은 word2Vec 알고리즘으로 단어 데이터 셋을 학습하여 비슷한 단어들을 비슷한 벡터값이 가지도록 임베딩하는 알고리즘이다. 
# 
# word2vec의 기본 아이디어는 비슷한 의미를 가지는 단어들은 문장 내에서 비슷한 위치에 존재하는것에서 시작된다.

# You shall know a word by the company it keeps. - J.R. Firth (1957)

# ### Fetch data

# In[2]:


from six.moves import urllib

import tensorflow as tf
import numpy as np
import errno
import os
import zipfile

WORDS_PATH = "datasets/words"
WORDS_URL = 'http://mattmahoney.net/dc/text8.zip'

def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):
    os.makedirs(words_path, exist_ok=True)
    zip_path = os.path.join(words_path, "words.zip")
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(words_url, zip_path)
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
    return data.decode("ascii").split()


# In[3]:


words = fetch_words_data()


# In[4]:


words


# In[5]:


len(words)


# ### Build the dictionary

# In[6]:


from collections import Counter

vocabulary_size = 50000
#많이 나온 단어 순서대로 dictionary에 추가 및 index 번호 매칭
vocabulary = [("UNK", None)] + Counter(words).most_common(vocabulary_size - 1)
vocabulary = np.array([word for word, _ in vocabulary])
dictionary = {word: code for code, word in enumerate(vocabulary)}
#DIctionary의 index값에 대응하여 words(텍스트 데이터) 인덱싱
data = np.array([dictionary.get(word, 0) for word in words])


# In[7]:


" ".join(words[:9]), data[:9]


# In[8]:


" ".join([vocabulary[word_index] for word_index in [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]])


# ### Generate batches

# In[10]:


from collections import deque

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=[batch_size], dtype=np.int32)
    labels = np.ndarray(shape=[batch_size, 1], dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = np.random.randint(0, span)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels


# In[11]:


np.random.seed(42)


# In[12]:


data_index = 0
batch, labels = generate_batch(8, 2, 1)


# In[13]:


batch, [vocabulary[word] for word in batch]


# In[14]:


labels, [vocabulary[word] for word in labels[:, 0]]


# ### Build the model

# In[15]:


batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

learning_rate = 0.01


# In[20]:


reset_graph()

# Input data.
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)


# In[21]:


vocabulary_size = 50000
embedding_size = 150

# Look up embeddings for inputs.
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)


# In[22]:


train_inputs = tf.placeholder(tf.int32, shape=[None])
embed = tf.nn.embedding_lookup(embeddings, train_inputs)


# In[23]:


# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                        stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
    tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
                   num_sampled, vocabulary_size))

# Construct the Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Add variable initializer.
init = tf.global_variables_initializer()


# In[24]:


num_steps = 10001

with tf.Session() as session:
    init.run()

    average_loss = 0
    for step in range(num_steps):
        print("\rIteration: {}".format(step), end="\t")
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

        # We perform one update step by evaluating the training op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([training_op, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = vocabulary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word = vocabulary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)

    final_embeddings = normalized_embeddings.eval()


# In[25]:


np.save("./my_final_embeddings.npy", final_embeddings)


# In[26]:


final_embeddings = np.load("./my_final_embeddings.npy")


# ### Plot the embeddings

# In[27]:


def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')


# In[28]:


from sklearn.manifold import TSNE

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [vocabulary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)


# In[29]:


def get_embedding_vector(word) :
    if word in labels:
        return final_embeddings[labels.index(word)]
    else :
        return final_embeddings[labels.index("UNK")]
def get_word_from_embedding_vector(vector) : 
    labels[final_embeddings.searchsorted(vector)]


# In[30]:


book = get_embedding_vector("book")
books = get_embedding_vector("books")
print(book)
print(books)


# ## 기계 번역을 위한 인코더-디코더 네트워크

# RNN을 이용한 seq2seq 모델은 길이가 다른 여러 시퀀스를 입력하여 여러 시퀀스를 출력한다. 
# 
# seq2seq 모델의 대표적인 응용이 기계 번역인데, 하나의 문장(시퀀스)를 입력받고 다른 언어로 된 문장(시퀀스)를 출력한다.

# ![image.png](attachment:image.png) <b><center>인코더-디코더 모델을 이용한 기계 번역

# <li>인코더 : 입력된 문장 토큰열을 그 문장의 의미/스타일 등의 요약 정보를 담고 있는 수치 벡터로 변환한다.</li>
# *인코더에 입력되는 문장의 토큰 순서를 뒤집어서 만든 수치 벡터를 디코더에 전달해주면 성능이 향상되는것으로 나타났다.
# <li>디코더 : 문장의 요약 정보를 담고 있는 수치 벡터를 디코딩 대상 언어 문장 토큰열로 변환한다. 각 토큰별로 변환될 수 있는 단어들의 확률값이 출력되고 Softmax를 통과시켜 최대 확률을 가지는 단어를 출력한다.</li>

# ### Implementation
# 본 구현은 연습문제 9번의 해답이다.

# #### Load dataset

# In[34]:


import os
import pickle
import copy
import numpy as np

def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()

    return data


# 데이터 셋은 WMT 10 French-English corpus의 축소 버전 사용

# In[35]:


source_path = 'data/small_vocab_en'
target_path = 'data/small_vocab_fr'
source_text = load_data(source_path)
target_text = load_data(target_path)


# #### Explore the data

# 데이터셋의 구성 확인

# In[36]:


import numpy as np
from collections import Counter

print('Dataset Brief Stats')
print('* number of unique words in English sample sentences: {}\
        [this is roughly measured/without any preprocessing]'.format(len(Counter(source_text.split()))))
print()

english_sentences = source_text.split('\n')
print('* English sentences')
print('\t- number of sentences: {}'.format(len(english_sentences)))
print('\t- avg. number of words in a sentence: {}'.format(np.average([len(sentence.split()) for sentence in english_sentences])))

french_sentences = target_text.split('\n')
print('* French sentences')
print('\t- number of sentences: {} [data integrity check / should have the same number]'.format(len(french_sentences)))
print('\t- avg. number of words in a sentence: {}'.format(np.average([len(sentence.split()) for sentence in french_sentences])))
print()

sample_sentence_range = (0, 5)
side_by_side_sentences = list(zip(english_sentences, french_sentences))[sample_sentence_range[0]:sample_sentence_range[1]]
print('* Sample sentences range from {} to {}'.format(sample_sentence_range[0], sample_sentence_range[1]))

for index, sentence in enumerate(side_by_side_sentences):
    en_sent, fr_sent = sentence
    print('[{}-th] sentence'.format(index+1))
    print('\tEN: {}'.format(en_sent))
    print('\tFR: {}'.format(fr_sent))
    print()


# #### Preprocessing

# Create lookup table
# 
# 두가지 종류의 매핑 테이블 생성
# 
# vocab_to_int -> (Key,value) ==  (unique word string, its unique index) : 분류기 학습 및 입력값의 임베딩 벡터 변환에 사용 -> (1)
# 
# int_to_vocab -> (Key,value) ==  (its unique index, unique word string) : 출력값의 단어 변환을 위한 lookup table -> (2)

# ![image.png](attachment:image.png)

# In[37]:


CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

def create_lookup_tables(text):
    # make a list of unique words
    vocab = set(text.split())

    # (1)
    # starts with the special tokens
    vocab_to_int = copy.copy(CODES)

    # the index (v_i) will starts from 4 (the 2nd arg in enumerate() specifies the starting index)
    # since vocab_to_int already contains special tokens
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    # (2)
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab


# Text to Word Ids
# 
# Lookup table의 인덱스 값을 기준으로 raw data(문자열)을 인덱스 값으로 변환
# 변환해주지 않으면 하나의 문장은 row가 문장, column이 인덱스 값인 2차원 배열 형태로 저장되어야 함

# ![image.png](attachment:image.png)

# In[38]:


def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    """
        1st, 2nd args: raw string text to be converted
        3rd, 4th args: lookup tables for 1st and 2nd args respectively
    
        return: A tuple of lists (source_id_text, target_id_text) converted
    """
    # empty list of converted sentences
    source_text_id = []
    target_text_id = []
    
    # make a list of sentences (extraction)
    source_sentences = source_text.split("\n")
    target_sentences = target_text.split("\n")
    
    max_source_sentence_length = max([len(sentence.split(" ")) for sentence in source_sentences])
    max_target_sentence_length = max([len(sentence.split(" ")) for sentence in target_sentences])
    
    # iterating through each sentences (# of sentences in source&target is the same)
    for i in range(len(source_sentences)):
        # extract sentences one by one
        source_sentence = source_sentences[i]
        target_sentence = target_sentences[i]
        
        # make a list of tokens/words (extraction) from the chosen sentence
        source_tokens = source_sentence.split(" ")
        target_tokens = target_sentence.split(" ")
        
        # empty list of converted words to index in the chosen sentence
        source_token_id = []
        target_token_id = []
        
        for index, token in enumerate(source_tokens):
            if (token != ""):
                source_token_id.append(source_vocab_to_int[token])
        
        for index, token in enumerate(target_tokens):
            if (token != ""):
                target_token_id.append(target_vocab_to_int[token])
                
        # put <EOS> token at the end of the chosen target sentence
        # this token suggests when to stop creating a sequence
        target_token_id.append(target_vocab_to_int['<EOS>'])
            
        # add each converted sentences in the final list
        source_text_id.append(source_token_id)
        target_text_id.append(target_token_id)
    
    return source_text_id, target_text_id


# Peprocess and save the data

# In[39]:


def preprocess_and_save_data(source_path, target_path, text_to_ids):
    # Preprocess
    
    # load original data (English, French)
    source_text = load_data(source_path)
    target_text = load_data(target_path)

    # to the lower case
    source_text = source_text.lower()
    target_text = target_text.lower()

    # create lookup tables for English and French data
    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)

    # create list of sentences whose words are represented in index
    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)

    # Save data for later use
    pickle.dump((
        (source_text, target_text),
        (source_vocab_to_int, target_vocab_to_int),
        (source_int_to_vocab, target_int_to_vocab)), open('preprocess.p', 'wb'))


# 데이터 전처리 수행

# In[40]:


preprocess_and_save_data(source_path, target_path, text_to_ids)


# In[41]:


import pickle

def load_preprocess():
    with open('preprocess.p', mode='rb') as in_file:
        return pickle.load(in_file)


# In[42]:


import numpy as np

(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = load_preprocess()


# In[40]:


from distutils.version import LooseVersion
import warnings
import tensorflow as tf
from tensorflow.python.layers.core import Dense

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))


# ### Built the Network model
# 
# 인코더 모델과 디코더 모델 두가지 서브모델들로 이루어진 sequence to sequence 모델 생성
# 
# RNN 구조로 구성된 인코더는 raw 데이터를 받아 neural representation 형태로 결과값을 출력하고, 이것이 디코더의 입력으로 사용되어 결과값을 출력하게 된다.
# 
# 
# 아래와 같은 과정을 통하여 인코더-디코더 모델을 정의하고 학습 및 추론에 이용할 수 있다.
# 
# <li>(1) 인코더 모델의 입력 파라미터 정의</li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : enc_dec_model_inputs()
# <li>(2) 인코더 모델 형성 </li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : encoding_layer
# <li>(3) 디코더 모델의 입력 파라미터 정의 </li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : enc_dec_model_inputs(), process_decoder_input(),
# <li>(4) Training을 위한 디코더 모델 형성 </li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : decoding_layer_train()
# <li>(5) Inference을 위한 디코더 모델 형성 </li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : decoding_layer_infer()
# <li>(6) 디코더 모델 통합 </li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : decoding_layer()
# <li>(7) 인코더-디코더 모델 통합 </li>
# &nbsp;&nbsp;&nbsp;&nbsp;해당 함수 : seq2seq_model()
# <li>(8) 모델 학습 및 검증 </li>
# 
# 

# ### Input (1), (3)

# In[43]:


def enc_dec_model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets') 
    
    target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
    max_target_len = tf.reduce_max(target_sequence_length)    
    
    return inputs, targets, target_sequence_length, max_target_len


# In[44]:


def hyperparam_inputs():
    #learning rate
    lr_rate = tf.placeholder(tf.float32, name='lr_rate')
    #keep probability for dropouts
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return lr_rate, keep_prob


# In[45]:


def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    """
    Preprocess target data for encoding
    :return: Preprocessed target data
    """
    # get '<GO>' id
    #<GO> 토큰은 번역의 시작 지점을 가르킴
    go_id = target_vocab_to_int['<GO>']
    #tf.stride_slice() : 텐서를 쪼개는 함수 
    #Arguments -> Tensor, Begin, End, Stride
    after_slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    #tf.fill() : 스칼라값으로 채워진 텐서 생성
    #tf.concat() : 두가지 텐서를 이어붙임
    after_concat = tf.concat( [tf.fill([batch_size, 1], go_id), after_slice], 1)
    
    return after_concat


# ### Encoding (3)

# 인코딩 모델은 임베딩 계층과 RNN 계층으로 구성된다.
# 
#     임베딩 계층은 tf.contrib.layers.embed_sequence()으로 구성하였다.
#     RNN 계층은 tf.contrib.rnn.LSTMCell(),tf.contrib.rnn.DropoutWrapper(), tf.contrib.rnn.MultiRNNCell() 함수를 사용하여 구성하였다.

# In[46]:


def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_vocab_size, 
                   encoding_embedding_size):
    """
    :return: tuple (RNN output, RNN state)
    """
    embed = tf.contrib.layers.embed_sequence(rnn_inputs, 
                                             vocab_size=source_vocab_size, 
                                             embed_dim=encoding_embedding_size)
    #MultiRNNCell은 여러 RNN cell들을 쌓을 수 있도록 함
    #num_layer만큼 LSTM cell을 스태킹
    stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
    
    #임베딩 레이어와 RNN 레이어를 통합하기 위한 함수
    outputs, state = tf.nn.dynamic_rnn(stacked_cells, 
                                       embed, 
                                       dtype=tf.float32)
    return outputs, state


# ### Decoding

# 디코딩 모델은 학습 단계와 추론 단계에서 서로 다른 프로세스가 이루어진다. 
# 학습 단계에서는 타겟 데이터에 정해진 라벨 대로 정해진 값이 다음 스텝으로 전달되지만, 추론 단계에서는 매 스탭마다 결정된 동적인 값을 전달받는다.

# ![image.png](attachment:image.png)

# 두 단계가 서로 다른 방식으로 임베딩 데이터를 사용하므로 디코딩 계층을 만드는 함수를 각각 생성한다.

# ### Decoding - 학습 단계 (4)

# 학습 단계에서는 입력에 따른 사전 정의된 임베딩 값을 사용한다. 
# 
# <a href="https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/TrainingHelper">tf.contrib.seq2seq.TrainingHelper()</a> 함수를 사용하여 입력 값을 전달한다.
# 
# RNN 학습 과정에 사용되는 helper 함수로 단순히 입력 값을 읽어오고, 다음 스텝에 사용될수 있도록 해당하는 인덱스값을 리턴한다.

# In[47]:


def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
    """
    Create a training process in decoding layer 
    :return: BasicDecoderOutput containing training logits and sample_id
    """
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, 
                                             output_keep_prob=keep_prob)
    
    # for only input layer
    helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, 
                                               target_sequence_length)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, 
                                              helper, 
                                              encoder_state, 
                                              output_layer)

    # unrolling the decoder layer
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_summary_length)
    return outputs


# ### Decoding - 추론 단계 (5)

# 추론 단계에서는 매 스텝마다 생성되는 결과물을 재 입력 받아야 하기 때문에 동적으로 임베딩 계층을 통과시켜야 한다.
# <a href="https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/GreedyEmbeddingHelper">tf.contrib.seq2seq.GreedyEmbeddingHelper()</a> 함수를 사용하여 현재 스텝의 결과물을 임베딩 계층에 통과시켜 다음 입력으로 사용될 수 있도록 한다.

# In[48]:


def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
    """
    Create a inference process in decoding layer 
    :return: BasicDecoderOutput containing inference logits and sample_id
    """
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, 
                                             output_keep_prob=keep_prob)
    
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, 
                                                      tf.fill([batch_size], start_of_sequence_id), 
                                                      end_of_sequence_id)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, 
                                              helper, 
                                              encoder_state, 
                                              output_layer)
    
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_target_sequence_length)
    return outputs


# ### Decoding 계층 통합 (6)

# In[49]:


def decoding_layer(dec_input, encoder_state,
                   target_sequence_length, max_target_sequence_length,
                   rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size):
    """
    Create decoding layer
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    target_vocab_size = len(target_vocab_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
    
    with tf.variable_scope("decode"):
        output_layer = tf.layers.Dense(target_vocab_size)
        train_output = decoding_layer_train(encoder_state, 
                                            cells, 
                                            dec_embed_input, 
                                            target_sequence_length, 
                                            max_target_sequence_length, 
                                            output_layer, 
                                            keep_prob)

    with tf.variable_scope("decode", reuse=True):
        infer_output = decoding_layer_infer(encoder_state, 
                                            cells, 
                                            dec_embeddings, 
                                            target_vocab_to_int['<GO>'], 
                                            target_vocab_to_int['<EOS>'], 
                                            max_target_sequence_length, 
                                            target_vocab_size, 
                                            output_layer,
                                            batch_size,
                                            keep_prob)

    return (train_output, infer_output)


# ### Built the Seq2seq model (7)

# 1~6번 과정에서 정의한 함수들을 통합하여 seq2seq 모델 생성을 위한 함수를 정의한다.

# In[50]:


def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    """
    Build the Sequence-to-Sequence model
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    enc_outputs, enc_states = encoding_layer(input_data, 
                                             rnn_size, 
                                             num_layers, 
                                             keep_prob, 
                                             source_vocab_size, 
                                             enc_embedding_size)
    
    dec_input = process_decoder_input(target_data, 
                                      target_vocab_to_int, 
                                      batch_size)
    
    train_output, infer_output = decoding_layer(dec_input,
                                               enc_states, 
                                               target_sequence_length, 
                                               max_target_sentence_length,
                                               rnn_size,
                                              num_layers,
                                              target_vocab_to_int,
                                              target_vocab_size,
                                              batch_size,
                                              keep_prob,
                                              dec_embedding_size)
    
    return train_output, infer_output


# ### Training

# 모델 생성 및 학습을 위한 하이퍼 파라미터 설정
# 

# In[51]:


display_step = 300

epochs = 13
batch_size = 128

rnn_size = 128
num_layers = 3

encoding_embedding_size = 150
decoding_embedding_size = 150

learning_rate = 0.001
keep_probability = 0.5


# In[52]:


save_path = 'checkpoints/dev'
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = load_preprocess()
max_target_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, target_sequence_length, max_target_sequence_length = enc_dec_model_inputs()
    lr, keep_prob = hyperparam_inputs()
    
    train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int)
    
    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # https://www.tensorflow.org/api_docs/python/tf/sequence_mask
    # - Returns a mask tensor representing the first N positions of each cell.
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function - weighted softmax cross entropy
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


# ### Padding source and target sequence

# ![image.png](attachment:image.png)<b><center> 데이터 패딩

# In[53]:


def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]


def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths


# In[52]:


def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

# Split data to training and validation sets
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]
valid_source = source_int_text[:batch_size]
valid_target = target_int_text[:batch_size]
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_vocab_to_int['<PAD>'],
                                                                                                             target_vocab_to_int['<PAD>']))                                                                                                  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_vocab_to_int['<PAD>'],
                            target_vocab_to_int['<PAD>'])):

            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 keep_prob: keep_probability})


            if batch_i % display_step == 0 and batch_i > 0:
                batch_train_logits = sess.run(
                    inference_logits,
                    {input_data: source_batch,
                     target_sequence_length: targets_lengths,
                     keep_prob: 1.0})

                batch_valid_logits = sess.run(
                    inference_logits,
                    {input_data: valid_sources_batch,
                     target_sequence_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)

                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')


# In[55]:


def save_params(params):
    with open('params.p', 'wb') as out_file:
        pickle.dump(params, out_file)


def load_params():
    with open('params.p', mode='rb') as in_file:
        return pickle.load(in_file)


# In[54]:


# Save parameters for checkpoint
save_params(save_path)


# ### Checkpoint

# In[56]:


import tensorflow as tf
import numpy as np

_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = load_preprocess()
load_path = load_params()


# ### Translate

# In[60]:


def sentence_to_seq(sentence, vocab_to_int):
    results = []
    for word in sentence.split(" "):
        if word in vocab_to_int:
            results.append(vocab_to_int[word])
        else:
            results.append(vocab_to_int['<UNK>'])
            
    return results

translate_sentence = 'i like apple .'

translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

    translate_logits = sess.run(logits, {input_data: [translate_sentence]*batch_size,
                                         target_sequence_length: [len(translate_sentence)*2]*batch_size,
                                         keep_prob: 1.0})[0]

print('Input')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in translate_logits]))
print('  French Words: {}'.format(" ".join([target_int_to_vocab[i] for i in translate_logits])))


# ### 연습문제 해답

# #### 1 시퀀스 투 시퀀스 RNN을 사용한 어플리케이션에는 어떤 것들이 있나요? 시퀀스-투-벡터 RNN과 벡터-투-시퀀스 RNN은 어떤가요?

# Seqeunce to sequence RNN application : 날씨 예측, 기께 번역, 비디오 캡션 생성, 스피치 투 텍스트, 음악 생성, 노래의 화음 식별
# 
# 시퀀스 투 벡터 RNN : 음악 샘플을 장르로 구분하기, 책 후기에 대한 감성 분석, 뇌에 심은 인공칩에서 익은 데이터를 기반으로 실어증 환자가 생각하는 단어 예측하기, 사용자의 영화 시청 이력을 바탕으로 복 싶어 할 영화의 확률 예측하기
# <br>
# 벡터 투 시퀀스 RNN : 이미지 캡션 생성, 현재 아티스트를 기반으로 음악 플레이리스트 생성, 일련의 파라미터를 기반으로 한 멜로디 생성, 사진 속에서 보행자 위치 찾기

# #### 2 왜 자동 번역에 시퀀스-투-시퀀스 RNN 대신 인코더- 디코더 RNN을 사용하나요?

# 일반적으로 문장을 한번에 단어 하나씩 번역하면 결과가 매우 좋지 않음
# - 예를 들어 프랑스 문장 'Je vous en prie'는 'You are welcome'을 의미함
# 하지만 이를 한 단어씩 번역하면 'I you in pray'가 되버림. 따라서 먼저 전체 문장을 읽고 난 다음에 번역하는것이 훨씬 좋음
# 보통의 시퀀스- 투 시퀀스 RNN은 첫 단어를 읽은 후 즉시 문장을 번역하기 시작하지만 인코더-디코더 RNN은 먼저 전체 문장을 일고 난 다음에 번역을 함
# 이는 다음에 말할 것이 확실하지 않을 때마다 침묵을 출력하는 시퀀스-투-시퀀스 RNN으로 생각할 수도 있습니다.

# #### 3 동영상을 분류하기 위해 합성곱 신경망과 RNN을 어떻게 연결할 수 있나요?

# 화면 내용을 기초로 동영상을 분류하려면 초당 한 프레임을 받아 각 프레임을 합성곱 신경망에 통과시키고 이 CNN의 출력을 시퀀스-투 벡터 RNN에 주입하고 마지막에 소프트맥스 층을 통과시켜 모든 클래스에 대한 확률을 구하는 구조를 생각해볼 수 있음.<br>
# 
# 훈련을 위해서는 크로스 엔트로피를 비용 함수로 사용하면 됩니다. 분류에 오디오도 사용하려면 매 초의 오디오를 스펙트럼 사진으로 변환하고 이 사진을 CNN에 주입한 다음 이 CNN의 출력을 RNN에 주입함

# #### 4 static_rnn() 대신 dynamic_rnn()을 사용하여 RNN을 구축할 때의 장점은 무엇인가요?

# 1. 메모리 부족 에러를 피하기 위해 역전파하는동안 GPU 메모리를 CPU 메모리로 대체할 수 있는 while_loop()연산을 기반으로 함
# 2. 입력과 출력에 하나의 텐서를 사용하기 떄문에 텐서의 리스트를 사용하는것보다 사용하기 편리함 
#     - stack, unstack, transpose 연산이 필요 없음
# 3. 더 작은 그래프를 만들기 때문에 텐서보드에서 확인하기 쉬움 

# #### 5 가변 길이 입력 시퀀스를 어떻게 다룰 수 있나요? 가변 길이 출력 시퀀스는 어떤가요?

# 가변 길이 입력 시퀀스를 다루기 위한 가장 간단한 방법은 
# 1. static_rnn()이나 dynamic_rnn()함수를 호출할 때 sequence_length 매개변수를 설정하는 것 
# 2. 가장 큰 입력의 크기에 맞추기 위해 작은 입력값으로 패딩을 추가하는 것
# 
# 가변 길이의 출력 시퀀스를 다루기 위해서는 
# 1. 출력 시퀀스의 길이를 미리 알고 있다면 sequence_length 매개변수를 사용할 수 있음 
# 2. 출력 시퀀스의 길이를 미리 알지 못하면 패딩 트릭을 사용할 수 있음 
#     - 즉 항상 같은 크기의 시퀀스를 출력하고, EOS 토큰 이후의 출력은 무시합니다.

# #### 6 여러 GPU에 심층 RNN의 훈련과 실행을 분산시키는 일반적인 방법은 무엇인가요?

# 여러 GPU에 심층 RNN의 훈련과 실행을 분산시키이 위한 일반적인 방법은 각각의 층을 다른 GPU에 배치하는 것입니다.

# #### 7 임베딩된 레버 문법

# - 먼저 문법에 맞는 문자열을 생성하는 함수가 필요
# - 이 문법은 각 상태에서 가능한 전이 상태의 리스트임
# - 해당 변환은 출력할 문자열과 다음 상태를 지정함

# In[ ]:


# In[87]:


from random import choice, seed

# 일관된 출력을 위한 유사난수 초기화
seed(42)
np.random.seed(42)

default_reber_grammar = [
    [("B", 1)],           # (상태 0) =B=>(상태 1)
    [("T", 2), ("P", 3)], # (상태 1) =T=>(상태 2) or =P=>(상태 3)
    [("S", 2), ("X", 4)], # (상태 2) =S=>(상태 2) or =X=>(상태 4)
    [("T", 3), ("V", 5)], # 등등..
    [("X", 3), ("S", 6)],
    [("P", 4), ("V", 6)],
    [("E", None)]]        # (상태 6) =E=>(종료 상태)

embedded_reber_grammar = [
    [("B", 1)],
    [("T", 2), ("P", 3)],
    [(default_reber_grammar, 4)],
    [(default_reber_grammar, 5)],
    [("T", 6)],
    [("P", 6)],
    [("E", None)]]

def generate_string(grammar):
    state = 0
    output = []
    while state is not None:
        production, state = choice(grammar[state])
        if isinstance(production, list):
            production = generate_string(grammar=production)
        output.append(production)
    return "".join(output)


# - Default Reber grammar에 맞는 문자열 만드는 것을 확인

# In[88]:


for _ in range(25):
    print(generate_string(default_reber_grammar), end=" ")


# - Embedding Reber grammar에 맞는 문자열 만드는 것을 확인

# In[89]:


for _ in range(25):
    print(generate_string(embedded_reber_grammar), end=" ")


# - 문법을 따르지 않는 문자열을 만들 함수 생성
# - 문법을 따르는 문자열을 만든 후 하나의 문자만 변경

# In[90]:


def generate_corrupted_string(grammar, chars="BEPSTVX"):
    good_string = generate_string(grammar)
    index = np.random.randint(len(good_string))
    good_char = good_string[index]
    bad_char = choice(list(set(chars) - set(good_char)))
    return good_string[:index] + bad_char + good_string[index + 1:]


# - 여러개의 잘못된 문자열 생성

# In[91]:


for _ in range(25):
    print(generate_corrupted_string(embedded_reber_grammar), end=" ")


# - 문자열을 바로 RNN에 주입할 수는 없음
# - 먼저 벡터로 바꾸어야 함
# - 각 벡터는 one-hot 인코딩을 사용하여 하나의 문자를 나타냄
# - 예를 들어, 벡터 [1, 0, 0, 0, 0, 0, 0]는 문자 "B"를 나타내고 벡터 [0, 1, 0, 0, 0, 0, 0]는 문자 "E"를 나타내는 식
# - 이런 원-핫 벡터의 연속으로 문자열을 바꾸는 함수로 생성
# - 문자열이 n_steps보다 짧으면 0 벡터로 패딩됨
# (나중에, 텐서플로에게 각 문자열의 실제 길이를 sequence_length 매개변수로 전달할 것).

# In[92]:


def string_to_one_hot_vectors(string, n_steps, chars="BEPSTVX"):
    char_to_index = {char: index for index, char in enumerate(chars)}
    output = np.zeros((n_steps, len(chars)), dtype=np.int32)
    for index, char in enumerate(string):
        output[index, char_to_index[char]] = 1.
    return output


# In[93]:


string_to_one_hot_vectors("BTBTXSETE", 12)


# - 50%의 올바른 문자열와 50%의 잘못된 문자열로 이루어진 데이터 셋을 생성

# In[94]:


def generate_dataset(size):
    good_strings = [generate_string(embedded_reber_grammar)
                    for _ in range(size // 2)]
    bad_strings = [generate_corrupted_string(embedded_reber_grammar)
                   for _ in range(size - size // 2)]
    all_strings = good_strings + bad_strings
    n_steps = max([len(string) for string in all_strings])
    X = np.array([string_to_one_hot_vectors(string, n_steps)
                  for string in all_strings])
    seq_length = np.array([len(string) for string in all_strings])
    y = np.array([[1] for _ in range(len(good_strings))] +
                 [[0] for _ in range(len(bad_strings))])
    rnd_idx = np.random.permutation(size)
    return X[rnd_idx], seq_length[rnd_idx], y[rnd_idx]


# In[95]:


X_train, l_train, y_train = generate_dataset(10000)


# - 첫번째 training instance 확인

# In[45]:


X_train[0]


# 데이터 세트에 가장 긴 문자열이 존재하기 때문에 0값이 많은 것을 확인 할 수 있음.
# 
# - 이제 좋은 문자열을 식별하기 위해 RNN을 만들 준비가 됨
# 
# MNIST 이미지를 분류하기 위해 앞서 만들었던 시퀀스 classifier 만들기.
# 
# 주의사항
# 1. 입력 문자열은 가변 길이 이므로 dynamic_rnn 함수를 호출할 때 sequence_length를 지정해야함.
# 2. binary classifier이므로 각 입력 문자열에 대해 예상되는 log probability가 높은 하나의 출력 뉴련만을 필요로 함.

# In[98]:


reset_graph()

possible_chars = "BEPSTVX"
n_inputs = len(possible_chars)
n_neurons = 30
n_outputs = 1

learning_rate = 0.02
momentum = 0.95

X = tf.placeholder(tf.float32, [None, None, n_inputs], name="X")
seq_length = tf.placeholder(tf.int32, [None], name="seq_length")
y = tf.placeholder(tf.float32, [None, 1], name="y")

gru_cell = tf.nn.rnn_cell.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(gru_cell, X, dtype=tf.float32,
                                    sequence_length=seq_length)

logits = tf.layers.dense(states, n_outputs, name="logits")
y_pred = tf.cast(tf.greater(logits, 0.), tf.float32, name="y_pred")
y_proba = tf.nn.sigmoid(logits, name="y_proba")

xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=momentum,
                                       use_nesterov=True)
training_op = optimizer.minimize(loss)

correct = tf.equal(y_pred, y, name="correct")
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

init = tf.global_variables_initializer()
saver = tf.train.Saver()


# In[99]:


X_val, l_val, y_val = generate_dataset(5000)


# In[100]:


n_epochs = 50
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        X_batches = np.array_split(X_train, len(X_train) // batch_size)
        l_batches = np.array_split(l_train, len(l_train) // batch_size)
        y_batches = np.array_split(y_train, len(y_train) // batch_size)
        for X_batch, l_batch, y_batch in zip(X_batches, l_batches, y_batches):
            loss_val, _ = sess.run(
                [loss, training_op],
                feed_dict={X: X_batch, seq_length: l_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, seq_length: l_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: X_val, seq_length: l_val, y: y_val})
        print("{:4d}  Train loss: {:.4f}, accuracy: {:.2f}%  Validation accuracy: {:.2f}%".format(
            epoch, loss_val, 100 * acc_train, 100 * acc_val))
        saver.save(sess, "./my_reber_classifier")


# 이제 RNN을 두 개의 까다로운 문자열로 테스트 해보기
# 첫번째는 나쁜것이고 두번째는 좋은것.
# 두번째와 마지막 문자만 다르며, 두번째 문자가 
# 항상 마지막 문자와 같아야 한다는 패턴을 알아 차릴 수 있다는 것을 보여줌
# 
# - 이를 위해서 상당히 긴 단기 메모리가 필요함 - GRU Cell을 사용하는 이유

# In[102]:


test_strings = [
    "BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
    "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"]
l_test = np.array([len(s) for s in test_strings])
max_length = l_test.max()
X_test = [string_to_one_hot_vectors(s, n_steps=max_length)
          for s in test_strings]

with tf.Session() as sess:
    saver.restore(sess, "./my_reber_classifier")
    y_proba_val = y_proba.eval(feed_dict={X: X_test, seq_length: l_test})

print()
print("Estimated probability that these are Reber strings:")
for index, string in enumerate(test_strings):
    print("{}: {:.2f}%".format(string, 100 * y_proba_val[index][0]))


# 참조 자료
# <list>
# <li>https://www.data-blogger.com/2017/08/27/gru-implementation-tensorflow/</li>
#     <li>순환 신경망 LSTM, GRU 설명 : https://excelsior-cjh.tistory.com/185</li>
#     <li>GRU Wiki : https://en.wikipedia.org/wiki/Gated_recurrent_unit</li>
#     <li>순환 신경망 모델 만들기 :https://ratsgo.github.io/natural%20language%20processing/2017/03/09/rnnlstm/</li>
#     <li>LSTM example : https://m.blog.naver.com/PostView.nhn?blogId=wideeyed&logNo=221158850266&proxyReferer=https%3A%2F%2Fwww.google.com%2F</li>
#     <li>LSTM exmaple : https://tykimos.github.io/2017/04/09/RNN_Layer_Talk/</li>
#     <li>워드 임베딩 : https://datascienceschool.net/view-notebook/6927b0906f884a67b0da9310d3a581ee/</li>
#     <li>워드 임베딩 : https://dreamgonfly.github.io/machine/learning,/natural/language/processing/2017/08/16/word2vec_explained.html</li>
# <li>희소 벡터, 밀집 벡터 : https://wikidocs.net/33520 </li>
#     <li>기계 번역 : https://github.com/tensorflow/nmt#introduction</li>
#     <li>FR/EN 기계 번역 https://github.com/denisb411/seq2seq-NMT-tensorflow</li>
#     <li> 기계 번역 코드 상세 설명 : https://github.com/deep-diver/EN-FR-MLT-tensorflow/blob/master/dlnd_language_translationv2.ipynb </li>