아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.jupyter.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.
주피터 노트북 뷰어로 보기 | 구글 코랩(Colab)에서 실행하기 |
from IPython.display import Image
Image(url='https://git.io/JLdVm', width=700)
Image(url='https://git.io/JLdVO', width=700)
Image(url='https://git.io/JLdV3', width=700)
Image(url='https://git.io/JLdVs', width=700)
Image(url='https://git.io/JLdVC', width=700)
Image(url='https://git.io/JLdVW', width=700)
Image(url='https://git.io/JLdV8', width=700)
import tensorflow as tf
tf.random.set_seed(1)
rnn_layer = tf.keras.layers.SimpleRNN(
units=2, use_bias=True,
return_sequences=True)
rnn_layer.build(input_shape=(None, None, 5))
w_xh, w_oo, b_h = rnn_layer.weights
print('W_xh 크기:', w_xh.shape)
print('W_oo 크기:', w_oo.shape)
print('b_h 크기:', b_h.shape)
W_xh 크기: (5, 2) W_oo 크기: (2, 2) b_h 크기: (2,)
x_seq = tf.convert_to_tensor(
[[1.0]*5, [2.0]*5, [3.0]*5],
dtype=tf.float32)
## SimepleRNN의 출력:
output = rnn_layer(tf.reshape(x_seq, shape=(1, 3, 5)))
## 수동으로 출력 계산하기:
out_man = []
for t in range(len(x_seq)):
xt = tf.reshape(x_seq[t], (1, 5))
print('타임 스텝 {} =>'.format(t))
print(' 입력 :', xt.numpy())
ht = tf.matmul(xt, w_xh) + b_h
print(' 은닉 :', ht.numpy())
if t>0:
prev_o = out_man[t-1]
else:
prev_o = tf.zeros(shape=(ht.shape))
ot = ht + tf.matmul(prev_o, w_oo)
ot = tf.math.tanh(ot)
out_man.append(ot)
print(' 출력 (수동) :', ot.numpy())
print(' SimpleRNN 출력 :'.format(t), output[0][t].numpy())
print()
타임 스텝 0 => 입력 : [[1. 1. 1. 1. 1.]] 은닉 : [[0.96635884 0.9771539 ]] 출력 (수동) : [[0.7470998 0.7518313]] SimpleRNN 출력 : [0.7470998 0.7518313] 타임 스텝 1 => 입력 : [[2. 2. 2. 2. 2.]] 은닉 : [[1.9327177 1.9543078]] 출력 (수동) : [[0.99347186 0.89415705]] SimpleRNN 출력 : [0.99347186 0.89415705] 타임 스텝 2 => 입력 : [[3. 3. 3. 3. 3.]] 은닉 : [[2.8990765 2.9314618]] 출력 (수동) : [[0.99937034 0.9767568 ]] SimpleRNN 출력 : [0.99937034 0.9767568 ]
Image(url='https://git.io/JLdV4', width=700)
Image(url='https://git.io/JLdVR', width=700)
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
# 코랩에서 실행하는 경우 다음 코드를 실행하세요.
!mkdir ../ch08
!wget https://github.com/rickiepark/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz -O ../ch08/movie_data.csv.gz
--2023-11-10 06:02:41-- https://github.com/rickiepark/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz Resolving github.com (github.com)... 192.30.255.112 Connecting to github.com (github.com)|192.30.255.112|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/rickiepark/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following] --2023-11-10 06:02:41-- https://raw.githubusercontent.com/rickiepark/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 26521894 (25M) [application/octet-stream] Saving to: ‘../ch08/movie_data.csv.gz’ ../ch08/movie_data. 100%[===================>] 25.29M 142MB/s in 0.2s 2023-11-10 06:02:41 (142 MB/s) - ‘../ch08/movie_data.csv.gz’ saved [26521894/26521894]
import os
import gzip
import shutil
with gzip.open('../ch08/movie_data.csv.gz', 'rb') as f_in, open('movie_data.csv', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.tail()
review | sentiment | |
---|---|---|
49995 | OK, lets start with the best. the building. al... | 0 |
49996 | The British 'heritage film' industry is out of... | 0 |
49997 | I don't even know where to begin on this one. ... | 0 |
49998 | Richard Tyler is a little boy who is scared of... | 0 |
49999 | I waited long to watch this movie. Also becaus... | 1 |
# 단계 1: 데이터셋 만들기
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices(
(df.values, target.values))
## 확인:
for ex in ds_raw.take(3):
tf.print(ex[0].numpy()[0][:50], ex[1])
b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1 b'OK... so... I really like Kris Kristofferson and h' 0 b'***SPOILER*** Do not read this, if you think about' 0
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(
50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)
tfds.deprecated.text.Tokenizer
: https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/Tokenizertfds.deprecated.text.TokenTextEncoder
: https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/TokenTextEncoder## 단계 2: 고유 토큰(단어) 찾기
from collections import Counter
try:
tokenizer = tfds.features.text.Tokenizer()
except AttributeError:
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()
for example in ds_raw_train:
tokens = tokenizer.tokenize(example[0].numpy()[0])
token_counts.update(tokens)
print('어휘 사전 크기:', len(token_counts))
어휘 사전 크기: 87007
## 단계 3: 고유 토큰을 정수로 인코딩하기
try:
encoder = tfds.features.text.TokenTextEncoder(token_counts)
except AttributeError:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
encoder.encode(example_str)
[232, 9, 270, 1123]
## 단계 3-A: 변환 함수 정의하기
def encode(text_tensor, label):
text = text_tensor.numpy()[0]
encoded_text = encoder.encode(text)
return encoded_text, label
## 단계 3-B: 인코딩 함수를 텐서플로 연산으로 감싸기
def encode_map_fn(text, label):
return tf.py_function(encode, inp=[text, label],
Tout=(tf.int64, tf.int64))
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
print('시퀀스 길이:', example[0].shape)
example
시퀀스 길이: (24,) 시퀀스 길이: (179,) 시퀀스 길이: (262,) 시퀀스 길이: (535,) 시퀀스 길이: (130,)
(<tf.Tensor: shape=(130,), dtype=int64, numpy= array([ 579, 1296, 32, 425, 40, 763, 9267, 65, 280, 308, 6, 481, 155, 473, 2, 3, 684, 9, 781, 176, 959, 730, 3917, 67, 9905, 13, 277, 24, 35, 371, 16368, 6, 14, 17231, 29, 187, 1651, 489, 503, 480, 143, 32, 270, 5851, 2402, 13, 3592, 3443, 425, 3313, 256, 257, 1577, 117, 8, 698, 270, 564, 56, 8, 42, 7517, 2629, 820, 25, 60, 79, 343, 32, 645, 14, 528, 241, 32, 1980, 8, 56, 8, 42, 1364, 573, 5183, 43, 12, 3870, 32, 312, 642, 251, 1401, 17232, 8, 698, 257, 750, 2, 9, 76, 235, 8, 42, 235, 840, 666, 258, 17233, 419, 32, 17234, 585, 420, 840, 25, 40, 13, 14, 198, 266, 623, 173, 179, 4103, 216, 25, 616, 14185, 186, 35, 16250, 120])>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
# 에러 발생
BATCH_SIZE = 32
train_data = all_encoded_data.batch(BATCH_SIZE)
next(iter(train_data))
# 이 코드는 에러를 발생시킵니다
# 이 데이터셋에는 .batch()를 적용할 수 없습니다
## 일부 데이터 추출하기
ds_subset = ds_train.take(8)
for example in ds_subset:
print('개별 샘플 크기:', example[0].shape)
## 배치 데이터 만들기
ds_batched = ds_subset.padded_batch(
4, padded_shapes=([-1], []))
for batch in ds_batched:
print('배치 차원:', batch[0].shape)
개별 샘플 크기: (119,) 개별 샘플 크기: (688,) 개별 샘플 크기: (308,) 개별 샘플 크기: (204,) 개별 샘플 크기: (326,) 개별 샘플 크기: (240,) 개별 샘플 크기: (127,) 개별 샘플 크기: (453,) 배치 차원: (4, 688) 배치 차원: (4, 453)
## 배치 데이터 만들기
train_data = ds_train.padded_batch(
32, padded_shapes=([-1],[]))
valid_data = ds_valid.padded_batch(
32, padded_shapes=([-1],[]))
test_data = ds_test.padded_batch(
32, padded_shapes=([-1],[]))
input_dim
: 단어 개수, 즉 최대 정수 인덱스 + 1.
output_dim
:
input_length
: (패딩된) 시퀀스 길이
'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9]
=> input_lenght는 10
이 층을 호출할 때 입력으로 정수값을 받습니다. 임베딩 층이 정수를 [output_dim]
크기의 실수 벡터로 변환합니다
[BATCH_SIZE]
이면 출력 크기는 [BATCH_SIZE, output_dim]
가 됩니다[BATCH_SIZE, 10]
이면 출력 크기는 [BATCH_SIZE, 10, output_dim]
가 됩니다Image(url='https://git.io/JLdV0', width=700)
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100,
output_dim=6,
input_length=20,
name='embed-layer'))
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embed-layer (Embedding) (None, 20, 6) 600 ================================================================= Total params: 600 (2.34 KB) Trainable params: 600 (2.34 KB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
케라스 RNN 층:
tf.keras.layers.SimpleRNN(units, return_sequences=False)
tf.keras.layers.LSTM(..)
tf.keras.layers.GRU(..)
tf.keras.layers.Bidirectional()
return_sequenes=?
결정하기
return_sequenes=True
로 지정합니다return_sequences=True
return_sequenes=False
## SimpleRNN 층으로 RNN 모델 만들기
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense
model = Sequential()
model.add(Embedding(1000, 32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, None, 32) 32000 simple_rnn_1 (SimpleRNN) (None, None, 32) 2080 simple_rnn_2 (SimpleRNN) (None, 32) 2080 dense (Dense) (None, 1) 33 ================================================================= Total params: 36193 (141.38 KB) Trainable params: 36193 (141.38 KB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
## LSTM 층으로 RNN 모델 만들기
from tensorflow.keras.layers import LSTM
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1))
model.summary()
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, None, 32) 320000 lstm (LSTM) (None, None, 32) 8320 lstm_1 (LSTM) (None, 32) 8320 dense_1 (Dense) (None, 1) 33 ================================================================= Total params: 336673 (1.28 MB) Trainable params: 336673 (1.28 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
## GRU 층으로 RNN 모델 만들기
from tensorflow.keras.layers import GRU
model = Sequential()
model.add(Embedding(10000, 32))
model.add(GRU(32, return_sequences=True))
model.add(GRU(32))
model.add(Dense(1))
model.summary()
Model: "sequential_3" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, None, 32) 320000 gru (GRU) (None, None, 32) 6336 gru_1 (GRU) (None, 32) 6336 dense_2 (Dense) (None, 1) 33 ================================================================= Total params: 332705 (1.27 MB) Trainable params: 332705 (1.27 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)
## 모델 생성
bi_lstm_model = tf.keras.Sequential([
tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=embedding_dim,
name='embed-layer'),
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(64, name='lstm-layer'),
name='bidir-lstm'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
bi_lstm_model.summary()
## 컴파일과 훈련:
bi_lstm_model.compile(
optimizer=tf.keras.optimizers.Adam(1e-3),
loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])
history = bi_lstm_model.fit(
train_data,
validation_data=valid_data,
epochs=10)
## 테스트 데이터에서 평가
test_results= bi_lstm_model.evaluate(test_data)
print('테스트 정확도: {:.2f}%'.format(test_results[1]*100))
Model: "sequential_4" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embed-layer (Embedding) (None, None, 20) 1740180 bidir-lstm (Bidirectional) (None, 128) 43520 dense_3 (Dense) (None, 64) 8256 dense_4 (Dense) (None, 1) 65 ================================================================= Total params: 1792021 (6.84 MB) Trainable params: 1792021 (6.84 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________ Epoch 1/10 625/625 [==============================] - 145s 220ms/step - loss: 0.6439 - accuracy: 0.5989 - val_loss: 0.6189 - val_accuracy: 0.6930 Epoch 2/10 625/625 [==============================] - 44s 71ms/step - loss: 0.4293 - accuracy: 0.8034 - val_loss: 0.4391 - val_accuracy: 0.8138 Epoch 3/10 625/625 [==============================] - 45s 72ms/step - loss: 0.2220 - accuracy: 0.9161 - val_loss: 0.4540 - val_accuracy: 0.8336 Epoch 4/10 625/625 [==============================] - 45s 71ms/step - loss: 0.1189 - accuracy: 0.9625 - val_loss: 0.5119 - val_accuracy: 0.8344 Epoch 5/10 625/625 [==============================] - 45s 71ms/step - loss: 0.0674 - accuracy: 0.9798 - val_loss: 0.5506 - val_accuracy: 0.8444 Epoch 6/10 625/625 [==============================] - 46s 74ms/step - loss: 0.0503 - accuracy: 0.9854 - val_loss: 0.6473 - val_accuracy: 0.8124 Epoch 7/10 625/625 [==============================] - 45s 72ms/step - loss: 0.0418 - accuracy: 0.9875 - val_loss: 0.7181 - val_accuracy: 0.8300 Epoch 8/10 625/625 [==============================] - 44s 70ms/step - loss: 0.0475 - accuracy: 0.9854 - val_loss: 0.8262 - val_accuracy: 0.7650 Epoch 9/10 625/625 [==============================] - 45s 72ms/step - loss: 0.0304 - accuracy: 0.9910 - val_loss: 0.7237 - val_accuracy: 0.8272 Epoch 10/10 625/625 [==============================] - 44s 70ms/step - loss: 0.0156 - accuracy: 0.9958 - val_loss: 0.8161 - val_accuracy: 0.8180 782/782 [==============================] - 34s 44ms/step - loss: 0.7992 - accuracy: 0.8179 테스트 정확도: 81.79%
if not os.path.exists('models'):
os.mkdir('models')
bi_lstm_model.save('models/Bidir-LSTM-full-length-seq.h5')
/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py:3079: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`. saving_api.save_model(
def preprocess_datasets(
ds_raw_train,
ds_raw_valid,
ds_raw_test,
max_seq_length=None,
batch_size=32):
## 단계 1: (데이터셋 만들기 이미 완료)
## 단계 2: 고유 토큰 찾기
try:
tokenizer = tfds.features.text.Tokenizer()
except AttributeError:
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()
for example in ds_raw_train:
tokens = tokenizer.tokenize(example[0].numpy()[0])
if max_seq_length is not None:
tokens = tokens[-max_seq_length:]
token_counts.update(tokens)
print('어휘 사전 크기:', len(token_counts))
## 단계 3: 텍스트 인코딩하기
try:
encoder = tfds.features.text.TokenTextEncoder(token_counts)
except AttributeError:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
def encode(text_tensor, label):
text = text_tensor.numpy()[0]
encoded_text = encoder.encode(text)
if max_seq_length is not None:
encoded_text = encoded_text[-max_seq_length:]
return encoded_text, label
def encode_map_fn(text, label):
return tf.py_function(encode, inp=[text, label],
Tout=(tf.int64, tf.int64))
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)
## 단계 4: 배치 데이터 만들기
train_data = ds_train.padded_batch(
batch_size, padded_shapes=([-1],[]))
valid_data = ds_valid.padded_batch(
batch_size, padded_shapes=([-1],[]))
test_data = ds_test.padded_batch(
batch_size, padded_shapes=([-1],[]))
return (train_data, valid_data,
test_data, len(token_counts))
def build_rnn_model(embedding_dim, vocab_size,
recurrent_type='SimpleRNN',
n_recurrent_units=64,
n_recurrent_layers=1,
bidirectional=True):
tf.random.set_seed(1)
# 모델 생성
model = tf.keras.Sequential()
model.add(
Embedding(
input_dim=vocab_size,
output_dim=embedding_dim,
name='embed-layer')
)
for i in range(n_recurrent_layers):
return_sequences = (i < n_recurrent_layers-1)
if recurrent_type == 'SimpleRNN':
recurrent_layer = SimpleRNN(
units=n_recurrent_units,
return_sequences=return_sequences,
name='simprnn-layer-{}'.format(i))
elif recurrent_type == 'LSTM':
recurrent_layer = LSTM(
units=n_recurrent_units,
return_sequences=return_sequences,
name='lstm-layer-{}'.format(i))
elif recurrent_type == 'GRU':
recurrent_layer = GRU(
units=n_recurrent_units,
return_sequences=return_sequences,
name='gru-layer-{}'.format(i))
if bidirectional:
recurrent_layer = Bidirectional(
recurrent_layer, name='bidir-'+recurrent_layer.name)
model.add(recurrent_layer)
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
return model
from tensorflow.keras.layers import Bidirectional
batch_size = 32
embedding_dim = 20
max_seq_length = 100
train_data, valid_data, test_data, n = preprocess_datasets(
ds_raw_train, ds_raw_valid, ds_raw_test,
max_seq_length=max_seq_length,
batch_size=batch_size
)
vocab_size = n + 2
rnn_model = build_rnn_model(
embedding_dim, vocab_size,
recurrent_type='SimpleRNN',
n_recurrent_units=64,
n_recurrent_layers=1,
bidirectional=True)
rnn_model.summary()
어휘 사전 크기: 58063 Model: "sequential_5" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embed-layer (Embedding) (None, None, 20) 1161300 bidir-simprnn-layer-0 (Bid (None, 128) 10880 irectional) dense_5 (Dense) (None, 64) 8256 dense_6 (Dense) (None, 1) 65 ================================================================= Total params: 1180501 (4.50 MB) Trainable params: 1180501 (4.50 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])
history = rnn_model.fit(
train_data,
validation_data=valid_data,
epochs=10)
Epoch 1/10 625/625 [==============================] - 179s 282ms/step - loss: 0.6877 - accuracy: 0.5437 - val_loss: 0.5733 - val_accuracy: 0.7176 Epoch 2/10 625/625 [==============================] - 138s 221ms/step - loss: 0.4811 - accuracy: 0.7659 - val_loss: 0.4919 - val_accuracy: 0.7892 Epoch 3/10 625/625 [==============================] - 138s 221ms/step - loss: 0.2658 - accuracy: 0.8935 - val_loss: 0.6333 - val_accuracy: 0.6632 Epoch 4/10 625/625 [==============================] - 138s 221ms/step - loss: 0.1993 - accuracy: 0.9232 - val_loss: 0.5770 - val_accuracy: 0.8108 Epoch 5/10 625/625 [==============================] - 137s 219ms/step - loss: 0.1453 - accuracy: 0.9499 - val_loss: 0.6287 - val_accuracy: 0.7758 Epoch 6/10 625/625 [==============================] - 138s 221ms/step - loss: 0.0857 - accuracy: 0.9726 - val_loss: 0.7515 - val_accuracy: 0.7644 Epoch 7/10 625/625 [==============================] - 138s 220ms/step - loss: 0.0500 - accuracy: 0.9849 - val_loss: 0.9449 - val_accuracy: 0.6744 Epoch 8/10 625/625 [==============================] - 142s 227ms/step - loss: 0.1503 - accuracy: 0.9416 - val_loss: 1.0362 - val_accuracy: 0.6554 Epoch 9/10 625/625 [==============================] - 138s 220ms/step - loss: 0.1151 - accuracy: 0.9556 - val_loss: 0.8802 - val_accuracy: 0.7124 Epoch 10/10 625/625 [==============================] - 139s 222ms/step - loss: 0.0735 - accuracy: 0.9754 - val_loss: 0.7908 - val_accuracy: 0.7338
results = rnn_model.evaluate(test_data)
782/782 [==============================] - 33s 42ms/step - loss: 0.7589 - accuracy: 0.7413
print('테스트 정확도: {:.2f}%'.format(results[1]*100))
테스트 정확도: 74.13%
batch_size = 32
embedding_dim = 20
max_seq_length = None
train_data, valid_data, test_data, n = preprocess_datasets(
ds_raw_train, ds_raw_valid, ds_raw_test,
max_seq_length=max_seq_length,
batch_size=batch_size
)
vocab_size = n + 2
rnn_model = build_rnn_model(
embedding_dim, vocab_size,
recurrent_type='SimpleRNN',
n_recurrent_units=64,
n_recurrent_layers=1,
bidirectional=False)
rnn_model.summary()
어휘 사전 크기: 87007 Model: "sequential_6" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embed-layer (Embedding) (None, None, 20) 1740180 simprnn-layer-0 (SimpleRNN (None, 64) 5440 ) dense_7 (Dense) (None, 64) 4160 dense_8 (Dense) (None, 1) 65 ================================================================= Total params: 1749845 (6.68 MB) Trainable params: 1749845 (6.68 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])
history = rnn_model.fit(
train_data,
validation_data=valid_data,
epochs=10)
Epoch 1/10 625/625 [==============================] - 584s 932ms/step - loss: 0.7014 - accuracy: 0.5005 - val_loss: 0.6964 - val_accuracy: 0.4984 Epoch 2/10 625/625 [==============================] - 527s 843ms/step - loss: 0.6991 - accuracy: 0.4997 - val_loss: 0.6929 - val_accuracy: 0.5126 Epoch 3/10 625/625 [==============================] - 525s 841ms/step - loss: 0.6967 - accuracy: 0.4943 - val_loss: 0.6929 - val_accuracy: 0.5126 Epoch 4/10 625/625 [==============================] - 521s 833ms/step - loss: 0.6956 - accuracy: 0.4969 - val_loss: 0.6930 - val_accuracy: 0.5128 Epoch 5/10 625/625 [==============================] - 523s 837ms/step - loss: 0.6949 - accuracy: 0.4981 - val_loss: 0.6929 - val_accuracy: 0.5136 Epoch 6/10 625/625 [==============================] - 524s 839ms/step - loss: 0.6944 - accuracy: 0.5021 - val_loss: 0.6927 - val_accuracy: 0.5138 Epoch 7/10 625/625 [==============================] - 521s 834ms/step - loss: 0.6931 - accuracy: 0.5062 - val_loss: 0.6922 - val_accuracy: 0.5136 Epoch 8/10 625/625 [==============================] - 526s 841ms/step - loss: 0.6913 - accuracy: 0.5061 - val_loss: 0.6918 - val_accuracy: 0.5134 Epoch 9/10 625/625 [==============================] - 532s 851ms/step - loss: 0.6862 - accuracy: 0.5076 - val_loss: 0.6953 - val_accuracy: 0.5144 Epoch 10/10 625/625 [==============================] - 539s 862ms/step - loss: 0.6813 - accuracy: 0.5101 - val_loss: 0.7008 - val_accuracy: 0.5090
imdb_bldr = tfds.builder('imdb_reviews')
print(imdb_bldr.info)
imdb_bldr.download_and_prepare()
datasets = imdb_bldr.as_dataset(shuffle_files=False)
datasets.keys()
tfds.core.DatasetInfo( name='imdb_reviews', full_name='imdb_reviews/plain_text/1.0.0', description=""" Large Movie Review Dataset. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. """, config_description=""" Plain text """, homepage='http://ai.stanford.edu/~amaas/data/sentiment/', data_dir=PosixGPath('/tmp/tmp0ldve2zatfds'), file_format=tfrecord, download_size=80.23 MiB, dataset_size=Unknown size, features=FeaturesDict({ 'label': ClassLabel(shape=(), dtype=int64, num_classes=2), 'text': Text(shape=(), dtype=string), }), supervised_keys=('text', 'label'), disable_shuffling=False, splits={ 'test': <SplitInfo num_examples=25000, num_shards=1>, 'train': <SplitInfo num_examples=25000, num_shards=1>, 'unsupervised': <SplitInfo num_examples=50000, num_shards=1>, }, citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011, author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, title = {Learning Word Vectors for Sentiment Analysis}, booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, month = {June}, year = {2011}, address = {Portland, Oregon, USA}, publisher = {Association for Computational Linguistics}, pages = {142--150}, url = {http://www.aclweb.org/anthology/P11-1015} }""", ) Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...
Dl Completed...: 0 url [00:00, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s]
Generating splits...: 0%| | 0/3 [00:00<?, ? splits/s]
Generating train examples...: 0%| | 0/25000 [00:00<?, ? examples/s]
Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteCHCH46/imdb_reviews-train.tfrecord…
Generating test examples...: 0%| | 0/25000 [00:00<?, ? examples/s]
Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteCHCH46/imdb_reviews-test.tfrecord*…
Generating unsupervised examples...: 0%| | 0/50000 [00:00<?, ? examples/s]
Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteCHCH46/imdb_reviews-unsupervised.t…
Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
dict_keys([Split('train'), Split('test'), Split('unsupervised')])
imdb_train = datasets['train']
imdb_train = datasets['test']
tfds.deprecated.text.Tokenizer
: https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/Tokenizertfds.deprecated.text.TokenTextEncoder
: https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/TokenTextEncodervocab_set = {'a', 'b', 'c', 'd'}
encoder = tfds.deprecated.text.TokenTextEncoder(vocab_set)
print(encoder)
print(encoder.encode(b'a b c d, , : .'))
print(encoder.encode(b'a b c d e f g h i z'))
<TokenTextEncoder vocab_size=6> [1, 4, 2, 3] [1, 4, 2, 3, 5, 5, 5, 5, 5, 5]
TOP_K = 200
MAX_LEN = 10
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(['this is an example', 'je suis en forme '])
sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme '])
print(sequences)
tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN)
[[1, 2, 3, 4], [5, 6, 7, 8]]
array([[0, 0, 0, 0, 0, 0, 1, 2, 3, 4], [0, 0, 0, 0, 0, 0, 5, 6, 7, 8]], dtype=int32)
TOP_K = 20000
MAX_LEN = 500
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(
[example['text'].numpy().decode('utf-8')
for example in imdb_train])
x_train = tokenizer.texts_to_sequences(
[example['text'].numpy().decode('utf-8')
for example in imdb_train])
print(len(x_train))
x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(
x_train, maxlen=MAX_LEN)
print(x_train_padded.shape)
25000 (25000, 500)
from tensorflow.keras.layers import Embedding
tf.random.set_seed(1)
embed = Embedding(input_dim=100, output_dim=4)
inp_arr = np.array([1, 98, 5, 6, 67, 45])
tf.print(embed(inp_arr))
tf.print(embed(inp_arr).shape)
tf.print(embed(np.array([1])))
[[0.0285978206 -0.0122499242 -0.0394328944 -0.0145259611] [-0.016297698 0.0204829238 0.0123164877 0.0303565152] [-0.0233924985 -0.0107878074 -0.00653867796 0.0383420847] [0.0457952954 -0.0139975436 0.000184893608 0.0398626] [-0.00592473894 0.0485283621 0.0443233289 -0.005304683] [-0.0153851733 0.0370714702 -0.0447837822 0.0241911896]] TensorShape([6, 4]) [[0.0285978206 -0.0122499242 -0.0394328944 -0.0145259611]]