dynamic_rnn
[Lai and Hockenmaier, 2014, Jimenez et al., 2014, Zhao et al., 2014, Beltagy et al., 2015 etc.]
Previous RTE corpora:
Stanford Natural Inference Corpus (SNLI):
[Bowman et al, 2015]
Same LSTM encodes premise and hypothesis
[Bowman et al, 2015]
Same LSTM encodes premise and hypothesis
You can’t cram the meaning of a whole
%&!$# sentence into a single $&!#* vector!
-- Raymond J. Mooney
[Bowman et al, 2015]
with tf.Graph().as_default():
def mlp(input_vector, layers=3, hidden_dim=200, output_dim=3):
# [input_size] => [input_size x 1] (column vector)
tmp = tf.expand_dims(input_vector, 1)
for i in range(layers+1):
W = tf.get_variable(
"W_"+str(i), [hidden_dim, hidden_dim])
# tanh(Wx^T)
tmp = tf.tanh(tf.matmul(W, tmp))
W = tf.get_variable(
"W_"+str(layers+1), [output_dim, hidden_dim])
# [input_size x 1] => [input_size]
return tf.squeeze(tf.matmul(W, tmp))
premise = tf.placeholder(tf.float32, [None], "premise")
hypothesis = tf.placeholder(tf.float32, [None], "hypothesis")
output = tf.nn.softmax(mlp(tf.concat([premise, hypothesis], axis=0)))
# in practice: outputs of an LSTM
v1 = np.random.rand(100); v2 = np.random.rand(100)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(output, {premise: v1, hypothesis: v2}))
[ 0.28892419 0.30673885 0.40433696]
Model | k | θW+M | θM | Train | Dev | Test |
---|---|---|---|---|---|---|
LSTM [Bowman et al.] | 100 | $\approx$10M | 221k | 84.4 | - | 77.6 |
Classifier [Bowman et al.] | - | - | - | 99.7 | - | 78.2 |
Model | k | θW+M | θM | Train | Dev | Test |
---|---|---|---|---|---|---|
LSTM [Bowman et al.] | 100 | $\approx$10M | 221k | 84.4 | - | 77.6 |
Classifier [Bowman et al.] | - | - | - | 99.7 | - | 78.2 |
Conditional Endcoding | 159 | 3.9M | 252k | 84.4 | 83.0 | 81.4 |
Model | k | θW+M | θM | Train | Dev | Test |
---|---|---|---|---|---|---|
LSTM [Bowman et al.] | 100 | $\approx$10M | 221k | 84.4 | - | 77.6 |
Classifier [Bowman et al.] | - | - | - | 99.7 | - | 78.2 |
Conditional Encoding | 159 | 3.9M | 252k | 84.4 | 83.0 | 81.4 |
Attention | 100 | 3.9M | 242k | 85.4 | 83.2 | 82.3 |
Model | k | θW+M | θM | Train | Dev | Test |
---|---|---|---|---|---|---|
LSTM [Bowman et al.] | 100 | $\approx$10M | 221k | 84.4 | - | 77.6 |
Classifier [Bowman et al.] | - | - | - | 99.7 | - | 78.2 |
Conditional Encoding | 159 | 3.9M | 252k | 84.4 | 83.0 | 81.4 |
Attention | 100 | 3.9M | 242k | 85.4 | 83.2 | 82.3 |
Word-by-word Attention | 100 | 3.9M | 252k | 85.3 | 83.7 | 83.5 |
with tf.Graph().as_default():
x = tf.get_variable("param", [])
loss = -tf.log(tf.sigmoid(x)) # dummy example
optim = tf.train.AdamOptimizer(learning_rate=0.1)
min_op = optim.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(x.assign(1.5))
for i in range(10):
print(sess.run([min_op, loss], {})[1])
0.201413 0.183901 0.167832 0.153134 0.139727 0.127529 0.116455 0.106421 0.0973447 0.0891449
with tf.Graph().as_default():
x = tf.get_variable("param", [])
loss = -tf.log(tf.sigmoid(x)) # dummy example
optim = tf.train.AdamOptimizer(learning_rate=0.1)
gradients = optim.compute_gradients(loss)
capped_gradients = \
[(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in gradients]
min_op = optim.apply_gradients(capped_gradients)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()); sess.run(x.assign(1.5))
for i in range(100):
if i % 10 == 0:
grads = sess.run([min_op, gradients, capped_gradients], {})[1:]
print(" => ".join([str(grad[0][0]) for grad in grads]))
-0.182426 => -0.1 -0.167982 => -0.1 -0.154465 => -0.1 -0.141851 => -0.1 -0.130109 => -0.1 -0.119203 => -0.1 -0.109097 => -0.1 -0.0997508 => -0.0997508 -0.0911243 => -0.0911243 -0.0832129 => -0.0832129
For RNNs: tf.nn.rnn_cell.DropoutWrapper
with tf.Graph().as_default():
x = tf.placeholder(tf.float32, [None], "input")
x_dropped = tf.nn.dropout(x, 0.7) # keeps 70% of values
with tf.Session() as sess:
print(sess.run(x_dropped, {x: np.random.rand(6)}))
[ 0. 0.40834624 1.31100929 0. 1.08233535 0. ]
[Bergstra and Bengio 2012]
with tf.Graph().as_default():
vocab_size = 4; embedding_size = 3
W = tf.get_variable("W", [vocab_size, embedding_size], trainable=False)
W = W.assign(np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]))
seq = tf.placeholder(tf.int64, [None], "seq")
seq_embedded = tf.nn.embedding_lookup(W, seq)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(seq_embedded, {seq:[0, 3, 2, 3, 1]})[2])
[ 6. 7. 8.]
with tf.Graph().as_default():
vocab_size = 4; embedding_size = 3; input_size = 2
W = tf.get_variable("W", [vocab_size, embedding_size], trainable=False)
W = W.assign(np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]))
seq = tf.placeholder(tf.int64, [None], "seq")
seq_embedded = tf.contrib.layers.linear(tf.nn.embedding_lookup(W, seq), input_size)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(seq_embedded, {seq:[0, 3, 2, 3, 1]})[2])
[-13.46509552 2.99458957]
dynamic_rnn
¶with tf.Graph().as_default():
input_size = 2; output_size = 3; batch_size = 5; max_length = 7
cell = tf.nn.rnn_cell.LSTMCell(output_size)
input_embedded = tf.placeholder(tf.float32, [None, None, input_size], "input_embedded")
input_length = tf.placeholder(tf.int64, [None], "input_length")
outputs, states = \
tf.nn.dynamic_rnn(cell, input_embedded, sequence_length=input_length, dtype=tf.float32)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(states, {
input_embedded: np.random.randn(batch_size, max_length, input_size),
input_length: np.random.randint(1, max_length, batch_size)
}).h)
[[-0.02504479 0.01778306 0.06259364] [-0.39973924 0.15913689 -0.24309666] [ 0.1272437 -0.11015443 0.27061945] [-0.16756612 0.08985849 -0.09469282] [-0.31299666 0.21844 -0.0656506 ]]
tf.nn.bidirectional_dynamic_rnn