3. Dynamic CNN for Modeling Sentences (Kalchbrenner et al 2014)

  • Simple CNN structures for modeling sentences such as that used in Kim 2014 sometimes suffer from long distance dependencies
  • Hence, Kalchbrenner et al have suggested Dynamic Convolutional Neural Network (DCNN) in their 2014 paper (Kalchbrenner, N., Grefenstette, E., & Blunsom, P. (2014). A convolutional neural network for modelling sentences. arXiv preprint arXiv:1404.2188.)
    • "The network uses Dynamic k-Max Pooling, a global pooling operation over linear sequences. The network handles input sentences of varying length and induces a feature graph over the sentence that is capable of explicitly capturing short and long-range relations."

</br>

In [35]:
from keras.preprocessing import sequence
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.datasets import imdb

Defining k-maxpooling layer

In [1]:
from keras.engine import Layer, InputSpec
import tensorflow as tf

class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[1] * self.k))

    def call(self, inputs):
        
        # swap last two dimensions since top_k will be applied along the last dimension
        #shifted_input = tf.transpose(inputs, [0, 2, 1])
        
        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(inputs, k=self.k, sorted=True, name=None)[0]
        
        # return flattened output
        return top_k
Using TensorFlow backend.

Load Dataset

  • Load imdb dataset in Keras
In [3]:
num_words = 5000
max_len = 300
embedding_dim = 50
In [26]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(25000, 300) (25000, 300) (25000,) (25000,)

1. Basic Dynamic CNN

  • Basic DCNN structure with only one feature map and one covolution/pooling layer
In [44]:
def basic_dynamic_cnn(k = 5):
    model = Sequential()
    # Embedding each word
    model.add(Embedding(num_words, embedding_dim, input_length = max_len))
    # Wide convolution
    model.add(ZeroPadding1D(29))
    model.add(Conv1D(embedding_dim, 30, activation = 'relu'))
    # k-max pooling
    model.add(Permute((2, 1)))
    model.add(KMaxPooling(k))
    model.add(Reshape((k, -1)))
    model.add(Flatten())
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model
In [42]:
basic_dynamic_cnn = basic_dynamic_cnn()
basic_dynamic_cnn.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_13 (Embedding)     (None, 300, 50)           250000    
_________________________________________________________________
zero_padding1d_13 (ZeroPaddi (None, 358, 50)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 329, 50)           75050     
_________________________________________________________________
permute_7 (Permute)          (None, 50, 329)           0         
_________________________________________________________________
k_max_pooling_9 (KMaxPooling (None, 250)               0         
_________________________________________________________________
reshape_7 (Reshape)          (None, 5, 50)             0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 250)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 251       
=================================================================
Total params: 325,301
Trainable params: 325,301
Non-trainable params: 0
_________________________________________________________________
In [ ]:
callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]
history = basic_dynamic_cnn.fit(X_train, y_train, callbacks = callbacks, epochs = 10, validation_split = 0.2, batch_size = 200)
In [48]:
basic_dynamic_cnn_best_model = basic_dynamic_cnn()
basic_dynamic_cnn_best_model.load_weights('best_model.hdf5')
basic_dynamic_cnn_best_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
results = basic_dynamic_cnn_best_model.evaluate(X_test, y_test)
print('Test accuracy: ', results[1])
24832/25000 [============================>.] - ETA: 0sTest accuracy:  0.87084
In [ ]:
def basic_dynamic_cnn(k = 5):
    model = Sequential()
    # Embedding each word
    model.add(Embedding(num_words, embedding_dim, input_length = max_len))
    # Wide convolution
    model.add(ZeroPadding1D(29))
    model.add(Conv1D(embedding_dim, 30, activation = 'relu'))
    # k-max pooling
    model.add(Permute((2, 1)))
    model.add(KMaxPooling(k))
    model.add(Reshape((k, -1)))
    model.add(Flatten())
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

2. DCNN with two convolution (pooling) layers

  • Perform wide convolution and k-max pooling two times
In [79]:
# two kinds of k's and kernel sizes for each operation
def two_conv_dynamic_cnn(k1 = 20, k2 = 10, ksize1 = 20, ksize = 30):
    inputs = Input(shape = (X_train.shape[-1],))
    embed = Embedding(num_words, embedding_dim, input_length = max_len)(inputs)
    padded = ZeroPadding1D(ksize1 - 1)(embed)
    conv1 = Conv1D(embedding_dim, ksize1, activation = 'relu')(padded)
    permuted = Permute((2,1))(conv1)
    kmaxpool1 = KMaxPooling(k1)(permuted)
    kmaxpool1 = Reshape((k1, -1))(kmaxpool1)
    padded = ZeroPadding1D(ksize2 -1)(kmaxpool1)
    conv2 = Conv1D(embedding_dim, ksize2, activation = 'relu')(padded)
    permuted = Permute((2,1))(conv2)
    kmaxpool2 = KMaxPooling(k2)(permuted)
    kmaxpool2 = Reshape((k2, -1))(kmaxpool2)
    flattened = Flatten()(kmaxpool2)
    outputs = Dense(1, activation = 'sigmoid')(flattened)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model
In [76]:
two_conv_dynamic_cnn = two_conv_dynamic_cnn()
two_conv_dynamic_cnn.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_13 (InputLayer)        (None, 300)               0         
_________________________________________________________________
embedding_28 (Embedding)     (None, 300, 50)           250000    
_________________________________________________________________
zero_padding1d_30 (ZeroPaddi (None, 338, 50)           0         
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 319, 50)           50050     
_________________________________________________________________
permute_21 (Permute)         (None, 50, 319)           0         
_________________________________________________________________
k_max_pooling_23 (KMaxPoolin (None, 1000)              0         
_________________________________________________________________
reshape_21 (Reshape)         (None, 20, 50)            0         
_________________________________________________________________
zero_padding1d_31 (ZeroPaddi (None, 78, 50)            0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 49, 50)            75050     
_________________________________________________________________
permute_22 (Permute)         (None, 50, 49)            0         
_________________________________________________________________
k_max_pooling_24 (KMaxPoolin (None, 500)               0         
_________________________________________________________________
reshape_22 (Reshape)         (None, 10, 50)            0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 500)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 501       
=================================================================
Total params: 375,601
Trainable params: 375,601
Non-trainable params: 0
_________________________________________________________________
In [ ]:
callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]
history = two_conv_dynamic_cnn.fit(X_train, y_train, callbacks = callbacks, epochs = 10, validation_split = 0.2, batch_size = 200)
In [80]:
two_conv_dynamic_cnn_best_model = two_conv_dynamic_cnn()
two_conv_dynamic_cnn_best_model.load_weights('best_model.hdf5')
two_conv_dynamic_cnn_best_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
results = two_conv_dynamic_cnn_best_model.evaluate(X_test, y_test)
print('Test accuracy: ', results[1])
24960/25000 [============================>.] - ETA: 0sTest accuracy:  0.8832

3. DCNN with two feature maps

  • DCNN with two feature maps (and two convolutions), concatenated at the end of model
  • This implementation is most close to the original model delineated in paper
In [84]:
def two_feature_map_dynamic_cnn(k1 = 20, k2 = 10, ksize1 = 20, ksize = 30):
    inputs = Input(shape = (X_train.shape[-1],))
    embed = Embedding(num_words, embedding_dim, input_length = max_len)(inputs)
    conv_results = []
    # two feature maps using for loop
    for i in range(2):
        padded = ZeroPadding1D(ksize1 - 1)(embed)
        conv1 = Conv1D(embedding_dim, ksize1, activation = 'relu')(padded)
        permuted = Permute((2,1))(conv1)
        kmaxpool1 = KMaxPooling(k1)(permuted)
        kmaxpool1 = Reshape((k1, -1))(kmaxpool1)
        padded = ZeroPadding1D(ksize2 -1)(kmaxpool1)
        conv2 = Conv1D(embedding_dim, ksize2, activation = 'relu')(padded)
        permuted = Permute((2,1))(conv2)
        kmaxpool2 = KMaxPooling(k2)(permuted)
        kmaxpool2 = Reshape((k2, -1))(kmaxpool2)
        flattened = Flatten()(kmaxpool2)
        conv_results.append(flattened)
    conv_result = concatenate(conv_results)
    outputs = Dense(1, activation = 'sigmoid')(conv_result)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model
In [82]:
two_feature_map_dynamic_cnn = two_feature_map_dynamic_cnn()
two_feature_map_dynamic_cnn.summary()
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_15 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_30 (Embedding)        (None, 300, 50)      250000      input_15[0][0]                   
__________________________________________________________________________________________________
zero_padding1d_34 (ZeroPadding1 (None, 338, 50)      0           embedding_30[0][0]               
__________________________________________________________________________________________________
zero_padding1d_36 (ZeroPadding1 (None, 338, 50)      0           embedding_30[0][0]               
__________________________________________________________________________________________________
conv1d_33 (Conv1D)              (None, 319, 50)      50050       zero_padding1d_34[0][0]          
__________________________________________________________________________________________________
conv1d_35 (Conv1D)              (None, 319, 50)      50050       zero_padding1d_36[0][0]          
__________________________________________________________________________________________________
permute_25 (Permute)            (None, 50, 319)      0           conv1d_33[0][0]                  
__________________________________________________________________________________________________
permute_27 (Permute)            (None, 50, 319)      0           conv1d_35[0][0]                  
__________________________________________________________________________________________________
k_max_pooling_27 (KMaxPooling)  (None, 1000)         0           permute_25[0][0]                 
__________________________________________________________________________________________________
k_max_pooling_29 (KMaxPooling)  (None, 1000)         0           permute_27[0][0]                 
__________________________________________________________________________________________________
reshape_25 (Reshape)            (None, 20, 50)       0           k_max_pooling_27[0][0]           
__________________________________________________________________________________________________
reshape_27 (Reshape)            (None, 20, 50)       0           k_max_pooling_29[0][0]           
__________________________________________________________________________________________________
zero_padding1d_35 (ZeroPadding1 (None, 78, 50)       0           reshape_25[0][0]                 
__________________________________________________________________________________________________
zero_padding1d_37 (ZeroPadding1 (None, 78, 50)       0           reshape_27[0][0]                 
__________________________________________________________________________________________________
conv1d_34 (Conv1D)              (None, 49, 50)       75050       zero_padding1d_35[0][0]          
__________________________________________________________________________________________________
conv1d_36 (Conv1D)              (None, 49, 50)       75050       zero_padding1d_37[0][0]          
__________________________________________________________________________________________________
permute_26 (Permute)            (None, 50, 49)       0           conv1d_34[0][0]                  
__________________________________________________________________________________________________
permute_28 (Permute)            (None, 50, 49)       0           conv1d_36[0][0]                  
__________________________________________________________________________________________________
k_max_pooling_28 (KMaxPooling)  (None, 500)          0           permute_26[0][0]                 
__________________________________________________________________________________________________
k_max_pooling_30 (KMaxPooling)  (None, 500)          0           permute_28[0][0]                 
__________________________________________________________________________________________________
reshape_26 (Reshape)            (None, 10, 50)       0           k_max_pooling_28[0][0]           
__________________________________________________________________________________________________
reshape_28 (Reshape)            (None, 10, 50)       0           k_max_pooling_30[0][0]           
__________________________________________________________________________________________________
flatten_13 (Flatten)            (None, 500)          0           reshape_26[0][0]                 
__________________________________________________________________________________________________
flatten_14 (Flatten)            (None, 500)          0           reshape_28[0][0]                 
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 1000)         0           flatten_13[0][0]                 
                                                                 flatten_14[0][0]                 
__________________________________________________________________________________________________
dense_13 (Dense)                (None, 1)            1001        concatenate_1[0][0]              
==================================================================================================
Total params: 501,201
Trainable params: 501,201
Non-trainable params: 0
__________________________________________________________________________________________________
In [ ]:
callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]
history = two_feature_map_dynamic_cnn.fit(X_train, y_train, callbacks = callbacks, epochs = 10, validation_split = 0.2, batch_size = 200)
In [85]:
two_feature_map_dynamic_cnn_best_model = two_feature_map_dynamic_cnn()
two_feature_map_dynamic_cnn_best_model.load_weights('best_model.hdf5')
two_feature_map_dynamic_cnn_best_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
results = two_feature_map_dynamic_cnn_best_model.evaluate(X_test, y_test)
print('Test accuracy: ', results[1])
24960/25000 [============================>.] - ETA: 0sTest accuracy:  0.88092