In [ ]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_matching/ant/main')
Mounted at /content/gdrive
In [ ]:
!pip install transformers
Collecting transformers
  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
     |████████████████████████████████| 1.3MB 9.2MB/s 
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)
Collecting sentencepiece!=0.1.92
  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
     |████████████████████████████████| 1.1MB 49.5MB/s 
Collecting tokenizers==0.9.2
  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
     |████████████████████████████████| 2.9MB 49.9MB/s 
Collecting sacremoses
  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
     |████████████████████████████████| 890kB 51.2MB/s 
Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)
Requirement already satisfied: protobuf in /usr/local/lib/python3.6/dist-packages (from transformers) (3.12.4)
Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)
Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.17.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf->transformers) (50.3.0)
Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... done
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=fae6552b20e3b306378d88a6dcbf802a46322ecf8a93ef6a8c8b19d31e489256
  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
Successfully built sacremoses
Installing collected packages: sentencepiece, tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.43 sentencepiece-0.1.94 tokenizers-0.9.2 transformers-3.4.0
In [ ]:
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics import classification_report

import os
import json
import time
import logging
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())
TensorFlow Version 2.3.0
WARNING:tensorflow:From <ipython-input-3-2c65ccb3f96b>:14: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True
In [ ]:
params = {
  'pretrain_path': 'bert-base-chinese',
  'train_path': '../data/train.json',
  'test_path': '../data/dev.json',
  'batch_size': 32,
  'max_len': 128,
  'buffer_size': 34334,
  'init_lr': 1e-5,
  'max_lr': 4e-5,
  'n_epochs': 12,
  'num_patience': 5,
}

tokenizer = BertTokenizer.from_pretrained(params['pretrain_path'],
                                          lowercase = True,
                                          add_special_tokens = True)

In [ ]:
# stream data from text files
def data_generator(f_path, params):
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      line = json.loads(line.rstrip())
      text1, text2, label = line['sentence1'], line['sentence2'], line['label']
      if len(text1) + len(text2) + 3 > params['max_len']:
        _max_len = (params['max_len'] - 3) // 2
        text1 = text1[:_max_len]
        text2 = text2[:_max_len]
      text1 = list(text1)
      text2 = list(text2)
      text = ['[CLS]'] + text1 + ['[SEP]'] + text2 + ['[SEP]']
      seg = [0] + [0] * len(text1) + [0] + [1] * len(text2) + [1]
      text = tokenizer.convert_tokens_to_ids(text)
      yield (text, seg), int(label)


def dataset(is_training, params):
  _shapes = (([None], [None]), ())
  _types = ((tf.int32, tf.int32), tf.int32)
  _pads = ((0, 0), -1)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds
In [ ]:
# input stream ids check
(text, seg), _ = next(data_generator(params['train_path'], params))
print(text)
print(seg)
Reading ../data/train.json
[101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
In [ ]:
class BertFinetune(tf.keras.Model):
  def __init__(self, params):
    super(BertFinetune, self).__init__()
    self.bert = TFBertModel.from_pretrained(params['pretrain_path'],
                                            trainable = True)
    self.drop_1 = tf.keras.layers.Dropout(.1)
    self.fc = tf.keras.layers.Dense(300, tf.nn.swish, name='down_stream/fc')
    self.drop_2 = tf.keras.layers.Dropout(.1)
    self.out = tf.keras.layers.Dense(1, name='down_stream/out')

  def call(self, bert_inputs, training):
    bert_inputs = [tf.cast(inp, tf.int32) for inp in bert_inputs]
    x = self.bert(bert_inputs, training=training)
    x = x[1]
    x = self.drop_1(x, training=training)
    x = self.fc(x)
    x = self.drop_2(x, training=training)
    x = self.out(x)
    x = tf.squeeze(x, 1)
    return x
In [ ]:
model = BertFinetune(params)
model.build([[None, None], [None, None], [None, None]])
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])


Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
[('tf_bert_model/bert/embeddings/word_embeddings/weight:0',
  TensorShape([21128, 768])),
 ('tf_bert_model/bert/embeddings/position_embeddings/embeddings:0',
  TensorShape([512, 768])),
 ('tf_bert_model/bert/embeddings/token_type_embeddings/embeddings:0',
  TensorShape([2, 768])),
 ('tf_bert_model/bert/embeddings/LayerNorm/gamma:0', TensorShape([768])),
 ('tf_bert_model/bert/embeddings/LayerNorm/beta:0', TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._0/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._0/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._1/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._1/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._1/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._1/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._2/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._2/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._2/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._2/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._3/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._3/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._3/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._3/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._4/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._4/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._4/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._4/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._5/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._5/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._5/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._5/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._6/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._6/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._6/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._6/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._7/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._7/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._7/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._7/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._8/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._8/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._8/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._8/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._9/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._9/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._9/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._9/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._10/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._10/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._10/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._10/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/self/value/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/output/dense/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/attention/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/intermediate/dense/kernel:0',
  TensorShape([768, 3072])),
 ('tf_bert_model/bert/encoder/layer_._11/intermediate/dense/bias:0',
  TensorShape([3072])),
 ('tf_bert_model/bert/encoder/layer_._11/output/dense/kernel:0',
  TensorShape([3072, 768])),
 ('tf_bert_model/bert/encoder/layer_._11/output/dense/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/gamma:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/beta:0',
  TensorShape([768])),
 ('tf_bert_model/bert/pooler/dense/kernel:0', TensorShape([768, 768])),
 ('tf_bert_model/bert/pooler/dense/bias:0', TensorShape([768])),
 ('down_stream/fc/kernel:0', TensorShape([768, 300])),
 ('down_stream/fc/bias:0', TensorShape([300])),
 ('down_stream/out/kernel:0', TensorShape([300, 1])),
 ('down_stream/out/bias:0', TensorShape([1]))]
In [9]:
step_size = 2 * params['buffer_size'] // params['batch_size']
decay_lr = tfa.optimizers.Triangular2CyclicalLearningRate(
  initial_learning_rate = params['init_lr'],
  maximal_learning_rate = params['max_lr'],
  step_size = step_size,)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0

best_acc = .0
count = 0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

for _ in range(params['n_epochs']):
  # TRAINING
  for ((text, seg), labels) in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      logits = model([text, tf.sign(text), seg], training=True)
      labels = tf.cast(labels, tf.float32)
      num_neg = tf.reduce_sum(tf.cast(tf.equal(labels, 0.), tf.float32)).numpy()
      num_pos = tf.reduce_sum(labels).numpy()
      if num_pos == 0.:
        pos_weight = 1.
      else:
        pos_weight = num_neg / num_pos
      loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(
        labels = labels,
        logits = logits,
        pos_weight = pos_weight))
      
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, 5.)
    optim.apply_gradients(zip(grads, model.trainable_variables))
    
    if global_step % 100 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
        global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
  
  # EVALUATION
  m = tf.keras.metrics.Accuracy()
  intent_true = []
  intent_pred = []

  for ((text, seg), labels) in dataset(is_training=False, params=params):
    logits = model([text, tf.sign(text), seg], training=False)
    y_pred = tf.cast(tf.math.greater_equal(logits, .5), tf.int32)
    m.update_state(y_true=labels, y_pred=y_pred)
    intent_true += labels.numpy().flatten().tolist()
    intent_pred += y_pred.numpy().flatten().tolist()

  acc = m.result().numpy()
  logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc))

  logger.info('\n'+classification_report(y_true = intent_true,
                                         y_pred = intent_pred,
                                         labels = [0, 1],
                                         target_names = ['Not Matched', 'Matched'],
                                         digits = 3))

  if acc > best_acc:
    best_acc = acc
    # you can save model here
    count = 0
  else:
    count += 1
  logger.info("Best Accuracy: {:.3f}".format(best_acc))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break
Reading ../data/train.json
INFO:tensorflow:Step 0 | Loss: 1.3252 | Spent: 8.7 secs | LR: 0.000010
INFO:tensorflow:Step 100 | Loss: 0.6572 | Spent: 49.9 secs | LR: 0.000011
INFO:tensorflow:Step 200 | Loss: 0.7912 | Spent: 53.1 secs | LR: 0.000013
INFO:tensorflow:Step 300 | Loss: 0.9696 | Spent: 51.4 secs | LR: 0.000014
INFO:tensorflow:Step 400 | Loss: 1.0542 | Spent: 52.0 secs | LR: 0.000016
INFO:tensorflow:Step 500 | Loss: 0.8474 | Spent: 50.0 secs | LR: 0.000017
INFO:tensorflow:Step 600 | Loss: 0.8511 | Spent: 53.0 secs | LR: 0.000018
INFO:tensorflow:Step 700 | Loss: 1.0404 | Spent: 49.5 secs | LR: 0.000020
INFO:tensorflow:Step 800 | Loss: 0.9121 | Spent: 51.3 secs | LR: 0.000021
INFO:tensorflow:Step 900 | Loss: 0.8106 | Spent: 52.0 secs | LR: 0.000023
INFO:tensorflow:Step 1000 | Loss: 0.6001 | Spent: 50.1 secs | LR: 0.000024
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.717
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.806     0.776     0.791      2978
     Matched      0.540     0.584     0.561      1338

    accuracy                          0.717      4316
   macro avg      0.673     0.680     0.676      4316
weighted avg      0.723     0.717     0.720      4316

INFO:tensorflow:Best Accuracy: 0.717
Reading ../data/train.json
INFO:tensorflow:Step 1100 | Loss: 0.7251 | Spent: 78.9 secs | LR: 0.000025
INFO:tensorflow:Step 1200 | Loss: 0.7493 | Spent: 51.7 secs | LR: 0.000027
INFO:tensorflow:Step 1300 | Loss: 0.8684 | Spent: 50.7 secs | LR: 0.000028
INFO:tensorflow:Step 1400 | Loss: 0.6927 | Spent: 50.9 secs | LR: 0.000030
INFO:tensorflow:Step 1500 | Loss: 1.3541 | Spent: 51.1 secs | LR: 0.000031
INFO:tensorflow:Step 1600 | Loss: 0.7615 | Spent: 49.2 secs | LR: 0.000032
INFO:tensorflow:Step 1700 | Loss: 0.6191 | Spent: 52.9 secs | LR: 0.000034
INFO:tensorflow:Step 1800 | Loss: 0.8407 | Spent: 50.7 secs | LR: 0.000035
INFO:tensorflow:Step 1900 | Loss: 1.0092 | Spent: 50.3 secs | LR: 0.000037
INFO:tensorflow:Step 2000 | Loss: 0.8386 | Spent: 52.4 secs | LR: 0.000038
INFO:tensorflow:Step 2100 | Loss: 0.6420 | Spent: 50.9 secs | LR: 0.000039
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.728
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.797     0.812     0.804      2978
     Matched      0.563     0.539     0.551      1338

    accuracy                          0.728      4316
   macro avg      0.680     0.676     0.678      4316
weighted avg      0.724     0.728     0.726      4316

INFO:tensorflow:Best Accuracy: 0.728
Reading ../data/train.json
INFO:tensorflow:Step 2200 | Loss: 0.4928 | Spent: 79.6 secs | LR: 0.000039
INFO:tensorflow:Step 2300 | Loss: 0.7312 | Spent: 52.0 secs | LR: 0.000038
INFO:tensorflow:Step 2400 | Loss: 0.5542 | Spent: 50.9 secs | LR: 0.000036
INFO:tensorflow:Step 2500 | Loss: 1.0056 | Spent: 49.9 secs | LR: 0.000035
INFO:tensorflow:Step 2600 | Loss: 0.4767 | Spent: 50.6 secs | LR: 0.000034
INFO:tensorflow:Step 2700 | Loss: 0.4871 | Spent: 51.1 secs | LR: 0.000032
INFO:tensorflow:Step 2800 | Loss: 0.4422 | Spent: 52.0 secs | LR: 0.000031
INFO:tensorflow:Step 2900 | Loss: 0.8468 | Spent: 50.7 secs | LR: 0.000029
INFO:tensorflow:Step 3000 | Loss: 0.5801 | Spent: 51.4 secs | LR: 0.000028
INFO:tensorflow:Step 3100 | Loss: 0.5261 | Spent: 52.0 secs | LR: 0.000027
INFO:tensorflow:Step 3200 | Loss: 0.4898 | Spent: 54.2 secs | LR: 0.000025
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.727
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.830     0.761     0.794      2978
     Matched      0.551     0.653     0.598      1338

    accuracy                          0.727      4316
   macro avg      0.690     0.707     0.696      4316
weighted avg      0.743     0.727     0.733      4316

INFO:tensorflow:Best Accuracy: 0.728
Reading ../data/train.json
INFO:tensorflow:Step 3300 | Loss: 0.4399 | Spent: 78.1 secs | LR: 0.000024
INFO:tensorflow:Step 3400 | Loss: 0.3858 | Spent: 49.9 secs | LR: 0.000022
INFO:tensorflow:Step 3500 | Loss: 0.6778 | Spent: 49.9 secs | LR: 0.000021
INFO:tensorflow:Step 3600 | Loss: 0.3714 | Spent: 50.3 secs | LR: 0.000020
INFO:tensorflow:Step 3700 | Loss: 0.4103 | Spent: 50.1 secs | LR: 0.000018
INFO:tensorflow:Step 3800 | Loss: 0.6524 | Spent: 50.8 secs | LR: 0.000017
INFO:tensorflow:Step 3900 | Loss: 0.4187 | Spent: 51.8 secs | LR: 0.000015
INFO:tensorflow:Step 4000 | Loss: 0.2999 | Spent: 50.7 secs | LR: 0.000014
INFO:tensorflow:Step 4100 | Loss: 0.4433 | Spent: 49.7 secs | LR: 0.000013
INFO:tensorflow:Step 4200 | Loss: 0.3272 | Spent: 49.3 secs | LR: 0.000011
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.724
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.813     0.778     0.795      2978
     Matched      0.550     0.602     0.575      1338

    accuracy                          0.724      4316
   macro avg      0.682     0.690     0.685      4316
weighted avg      0.732     0.724     0.727      4316

INFO:tensorflow:Best Accuracy: 0.728
Reading ../data/train.json
INFO:tensorflow:Step 4300 | Loss: 0.1611 | Spent: 77.8 secs | LR: 0.000010
INFO:tensorflow:Step 4400 | Loss: 0.3984 | Spent: 49.6 secs | LR: 0.000011
INFO:tensorflow:Step 4500 | Loss: 0.3907 | Spent: 50.8 secs | LR: 0.000011
INFO:tensorflow:Step 4600 | Loss: 0.3650 | Spent: 50.8 secs | LR: 0.000012
INFO:tensorflow:Step 4700 | Loss: 0.3201 | Spent: 48.6 secs | LR: 0.000013
INFO:tensorflow:Step 4800 | Loss: 0.4865 | Spent: 49.9 secs | LR: 0.000014
INFO:tensorflow:Step 4900 | Loss: 0.4885 | Spent: 49.2 secs | LR: 0.000014
INFO:tensorflow:Step 5000 | Loss: 0.2418 | Spent: 49.1 secs | LR: 0.000015
INFO:tensorflow:Step 5100 | Loss: 0.2088 | Spent: 49.2 secs | LR: 0.000016
INFO:tensorflow:Step 5200 | Loss: 0.2184 | Spent: 52.0 secs | LR: 0.000016
INFO:tensorflow:Step 5300 | Loss: 0.3686 | Spent: 50.8 secs | LR: 0.000017
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.738
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.826     0.787     0.806      2978
     Matched      0.571     0.630     0.599      1338

    accuracy                          0.738      4316
   macro avg      0.698     0.709     0.702      4316
weighted avg      0.747     0.738     0.742      4316

INFO:tensorflow:Best Accuracy: 0.738
Reading ../data/train.json
INFO:tensorflow:Step 5400 | Loss: 0.3405 | Spent: 78.5 secs | LR: 0.000018
INFO:tensorflow:Step 5500 | Loss: 0.1906 | Spent: 49.7 secs | LR: 0.000018
INFO:tensorflow:Step 5600 | Loss: 0.3011 | Spent: 48.9 secs | LR: 0.000019
INFO:tensorflow:Step 5700 | Loss: 0.2265 | Spent: 50.5 secs | LR: 0.000020
INFO:tensorflow:Step 5800 | Loss: 0.2994 | Spent: 49.6 secs | LR: 0.000021
INFO:tensorflow:Step 5900 | Loss: 0.2881 | Spent: 50.3 secs | LR: 0.000021
INFO:tensorflow:Step 6000 | Loss: 0.2155 | Spent: 49.3 secs | LR: 0.000022
INFO:tensorflow:Step 6100 | Loss: 0.1441 | Spent: 52.1 secs | LR: 0.000023
INFO:tensorflow:Step 6200 | Loss: 0.4761 | Spent: 48.9 secs | LR: 0.000023
INFO:tensorflow:Step 6300 | Loss: 0.5735 | Spent: 49.2 secs | LR: 0.000024
INFO:tensorflow:Step 6400 | Loss: 0.3052 | Spent: 50.8 secs | LR: 0.000025
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.723
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.812     0.778     0.795      2978
     Matched      0.548     0.600     0.573      1338

    accuracy                          0.723      4316
   macro avg      0.680     0.689     0.684      4316
weighted avg      0.731     0.723     0.726      4316

INFO:tensorflow:Best Accuracy: 0.738
Reading ../data/train.json
INFO:tensorflow:Step 6500 | Loss: 0.3314 | Spent: 75.6 secs | LR: 0.000025
INFO:tensorflow:Step 6600 | Loss: 0.4986 | Spent: 47.5 secs | LR: 0.000024
INFO:tensorflow:Step 6700 | Loss: 0.1254 | Spent: 50.1 secs | LR: 0.000023
INFO:tensorflow:Step 6800 | Loss: 0.4651 | Spent: 50.9 secs | LR: 0.000022
INFO:tensorflow:Step 6900 | Loss: 0.2374 | Spent: 49.8 secs | LR: 0.000022
INFO:tensorflow:Step 7000 | Loss: 0.5111 | Spent: 50.8 secs | LR: 0.000021
INFO:tensorflow:Step 7100 | Loss: 0.1521 | Spent: 48.5 secs | LR: 0.000020
INFO:tensorflow:Step 7200 | Loss: 0.1952 | Spent: 50.5 secs | LR: 0.000020
INFO:tensorflow:Step 7300 | Loss: 0.6061 | Spent: 51.0 secs | LR: 0.000019
INFO:tensorflow:Step 7400 | Loss: 0.4945 | Spent: 49.4 secs | LR: 0.000018
INFO:tensorflow:Step 7500 | Loss: 0.1564 | Spent: 49.5 secs | LR: 0.000018
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.728
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.791     0.824     0.807      2978
     Matched      0.568     0.516     0.541      1338

    accuracy                          0.728      4316
   macro avg      0.680     0.670     0.674      4316
weighted avg      0.722     0.728     0.725      4316

INFO:tensorflow:Best Accuracy: 0.738
Reading ../data/train.json
INFO:tensorflow:Step 7600 | Loss: 0.0759 | Spent: 75.3 secs | LR: 0.000017
INFO:tensorflow:Step 7700 | Loss: 0.0131 | Spent: 49.6 secs | LR: 0.000016
INFO:tensorflow:Step 7800 | Loss: 0.0189 | Spent: 50.1 secs | LR: 0.000015
INFO:tensorflow:Step 7900 | Loss: 0.4068 | Spent: 49.2 secs | LR: 0.000015
INFO:tensorflow:Step 8000 | Loss: 0.2296 | Spent: 50.2 secs | LR: 0.000014
INFO:tensorflow:Step 8100 | Loss: 0.1677 | Spent: 50.5 secs | LR: 0.000013
INFO:tensorflow:Step 8200 | Loss: 0.0434 | Spent: 50.9 secs | LR: 0.000013
INFO:tensorflow:Step 8300 | Loss: 0.0947 | Spent: 48.3 secs | LR: 0.000012
INFO:tensorflow:Step 8400 | Loss: 0.4036 | Spent: 50.0 secs | LR: 0.000011
INFO:tensorflow:Step 8500 | Loss: 0.1414 | Spent: 51.7 secs | LR: 0.000011
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.729
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.791     0.826     0.808      2978
     Matched      0.570     0.513     0.540      1338

    accuracy                          0.729      4316
   macro avg      0.680     0.670     0.674      4316
weighted avg      0.722     0.729     0.725      4316

INFO:tensorflow:Best Accuracy: 0.738
Reading ../data/train.json
INFO:tensorflow:Step 8600 | Loss: 0.2018 | Spent: 75.0 secs | LR: 0.000010
INFO:tensorflow:Step 8700 | Loss: 0.1929 | Spent: 50.0 secs | LR: 0.000010
INFO:tensorflow:Step 8800 | Loss: 0.2421 | Spent: 50.9 secs | LR: 0.000011
INFO:tensorflow:Step 8900 | Loss: 0.0066 | Spent: 49.4 secs | LR: 0.000011
INFO:tensorflow:Step 9000 | Loss: 0.3299 | Spent: 49.9 secs | LR: 0.000011
INFO:tensorflow:Step 9100 | Loss: 0.0182 | Spent: 51.5 secs | LR: 0.000012
INFO:tensorflow:Step 9200 | Loss: 0.0908 | Spent: 50.3 secs | LR: 0.000012
INFO:tensorflow:Step 9300 | Loss: 0.3083 | Spent: 48.5 secs | LR: 0.000013
INFO:tensorflow:Step 9400 | Loss: 0.2490 | Spent: 50.0 secs | LR: 0.000013
INFO:tensorflow:Step 9500 | Loss: 0.0784 | Spent: 49.1 secs | LR: 0.000013
INFO:tensorflow:Step 9600 | Loss: 0.0908 | Spent: 48.1 secs | LR: 0.000014
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.727
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.783     0.835     0.808      2978
     Matched      0.569     0.485     0.524      1338

    accuracy                          0.727      4316
   macro avg      0.676     0.660     0.666      4316
weighted avg      0.717     0.727     0.720      4316

INFO:tensorflow:Best Accuracy: 0.738
Reading ../data/train.json
INFO:tensorflow:Step 9700 | Loss: 0.1900 | Spent: 76.5 secs | LR: 0.000014
INFO:tensorflow:Step 9800 | Loss: 0.1188 | Spent: 51.2 secs | LR: 0.000014
INFO:tensorflow:Step 9900 | Loss: 0.0741 | Spent: 47.9 secs | LR: 0.000015
INFO:tensorflow:Step 10000 | Loss: 0.4289 | Spent: 49.5 secs | LR: 0.000015
INFO:tensorflow:Step 10100 | Loss: 0.0227 | Spent: 49.2 secs | LR: 0.000015
INFO:tensorflow:Step 10200 | Loss: 0.0279 | Spent: 51.9 secs | LR: 0.000016
INFO:tensorflow:Step 10300 | Loss: 0.3914 | Spent: 51.8 secs | LR: 0.000016
INFO:tensorflow:Step 10400 | Loss: 0.0434 | Spent: 50.1 secs | LR: 0.000016
INFO:tensorflow:Step 10500 | Loss: 0.0574 | Spent: 49.6 secs | LR: 0.000017
INFO:tensorflow:Step 10600 | Loss: 0.1978 | Spent: 50.6 secs | LR: 0.000017
INFO:tensorflow:Step 10700 | Loss: 0.2863 | Spent: 49.4 secs | LR: 0.000017
Reading ../data/dev.json
INFO:tensorflow:Evaluation: Testing Accuracy: 0.728
INFO:tensorflow:
              precision    recall  f1-score   support

 Not Matched      0.801     0.805     0.803      2978
     Matched      0.562     0.556     0.559      1338

    accuracy                          0.728      4316
   macro avg      0.681     0.680     0.681      4316
weighted avg      0.727     0.728     0.727      4316

INFO:tensorflow:Best Accuracy: 0.738
5 times not improve the best result, therefore stop training