In [1]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
In [11]:
!pip install -q tqdm bert-for-tf2
In [2]:
import os
import sys
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf

# tf.config.set_visible_devices([], 'GPU')  # disable GPU
In [3]:
print("Tensorflow: ", tf.__version__)
print("Python: ", sys.version)
print("GPU: ", tf.test.is_gpu_available())
assert sys.version_info.major == 3 and sys.version_info.minor == 6  # required by clipper benchmark
Tensorflow:  2.1.0
Python:  3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 21:14:29) 
[GCC 7.3.0]
WARNING:tensorflow:From <ipython-input-3-7068fd7facab>:3: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU:  True
In [5]:
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
from tensorflow import keras
import os
import re
In [6]:
from tensorflow import keras
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in tqdm(os.listdir(directory), desc=os.path.basename(directory)):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True)

    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                           "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                          "aclImdb", "test"))

    return train_df, test_df

Let's use the MovieReviewData class below, to prepare/encode the data for feeding into our BERT model, by:

  • tokenizing the text
  • trim or pad it to a max_seq_len length
  • append the special tokens [CLS] and [SEP]
  • convert the string tokens to numerical IDs using the original model's token encoding from vocab.txt
In [7]:
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights


class MovieReviewData:
    DATA_COLUMN = "sentence"
    LABEL_COLUMN = "polarity"

    def __init__(self, tokenizer: FullTokenizer, sample_size=None, max_seq_len=1024):
        self.tokenizer = tokenizer
        self.sample_size = sample_size
        self.max_seq_len = 0
        train, test = download_and_load_datasets()
        
        train, test = map(lambda df: df.reindex(df[MovieReviewData.DATA_COLUMN].str.len().sort_values().index), 
                          [train, test])
                
        if sample_size is not None:
            train, test = train.head(sample_size), test.head(sample_size)
            # train, test = map(lambda df: df.sample(sample_size), [train, test])
        
        ((self.train_x, self.train_y),
         (self.test_x, self.test_y)) = map(self._prepare, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        ((self.train_x, self.train_x_token_types),
         (self.test_x, self.test_x_token_types)) = map(self._pad, 
                                                       [self.train_x, self.test_x])

    def _prepare(self, df):
        x, y = [], []
        with tqdm(total=df.shape[0], unit_scale=True) as pbar:
            for ndx, row in df.iterrows():
                text, label = row[MovieReviewData.DATA_COLUMN], row[MovieReviewData.LABEL_COLUMN]
                tokens = self.tokenizer.tokenize(text)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(int(label))
                pbar.update()
        return np.array(x), np.array(y)

    def _pad(self, ids):
        x, t = [], []
        token_type_ids = [0] * self.max_seq_len
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
            t.append(token_type_ids)
        return np.array(x), np.array(t)

A tweak

Because of a tf.train.load_checkpoint limitation requiring list permissions on the google storage bucket, we need to copy the pre-trained BERT weights locally.

In [8]:
asset_path = 'asset'
bert_model_name = "uncased_L-12_H-768_A-12"
bert_ckpt_dir    = os.path.join(asset_path, bert_model_name)
bert_ckpt_file   = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
In [9]:
%%bash

if [ ! -f asset/uncased_L-12_H-768_A-12.zip ]; then
    curl -o asset/uncased_L-12_H-768_A-12.zip --create-dirs https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
fi
if [ ! -d asset/uncased_L-12_H-768_A-12 ]; then
    unzip asset/uncased_L-12_H-768_A-12.zip -d asset/
fi
Archive:  asset/uncased_L-12_H-768_A-12.zip
   creating: asset/uncased_L-12_H-768_A-12/
  inflating: asset/uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: asset/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: asset/uncased_L-12_H-768_A-12/vocab.txt  
  inflating: asset/uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: asset/uncased_L-12_H-768_A-12/bert_config.json  
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  388M  100  388M    0     0  3564k      0  0:01:51  0:01:51 --:--:-- 3948k

Preparing the Data

Now let's fetch and prepare the data by taking the first max_seq_len tokenens after tokenizing with the BERT tokenizer, und use sample_size examples for both training and testing.

To keep training fast, we'll take a sample of about 2500 train and test examples, respectively, and use the first 128 tokens only (transformers memory and computation requirements scale quadraticly with the sequence length - so with a TPU you might use max_seq_len=512, but on a GPU this would be too slow, and you will have to use a very small batch_sizes to fit the model into the GPU memory).

In [10]:
%%time

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = MovieReviewData(tokenizer, 
                       sample_size=10*128*2, #10*128*2
                       max_seq_len=128)
pos: 100%|██████████| 12500/12500 [00:00<00:00, 19259.77it/s]
neg: 100%|██████████| 12500/12500 [00:00<00:00, 19675.69it/s]
pos: 100%|██████████| 12500/12500 [00:00<00:00, 19420.16it/s]
neg: 100%|██████████| 12500/12500 [00:00<00:00, 18046.21it/s]
100%|██████████| 2.56k/2.56k [00:02<00:00, 879it/s]  
100%|██████████| 2.56k/2.56k [00:02<00:00, 866it/s]  
max seq_len 178
CPU times: user 20 s, sys: 5.05 s, total: 25.1 s
Wall time: 25.5 s
In [9]:
print("            train_x", data.train_x.shape)
print("train_x_token_types", data.train_x_token_types.shape)
print("            train_y", data.train_y.shape)

print("             test_x", data.test_x.shape)

print("        max_seq_len", data.max_seq_len)
            train_x (2560, 128)
train_x_token_types (2560, 128)
            train_y (2560,)
             test_x (2560, 128)
        max_seq_len 128

Adapter BERT

If we decide to use adapter-BERT we need some helpers for freezing the original BERT layers.

In [16]:
def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(
                math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

Creating a model

Now let's create a classification model using adapter-BERT, which is clever way of reducing the trainable parameter count, by freezing the original BERT weights, and adapting them with two FFN bottlenecks (i.e. adapter_size bellow) in every BERT layer.

N.B. The commented out code below show how to feed a token_type_ids/segment_ids sequence (which is not needed in our case).

In [13]:
def create_model(max_seq_len, adapter_size=64):
    """Creates a classification model."""

    #adapter_size = 64  # see - arXiv:1902.00751

    # create the bert layer
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = adapter_size
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    # token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="token_type_ids")
    # output         = bert([input_ids, token_type_ids])
    output         = bert(input_ids)

    print("bert shape", output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=2, activation="softmax")(logits)

    # model = keras.Model(inputs=[input_ids, token_type_ids], outputs=logits)
    # model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    # load the pre-trained model weights
    load_stock_weights(bert, bert_ckpt_file)

    # freeze weights if adapter-BERT is used
    if adapter_size is not None:
        freeze_bert_layers(bert)

    model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

    model.summary()

    return model

Train

In [14]:
adapter_size = None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)
bert shape (None, 128, 768)
Done loading 196 BERT weights from: asset/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fd9f4052470> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_ids (InputLayer)       [(None, 128)]             0         
_________________________________________________________________
bert (BertModelLayer)        (None, 128, 768)          108890112 
_________________________________________________________________
lambda (Lambda)              (None, 768)               0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 768)               590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1538      
=================================================================
Total params: 109,482,242
Trainable params: 109,482,242
Non-trainable params: 0
_________________________________________________________________
In [21]:
%%time

log_dir = ".log/movie_reviews/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

total_epoch_count = 2
model.fit(x=data.train_x, y=data.train_y,
          validation_split=0.1,
          batch_size=12,
          shuffle=True,
          epochs=total_epoch_count,
          callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
                                                    end_learn_rate=1e-7,
                                                    warmup_epoch_count=20,
                                                    total_epoch_count=total_epoch_count),
                     keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
                     tensorboard_callback])
Train on 2304 samples, validate on 256 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 5.000000000000001e-07.
Epoch 1/2
2304/2304 [==============================] - 121s 52ms/sample - loss: 0.4103 - acc: 0.9058 - val_loss: 0.4349 - val_acc: 0.8633

Epoch 00002: LearningRateScheduler reducing learning rate to 1.0000000000000002e-06.
Epoch 2/2
2304/2304 [==============================] - 121s 53ms/sample - loss: 0.4053 - acc: 0.9054 - val_loss: 0.4415 - val_acc: 0.8594
CPU times: user 4min 31s, sys: 43.2 s, total: 5min 14s
Wall time: 4min 2s
Out[21]:
<tensorflow.python.keras.callbacks.History at 0x7fd8bc9238d0>
In [22]:
model.save_weights('./movie_reviews.h5', overwrite=True)
In [23]:
%%time

_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print(" test acc", test_acc)
2560/2560 [==============================] - 35s 14ms/sample - loss: 0.3835 - acc: 0.9270
2560/2560 [==============================] - 33s 13ms/sample - loss: 0.4088 - acc: 0.8992
train acc 0.92695314
 test acc 0.89921874
CPU times: user 1min 7s, sys: 222 ms, total: 1min 8s
Wall time: 1min 7s

Evaluation

To evaluate the trained model, let's load the saved weights in a new model instance, and evaluate.

In [12]:
model = create_model(data.max_seq_len, adapter_size=None)
model.load_weights("./movie_reviews.h5")
bert shape (None, 128, 768)
Done loading 196 BERT weights from: asset/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7ff6b45c4160> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_ids (InputLayer)       [(None, 128)]             0         
_________________________________________________________________
bert (BertModelLayer)        (None, 128, 768)          108890112 
_________________________________________________________________
lambda (Lambda)              (None, 768)               0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 768)               590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1538      
=================================================================
Total params: 109,482,242
Trainable params: 109,482,242
Non-trainable params: 0
_________________________________________________________________
In [17]:
%%time 

# _, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

# print("train acc", train_acc)
print(" test acc", test_acc)
2560/2560 [==============================] - 35s 14ms/sample - loss: 0.3903 - acc: 0.9230
 test acc 0.9230469
CPU times: user 34.6 s, sys: 113 ms, total: 34.7 s
Wall time: 34.6 s

Prediction

For prediction, we need to prepare the input text the same way as we did for training - tokenize, adding the special [CLS] and [SEP] token at begin and end of the token sequence, and pad to match the model input shape.

In [25]:
%%time
CLASSES = ["negative","positive"]
max_seq_len = 128
pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!",
]

inputs = pd.DataFrame(pred_sentences)

pred_tokens    = map(tokenizer.tokenize, inputs.to_numpy()[:, 0].tolist())
pred_tokens    = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))
pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len-len(tids)), pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

res = model(pred_token_ids).numpy().argmax(axis=-1)
[CLASSES[i] for i in res]
CPU times: user 150 ms, sys: 7.46 ms, total: 158 ms
Wall time: 177 ms
Out[25]:
['negative', 'negative', 'positive', 'positive']

Build & Save bentoml service

In [29]:
%%writefile bentoml_service.py

import bentoml
import tensorflow as tf
import numpy as np
import pandas as pd

from bentoml.artifact import (
    TensorflowSavedModelArtifact, PickleArtifact,
)
from bentoml.adapters import DataframeInput


CLASSES = ["negative","positive"]
max_seq_len = 128

try:
    tf.config.set_visible_devices([], 'GPU')  # disable GPU, required when served in docker
except:
    pass


@bentoml.env(pip_dependencies=['tensorflow', 'bert-for-tf2'])
@bentoml.artifacts([TensorflowSavedModelArtifact('model'), PickleArtifact('tokenizer')])
class Service(bentoml.BentoService):

    def tokenize(self, inputs: pd.DataFrame):
        tokenizer = self.artifacts.tokenizer
        if isinstance(inputs, pd.DataFrame):
            inputs = inputs.to_numpy()[:, 0].tolist()
        else:
            inputs = inputs.tolist()  # for predict_clipper
        pred_tokens = map(tokenizer.tokenize, inputs)
        pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
        pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))
        pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len - len(tids)), pred_token_ids)
        pred_token_ids = tf.constant(list(pred_token_ids), dtype=tf.int32)
        return pred_token_ids

    @bentoml.api(input=DataframeInput(), mb_max_latency=3000, mb_max_batch_size=20)
    def predict(self, inputs):
        model = self.artifacts.model
        pred_token_ids = self.tokenize(inputs)
        res = model(pred_token_ids).numpy().argmax(axis=-1)
        return [CLASSES[i] for i in res]
Overwriting bentoml_service.py
In [30]:
from bentoml_service import Service

bento_svc = Service()
bento_svc.pack("model", model)
bento_svc.pack("tokenizer", tokenizer)
saved_path = bento_svc.save()
[2020-07-28 15:11:02,575] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle.
WARNING:tensorflow:Skipping full serialization of Keras layer <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7fd96c59ab70>, because it is not built.
WARNING:tensorflow:From /opt/anaconda3/envs/bentoml-dev-py36/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1786: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /tmp/bentoml-temp-r9yxj9ku/Service/artifacts/model_saved_model/assets
[2020-07-28 15:12:13,764] INFO - Detect BentoML installed in development model, copying local BentoML module file to target saved bundle path
running sdist
running egg_info
writing BentoML.egg-info/PKG-INFO
writing dependency_links to BentoML.egg-info/dependency_links.txt
writing entry points to BentoML.egg-info/entry_points.txt
writing requirements to BentoML.egg-info/requires.txt
writing top-level names to BentoML.egg-info/top_level.txt
reading manifest file 'BentoML.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
warning: no previously-included files matching '*~' found anywhere in distribution
warning: no previously-included files matching '*.pyo' found anywhere in distribution
warning: no previously-included files matching '.git' found anywhere in distribution
warning: no previously-included files matching '.ipynb_checkpoints' found anywhere in distribution
warning: no previously-included files matching '__pycache__' found anywhere in distribution
warning: no directories found matching 'bentoml/server/static'
warning: no directories found matching 'bentoml/yatai/web/dist'
no previously-included directories found matching 'e2e_tests'
no previously-included directories found matching 'tests'
no previously-included directories found matching 'benchmark'
writing manifest file 'BentoML.egg-info/SOURCES.txt'
running check
creating BentoML-0.8.3+42.gb8d36b6
creating BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
creating BentoML-0.8.3+42.gb8d36b6/bentoml
creating BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
creating BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
creating BentoML-0.8.3+42.gb8d36b6/bentoml/cli
creating BentoML-0.8.3+42.gb8d36b6/bentoml/clipper
creating BentoML-0.8.3+42.gb8d36b6/bentoml/configuration
creating BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
creating BentoML-0.8.3+42.gb8d36b6/bentoml/handlers
creating BentoML-0.8.3+42.gb8d36b6/bentoml/marshal
creating BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
creating BentoML-0.8.3+42.gb8d36b6/bentoml/server
creating BentoML-0.8.3+42.gb8d36b6/bentoml/utils
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/__pycache__
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions/__pycache__
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/validator
copying files to BentoML-0.8.3+42.gb8d36b6...
copying LICENSE -> BentoML-0.8.3+42.gb8d36b6
copying MANIFEST.in -> BentoML-0.8.3+42.gb8d36b6
copying README.md -> BentoML-0.8.3+42.gb8d36b6
copying pyproject.toml -> BentoML-0.8.3+42.gb8d36b6
copying setup.cfg -> BentoML-0.8.3+42.gb8d36b6
copying setup.py -> BentoML-0.8.3+42.gb8d36b6
copying versioneer.py -> BentoML-0.8.3+42.gb8d36b6
copying BentoML.egg-info/PKG-INFO -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
copying BentoML.egg-info/SOURCES.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
copying BentoML.egg-info/dependency_links.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
copying BentoML.egg-info/entry_points.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
copying BentoML.egg-info/requires.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
copying BentoML.egg-info/top_level.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info
copying bentoml/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml
copying bentoml/_version.py -> BentoML-0.8.3+42.gb8d36b6/bentoml
copying bentoml/exceptions.py -> BentoML-0.8.3+42.gb8d36b6/bentoml
copying bentoml/service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml
copying bentoml/service_env.py -> BentoML-0.8.3+42.gb8d36b6/bentoml
copying bentoml/adapters/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/base_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/base_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/clipper_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/dataframe_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/dataframe_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/default_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/fastai_image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/file_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/json_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/json_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/legacy_image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/legacy_json_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/multi_image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/pytorch_tensor_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/tensorflow_tensor_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/tensorflow_tensor_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/adapters/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters
copying bentoml/artifact/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/fastai2_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/fastai_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/fasttext_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/h2o_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/json_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/keras_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/lightgbm_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/onnx_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/pickle_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/pytorch_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/sklearn_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/spacy_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/text_file_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/tf_savedmodel_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/artifact/xgboost_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact
copying bentoml/cli/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/aws_lambda.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/aws_sagemaker.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/azure_functions.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/bento_management.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/bento_service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/click_utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/config.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/deployment.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/cli/yatai_service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli
copying bentoml/clipper/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/clipper
copying bentoml/configuration/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration
copying bentoml/configuration/configparser.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration
copying bentoml/configuration/default_bentoml.cfg -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration
copying bentoml/configuration/__pycache__/__init__.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
copying bentoml/configuration/__pycache__/__init__.cpython-37.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
copying bentoml/configuration/__pycache__/__init__.cpython-38.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
copying bentoml/configuration/__pycache__/configparser.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
copying bentoml/configuration/__pycache__/configparser.cpython-37.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
copying bentoml/configuration/__pycache__/configparser.cpython-38.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__
copying bentoml/handlers/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/handlers
copying bentoml/marshal/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal
copying bentoml/marshal/dispatcher.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal
copying bentoml/marshal/marshal.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal
copying bentoml/marshal/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal
copying bentoml/saved_bundle/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/bentoml-init.sh -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/bundler.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/config.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/docker-entrypoint.sh -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/loader.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/pip_pkg.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/py_module_utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/saved_bundle/templates.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle
copying bentoml/server/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/api_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/gunicorn_config.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/gunicorn_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/instruments.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/marshal_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/open_api.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/trace.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/server/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server
copying bentoml/utils/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/alg.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/benchmark.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/cloudpickle.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/dataframe_util.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/flask_ngrok.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/hybridmethod.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/lazy_loader.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/log.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/s3.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/tempdir.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/utils/usage_stats.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils
copying bentoml/yatai/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/alembic.ini -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/db.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/deployment_utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/status.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/yatai_service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/yatai_service_impl.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai
copying bentoml/yatai/client/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client
copying bentoml/yatai/client/bento_repository_api.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client
copying bentoml/yatai/client/deployment_api.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client
copying bentoml/yatai/deployment/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment
copying bentoml/yatai/deployment/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment
copying bentoml/yatai/deployment/store.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment
copying bentoml/yatai/deployment/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment
copying bentoml/yatai/deployment/aws_lambda/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda
copying bentoml/yatai/deployment/aws_lambda/download_extra_resources.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda
copying bentoml/yatai/deployment/aws_lambda/lambda_app.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda
copying bentoml/yatai/deployment/aws_lambda/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda
copying bentoml/yatai/deployment/aws_lambda/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda
copying bentoml/yatai/deployment/azure_functions/Dockerfile -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/app_init.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/constants.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/host.json -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/local.settings.json -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/azure_functions/templates.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions
copying bentoml/yatai/deployment/sagemaker/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
copying bentoml/yatai/deployment/sagemaker/model_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
copying bentoml/yatai/deployment/sagemaker/nginx.conf -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
copying bentoml/yatai/deployment/sagemaker/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
copying bentoml/yatai/deployment/sagemaker/serve -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
copying bentoml/yatai/deployment/sagemaker/wsgi.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker
copying bentoml/yatai/migrations/README -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations
copying bentoml/yatai/migrations/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations
copying bentoml/yatai/migrations/env.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations
copying bentoml/yatai/migrations/script.py.mako -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations
copying bentoml/yatai/migrations/__pycache__/env.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/__pycache__
copying bentoml/yatai/migrations/versions/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions
copying bentoml/yatai/migrations/versions/a6b00ae45279_add_last_updated_at_for_deployments.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions
copying bentoml/yatai/migrations/versions/__pycache__/a6b00ae45279_add_last_updated_at_for_deployments.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions/__pycache__
copying bentoml/yatai/proto/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
copying bentoml/yatai/proto/deployment_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
copying bentoml/yatai/proto/repository_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
copying bentoml/yatai/proto/status_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
copying bentoml/yatai/proto/yatai_service_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
copying bentoml/yatai/proto/yatai_service_pb2_grpc.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto
copying bentoml/yatai/repository/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
copying bentoml/yatai/repository/base_repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
copying bentoml/yatai/repository/local_repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
copying bentoml/yatai/repository/metadata_store.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
copying bentoml/yatai/repository/repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
copying bentoml/yatai/repository/s3_repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository
copying bentoml/yatai/validator/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/validator
copying bentoml/yatai/validator/deployment_pb_validator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/validator
Writing BentoML-0.8.3+42.gb8d36b6/setup.cfg
UPDATING BentoML-0.8.3+42.gb8d36b6/bentoml/_version.py
set BentoML-0.8.3+42.gb8d36b6/bentoml/_version.py to '0.8.3+42.gb8d36b6'
Creating tar archive
removing 'BentoML-0.8.3+42.gb8d36b6' (and everything under it)
[2020-07-28 15:12:18,720] INFO - BentoService bundle 'Service:20200728151102_B3E065' saved to: /home/bentoml/bentoml/repository/Service/20200728151102_B3E065
In [31]:
print(saved_path)
/home/bentoml/bentoml/repository/Service/20200728151102_B3E065

Serving

Since BERT is a large model, if you met OOM bellow,
you may need to restart this kernel to release the RAM/GRAM used by training model.

In [32]:
bentoml_bundle_path = '/home/bentoml/bentoml/repository/Service/20200728151102_B3E065' # saved_path
In [34]:
# Option 1: serve directly
print(f"bentoml serve-gunicorn {bentoml_bundle_path} --port 5000 --enable-microbatch --workers 1")

!bentoml serve-gunicorn {bentoml_bundle_path} --port 5000 --enable-microbatch --workers 1
bentoml serve-gunicorn /home/bentoml/bentoml/repository/Service/20200728151102_B3E065 --port 5000 --enable-microbatch --workers 1
[2020-07-28 15:14:35,761] INFO - Starting BentoML API server in production mode..
[2020-07-28 15:14:36,563] INFO - Running micro batch service on :5000
[2020-07-28 15:14:36 +0800] [2953201] [INFO] Starting gunicorn 20.0.4
[2020-07-28 15:14:36 +0800] [2952697] [INFO] Starting gunicorn 20.0.4
[2020-07-28 15:14:36 +0800] [2952697] [INFO] Listening at: http://0.0.0.0:60577 (2952697)
[2020-07-28 15:14:36 +0800] [2953201] [INFO] Listening at: http://0.0.0.0:5000 (2953201)
[2020-07-28 15:14:36 +0800] [2952697] [INFO] Using worker: sync
[2020-07-28 15:14:36 +0800] [2953201] [INFO] Using worker: aiohttp.worker.GunicornWebWorker
[2020-07-28 15:14:36 +0800] [2953203] [INFO] Booting worker with pid: 2953203
[2020-07-28 15:14:36 +0800] [2953202] [INFO] Booting worker with pid: 2953202
[2020-07-28 15:14:36,613] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle.
[2020-07-28 15:14:36,631] WARNING - Saved BentoService bundle version mismatch: loading BentoService bundle create with BentoML version 0.8.3, but loading from BentoML version 0.8.3+42.gb8d36b6
[2020-07-28 15:14:36,874] INFO - Micro batch enabled for API `predict`
[2020-07-28 15:14:36,874] INFO - Your system nofile limit is 10000, which means each instance of microbatch service is able to hold this number of connections at same time. You can increase the number of file descriptors for the server process, or launch more microbatch instances to accept more concurrent connection.
[2020-07-28 15:14:37,659] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle.
[2020-07-28 15:14:37,697] WARNING - Saved BentoService bundle version mismatch: loading BentoService bundle create with BentoML version 0.8.3, but loading from BentoML version 0.8.3+42.gb8d36b6
2020-07-28 15:14:39.628530: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-07-28 15:14:39.636462: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-28 15:14:39.636718: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1060 computeCapability: 6.1
coreClock: 1.6705GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s
2020-07-28 15:14:39.637038: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-07-28 15:14:39.639381: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-07-28 15:14:39.641536: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-07-28 15:14:39.642546: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-07-28 15:14:39.644930: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-07-28 15:14:39.646255: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-07-28 15:14:39.650387: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-07-28 15:14:39.650598: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-28 15:14:39.651086: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-28 15:14:39.651293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1697] Adding visible gpu devices: 0
2020-07-28 15:14:40.990800: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2020-07-28 15:14:41.013344: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2699905000 Hz
2020-07-28 15:14:41.013808: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b080b80810 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-07-28 15:14:41.013856: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-07-28 15:14:41.013969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1096] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-28 15:14:41.013981: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1102]      
2020-07-28 15:14:41.170231: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-28 15:14:41.170580: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b080be4f10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-07-28 15:14:41.170601: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): GeForce GTX 1060, Compute Capability 6.1
^C
[2020-07-28 15:14:49 +0800] [2953201] [INFO] Handling signal: int
[2020-07-28 15:14:49 +0800] [2952697] [INFO] Handling signal: int
[2020-07-28 15:14:49 +0800] [2953202] [INFO] Worker exiting (pid: 2953202)
[2020-07-28 15:14:49 +0800] [2953201] [INFO] Shutting down: Master
[2020-07-28 15:14:49 +0800] [2953203] [INFO] Worker exiting (pid: 2953203)
In [ ]:
# Option 2: serve in docker
!cd {bentoml_bundle_path}
IMG_NAME = bentoml_bundle_path.split('/')[-1].lower()

!docker build --quiet -t {IMG_NAME} {bentoml_bundle_path}
# launch docker instances
!docker run -itd -p 5000:5000 {IMG_NAME}:latest --workers 1 --enable-microbatch

Test the API with requests

In [4]:
%%time
import requests
import pandas as pd

server_url = f"http://127.0.0.1:5000/predict"
method = "POST"
headers = {"content-type": "application/json"}
pred_sentences =  ["The acting was a bit lacking."]
data = pd.DataFrame(pred_sentences).to_json()

r = requests.request(method, server_url, headers=headers, data=data)
print(r.content)
b'["negative"]'
CPU times: user 2.36 ms, sys: 3.26 ms, total: 5.62 ms
Wall time: 29.1 s
In [ ]: