# Copyright 2019 Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
BentoML makes moving trained ML models to production easy:
BentoML is a framework for serving, managing, and deploying machine learning models. It is aiming to bridge the gap between Data Science and DevOps, and enable teams to deliver prediction services in a fast, repeatable, and scalable way.
Before reading this example project, be sure to check out the Getting started guide to learn about the basic concepts in BentoML.
A modification of https://github.com/kpe/bert-for-tf2/blob/master/examples/gpu_movie_reviews.ipynb, which is a modification of https://github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb using the Tensorflow 2.0 Keras
!pip install -q bentoml "tqdm==4.32.2" "bert-for-tf2==0.14.5"
import os
import sys
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
# tf.config.set_visible_devices([], 'GPU') # disable GPU
print("Tensorflow: ", tf.__version__)
print("Python: ", sys.version)
print("GPU: ", tf.test.is_gpu_available())
assert sys.version_info.major == 3 and sys.version_info.minor == 6 # required by clipper benchmark
Tensorflow: 2.1.0 Python: 3.6.10 |Anaconda, Inc.| (default, Jan 7 2020, 21:14:29) [GCC 7.3.0] WARNING:tensorflow:From <ipython-input-3-7068fd7facab>:3: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version. Instructions for updating: Use `tf.config.list_physical_devices('GPU')` instead. GPU: True
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
from tensorflow import keras
import os
import re
from tensorflow import keras
import os
import re
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
data = {}
data["sentence"] = []
data["sentiment"] = []
for file_path in tqdm(os.listdir(directory), desc=os.path.basename(directory)):
with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
data["sentence"].append(f.read())
data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
pos_df = load_directory_data(os.path.join(directory, "pos"))
neg_df = load_directory_data(os.path.join(directory, "neg"))
pos_df["polarity"] = 1
neg_df["polarity"] = 0
return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
dataset = tf.keras.utils.get_file(
fname="aclImdb.tar.gz",
origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
extract=True)
train_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "train"))
test_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "test"))
return train_df, test_df
Let's use the MovieReviewData
class below, to prepare/encode
the data for feeding into our BERT model, by:
max_seq_len
length[CLS]
and [SEP]
ID
s using the original model's token encoding from vocab.txt
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
class MovieReviewData:
DATA_COLUMN = "sentence"
LABEL_COLUMN = "polarity"
def __init__(self, tokenizer: FullTokenizer, sample_size=None, max_seq_len=1024):
self.tokenizer = tokenizer
self.sample_size = sample_size
self.max_seq_len = 0
train, test = download_and_load_datasets()
train, test = map(lambda df: df.reindex(df[MovieReviewData.DATA_COLUMN].str.len().sort_values().index),
[train, test])
if sample_size is not None:
train, test = train.head(sample_size), test.head(sample_size)
# train, test = map(lambda df: df.sample(sample_size), [train, test])
((self.train_x, self.train_y),
(self.test_x, self.test_y)) = map(self._prepare, [train, test])
print("max seq_len", self.max_seq_len)
self.max_seq_len = min(self.max_seq_len, max_seq_len)
((self.train_x, self.train_x_token_types),
(self.test_x, self.test_x_token_types)) = map(self._pad,
[self.train_x, self.test_x])
def _prepare(self, df):
x, y = [], []
with tqdm(total=df.shape[0], unit_scale=True) as pbar:
for ndx, row in df.iterrows():
text, label = row[MovieReviewData.DATA_COLUMN], row[MovieReviewData.LABEL_COLUMN]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(int(label))
pbar.update()
return np.array(x), np.array(y)
def _pad(self, ids):
x, t = [], []
token_type_ids = [0] * self.max_seq_len
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
t.append(token_type_ids)
return np.array(x), np.array(t)
Because of a tf.train.load_checkpoint
limitation requiring list permissions on the google storage bucket, we need to copy the pre-trained BERT weights locally.
asset_path = 'asset'
bert_model_name = "uncased_L-12_H-768_A-12"
bert_ckpt_dir = os.path.join(asset_path, bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
%%bash
if [ ! -f asset/uncased_L-12_H-768_A-12.zip ]; then
curl -o asset/uncased_L-12_H-768_A-12.zip --create-dirs https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
fi
if [ ! -d asset/uncased_L-12_H-768_A-12 ]; then
unzip asset/uncased_L-12_H-768_A-12.zip -d asset/
fi
Archive: asset/uncased_L-12_H-768_A-12.zip creating: asset/uncased_L-12_H-768_A-12/ inflating: asset/uncased_L-12_H-768_A-12/bert_model.ckpt.meta inflating: asset/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001 inflating: asset/uncased_L-12_H-768_A-12/vocab.txt inflating: asset/uncased_L-12_H-768_A-12/bert_model.ckpt.index inflating: asset/uncased_L-12_H-768_A-12/bert_config.json
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 388M 100 388M 0 0 3564k 0 0:01:51 0:01:51 --:--:-- 3948k
Now let's fetch and prepare the data by taking the first max_seq_len
tokenens after tokenizing with the BERT tokenizer, und use sample_size
examples for both training and testing.
To keep training fast, we'll take a sample of about 2500 train and test examples, respectively, and use the first 128 tokens only (transformers memory and computation requirements scale quadraticly with the sequence length - so with a TPU you might use max_seq_len=512
, but on a GPU this would be too slow, and you will have to use a very small batch_size
s to fit the model into the GPU memory).
%%time
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = MovieReviewData(tokenizer,
sample_size=10*128*2, #10*128*2
max_seq_len=128)
pos: 100%|██████████| 12500/12500 [00:00<00:00, 19259.77it/s] neg: 100%|██████████| 12500/12500 [00:00<00:00, 19675.69it/s] pos: 100%|██████████| 12500/12500 [00:00<00:00, 19420.16it/s] neg: 100%|██████████| 12500/12500 [00:00<00:00, 18046.21it/s] 100%|██████████| 2.56k/2.56k [00:02<00:00, 879it/s] 100%|██████████| 2.56k/2.56k [00:02<00:00, 866it/s]
max seq_len 178 CPU times: user 20 s, sys: 5.05 s, total: 25.1 s Wall time: 25.5 s
print(" train_x", data.train_x.shape)
print("train_x_token_types", data.train_x_token_types.shape)
print(" train_y", data.train_y.shape)
print(" test_x", data.test_x.shape)
print(" max_seq_len", data.max_seq_len)
train_x (2560, 128) train_x_token_types (2560, 128) train_y (2560,) test_x (2560, 128) max_seq_len 128
If we decide to use adapter-BERT we need some helpers for freezing the original BERT layers.
def flatten_layers(root_layer):
if isinstance(root_layer, keras.layers.Layer):
yield root_layer
for layer in root_layer._layers:
for sub_layer in flatten_layers(layer):
yield sub_layer
def freeze_bert_layers(l_bert):
"""
Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
"""
for layer in flatten_layers(l_bert):
if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
layer.trainable = True
elif len(layer._layers) == 0:
layer.trainable = False
l_bert.embeddings_layer.trainable = False
def create_learning_rate_scheduler(max_learn_rate=5e-5,
end_learn_rate=1e-7,
warmup_epoch_count=10,
total_epoch_count=90):
def lr_scheduler(epoch):
if epoch < warmup_epoch_count:
res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
else:
res = max_learn_rate*math.exp(
math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
return float(res)
learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)
return learning_rate_scheduler
Now let's create a classification model using adapter-BERT, which is clever way of reducing the trainable parameter count, by freezing the original BERT weights, and adapting them with two FFN bottlenecks (i.e. adapter_size
bellow) in every BERT layer.
N.B. The commented out code below show how to feed a token_type_ids
/segment_ids
sequence (which is not needed in our case).
def create_model(max_seq_len, adapter_size=64):
"""Creates a classification model."""
#adapter_size = 64 # see - arXiv:1902.00751
# create the bert layer
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
bc = StockBertConfig.from_json_string(reader.read())
bert_params = map_stock_config_to_params(bc)
bert_params.adapter_size = adapter_size
bert = BertModelLayer.from_params(bert_params, name="bert")
input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
# token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="token_type_ids")
# output = bert([input_ids, token_type_ids])
output = bert(input_ids)
print("bert shape", output.shape)
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
cls_out = keras.layers.Dropout(0.5)(cls_out)
logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
logits = keras.layers.Dropout(0.5)(logits)
logits = keras.layers.Dense(units=2, activation="softmax")(logits)
# model = keras.Model(inputs=[input_ids, token_type_ids], outputs=logits)
# model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
model = keras.Model(inputs=input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))
# load the pre-trained model weights
load_stock_weights(bert, bert_ckpt_file)
# freeze weights if adapter-BERT is used
if adapter_size is not None:
freeze_bert_layers(bert)
model.compile(optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])
model.summary()
return model
adapter_size = None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)
bert shape (None, 128, 768) Done loading 196 BERT weights from: asset/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fd9f4052470> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0] Unused weights from checkpoint: bert/embeddings/token_type_embeddings bert/pooler/dense/bias bert/pooler/dense/kernel cls/predictions/output_bias cls/predictions/transform/LayerNorm/beta cls/predictions/transform/LayerNorm/gamma cls/predictions/transform/dense/bias cls/predictions/transform/dense/kernel cls/seq_relationship/output_bias cls/seq_relationship/output_weights Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_ids (InputLayer) [(None, 128)] 0 _________________________________________________________________ bert (BertModelLayer) (None, 128, 768) 108890112 _________________________________________________________________ lambda (Lambda) (None, 768) 0 _________________________________________________________________ dropout (Dropout) (None, 768) 0 _________________________________________________________________ dense (Dense) (None, 768) 590592 _________________________________________________________________ dropout_1 (Dropout) (None, 768) 0 _________________________________________________________________ dense_1 (Dense) (None, 2) 1538 ================================================================= Total params: 109,482,242 Trainable params: 109,482,242 Non-trainable params: 0 _________________________________________________________________
%%time
log_dir = ".log/movie_reviews/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
total_epoch_count = 2
model.fit(x=data.train_x, y=data.train_y,
validation_split=0.1,
batch_size=12,
shuffle=True,
epochs=total_epoch_count,
callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
end_learn_rate=1e-7,
warmup_epoch_count=20,
total_epoch_count=total_epoch_count),
keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
tensorboard_callback])
Train on 2304 samples, validate on 256 samples Epoch 00001: LearningRateScheduler reducing learning rate to 5.000000000000001e-07. Epoch 1/2 2304/2304 [==============================] - 121s 52ms/sample - loss: 0.4103 - acc: 0.9058 - val_loss: 0.4349 - val_acc: 0.8633 Epoch 00002: LearningRateScheduler reducing learning rate to 1.0000000000000002e-06. Epoch 2/2 2304/2304 [==============================] - 121s 53ms/sample - loss: 0.4053 - acc: 0.9054 - val_loss: 0.4415 - val_acc: 0.8594 CPU times: user 4min 31s, sys: 43.2 s, total: 5min 14s Wall time: 4min 2s
<tensorflow.python.keras.callbacks.History at 0x7fd8bc9238d0>
model.save_weights('./movie_reviews.h5', overwrite=True)
%%time
_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)
print("train acc", train_acc)
print(" test acc", test_acc)
2560/2560 [==============================] - 35s 14ms/sample - loss: 0.3835 - acc: 0.9270 2560/2560 [==============================] - 33s 13ms/sample - loss: 0.4088 - acc: 0.8992 train acc 0.92695314 test acc 0.89921874 CPU times: user 1min 7s, sys: 222 ms, total: 1min 8s Wall time: 1min 7s
To evaluate the trained model, let's load the saved weights in a new model instance, and evaluate.
model = create_model(data.max_seq_len, adapter_size=None)
model.load_weights("./movie_reviews.h5")
bert shape (None, 128, 768) Done loading 196 BERT weights from: asset/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7ff6b45c4160> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0] Unused weights from checkpoint: bert/embeddings/token_type_embeddings bert/pooler/dense/bias bert/pooler/dense/kernel cls/predictions/output_bias cls/predictions/transform/LayerNorm/beta cls/predictions/transform/LayerNorm/gamma cls/predictions/transform/dense/bias cls/predictions/transform/dense/kernel cls/seq_relationship/output_bias cls/seq_relationship/output_weights Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_ids (InputLayer) [(None, 128)] 0 _________________________________________________________________ bert (BertModelLayer) (None, 128, 768) 108890112 _________________________________________________________________ lambda (Lambda) (None, 768) 0 _________________________________________________________________ dropout (Dropout) (None, 768) 0 _________________________________________________________________ dense (Dense) (None, 768) 590592 _________________________________________________________________ dropout_1 (Dropout) (None, 768) 0 _________________________________________________________________ dense_1 (Dense) (None, 2) 1538 ================================================================= Total params: 109,482,242 Trainable params: 109,482,242 Non-trainable params: 0 _________________________________________________________________
%%time
# _, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)
# print("train acc", train_acc)
print(" test acc", test_acc)
2560/2560 [==============================] - 35s 14ms/sample - loss: 0.3903 - acc: 0.9230 test acc 0.9230469 CPU times: user 34.6 s, sys: 113 ms, total: 34.7 s Wall time: 34.6 s
For prediction, we need to prepare the input text the same way as we did for training - tokenize, adding the special [CLS]
and [SEP]
token at begin and end of the token sequence, and pad to match the model input shape.
%%time
CLASSES = ["negative","positive"]
max_seq_len = 128
pred_sentences = [
"That movie was absolutely awful",
"The acting was a bit lacking",
"The film was creative and surprising",
"Absolutely fantastic!",
]
inputs = pd.DataFrame(pred_sentences)
pred_tokens = map(tokenizer.tokenize, inputs.to_numpy()[:, 0].tolist())
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))
pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len-len(tids)), pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))
res = model(pred_token_ids).numpy().argmax(axis=-1)
[CLASSES[i] for i in res]
CPU times: user 150 ms, sys: 7.46 ms, total: 158 ms Wall time: 177 ms
['negative', 'negative', 'positive', 'positive']
%%writefile bentoml_service.py
import bentoml
import tensorflow as tf
import numpy as np
import pandas as pd
from typing import List
from bentoml.frameworks.tensorflow import TensorflowSavedModelArtifact
from bentoml.service.artifacts.common import PickleArtifact
from bentoml.adapters import DataframeInput
CLASSES = ["negative","positive"]
max_seq_len = 128
try:
tf.config.set_visible_devices([], 'GPU') # disable GPU, required when served in docker
except:
pass
@bentoml.env(pip_packages=['tensorflow', 'bert-for-tf2'])
@bentoml.artifacts([TensorflowSavedModelArtifact('model'), PickleArtifact('tokenizer')])
class BertService(bentoml.BentoService):
def tokenize(self, inputs: pd.DataFrame):
tokenizer = self.artifacts.tokenizer
if isinstance(inputs, pd.DataFrame):
inputs = inputs.to_numpy()[:, 0].tolist()
else:
inputs = inputs.tolist() # for predict_clipper
pred_tokens = map(tokenizer.tokenize, inputs)
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))
pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len - len(tids)), pred_token_ids)
pred_token_ids = tf.constant(list(pred_token_ids), dtype=tf.int32)
return pred_token_ids
@bentoml.api(input=DataframeInput(), mb_max_latency=3000, mb_max_batch_size=20, batch=True)
def predict(self, inputs: pd.DataFrame) -> List[str]:
model = self.artifacts.model
pred_token_ids = self.tokenize(inputs)
res = model(pred_token_ids).numpy().argmax(axis=-1)
return [CLASSES[i] for i in res]
Overwriting bentoml_service.py
from bentoml_service import Service
bento_svc = Service()
bento_svc.pack("model", model)
bento_svc.pack("tokenizer", tokenizer)
saved_path = bento_svc.save()
[2020-07-28 15:11:02,575] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle. WARNING:tensorflow:Skipping full serialization of Keras layer <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7fd96c59ab70>, because it is not built. WARNING:tensorflow:From /opt/anaconda3/envs/bentoml-dev-py36/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1786: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. Instructions for updating: If using Keras pass *_constraint arguments to layers. INFO:tensorflow:Assets written to: /tmp/bentoml-temp-r9yxj9ku/Service/artifacts/model_saved_model/assets [2020-07-28 15:12:13,764] INFO - Detect BentoML installed in development model, copying local BentoML module file to target saved bundle path running sdist running egg_info writing BentoML.egg-info/PKG-INFO writing dependency_links to BentoML.egg-info/dependency_links.txt writing entry points to BentoML.egg-info/entry_points.txt writing requirements to BentoML.egg-info/requires.txt writing top-level names to BentoML.egg-info/top_level.txt reading manifest file 'BentoML.egg-info/SOURCES.txt' reading manifest template 'MANIFEST.in'
warning: no previously-included files matching '*~' found anywhere in distribution warning: no previously-included files matching '*.pyo' found anywhere in distribution warning: no previously-included files matching '.git' found anywhere in distribution warning: no previously-included files matching '.ipynb_checkpoints' found anywhere in distribution warning: no previously-included files matching '__pycache__' found anywhere in distribution warning: no directories found matching 'bentoml/server/static' warning: no directories found matching 'bentoml/yatai/web/dist' no previously-included directories found matching 'e2e_tests' no previously-included directories found matching 'tests' no previously-included directories found matching 'benchmark'
writing manifest file 'BentoML.egg-info/SOURCES.txt' running check creating BentoML-0.8.3+42.gb8d36b6 creating BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info creating BentoML-0.8.3+42.gb8d36b6/bentoml creating BentoML-0.8.3+42.gb8d36b6/bentoml/adapters creating BentoML-0.8.3+42.gb8d36b6/bentoml/artifact creating BentoML-0.8.3+42.gb8d36b6/bentoml/cli creating BentoML-0.8.3+42.gb8d36b6/bentoml/clipper creating BentoML-0.8.3+42.gb8d36b6/bentoml/configuration creating BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ creating BentoML-0.8.3+42.gb8d36b6/bentoml/handlers creating BentoML-0.8.3+42.gb8d36b6/bentoml/marshal creating BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle creating BentoML-0.8.3+42.gb8d36b6/bentoml/server creating BentoML-0.8.3+42.gb8d36b6/bentoml/utils creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/__pycache__ creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions/__pycache__ creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository creating BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/validator copying files to BentoML-0.8.3+42.gb8d36b6... copying LICENSE -> BentoML-0.8.3+42.gb8d36b6 copying MANIFEST.in -> BentoML-0.8.3+42.gb8d36b6 copying README.md -> BentoML-0.8.3+42.gb8d36b6 copying pyproject.toml -> BentoML-0.8.3+42.gb8d36b6 copying setup.cfg -> BentoML-0.8.3+42.gb8d36b6 copying setup.py -> BentoML-0.8.3+42.gb8d36b6 copying versioneer.py -> BentoML-0.8.3+42.gb8d36b6 copying BentoML.egg-info/PKG-INFO -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info copying BentoML.egg-info/SOURCES.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info copying BentoML.egg-info/dependency_links.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info copying BentoML.egg-info/entry_points.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info copying BentoML.egg-info/requires.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info copying BentoML.egg-info/top_level.txt -> BentoML-0.8.3+42.gb8d36b6/BentoML.egg-info copying bentoml/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml copying bentoml/_version.py -> BentoML-0.8.3+42.gb8d36b6/bentoml copying bentoml/exceptions.py -> BentoML-0.8.3+42.gb8d36b6/bentoml copying bentoml/service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml copying bentoml/service_env.py -> BentoML-0.8.3+42.gb8d36b6/bentoml copying bentoml/adapters/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/base_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/base_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/clipper_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/dataframe_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/dataframe_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/default_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/fastai_image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/file_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/json_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/json_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/legacy_image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/legacy_json_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/multi_image_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/pytorch_tensor_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/tensorflow_tensor_input.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/tensorflow_tensor_output.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/adapters/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/adapters copying bentoml/artifact/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/fastai2_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/fastai_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/fasttext_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/h2o_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/json_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/keras_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/lightgbm_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/onnx_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/pickle_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/pytorch_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/sklearn_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/spacy_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/text_file_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/tf_savedmodel_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/artifact/xgboost_model_artifact.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/artifact copying bentoml/cli/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/aws_lambda.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/aws_sagemaker.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/azure_functions.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/bento_management.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/bento_service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/click_utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/config.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/deployment.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/cli/yatai_service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/cli copying bentoml/clipper/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/clipper copying bentoml/configuration/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration copying bentoml/configuration/configparser.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration copying bentoml/configuration/default_bentoml.cfg -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration copying bentoml/configuration/__pycache__/__init__.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ copying bentoml/configuration/__pycache__/__init__.cpython-37.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ copying bentoml/configuration/__pycache__/__init__.cpython-38.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ copying bentoml/configuration/__pycache__/configparser.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ copying bentoml/configuration/__pycache__/configparser.cpython-37.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ copying bentoml/configuration/__pycache__/configparser.cpython-38.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/configuration/__pycache__ copying bentoml/handlers/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/handlers copying bentoml/marshal/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal copying bentoml/marshal/dispatcher.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal copying bentoml/marshal/marshal.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal copying bentoml/marshal/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/marshal copying bentoml/saved_bundle/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/bentoml-init.sh -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/bundler.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/config.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/docker-entrypoint.sh -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/loader.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/pip_pkg.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/py_module_utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/saved_bundle/templates.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/saved_bundle copying bentoml/server/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/api_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/gunicorn_config.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/gunicorn_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/instruments.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/marshal_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/open_api.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/trace.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/server/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/server copying bentoml/utils/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/alg.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/benchmark.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/cloudpickle.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/dataframe_util.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/flask_ngrok.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/hybridmethod.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/lazy_loader.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/log.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/s3.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/tempdir.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/utils/usage_stats.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/utils copying bentoml/yatai/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/alembic.ini -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/db.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/deployment_utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/status.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/yatai_service.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/yatai_service_impl.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai copying bentoml/yatai/client/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client copying bentoml/yatai/client/bento_repository_api.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client copying bentoml/yatai/client/deployment_api.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/client copying bentoml/yatai/deployment/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment copying bentoml/yatai/deployment/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment copying bentoml/yatai/deployment/store.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment copying bentoml/yatai/deployment/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment copying bentoml/yatai/deployment/aws_lambda/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda copying bentoml/yatai/deployment/aws_lambda/download_extra_resources.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda copying bentoml/yatai/deployment/aws_lambda/lambda_app.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda copying bentoml/yatai/deployment/aws_lambda/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda copying bentoml/yatai/deployment/aws_lambda/utils.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/aws_lambda copying bentoml/yatai/deployment/azure_functions/Dockerfile -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/app_init.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/constants.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/host.json -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/local.settings.json -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/azure_functions/templates.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/azure_functions copying bentoml/yatai/deployment/sagemaker/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker copying bentoml/yatai/deployment/sagemaker/model_server.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker copying bentoml/yatai/deployment/sagemaker/nginx.conf -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker copying bentoml/yatai/deployment/sagemaker/operator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker copying bentoml/yatai/deployment/sagemaker/serve -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker copying bentoml/yatai/deployment/sagemaker/wsgi.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/deployment/sagemaker copying bentoml/yatai/migrations/README -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations copying bentoml/yatai/migrations/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations copying bentoml/yatai/migrations/env.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations copying bentoml/yatai/migrations/script.py.mako -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations copying bentoml/yatai/migrations/__pycache__/env.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/__pycache__ copying bentoml/yatai/migrations/versions/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions copying bentoml/yatai/migrations/versions/a6b00ae45279_add_last_updated_at_for_deployments.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions copying bentoml/yatai/migrations/versions/__pycache__/a6b00ae45279_add_last_updated_at_for_deployments.cpython-36.pyc -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/migrations/versions/__pycache__ copying bentoml/yatai/proto/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto copying bentoml/yatai/proto/deployment_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto copying bentoml/yatai/proto/repository_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto copying bentoml/yatai/proto/status_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto copying bentoml/yatai/proto/yatai_service_pb2.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto copying bentoml/yatai/proto/yatai_service_pb2_grpc.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/proto copying bentoml/yatai/repository/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository copying bentoml/yatai/repository/base_repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository copying bentoml/yatai/repository/local_repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository copying bentoml/yatai/repository/metadata_store.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository copying bentoml/yatai/repository/repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository copying bentoml/yatai/repository/s3_repository.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/repository copying bentoml/yatai/validator/__init__.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/validator copying bentoml/yatai/validator/deployment_pb_validator.py -> BentoML-0.8.3+42.gb8d36b6/bentoml/yatai/validator Writing BentoML-0.8.3+42.gb8d36b6/setup.cfg UPDATING BentoML-0.8.3+42.gb8d36b6/bentoml/_version.py set BentoML-0.8.3+42.gb8d36b6/bentoml/_version.py to '0.8.3+42.gb8d36b6' Creating tar archive removing 'BentoML-0.8.3+42.gb8d36b6' (and everything under it) [2020-07-28 15:12:18,720] INFO - BentoService bundle 'Service:20200728151102_B3E065' saved to: /home/bentoml/bentoml/repository/Service/20200728151102_B3E065
print(saved_path)
/home/bentoml/bentoml/repository/Service/20200728151102_B3E065
To start a REST API model server with the BentoService saved above, use the bentoml serve command:
Since BERT is a large model, if you met OOM bellow,
you may need to restart this kernel to release the RAM/GRAM used by training model.
bentoml_bundle_path = '/home/bentoml/bentoml/repository/Service/20200728151102_B3E065' # saved_path
# Option 1: serve directly
print(f"bentoml serve-gunicorn {bentoml_bundle_path} --port 5000 --enable-microbatch --workers 1")
!bentoml serve-gunicorn {bentoml_bundle_path} --port 5000 --enable-microbatch --workers 1
bentoml serve-gunicorn /home/bentoml/bentoml/repository/Service/20200728151102_B3E065 --port 5000 --enable-microbatch --workers 1 [2020-07-28 15:14:35,761] INFO - Starting BentoML API server in production mode.. [2020-07-28 15:14:36,563] INFO - Running micro batch service on :5000 [2020-07-28 15:14:36 +0800] [2953201] [INFO] Starting gunicorn 20.0.4 [2020-07-28 15:14:36 +0800] [2952697] [INFO] Starting gunicorn 20.0.4 [2020-07-28 15:14:36 +0800] [2952697] [INFO] Listening at: http://0.0.0.0:60577 (2952697) [2020-07-28 15:14:36 +0800] [2953201] [INFO] Listening at: http://0.0.0.0:5000 (2953201) [2020-07-28 15:14:36 +0800] [2952697] [INFO] Using worker: sync [2020-07-28 15:14:36 +0800] [2953201] [INFO] Using worker: aiohttp.worker.GunicornWebWorker [2020-07-28 15:14:36 +0800] [2953203] [INFO] Booting worker with pid: 2953203 [2020-07-28 15:14:36 +0800] [2953202] [INFO] Booting worker with pid: 2953202 [2020-07-28 15:14:36,613] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle. [2020-07-28 15:14:36,631] WARNING - Saved BentoService bundle version mismatch: loading BentoService bundle create with BentoML version 0.8.3, but loading from BentoML version 0.8.3+42.gb8d36b6 [2020-07-28 15:14:36,874] INFO - Micro batch enabled for API `predict` [2020-07-28 15:14:36,874] INFO - Your system nofile limit is 10000, which means each instance of microbatch service is able to hold this number of connections at same time. You can increase the number of file descriptors for the server process, or launch more microbatch instances to accept more concurrent connection. [2020-07-28 15:14:37,659] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle. [2020-07-28 15:14:37,697] WARNING - Saved BentoService bundle version mismatch: loading BentoService bundle create with BentoML version 0.8.3, but loading from BentoML version 0.8.3+42.gb8d36b6 2020-07-28 15:14:39.628530: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 2020-07-28 15:14:39.636462: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-07-28 15:14:39.636718: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: pciBusID: 0000:01:00.0 name: GeForce GTX 1060 computeCapability: 6.1 coreClock: 1.6705GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s 2020-07-28 15:14:39.637038: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1 2020-07-28 15:14:39.639381: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10 2020-07-28 15:14:39.641536: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10 2020-07-28 15:14:39.642546: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10 2020-07-28 15:14:39.644930: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10 2020-07-28 15:14:39.646255: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10 2020-07-28 15:14:39.650387: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 2020-07-28 15:14:39.650598: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-07-28 15:14:39.651086: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-07-28 15:14:39.651293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1697] Adding visible gpu devices: 0 2020-07-28 15:14:40.990800: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA 2020-07-28 15:14:41.013344: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2699905000 Hz 2020-07-28 15:14:41.013808: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b080b80810 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2020-07-28 15:14:41.013856: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2020-07-28 15:14:41.013969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1096] Device interconnect StreamExecutor with strength 1 edge matrix: 2020-07-28 15:14:41.013981: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1102] 2020-07-28 15:14:41.170231: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-07-28 15:14:41.170580: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b080be4f10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2020-07-28 15:14:41.170601: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): GeForce GTX 1060, Compute Capability 6.1 ^C [2020-07-28 15:14:49 +0800] [2953201] [INFO] Handling signal: int [2020-07-28 15:14:49 +0800] [2952697] [INFO] Handling signal: int [2020-07-28 15:14:49 +0800] [2953202] [INFO] Worker exiting (pid: 2953202) [2020-07-28 15:14:49 +0800] [2953201] [INFO] Shutting down: Master [2020-07-28 15:14:49 +0800] [2953203] [INFO] Worker exiting (pid: 2953203)
If you are running this notebook from Google Colab, you can start the dev server with --run-with-ngrok
option, to gain acccess to the API endpoint via a public endpoint managed by ngrok:
!bentoml serve BertService --run-with-ngrok
Open http://127.0.0.1:5000 to see more information about the REST APIs server in your browser.
Navigate to parent directory of the notebook(so you have reference to the test.jpg
image), and run the following curl
command to send the image to REST API server and get a prediction result:
curl -i \
--request POST \
--header "Content-Type: application/json" \
--data '{"0":{"0":"The acting was a bit lacking."}}' \
localhost:5000/predict
%%time
import requests
import pandas as pd
server_url = f"http://127.0.0.1:5000/predict"
method = "POST"
headers = {"content-type": "application/json"}
pred_sentences = ["The acting was a bit lacking."]
data = pd.DataFrame(pred_sentences).to_json()
r = requests.request(method, server_url, headers=headers, data=data)
print(r.content)
b'["negative"]' CPU times: user 2.36 ms, sys: 3.26 ms, total: 5.62 ms Wall time: 29.1 s
# Option 2: serve in docker
!cd {bentoml_bundle_path}
IMG_NAME = bentoml_bundle_path.split('/')[-1].lower()
!docker build --quiet -t {IMG_NAME} {bentoml_bundle_path}
# launch docker instances
!docker run -itd -p 5000:5000 {IMG_NAME}:latest --workers 1 --enable-microbatch
One common way of distributing this model API server for production deployment, is via Docker containers. And BentoML provides a convenient way to do that.
Note that docker is not available in Google Colab. You will need to download and run this notebook locally to try out this containerization with docker feature.
If you already have docker configured, simply run the follow command to product a docker container serving the IrisClassifier prediction service created above:
!bentoml containerize BertService:latest
!docker run -p 5000:5000 bertservice
bentoml.load is the API for loading a BentoML packaged model in python:
from bentoml import load
service = load(saved_path)
print(service.predict([["The acting was a bit lacking."]]))
BentoML cli supports loading and running a packaged model from CLI. With the DataframeInput adapter, the CLI command supports reading input Dataframe data from CLI argument or local csv or json files:
!bentoml run BertService:latest predict --input '{"0":{"0":"The acting was a bit lacking."}}'
If you are at a small team with limited engineering or DevOps resources, try out automated deployment with BentoML CLI, currently supporting AWS Lambda, AWS SageMaker, and Azure Functions:
If the cloud platform you are working with is not on the list above, try out these step-by-step guide on manually deploying BentoML packaged model to cloud platforms:
Lastly, if you have a DevOps or ML Engineering team who's operating a Kubernetes or OpenShift cluster, use the following guides as references for implementating your deployment strategy: