#!/usr/bin/env python # coding: utf-8 # # BentoML Example: ONNX GPU Serving # # BentoML makes moving trained ML models to production easy: # # Package models trained with any ML framework and reproduce them for model serving in production # Deploy anywhere for online API serving or offline batch serving # High-Performance API model server with adaptive micro-batching support # Central hub for managing models and deployment process via Web UI and APIs # Modular and flexible design making it adaptable to your infrastrcuture # # BentoML is a framework for serving, managing, and deploying machine learning models. It is aiming to bridge the gap between Data Science and DevOps, and enable teams to deliver prediction services in a fast, repeatable, and scalable way. Before reading this example project, be sure to check out the Getting started guide to learn about the basic concepts in BentoML. # # This notebook demonstrates how to export your PyTorch model the serve with BentoML, building a Docker Images that has GPU supports. Please refers to [GPU Serving guides](https://docs.bentoml.org/en/latest/guides/gpu_serving.html) for more information. # # This is an extension of [BentoML's PyTorch with GPU Serving](https://github.com/bentoml/gallery/blob/master/pytorch/news-classification-gpu/news-classification.ipynb). Please refers to that tutorial before going with forward. # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: get_ipython().system('pip install -q bentoml torch==1.8.1+cu111 torchtext==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html onnxruntime-gpu onnx') # In[3]: get_ipython().system('cp -r ../../pytorch/news-classification-gpu/model/ .') # In[4]: import torch from torch import nn from torchtext.datasets import AG_NEWS from torchtext.data.utils import get_tokenizer from collections import Counter from torchtext.vocab import Vocab from bentoml import BentoService, api, env, artifacts from bentoml.adapters import JsonInput, JsonOutput from bentoml.frameworks.onnx import OnnxModelArtifact from bentoml.service.artifacts.pickle import PickleArtifact import onnx from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument # ## Convert our PyTorch model to ONNX format # We need to define our PyTorch model and some helpers functions, refers to [BentoML's PyTorch with GPU Serving](https://github.com/bentoml/gallery/blob/master/pytorch/news-classification-gpu/news-classification.ipynb) # In[5]: # https://www.onnxruntime.ai/python/auto_examples/plot_common_errors.html device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') EMSIZE = 64 class TextClassificationModel(nn.Module): def __init__(self, vocab_size, embed_dim, num_class): super().__init__() self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True) self.fc = nn.Linear(embed_dim, num_class) self.offsets = torch.tensor([0]).to(device) self.init_weights() def init_weights(self): init_range = 0.5 self.embedding.weight.data.uniform_(-init_range, init_range) self.fc.weight.data.uniform_(-init_range, init_range) self.fc.bias.data.zero_() def forward(self, text): embedded = self.embedding(text, offsets=self.offsets) return self.fc(embedded) def get_tokenizer_vocab(dataset=AG_NEWS, tokenizer_fn='basic_english', root_data_dir='dataset'): print('Getting tokenizer and vocab...') tokenizer = get_tokenizer(tokenizer_fn) train_ = dataset(root=root_data_dir, split='train') counter = Counter() for (label, line) in train_: counter.update(tokenizer(line)) vocab = Vocab(counter, min_freq=1) return tokenizer, vocab def get_model_params(vocab): print('Setup model params...') train_iter = AG_NEWS(root='dataset', split='train') num_class = len(set([label for (label, text) in train_iter])) vocab_size = len(vocab) return vocab_size, EMSIZE, num_class # ## Define our BentoService # # Please refers to our [GPU Serving guide](https://docs.bentoml.org/en/latest/guides/gpu_serving.html) to setup your environment correctly. # # We will be using Docker images provided by *BentoML* : `bentoml/model-server:0.12.1-py38-gpu` to prepare our CUDA-enabled images. # Since `onnxruntime.InferenceSession` only accepts numpy array, refers [ONNX API](https://www.onnxruntime.ai/python/api_summary) for more information. # # We need to convert our `torch.Tensor` to numpy array with `to_numpy` below. `.detach()` is used to make sure that if you have a `requires_grad=True` tensor the function will convert correctly. # In[6]: get_ipython().run_cell_magic('writefile', 'bento_svc.py', '\nimport torch\nfrom bentoml import BentoService, api, env, artifacts\nfrom bentoml.adapters import JsonInput, JsonOutput\nfrom bentoml.frameworks.onnx import OnnxModelArtifact\nfrom bentoml.service.artifacts.pickle import PickleArtifact\nfrom onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument\n\n\ndevice = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")\n\n\n\ndef get_pipeline(tokenizer, vocab):\n print(\'Setup pipeline...\')\n text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]\n label_pipeline = lambda x: int(x) - 1\n return text_pipeline, label_pipeline\n\ndef to_numpy(tensor):\n return tensor.detach().cpu().clone().numpy() if tensor.requires_grad else tensor.cpu().clone().numpy()\n\n\n@env(infer_pip_packages=False, pip_packages=[\'onnxruntime-gpu\'], requirements_txt_file="./requirements.txt", docker_base_image="bentoml/model-server:0.12.1-py38-gpu")\n@artifacts(\n [OnnxModelArtifact(\'model\', backend=\'onnxruntime-gpu\'), PickleArtifact(\'tokenizer\'), PickleArtifact(\'vocab\')])\nclass OnnxService(BentoService):\n def __init__(self):\n super().__init__()\n self.news_label = {1: \'World\',\n 2: \'Sports\',\n 3: \'Business\',\n 4: \'Sci/Tec\'}\n\n def classify_categories(self, sentence):\n text_pipeline, _ = get_pipeline(self.artifacts.tokenizer, self.artifacts.vocab)\n text = to_numpy(torch.tensor(text_pipeline(sentence)).to(device))\n tensor_name = self.artifacts.model.get_inputs()[0].name\n output_name = self.artifacts.model.get_outputs()[0].name\n onnx_inputs = {tensor_name: text}\n print(f\'providers: {self.artifacts.model.get_providers()}\')\n\n try:\n r = self.artifacts.model.run([output_name], onnx_inputs)[0]\n return r.argmax(1).item() + 1\n except (RuntimeError, InvalidArgument) as e:\n print(f"ERROR with shape: {onnx_inputs[tensor_name].shape} - {e}")\n\n @api(input=JsonInput(), output=JsonOutput())\n def predict(self, parsed_json):\n sentence = parsed_json.get(\'text\')\n return {\'categories\': self.news_label[self.classify_categories(sentence)]}\n') # ## Pack our BentoService # In[7]: from bento_svc import OnnxService device = torch.device("cuda" if torch.cuda.is_available() else "cpu") onnx_model_path = "model/pytorch_model.onnx" tokenizer, vocab = get_tokenizer_vocab() vocab_size, embedding_size, num_class = get_model_params(vocab) model = TextClassificationModel(vocab_size, embedding_size, num_class).to(device) model.load_state_dict(torch.load("model/pytorch_model.pt")) model.eval() # convert our dummy inputs to torch.cuda.LongTensor print("\nExporting torch model to onnx...") inp = torch.rand(vocab_size).long().to(device) # set our dynamic_axes to vocab_size since our inputs for news piece can vary. Users have to make sure that variables name in dynamic_axes match our dummy ones # e.g: since we define our vocab_size as our size for dummy inputs, dynamic_axes parameters have to follow as shown below. torch.onnx.export(model, inp, onnx_model_path, export_params=True, opset_version=11, do_constant_folding=True, input_names=["input"], output_names=["output"], dynamic_axes={"input": {0: "vocab_size"}, "output": {0: "vocab_size"}}) # In[8]: print("\n Loading model to check...") onnx_model = onnx.load(onnx_model_path) onnx.checker.check_model(onnx_model) # check will returns nothing if our ONNX model is valid. bento_svc = OnnxService() bento_svc.pack("model", onnx_model_path) bento_svc.pack("tokenizer", tokenizer) bento_svc.pack("vocab", vocab) saved_path = bento_svc.save() # ## REST API Model Serving # # To start a REST API model server with the BentoService save above, use the `serve` command: # In[9]: get_ipython().system('bentoml serve OnnxService:latest') # In[10]: get_ipython().system('nvidia-smi') # If you are running this notebook from Google Colab, start the dev server with `--run-with-ngrok` option to gain access to the API endpoint via a public endpoint managed by [ngrok](https://ngrok.com/): # In[ ]: get_ipython().system('bentoml serve PyTorchFashionClassifier:latest --run-with-ngrok') # ## Containerize our model server with Docker # # One common way of distributing this model API server for production deployment, is via Docker containers. And BentoML provides a convenient way to do that. # # Note that docker is not available in Google Colab. You will need to download and run this notebook locally to try out this containerization with docker feature. # # If you already have docker configured, simply run the follow command to product a docker container serving the ONNXService with GPU prediction service created above: # In[11]: get_ipython().system('bentoml containerize OnnxService:latest -t onnx-service-gpu:latest') # In[12]: get_ipython().system('docker run --gpus all --device /dev/nvidia0 --device /dev/nvidiactl --device /dev/nvidia-modeset --device /dev/nvidia-uvm --device /dev/nvidia-uvm-tools -p 5000:5000 onnx-service-gpu') # ## Deployment Options # # If you are at a small team with limited engineering or DevOps resources, try out automated deployment with BentoML CLI, currently supporting AWS Lambda, AWS SageMaker, and Azure Functions: # - [AWS Lambda Deployment Guide](https://docs.bentoml.org/en/latest/deployment/aws_lambda.html) # - [AWS SageMaker Deployment Guide](https://docs.bentoml.org/en/latest/deployment/aws_sagemaker.html) # - [Azure Functions Deployment Guide](https://docs.bentoml.org/en/latest/deployment/azure_functions.html) # # If the cloud platform you are working with is not on the list above, try out these step-by-step guide on manually deploying BentoML packaged model to cloud platforms: # - [AWS ECS Deployment](https://docs.bentoml.org/en/latest/deployment/aws_ecs.html) # - [Google Cloud Run Deployment](https://docs.bentoml.org/en/latest/deployment/google_cloud_run.html) # - [Azure container instance Deployment](https://docs.bentoml.org/en/latest/deployment/azure_container_instance.html) # - [Heroku Deployment](https://docs.bentoml.org/en/latest/deployment/heroku.html) # # Lastly, if you have a DevOps or ML Engineering team who's operating a Kubernetes or OpenShift cluster, use the following guides as references for implementating your deployment strategy: # - [Kubernetes Deployment](https://docs.bentoml.org/en/latest/deployment/kubernetes.html) # - [Knative Deployment](https://docs.bentoml.org/en/latest/deployment/knative.html) # - [Kubeflow Deployment](https://docs.bentoml.org/en/latest/deployment/kubeflow.html) # - [KFServing Deployment](https://docs.bentoml.org/en/latest/deployment/kfserving.html) # - [Clipper.ai Deployment Guide](https://docs.bentoml.org/en/latest/deployment/clipper.html) # In[ ]: