#!/usr/bin/env python # coding: utf-8 # # BentoML Example: ONNX GPU Serving # # BentoML makes moving trained ML models to production easy: # # Package models trained with any ML framework and reproduce them for model serving in production # Deploy anywhere for online API serving or offline batch serving # High-Performance API model server with adaptive micro-batching support # Central hub for managing models and deployment process via Web UI and APIs # Modular and flexible design making it adaptable to your infrastrcuture # # BentoML is a framework for serving, managing, and deploying machine learning models. It is aiming to bridge the gap between Data Science and DevOps, and enable teams to deliver prediction services in a fast, repeatable, and scalable way. Before reading this example project, be sure to check out the Getting started guide to learn about the basic concepts in BentoML. # # This notebook demonstrates how to export your PyTorch model the serve with BentoML, building a Docker Images that has GPU supports. Please refers to [GPU Serving guides](https://docs.bentoml.org/en/latest/guides/gpu_serving.html) for more information. # # This is an extension of [BentoML's PyTorch with GPU Serving](https://github.com/bentoml/gallery/blob/master/pytorch/news-classification-gpu/news-classification.ipynb). Please refers to that tutorial before going with forward. 1\n return text_pipeline, label_pipeline\n\ndef to_numpy(tensor):\n return tensor.detach().cpu().clone().numpy() if tensor.requires_grad else tensor.cpu().clone().numpy()\n\n\n@env(infer_pip_packages=False, pip_packages=[\'onnxruntime-gpu\'], requirements_txt_file="./requirements.txt", docker_base_image="bentoml/model-server:0.12.1-py38-gpu")\n@artifacts(\n [OnnxModelArtifact(\'model\', backend=\'onnxruntime-gpu\'), PickleArtifact(\'tokenizer\'), PickleArtifact(\'vocab\')])\nclass OnnxService(BentoService):\n def __init__(self):\n super().__init__()\n self.news_label = {1: \'World\',\n 2: \'Sports\',\n 3: \'Business\',\n 4: \'Sci/Tec\'}\n\n def classify_categories(self, sentence):\n text_pipeline, _ = get_pipeline(self.artifacts.tokenizer, self.artifacts.vocab)\n text = to_numpy(torch.tensor(text_pipeline(sentence)).to(device))\n tensor_name = self.artifacts.model.get_inputs()[0].name\n output_name = self.artifacts.model.get_outputs()[0].name\n onnx_inputs = {tensor_name: text}\n print(f\'providers: {self.artifacts.model.get_providers()}\')\n\n try:\n r = self.artifacts.model.run([output_name], onnx_inputs)[0]\n return r.argmax(1).item() + 1\n except (RuntimeError, InvalidArgument) as e:\n print(f"ERROR with shape: {onnx_inputs[tensor_name].shape} - {e}")\n\n @api(input=JsonInput(), output=JsonOutput())\n def predict(self, parsed_json):\n sentence = parsed_json.get(\'text\')\n return {\'categories\': self.news_label[self.classify_categories(sentence)]}\n') # ## Pack our BentoService # In[7]: from bento_svc import OnnxService device = torch.device("cuda" if torch.cuda.is_available() else "cpu") onnx_model_path = "model/pytorch_model.onnx" tokenizer, vocab = get_tokenizer_vocab() vocab_size, embedding_size, num_class = get_model_params(vocab) model = TextClassificationModel(vocab_size, embedding_size, num_class).to(device) model.load_state_dict(torch.load("model/pytorch_model.pt")) model.eval() # convert our dummy inputs to torch.cuda.LongTensor print("\nExporting torch model to onnx...") inp = torch.rand(vocab_size).long().to(device) # set our dynamic_axes to vocab_size since our inputs for news piece can vary. Users have to make sure that variables name in dynamic_axes match our dummy ones # e.g: since we define our vocab_size as our size for dummy inputs, dynamic_axes parameters have to follow as shown below. torch.onnx.export(model, inp, onnx_model_path, export_params=True, opset_version=11, do_constant_folding=True, input_names=["input"], output_names=["output"], dynamic_axes={"input": {0: "vocab_size"}, "output": {0: "vocab_size"}}) # In[8]: print("\n Loading model to check...") onnx_model = onnx.load(onnx_model_path) onnx.checker.check_model(onnx_model) # check will returns nothing if our ONNX model is valid. bento_svc = OnnxService() bento_svc.pack("model", onnx_model_path) bento_svc.pack("tokenizer", tokenizer) bento_svc.pack("vocab", vocab) saved_path = bento_svc.save() # ## REST API Model Serving # # To start a REST API model server with the BentoService save above, use the `serve` command: # In[9]: get_ipython().system('bentoml serve OnnxService:latest') # In[10]: get_ipython().system('nvidia-smi') # If you are running this notebook from Google Colab, start the dev server with `--run-with-ngrok` option to gain access to the API endpoint via a public endpoint managed by [ngrok](https://ngrok.com/): # In[ ]: get_ipython().system('bentoml serve PyTorchFashionClassifier:latest --run-with-ngrok') # ## Containerize our model server with Docker # # One common way of distributing this model API server for production deployment, is via Docker containers. And BentoML provides a convenient way to do that. # # Note that docker is not available in Google Colab. You will need to download and run this notebook locally to try out this containerization with docker feature. # # If you already have docker configured, simply run the follow command to product a docker container serving the ONNXService with GPU prediction service created above: # In[11]: get_ipython().system('bentoml containerize OnnxService:latest -t onnx-service-gpu:latest') # In[12]: get_ipython().system('docker run --gpus all --device /dev/nvidia0 --device /dev/nvidiactl --device /dev/nvidia-modeset --device /dev/nvidia-uvm --device /dev/nvidia-uvm-tools -p 5000:5000 onnx-service-gpu') # ## Deployment Options # # If you are at a small team with limited engineering or DevOps resources, try out automated deployment with BentoML CLI, currently supporting AWS Lambda, AWS SageMaker, and Azure Functions: # - [AWS Lambda Deployment Guide](https://docs.bentoml.org/en/latest/deployment/aws_lambda.html) # - [AWS SageMaker Deployment Guide](https://docs.bentoml.org/en/latest/deployment/aws_sagemaker.html) # - [Azure Functions Deployment Guide](https://docs.bentoml.org/en/latest/deployment/azure_functions.html) # # If the cloud platform you are working with is not on the list above, try out these step-by-step guide on manually deploying BentoML packaged model to cloud platforms: # - [AWS ECS Deployment](https://docs.bentoml.org/en/latest/deployment/aws_ecs.html) # - [Google Cloud Run Deployment](https://docs.bentoml.org/en/latest/deployment/google_cloud_run.html) # - [Azure container instance Deployment](https://docs.bentoml.org/en/latest/deployment/azure_container_instance.html) # - [Heroku Deployment](https://docs.bentoml.org/en/latest/deployment/heroku.html) # # Lastly, if you have a DevOps or ML Engineering team who's operating a Kubernetes or OpenShift cluster, use the following guides as references for implementating your deployment strategy: # - [Kubernetes Deployment](https://docs.bentoml.org/en/latest/deployment/kubernetes.html) # - [Knative Deployment](https://docs.bentoml.org/en/latest/deployment/knative.html) # - [Kubeflow Deployment](https://docs.bentoml.org/en/latest/deployment/kubeflow.html) # - [KFServing Deployment](https://docs.bentoml.org/en/latest/deployment/kfserving.html) # - [Clipper.ai Deployment Guide](https://docs.bentoml.org/en/latest/deployment/clipper.html) # In[ ]: