Add notebook for ONNX export with MXNet 1.3 (aws#454)

laurenyu · web-flow · commit 8b4403f5e1c0 · 2018-11-07T15:14:41.000-08:00
diff --git a/sagemaker-python-sdk/mxnet_onnx_export/mnist.py b/sagemaker-python-sdk/mxnet_onnx_export/mnist.py
@@ -0,0 +1,128 @@
+import argparse
+import gzip
+import json
+import logging
+import os
+import tempfile
+import shutil
+import struct
+
+import mxnet as mx
+from mxnet.contrib import onnx as onnx_mxnet
+import numpy as np
+
+from sagemaker_mxnet_container.training_utils import scheduler_host
+
+
+def load_data(path):
+    with gzip.open(find_file(path, "labels.gz")) as flbl:
+        struct.unpack(">II", flbl.read(8))
+        labels = np.fromstring(flbl.read(), dtype=np.int8)
+    with gzip.open(find_file(path, "images.gz")) as fimg:
+        _, _, rows, cols = struct.unpack(">IIII", fimg.read(16))
+        images = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(labels), rows, cols)
+        images = images.reshape(images.shape[0], 1, 28, 28).astype(np.float32) / 255
+    return labels, images
+
+
+def find_file(root_path, file_name):
+    for root, dirs, files in os.walk(root_path):
+        if file_name in files:
+            return os.path.join(root, file_name)
+
+
+def build_graph():
+    data = mx.sym.var('data')
+    data = mx.sym.flatten(data=data)
+    fc1 = mx.sym.FullyConnected(data=data, num_hidden=128)
+    act1 = mx.sym.Activation(data=fc1, act_type="relu")
+    fc2 = mx.sym.FullyConnected(data=act1, num_hidden=64)
+    act2 = mx.sym.Activation(data=fc2, act_type="relu")
+    fc3 = mx.sym.FullyConnected(data=act2, num_hidden=10)
+    return mx.sym.SoftmaxOutput(data=fc3, name='softmax')
+
+
+def get_training_context(num_gpus):
+    if num_gpus:
+        return [mx.gpu(i) for i in range(num_gpus)]
+    else:
+        return mx.cpu()
+
+
+def train(batch_size, epochs, learning_rate, num_gpus, training_channel, testing_channel,
+          hosts, current_host, model_dir):
+    (train_labels, train_images) = load_data(training_channel)
+    (test_labels, test_images) = load_data(testing_channel)
+
+    # Data parallel training - shard the data so each host
+    # only trains on a subset of the total data.
+    shard_size = len(train_images) // len(hosts)
+    for i, host in enumerate(hosts):
+        if host == current_host:
+            start = shard_size * i
+            end = start + shard_size
+            break
+
+    train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size,
+                                   shuffle=True)
+    val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size)
+
+    logging.getLogger().setLevel(logging.DEBUG)
+
+    kvstore = 'local' if len(hosts) == 1 else 'dist_sync'
+
+    mlp_model = mx.mod.Module(symbol=build_graph(),
+                              context=get_training_context(num_gpus))
+    mlp_model.fit(train_iter,
+                  eval_data=val_iter,
+                  kvstore=kvstore,
+                  optimizer='sgd',
+                  optimizer_params={'learning_rate': learning_rate},
+                  eval_metric='acc',
+                  batch_end_callback=mx.callback.Speedometer(batch_size, 100),
+                  num_epoch=epochs)
+
+    if current_host == scheduler_host(hosts):
+        save(model_dir, mlp_model)
+
+
+def save(model_dir, model):
+    tmp_dir = tempfile.mkdtemp()
+
+    symbol_file = os.path.join(tmp_dir, 'model-symbol.json')
+    params_file = os.path.join(tmp_dir, 'model-0000.params')
+
+    model.symbol.save(symbol_file)
+    model.save_params(params_file)
+
+    data_shapes = [[dim for dim in data_desc.shape] for data_desc in model.data_shapes]
+    output_path = os.path.join(model_dir, 'model.onnx')
+
+    onnx_mxnet.export_model(symbol_file, params_file, data_shapes, np.float32, output_path)
+
+    shutil.rmtree(tmp_dir)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--batch-size', type=int, default=100)
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--learning-rate', type=float, default=0.1)
+
+    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
+    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
+    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
+
+    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
+    parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    num_gpus = int(os.environ['SM_NUM_GPUS'])
+
+    train(args.batch_size, args.epochs, args.learning_rate, num_gpus, args.train, args.test,
+          args.hosts, args.current_host, args.model_dir)
diff --git a/sagemaker-python-sdk/mxnet_onnx_export/mxnet_onnx_export.ipynb b/sagemaker-python-sdk/mxnet_onnx_export/mxnet_onnx_export.ipynb
@@ -0,0 +1,175 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exporting ONNX Models with MXNet\n",
+    "\n",
+    "The [Open Neural Network Exchange](https://onnx.ai/) (ONNX) is an open format for representing deep learning models with its extensible computation graph model and definitions of built-in operators and standard data types. Starting with MXNet 1.3, models trained using MXNet can now be saved as ONNX models.\n",
+    "\n",
+    "In this example, we will show how to train a model on Amazon SageMaker and save it as an ONNX model. This notebooks is based on the [MXNet MNIST notebook](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/mxnet_mnist/mxnet_mnist.ipynb) and the [MXNet example for exporting to ONNX](https://mxnet.incubator.apache.org/tutorials/onnx/export_mxnet_to_onnx.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setup\n",
+    "\n",
+    "First we need to define a few variables that will be needed later in the example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "\n",
+    "from sagemaker import get_execution_role\n",
+    "from sagemaker.session import Session\n",
+    "\n",
+    "# AWS region\n",
+    "region = boto3.Session().region_name\n",
+    "\n",
+    "# S3 bucket for saving code and model artifacts.\n",
+    "# Feel free to specify a different bucket here if you wish.\n",
+    "bucket = Session().default_bucket()\n",
+    "\n",
+    "# Location to save your custom code in tar.gz format.\n",
+    "custom_code_upload_location = 's3://{}/customcode/mxnet'.format(bucket)\n",
+    "\n",
+    "# Location where results of model training are saved.\n",
+    "model_artifacts_location = 's3://{}/artifacts'.format(bucket)\n",
+    "\n",
+    "# IAM execution role that gives SageMaker access to resources in your AWS account.\n",
+    "# We can use the SageMaker Python SDK to get the role from our notebook environment. \n",
+    "role = get_execution_role()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The training script\n",
+    "\n",
+    "The ``mnist.py`` script provides all the code we need for training and hosting a SageMaker model. The script we will use is adaptated from Apache MXNet [MNIST tutorial](https://mxnet.incubator.apache.org/tutorials/python/mnist.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pygmentize mnist.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exporting to ONNX\n",
+    "\n",
+    "The important part of this script can be found in the `save` method. This is where the ONNX model is exported:\n",
+    "\n",
+    "```python\n",
+    "import os\n",
+    "\n",
+    "from mxnet.contrib import onnx as onnx_mxnet\n",
+    "import numpy as np\n",
+    "\n",
+    "def save(model_dir, model):\n",
+    "    symbol_file = os.path.join(model_dir, 'model-symbol.json')\n",
+    "    params_file = os.path.join(model_dir, 'model-0000.params')\n",
+    "\n",
+    "    model.symbol.save(symbol_file)\n",
+    "    model.save_params(params_file)\n",
+    "\n",
+    "    data_shapes = [[dim for dim in data_desc.shape] for data_desc in model.data_shapes]\n",
+    "    output_path = os.path.join(model_dir, 'model.onnx')\n",
+    "    \n",
+    "    onnx_mxnet.export_model(symbol_file, params_file, data_shapes, np.float32, output_path)\n",
+    "```\n",
+    "\n",
+    "The last line in that method, `onnx_mxnet.export_model`, is what saves the model in the ONNX format. You can see that we pass the following arguments:\n",
+    "\n",
+    "* `symbol_file`: path to the saved input symbol file\n",
+    "* `params_file`: path to the saved input params file\n",
+    "* `data_shapes`: list of the input shapes\n",
+    "* `np.float32`: input data type\n",
+    "* `output_path`: path to save the generated ONNX file\n",
+    "\n",
+    "For more information, see the [MXNet Documentation](https://mxnet.incubator.apache.org/api/python/contrib/onnx.html#mxnet.contrib.onnx.mx2onnx.export_model.export_model)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training the model\n",
+    "\n",
+    "With the training script written to export an ONNX model, the rest of training process looks like any other Amazon SageMaker training job using MXNet. For a more in-depth explanation of these steps, see the [MXNet MNIST notebook](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/mxnet_mnist/mxnet_mnist.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.mxnet import MXNet\n",
+    "\n",
+    "mnist_estimator = MXNet(entry_point='mnist.py',\n",
+    "                        role=role,\n",
+    "                        output_path=model_artifacts_location,\n",
+    "                        code_location=custom_code_upload_location,\n",
+    "                        train_instance_count=1,\n",
+    "                        train_instance_type='ml.m4.xlarge',\n",
+    "                        framework_version='1.3.0',\n",
+    "                        hyperparameters={'learning-rate': 0.1})\n",
+    "\n",
+    "train_data_location = 's3://sagemaker-sample-data-{}/mxnet/mnist/train'.format(region)\n",
+    "test_data_location = 's3://sagemaker-sample-data-{}/mxnet/mnist/test'.format(region)\n",
+    "\n",
+    "mnist_estimator.fit({'train': train_data_location, 'test': test_data_location})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Next steps\n",
+    "\n",
+    "Now that we have an ONNX model, we can deploy it to an endpoint in the same way we do in the [MXNet MNIST notebook](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/mxnet_mnist/mxnet_mnist.ipynb).\n",
+    "\n",
+    "For examples on how to write a `model_fn` to load the ONNX model, please refer to:\n",
+    "* the [MXNet ONNX Super Resolution notebook](https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk/mxnet_onnx_superresolution)\n",
+    "* the [MXNet documentation](https://mxnet.incubator.apache.org/api/python/contrib/onnx.html#mxnet.contrib.onnx.onnx2mx.import_model.import_model)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_mxnet_p36",
+   "language": "python",
+   "name": "conda_mxnet_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}