aws · ragavvenkatesan · Jan 30, 2018
@@ -20,3 +20,4 @@ examples/tensorflow/distributed_mnist/data
 doc/_build
 **/.DS_Store
 venv/
+*~
@@ -0,0 +1,126 @@
+{
+  "nodes": [
+    {
+      "op": "null", 
+      "name": "data", 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "sequential0_dense0_weight", 
+      "attr": {
+        "__dtype__": "0", 
+        "__lr_mult__": "1.0", 
+        "__shape__": "(128, 0)", 
+        "__wd_mult__": "1.0"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "sequential0_dense0_bias", 
+      "attr": {
+        "__dtype__": "0", 
+        "__init__": "zeros", 
+        "__lr_mult__": "1.0", 
+        "__shape__": "(128,)", 
+        "__wd_mult__": "1.0"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "FullyConnected", 
+      "name": "sequential0_dense0_fwd", 
+      "attr": {"num_hidden": "128"}, 
+      "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "sequential0_dense0_relu_fwd", 
+      "attr": {"act_type": "relu"}, 
+      "inputs": [[3, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "sequential0_dense1_weight", 
+      "attr": {
+        "__dtype__": "0", 
+        "__lr_mult__": "1.0", 
+        "__shape__": "(64, 0)", 
+        "__wd_mult__": "1.0"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "sequential0_dense1_bias", 
+      "attr": {
+        "__dtype__": "0", 
+        "__init__": "zeros", 
+        "__lr_mult__": "1.0", 
+        "__shape__": "(64,)", 
+        "__wd_mult__": "1.0"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "FullyConnected", 
+      "name": "sequential0_dense1_fwd", 
+      "attr": {"num_hidden": "64"}, 
+      "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "sequential0_dense1_relu_fwd", 
+      "attr": {"act_type": "relu"}, 
+      "inputs": [[7, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "sequential0_dense2_weight", 
+      "attr": {
+        "__dtype__": "0", 
+        "__lr_mult__": "1.0", 
+        "__shape__": "(10, 0)", 
+        "__wd_mult__": "1.0"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "sequential0_dense2_bias", 
+      "attr": {
+        "__dtype__": "0", 
+        "__init__": "zeros", 
+        "__lr_mult__": "1.0", 
+        "__shape__": "(10,)", 
+        "__wd_mult__": "1.0"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "FullyConnected", 
+      "name": "sequential0_dense2_fwd", 
+      "attr": {"num_hidden": "10"}, 
+      "inputs": [[8, 0, 0], [9, 0, 0], [10, 0, 0]]
+    }
+  ], 
+  "arg_nodes": [0, 1, 2, 5, 6, 9, 10], 
+  "node_row_ptr": [
+    0, 
+    1, 
+    2, 
+    3, 
+    4, 
+    5, 
+    6, 
+    7, 
+    8, 
+    9, 
+    10, 
+    11, 
+    12
+  ], 
+  "heads": [[11, 0, 0]], 
+  "attrs": {"mxnet_version": ["int", 1100]}
+}
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+sagemaker mxnet host --role-name <your-sagemaker-execution-role>
@@ -0,0 +1,41 @@
+from __future__ import print_function
+
+import json
+import mxnet as mx
+from mxnet import gluon
+
+
+def model_fn(model_dir):
+    """
+    Load the gluon model. Called once when hosting service starts.
+
+    :param: model_dir The directory where model files are stored.
+    :return: a model (in this case a Gluon network)
+    """
+    symbol = mx.sym.load('%s/model.json' % model_dir)
+    outputs = mx.symbol.softmax(data=symbol, name='softmax_label')
+    inputs = mx.sym.var('data')
+    param_dict = gluon.ParameterDict('model_')
+    net = gluon.SymbolBlock(outputs, inputs, param_dict)
+    net.load_params('%s/model.params' % model_dir, ctx=mx.cpu())
+    return net
+
+
+def transform_fn(net, data, input_content_type, output_content_type):
+    """
+    Transform a request using the Gluon model. Called once per request.
+
+    :param net: The Gluon model.
+    :param data: The request payload.
+    :param input_content_type: The request content type.
+    :param output_content_type: The (desired) response content type.
+    :return: response payload and content type.
+    """
+    # we can use content types to vary input/output handling, but
+    # here we just assume json for both
+    parsed = json.loads(data)
+    nda = mx.nd.array(parsed)
+    output = net(nda)
+    prediction = mx.nd.argmax(output, axis=1)
+    response_body = json.dumps(prediction.asnumpy().tolist())
+    return response_body, output_content_type
@@ -0,0 +1,10 @@
+from mxnet import gluon
+
+
+def download_training_data():
+    gluon.data.vision.MNIST('./data/training', train=True)
+    gluon.data.vision.MNIST('./data/training', train=False)
+
+
+if __name__ == "__main__":
+    download_training_data()
@@ -0,0 +1,7 @@
+{
+  "batch_size": 100,
+  "epochs": 10,
+  "learning_rate": 0.1,
+  "momentum": 0.9,
+  "log_interval": 100
+}
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+python ./download_training_data.py
+sagemaker mxnet train --role-name <your-sagemaker-execution-role>
@@ -0,0 +1,118 @@
+import logging
+import time
+
+import mxnet as mx
+import numpy as np
+from mxnet import gluon, autograd
+from mxnet.gluon import nn
+
+logger = logging.getLogger(__name__)
+
+
+def train(channel_input_dirs, hyperparameters, **kwargs):
+    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
+    # the current container environment, but here we just use simple cpu context.
+    ctx = mx.cpu()
+
+    # retrieve the hyperparameters we set in notebook (with some defaults)
+    batch_size = hyperparameters.get('batch_size', 100)
+    epochs = hyperparameters.get('epochs', 10)
+    learning_rate = hyperparameters.get('learning_rate', 0.1)
+    momentum = hyperparameters.get('momentum', 0.9)
+    log_interval = hyperparameters.get('log_interval', 100)
+
+    training_data = channel_input_dirs['training']
+
+    # load training and validation data
+    # we use the gluon.data.vision.MNIST class because of its built in mnist pre-processing logic,
+    # but point it at the location where SageMaker placed the data files, so it doesn't download them again.
+    train_data = get_train_data(training_data, batch_size)
+    val_data = get_val_data(training_data, batch_size)
+
+    # define the network
+    net = define_network()
+
+    # Collect all parameters from net and its children, then initialize them.
+    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
+    # Trainer is for updating parameters with gradient.
+    trainer = gluon.Trainer(net.collect_params(), 'sgd',
+                            {'learning_rate': learning_rate, 'momentum': momentum})
+    metric = mx.metric.Accuracy()
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+    for epoch in range(epochs):
+        # reset data iterator and metric at begining of epoch.
+        metric.reset()
+        btic = time.time()
+        for i, (data, label) in enumerate(train_data):
+            # Copy data to ctx if necessary
+            data = data.as_in_context(ctx)
+            label = label.as_in_context(ctx)
+            # Start recording computation graph with record() section.
+            # Recorded graphs can then be differentiated with backward.
+            with autograd.record():
+                output = net(data)
+                L = loss(output, label)
+                L.backward()
+            # take a gradient step with batch_size equal to data.shape[0]
+            trainer.step(data.shape[0])
+            # update metric at last.
+            metric.update([label], [output])
+
+            if i % log_interval == 0 and i > 0:
+                name, acc = metric.get()
+                logger.info('[Epoch %d Batch %d] Training: %s=%f, %f samples/s' %
+                            (epoch, i, name, acc, batch_size / (time.time() - btic)))
+
+            btic = time.time()
+
+        name, acc = metric.get()
+        logger.info('[Epoch %d] Training: %s=%f' % (epoch, name, acc))
+
+        name, val_acc = test(ctx, net, val_data)
+        logger.info('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc))
+
+    return net
+
+
+def save(net, model_dir):
+    # save the model
+    y = net(mx.sym.var('data'))
+    y.save('%s/model.json' % model_dir)
+    net.collect_params().save('%s/model.params' % model_dir)
+
+
+def define_network():
+    net = nn.Sequential()
+    with net.name_scope():
+        net.add(nn.Dense(128, activation='relu'))
+        net.add(nn.Dense(64, activation='relu'))
+        net.add(nn.Dense(10))
+    return net
+
+
+def input_transformer(data, label):
+    data = data.reshape((-1,)).astype(np.float32) / 255
+    return data, label
+
+
+def get_train_data(data_dir, batch_size):
+    return gluon.data.DataLoader(
+        gluon.data.vision.MNIST(data_dir, train=True, transform=input_transformer),
+        batch_size=batch_size, shuffle=True, last_batch='discard')
+
+
+def get_val_data(data_dir, batch_size):
+    return gluon.data.DataLoader(
+        gluon.data.vision.MNIST(data_dir, train=False, transform=input_transformer),
+        batch_size=batch_size, shuffle=False)
+
+
+def test(ctx, net, val_data):
+    metric = mx.metric.Accuracy()
+    for data, label in val_data:
+        data = data.as_in_context(ctx)
+        label = label.as_in_context(ctx)
+        output = net(data)
+        metric.update([label], [output])
+    return metric.get()
@@ -1,9 +1,10 @@
 import os
-from setuptools import setup, find_packages
 from glob import glob
 from os.path import basename
 from os.path import splitext
 
+from setuptools import setup, find_packages
+
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
@@ -36,4 +37,8 @@ def read(fname):
       extras_require={
           'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist',
                    'mock', 'tensorflow>=1.3.0', 'contextlib2']},
+
+      entry_points={
+          'console_scripts': ['sagemaker=sagemaker.cli.main:main'],
+      }
       )
@@ -28,8 +28,8 @@ class AmazonAlgorithmEstimatorBase(EstimatorBase):
     """Base class for Amazon first-party Estimator implementations. This class isn't intended
     to be instantiated directly."""
 
-    feature_dim = hp('feature_dim', (validation.isint, validation.gt(0)))
-    mini_batch_size = hp('mini_batch_size', (validation.isint, validation.gt(0)))
+    feature_dim = hp('feature_dim', validation.gt(0), data_type=int)
+    mini_batch_size = hp('mini_batch_size', validation.gt(0), data_type=int)
 
     def __init__(self, role, train_instance_count, train_instance_type, data_location=None, **kwargs):
         """Initialize an AmazonAlgorithmEstimatorBase.
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,3 +20,4 @@ examples/tensorflow/distributed_mnist/data @@
     doc/_build
     **/.DS_Store
     venv/
+    *~
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		sagemaker mxnet host --role-name <your-sagemaker-execution-role>