Add more comments

Dan Choi · Dan Choi · commit ff8f23f48872 · 2018-08-03T15:34:14.000-07:00
diff --git a/advanced_functionality/tensorflow_bring_your_own/tensorflow_bring_your_own.ipynb b/advanced_functionality/tensorflow_bring_your_own/tensorflow_bring_your_own.ipynb
@@ -256,7 +256,7 @@
    "source": [
     "### Building and registering the container\n",
     "\n",
-    "The following shell code shows how to build the container image using `docker build` and push the container image to ECR using `docker push`. This code is also available as the shell script `container/build-and-push.sh`, which you can run as `build-and-push.sh tensorflow_cifar10_example` to build the image `tensorflow_cifar10_example`. \n",
+    "The following shell code shows how to build the container image using `docker build` and push the container image to ECR using `docker push`. This code is also available as the shell script `container/build-and-push.sh`, which you can run as `build-and-push.sh tensorflow-cifar10-example` to build the image `tensorflow-cifar10-example`. \n",
     "\n",
     "This code looks for an ECR repository in the account you're using and the current default region (if you're using a SageMaker notebook instance, this is the region where the notebook instance was created). If the repository doesn't exist, the script will create it."
    ]
@@ -270,7 +270,7 @@
     "%%sh\n",
     "\n",
     "# The name of our algorithm\n",
-    "algorithm_name=tensorflow_cifar10_example\n",
+    "algorithm_name=tensorflow-cifar10-example\n",
     "\n",
     "cd container\n",
     "\n",
@@ -321,7 +321,7 @@
    "source": [
     "## Download the CIFAR-10 dataset\n",
     "Our training algorithm is expecting our training data to be in the file format of [TFRecords](https://www.tensorflow.org/guide/datasets), which is a simple record-oriented binary format that many TensorFlow applications use for training data.\n",
-    "Below is a python script from the official TensorFlow CIFAR10 example, which downloads the CIFAR-10 dataset and converts them into TFRecords."
+    "Below is a Python script adapted from the [official TensorFlow CIFAR-10 example](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator), which downloads the CIFAR-10 dataset and converts them into TFRecords."
    ]
   },
   {
@@ -335,11 +335,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "eval.tfrecords\ttrain.tfrecords  validation.tfrecords\r\n"
+     ]
+    }
+   ],
    "source": [
-    "! ls /tmp/cifar-10-data "
+    "# There should be three tfrecords. (eval, train, validation)\n",
+    "! ls /tmp/cifar-10-data"
    ]
   },
   {
@@ -359,7 +368,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -387,10 +396,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SageMaker instance route table setup is ok. We are good to go.\r\n",
+      "SageMaker instance routing for Docker is ok. We are good to go!\r\n"
+     ]
+    }
+   ],
    "source": [
+    "# Lets set up our SageMaker notebook instance for local mode.\n",
     "!/bin/bash ./utils/setup.sh"
    ]
   },
@@ -428,34 +447,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install opencv-python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Collecting opencv-python\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/53/e0/21c8964fa8ef50842ebefaa7346a3cf0e37b56c8ecd97ed6bd2dbe577705/opencv_python-3.4.2.17-cp36-cp36m-manylinux1_x86_64.whl (25.0MB)\n",
-      "\u001b[K    100% |████████████████████████████████| 25.0MB 2.1MB/s eta 0:00:01\n",
-      "\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from opencv-python) (1.14.5)\n",
-      "\u001b[31mdistributed 1.21.8 requires msgpack, which is not installed.\u001b[0m\n",
-      "Installing collected packages: opencv-python\n",
-      "Successfully installed opencv-python-3.4.2.17\n",
-      "\u001b[33mYou are using pip version 10.0.1, however version 18.0 is available.\n",
-      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
+      "\u001b[36malgo-1-L58J2_1  |\u001b[0m 172.18.0.1 - - [03/Aug/2018:22:32:52 +0000] \"POST /invocations HTTP/1.1\" 200 229 \"-\" \"-\"\r\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'predictions': [{'probabilities': [2.29861e-05,\n",
+       "    0.0104983,\n",
+       "    0.147974,\n",
+       "    0.01538,\n",
+       "    0.0478089,\n",
+       "    0.00164997,\n",
+       "    0.758483,\n",
+       "    0.0164191,\n",
+       "    0.00125304,\n",
+       "    0.000510801],\n",
+       "   'classes': 6}]}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
-   "source": [
-    "! pip install opencv-python"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "import cv2\n",
     "import numpy\n",
@@ -484,28 +515,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:sagemaker:Deleting endpoint with name: tensorflow_cifar10_example-2018-08-03-18-06-55-168\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Gracefully stopping... (press Ctrl+C again to force)\n",
-      "Stopping tmp3n0u5hj2_algo-1-HCRIC_1 ... \r\n",
-      "\u001b[1A\u001b[2K\r",
-      "Stopping tmp3n0u5hj2_algo-1-HCRIC_1 ... \u001b[32mdone\u001b[0m\r",
-      "\u001b[1B"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "predictor.delete_endpoint()"
    ]
@@ -523,7 +535,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -542,7 +554,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -564,7 +576,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -580,9 +592,32 @@
     "## Training on SageMaker\n",
     "Training a model on SageMaker with the Python SDK is done in a way that is similar to the way we trained it locally. This is done by changing our train_instance_type from `local` to one of our [supported EC2 instance types](https://aws.amazon.com/sagemaker/pricing/instance-types/).\n",
     "\n",
-    "In addition, we must now specify the ECR image URL, which we just pushed above. Be sure to replace the string within the Estimator parameter, image_name.\n",
+    "In addition, we must now specify the ECR image URL, which we just pushed above.\n",
+    "\n",
+    "Finally, our local training dataset has to be in Amazon S3 and the S3 URL to our dataset is passed into the `fit()` call.\n",
+    "\n",
+    "Let's first fetch our ECR image url that corresponds to the image we just built and pushed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "\n",
+    "client = boto3.client('sts')\n",
+    "account = client.get_caller_identity()['Account']\n",
     "\n",
-    "Finally, our local training dataset has to be in Amazon S3 and the S3 URL to our dataset is passed into the `fit()` call."
+    "my_session = boto3.session.Session()\n",
+    "region = my_session.region_name\n",
+    "\n",
+    "algorithm_name = 'tensorflow-cifar10-example'\n",
+    "\n",
+    "ecr_image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)\n",
+    "\n",
+    "print(ecr_image)"
    ]
   },
   {
@@ -600,7 +635,7 @@
     "estimator = Estimator(role=role,\n",
     "                      train_instance_count=1,\n",
     "                      train_instance_type=instance_type,\n",
-    "                      image_name='ecr-image',\n",
+    "                      image_name=ecr_image,\n",
     "                      hyperparameters=hyperparameters)\n",
     "\n",
     "estimator.fit(data_location)\n",
@@ -610,30 +645,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'predictions': [{'probabilities': [0.115806,\n",
-       "    0.119459,\n",
-       "    0.028497,\n",
-       "    0.348986,\n",
-       "    0.102692,\n",
-       "    0.0354596,\n",
-       "    0.0917221,\n",
-       "    0.00540253,\n",
-       "    0.121872,\n",
-       "    0.0301034],\n",
-       "   'classes': 3}]}"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "image = cv2.imread(\"data/cat.png\", 1)\n",
     "\n",
@@ -662,17 +676,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "b'{\\n    \"predictions\": [\\n        {\\n            \"classes\": 3,\\n            \"probabilities\": [0.122724, 0.0958609, 0.0519071, 0.272535, 0.097384, 0.0535893, 0.0905842, 0.0250508, 0.123435, 0.0669298]\\n        }\\n    ]\\n}'\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "\n",
@@ -681,9 +687,9 @@
     "endpoint_name = predictor.endpoint\n",
     "\n",
     "response = client.invoke_endpoint(EndpointName=endpoint_name, Body=json.dumps(data))\n",
-    "response_body = response['Body'].decode('utf-8')\n",
+    "response_body = response['Body']\n",
     "\n",
-    "print(response_body.read())"
+    "print(response_body.read().decode('utf-8'))"
    ]
   },
   {
diff --git a/advanced_functionality/tensorflow_bring_your_own/utils/generate_cifar10_tfrecords.py b/advanced_functionality/tensorflow_bring_your_own/utils/generate_cifar10_tfrecords.py
@@ -1,17 +1,15 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     http://aws.amazon.com/apache2.0/
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
 """Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
 
 Generates tf.train.Example protos and writes them to TFRecord files from the
@@ -29,20 +27,44 @@
 import tarfile
 
 from six.moves import cPickle as pickle
+from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
+from ipywidgets import FloatProgress
+from IPython.display import display
 import tensorflow as tf
 
 CIFAR_FILENAME = 'cifar-10-python.tar.gz'
 CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
 CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
 
 
-def download_and_extract(data_dir):
-    # download CIFAR-10 if not already downloaded.
-    tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
-                                                  CIFAR_DOWNLOAD_URL)
-    tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
-                 'r:gz').extractall(data_dir)
+def download_and_extract(data_dir, print_progress=True):
+    """Download and extract the tarball from Alex's website."""
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+
+    if os.path.exists(os.path.join(data_dir, 'cifar-10-batches-bin')):
+        print('cifar dataset already downloaded')
+        return
+
+    filename = CIFAR_DOWNLOAD_URL.split('/')[-1]
+    filepath = os.path.join(data_dir, filename)
+
+    if not os.path.exists(filepath):
+        f = FloatProgress(min=0, max=100)
+        display(f)
+        sys.stdout.write('\r>> Downloading %s ' % (filename))
+
+        def _progress(count, block_size, total_size):
+            if print_progress:
+                f.value = 100.0 * count * block_size / total_size
+
+        filepath, _ = urllib.request.urlretrieve(CIFAR_DOWNLOAD_URL, filepath, _progress)
+        print()
+        statinfo = os.stat(filepath)
+        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
+
+    tarfile.open(filepath, 'r:gz').extractall(data_dir)
 
 
 def _int64_feature(value):