Merge branch 'master' into fix-localmode

icywang86rui · web-flow · commit 09f76e391c0c · 2018-10-15T09:44:59.000-07:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,7 @@ CHANGELOG
 * feature: Local Mode: Add support for Batch Inference
 * feature: Add timestamp to secondary status in training job output
 * bug-fix: Local Mode: Set correct default values for additional_volumes and additional_env_vars
+* enhancement: Local Mode: support nvidia-docker2 natively
 
 1.11.2
 ======
diff --git a/setup.py b/setup.py
@@ -44,8 +44,8 @@ def read(fname):
       ],
 
       # Declare minimal set for installation
-      install_requires=['boto3>=1.4.8', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0', 'urllib3>=1.2',
-                        'PyYAML>=3.2', 'protobuf3-to-dict>=0.1.5'],
+      install_requires=['boto3>=1.4.8', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0', 'urllib3 >=1.21, <1.23',
+                        'PyYAML>=3.2', 'protobuf3-to-dict>=0.1.5', 'docker-compose>=1.21.0'],
 
       extras_require={
           'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist',
diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py
@@ -362,8 +362,8 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
         }
 
         content = {
-            # Some legacy hosts only support the 2.1 format.
-            'version': '2.1',
+            # Use version 2.3 as a minimum so that we can specify the runtime
+            'version': '2.3',
             'services': services,
             'networks': {
                 'sagemaker-local': {'name': 'sagemaker-local'}
@@ -415,6 +415,11 @@ def _create_docker_host(self, host, environment, optml_subdirs, command, volumes
             }
         }
 
+        # for GPU support pass in nvidia as the runtime, this is equivalent
+        # to setting --runtime=nvidia in the docker commandline.
+        if self.instance_type == 'local_gpu':
+            host_config['runtime'] = 'nvidia'
+
         if command == 'serve':
             serving_port = sagemaker.utils.get_config_value('local.serving_port',
                                                             self.sagemaker_session.config) or 8080
diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py
@@ -368,8 +368,9 @@ def test_mxnet_local_data_local_script():
         fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
 
 
-@pytest.mark.continuous_testing
 def test_local_transform_mxnet(sagemaker_local_session, tmpdir):
+    local_mode_lock_fd = open(LOCK_PATH, 'w')
+    local_mode_lock = local_mode_lock_fd.fileno()
     data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
     script_path = os.path.join(data_path, 'mnist.py')
 
@@ -392,7 +393,13 @@ def test_local_transform_mxnet(sagemaker_local_session, tmpdir):
     output_path = 'file://%s' % (str(tmpdir))
     transformer = mx.transformer(1, 'local', assemble_with='Line', max_payload=1,
                                  strategy='SingleRecord', output_path=output_path)
+
+    # Since Local Mode uses the same port for serving, we need a lock in order
+    # to allow concurrent test execution.
+    fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
     transformer.transform(transform_input, content_type='text/csv', split_type='Line')
     transformer.wait()
+    time.sleep(5)
+    fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
 
     assert os.path.exists(os.path.join(str(tmpdir), 'data.csv.out'))
diff --git a/tests/unit/test_image.py b/tests/unit/test_image.py
@@ -334,6 +334,27 @@ def test_train_local_code(download_folder, _cleanup, popen, _stream_output,
                 assert '%s:/opt/ml/shared' % shared_folder_path in volumes
 
 
+def test_container_has_gpu_support(tmpdir, sagemaker_session):
+    instance_count = 1
+    image = 'my-image'
+    sagemaker_container = _SageMakerContainer('local_gpu', instance_count, image,
+                                              sagemaker_session=sagemaker_session)
+
+    docker_host = sagemaker_container._create_docker_host('host-1', {}, set(), 'train', [])
+    assert 'runtime' in docker_host
+    assert docker_host['runtime'] == 'nvidia'
+
+
+def test_container_does_not_enable_nvidia_docker_for_cpu_containers(tmpdir, sagemaker_session):
+    instance_count = 1
+    image = 'my-image'
+    sagemaker_container = _SageMakerContainer('local', instance_count, image,
+                                              sagemaker_session=sagemaker_session)
+
+    docker_host = sagemaker_container._create_docker_host('host-1', {}, set(), 'train', [])
+    assert 'runtime' not in docker_host
+
+
 @patch('sagemaker.local.image._HostingContainer.run')
 @patch('shutil.copy')
 @patch('shutil.copytree')