Merge branch 'master' into PT113Release

Qingzi-Lan · web-flow · commit b1c3e0ce5f81 · 2023-05-03T12:18:07.000-07:00
diff --git a/doc/api/training/sdp_versions/latest.rst b/doc/api/training/sdp_versions/latest.rst
@@ -26,7 +26,7 @@ depending on the version of the library you use.
    <https://docs.aws.amazon.com/sagemaker/latest/dg/data-parallel-use-api.html#data-parallel-use-python-skd-api>`_
    for more information.
 
-For versions between 1.4.0 and 1.7.0 (Latest)
+For versions between 1.4.0 and 1.8.0 (Latest)
 =============================================
 
 .. toctree::
diff --git a/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst b/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst
@@ -5,39 +5,64 @@ Release Notes
 #############
 
 New features, bug fixes, and improvements are regularly made to the SageMaker
-distributed data parallel library.
+data parallelism library.
 
-SageMaker Distributed Data Parallel 1.7.0 Release Notes
+SageMaker Distributed Data Parallel 1.8.0 Release Notes
 =======================================================
 
-*Date: Feb. 10. 2023*
+*Date: Apr. 17. 2023*
 
 **Currency Updates**
 
-* Added support for PyTorch 1.13.1.
+* Added support for PyTorch 2.0.0.
 
 **Migration to AWS Deep Learning Containers**
 
 This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
 
-- PyTorch 1.13.1 DLC
+- PyTorch 2.0.0 DLC
 
   .. code::
 
-    763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker
+    763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker
 
 Binary file of this version of the library for custom container users:
 
   .. code::
 
-    https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.13.1/cu117/2023-01-09/smdistributed_dataparallel-1.7.0-cp39-cp39-linux_x86_64.whl
+    https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.0.0/cu118/2023-03-20/smdistributed_dataparallel-1.8.0-cp310-cp310-linux_x86_64.whl
 
 
 ----
 
 Release History
 ===============
 
+SageMaker Distributed Data Parallel 1.7.0 Release Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+*Date: Feb. 10. 2023*
+
+**Currency Updates**
+
+* Added support for PyTorch 1.13.1.
+
+**Migration to AWS Deep Learning Containers**
+
+This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
+
+- PyTorch 1.13.1 DLC
+
+  .. code::
+
+    763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker
+
+Binary file of this version of the library for custom container users:
+
+  .. code::
+
+    https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.13.1/cu117/2023-01-09/smdistributed_dataparallel-1.7.0-cp39-cp39-linux_x86_64.whl
+
 SageMaker Distributed Data Parallel 1.6.0 Release Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst
@@ -892,7 +892,7 @@ see `For versions 1.1 and lower <#for-versions-1.1-and-lower>`_.
     |               |--inference.py
     |               |--requirements.txt
 
-Where ``requirments.txt`` is an optional file that specifies dependencies on third-party libraries.
+Where ``requirements.txt`` is an optional file that specifies dependencies on third-party libraries.
 
 Create a ``PyTorchModel`` object
 --------------------------------
diff --git a/src/sagemaker/image_uri_config/djl-neuronx.json b/src/sagemaker/image_uri_config/djl-neuronx.json
@@ -0,0 +1,35 @@
+{
+    "scope": ["inference"],
+    "versions": {
+        "0.22.1": {
+            "registries": {
+                "af-south-1": "626614931356",
+                "ap-east-1": "871362719292",
+                "ap-northeast-1": "763104351884",
+                "ap-northeast-2": "763104351884",
+                "ap-northeast-3": "364406365360",
+                "ap-south-1": "763104351884",
+                "ap-southeast-1": "763104351884",
+                "ap-southeast-2": "763104351884",
+                "ap-southeast-3": "907027046896",
+                "ca-central-1": "763104351884",
+                "cn-north-1": "727897471807",
+                "cn-northwest-1": "727897471807",
+                "eu-central-1": "763104351884",
+                "eu-north-1": "763104351884",
+                "eu-west-1": "763104351884",
+                "eu-west-2": "763104351884",
+                "eu-west-3": "763104351884",
+                "eu-south-1": "692866216735",
+                "me-south-1": "217643126080",
+                "sa-east-1": "763104351884",
+                "us-east-1": "763104351884",
+                "us-east-2": "763104351884",
+                "us-west-1": "763104351884",
+                "us-west-2": "763104351884"
+            },
+            "repository": "djl-inference",
+            "tag_prefix": "0.22.1-neuronx-sdk2.9.0"
+        }
+    }
+}
diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py
@@ -731,7 +731,7 @@ def map(self, func, *iterables):
         futures = map(self.submit, itertools.repeat(func), *iterables)
         return [future.result() for future in futures]
 
-    def shutdown(self):
+    def shutdown(self, wait=True):
         """Prevent more function executions to be submitted to this executor."""
         with self._state_condition:
             self._shutdown = True
@@ -742,15 +742,15 @@ def shutdown(self):
             self._state_condition.notify_all()
 
         if self._workers is not None:
-            self._workers.shutdown(wait=True)
+            self._workers.shutdown(wait)
 
     def __enter__(self):
         """Create an executor instance and return it"""
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Make sure the executor instance is shutdown."""
-        self.shutdown()
+        self.shutdown(wait=False)
         return False
 
     @staticmethod
diff --git a/tests/unit/sagemaker/image_uris/test_djl.py b/tests/unit/sagemaker/image_uris/test_djl.py
@@ -43,13 +43,17 @@
 }
 DJL_DEEPSPEED_VERSIONS = ["0.21.0", "0.20.0", "0.19.0"]
 DJL_FASTERTRANSFORMER_VERSIONS = ["0.21.0"]
+DJL_NEURONX_VERSIONS = ["0.22.1"]
 DJL_VERSIONS_TO_FRAMEWORK = {
     "0.19.0": {"djl-deepspeed": "deepspeed0.7.3-cu113"},
     "0.20.0": {"djl-deepspeed": "deepspeed0.7.5-cu116"},
     "0.21.0": {
         "djl-deepspeed": "deepspeed0.8.3-cu117",
         "djl-fastertransformer": "fastertransformer5.3.0-cu117",
     },
+    "0.22.1": {
+        "djl-neuronx": "neuronx-sdk2.9.0",
+    },
 }
 
 
@@ -65,6 +69,12 @@ def test_djl_fastertransformer(region, version):
     _test_djl_uris(region, version, "djl-fastertransformer")
 
 
+@pytest.mark.parametrize("region", ACCOUNTS.keys())
+@pytest.mark.parametrize("version", DJL_NEURONX_VERSIONS)
+def test_djl_neuronx(region, version):
+    _test_djl_uris(region, version, "djl-neuronx")
+
+
 def _test_djl_uris(region, version, djl_framework):
     uri = image_uris.retrieve(framework=djl_framework, region=region, version=version)
     expected = expected_uris.djl_framework_uri(
diff --git a/tests/unit/sagemaker/remote_function/test_client.py b/tests/unit/sagemaker/remote_function/test_client.py
@@ -509,6 +509,11 @@ def test_executor_submit_happy_case(mock_start, mock_job_settings, parallelism):
         future_3 = e.submit(job_function, 9, 10, c=11, d=12)
         future_4 = e.submit(job_function, 13, 14, c=15, d=16)
 
+    future_1.wait()
+    future_2.wait()
+    future_3.wait()
+    future_4.wait()
+
     mock_start.assert_has_calls(
         [
             call(ANY, job_function, (1, 2), {"c": 3, "d": 4}, None),
@@ -517,10 +522,6 @@ def test_executor_submit_happy_case(mock_start, mock_job_settings, parallelism):
             call(ANY, job_function, (13, 14), {"c": 15, "d": 16}, None),
         ]
     )
-    mock_job_1.describe.assert_called()
-    mock_job_2.describe.assert_called()
-    mock_job_3.describe.assert_called()
-    mock_job_4.describe.assert_called()
 
     assert future_1.done()
     assert future_2.done()
@@ -545,14 +546,15 @@ def test_executor_submit_with_run(mock_start, mock_job_settings, run_obj):
             future_1 = e.submit(job_function, 1, 2, c=3, d=4)
             future_2 = e.submit(job_function, 5, 6, c=7, d=8)
 
+    future_1.wait()
+    future_2.wait()
+
     mock_start.assert_has_calls(
         [
             call(ANY, job_function, (1, 2), {"c": 3, "d": 4}, run_info),
             call(ANY, job_function, (5, 6), {"c": 7, "d": 8}, run_info),
         ]
     )
-    mock_job_1.describe.assert_called()
-    mock_job_2.describe.assert_called()
 
     assert future_1.done()
     assert future_2.done()
@@ -562,14 +564,15 @@ def test_executor_submit_with_run(mock_start, mock_job_settings, run_obj):
             future_3 = e.submit(job_function, 9, 10, c=11, d=12)
             future_4 = e.submit(job_function, 13, 14, c=15, d=16)
 
+    future_3.wait()
+    future_4.wait()
+
     mock_start.assert_has_calls(
         [
             call(ANY, job_function, (9, 10), {"c": 11, "d": 12}, run_info),
             call(ANY, job_function, (13, 14), {"c": 15, "d": 16}, run_info),
         ]
     )
-    mock_job_3.describe.assert_called()
-    mock_job_4.describe.assert_called()
 
     assert future_3.done()
     assert future_4.done()
@@ -621,7 +624,7 @@ def test_executor_fails_to_start_job(mock_start, *args):
 
     with pytest.raises(TypeError):
         future_1.result()
-    print(future_2._state)
+    future_2.wait()
     assert future_2.done()
 
 
@@ -678,6 +681,8 @@ def test_executor_describe_job_throttled_temporarily(mock_start, *args):
         # submit second job
         future_2 = e.submit(job_function, 5, 6, c=7, d=8)
 
+    future_1.wait()
+    future_2.wait()
     assert future_1.done()
     assert future_2.done()
 
@@ -697,9 +702,9 @@ def test_executor_describe_job_failed_permanently(mock_start, *args):
         future_2 = e.submit(job_function, 5, 6, c=7, d=8)
 
     with pytest.raises(RuntimeError):
-        future_1.done()
+        future_1.result()
     with pytest.raises(RuntimeError):
-        future_2.done()
+        future_2.result()
 
 
 @pytest.mark.parametrize(