Skip to content

Commit b1c3e0c

Browse files
authored
Merge branch 'master' into PT113Release
2 parents ac747e5 + 5b01d66 commit b1c3e0c

File tree

7 files changed

+98
-23
lines changed

7 files changed

+98
-23
lines changed

doc/api/training/sdp_versions/latest.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ depending on the version of the library you use.
2626
<https://docs.aws.amazon.com/sagemaker/latest/dg/data-parallel-use-api.html#data-parallel-use-python-skd-api>`_
2727
for more information.
2828

29-
For versions between 1.4.0 and 1.7.0 (Latest)
29+
For versions between 1.4.0 and 1.8.0 (Latest)
3030
=============================================
3131

3232
.. toctree::

doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,64 @@ Release Notes
55
#############
66

77
New features, bug fixes, and improvements are regularly made to the SageMaker
8-
distributed data parallel library.
8+
data parallelism library.
99

10-
SageMaker Distributed Data Parallel 1.7.0 Release Notes
10+
SageMaker Distributed Data Parallel 1.8.0 Release Notes
1111
=======================================================
1212

13-
*Date: Feb. 10. 2023*
13+
*Date: Apr. 17. 2023*
1414

1515
**Currency Updates**
1616

17-
* Added support for PyTorch 1.13.1.
17+
* Added support for PyTorch 2.0.0.
1818

1919
**Migration to AWS Deep Learning Containers**
2020

2121
This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
2222

23-
- PyTorch 1.13.1 DLC
23+
- PyTorch 2.0.0 DLC
2424

2525
.. code::
2626
27-
763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker
27+
763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker
2828
2929
Binary file of this version of the library for custom container users:
3030

3131
.. code::
3232
33-
https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.13.1/cu117/2023-01-09/smdistributed_dataparallel-1.7.0-cp39-cp39-linux_x86_64.whl
33+
https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.0.0/cu118/2023-03-20/smdistributed_dataparallel-1.8.0-cp310-cp310-linux_x86_64.whl
3434
3535
3636
----
3737

3838
Release History
3939
===============
4040

41+
SageMaker Distributed Data Parallel 1.7.0 Release Notes
42+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43+
44+
*Date: Feb. 10. 2023*
45+
46+
**Currency Updates**
47+
48+
* Added support for PyTorch 1.13.1.
49+
50+
**Migration to AWS Deep Learning Containers**
51+
52+
This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
53+
54+
- PyTorch 1.13.1 DLC
55+
56+
.. code::
57+
58+
763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker
59+
60+
Binary file of this version of the library for custom container users:
61+
62+
.. code::
63+
64+
https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.13.1/cu117/2023-01-09/smdistributed_dataparallel-1.7.0-cp39-cp39-linux_x86_64.whl
65+
4166
SageMaker Distributed Data Parallel 1.6.0 Release Notes
4267
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4368

doc/frameworks/pytorch/using_pytorch.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ see `For versions 1.1 and lower <#for-versions-1.1-and-lower>`_.
892892
| |--inference.py
893893
| |--requirements.txt
894894

895-
Where ``requirments.txt`` is an optional file that specifies dependencies on third-party libraries.
895+
Where ``requirements.txt`` is an optional file that specifies dependencies on third-party libraries.
896896

897897
Create a ``PyTorchModel`` object
898898
--------------------------------
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"scope": ["inference"],
3+
"versions": {
4+
"0.22.1": {
5+
"registries": {
6+
"af-south-1": "626614931356",
7+
"ap-east-1": "871362719292",
8+
"ap-northeast-1": "763104351884",
9+
"ap-northeast-2": "763104351884",
10+
"ap-northeast-3": "364406365360",
11+
"ap-south-1": "763104351884",
12+
"ap-southeast-1": "763104351884",
13+
"ap-southeast-2": "763104351884",
14+
"ap-southeast-3": "907027046896",
15+
"ca-central-1": "763104351884",
16+
"cn-north-1": "727897471807",
17+
"cn-northwest-1": "727897471807",
18+
"eu-central-1": "763104351884",
19+
"eu-north-1": "763104351884",
20+
"eu-west-1": "763104351884",
21+
"eu-west-2": "763104351884",
22+
"eu-west-3": "763104351884",
23+
"eu-south-1": "692866216735",
24+
"me-south-1": "217643126080",
25+
"sa-east-1": "763104351884",
26+
"us-east-1": "763104351884",
27+
"us-east-2": "763104351884",
28+
"us-west-1": "763104351884",
29+
"us-west-2": "763104351884"
30+
},
31+
"repository": "djl-inference",
32+
"tag_prefix": "0.22.1-neuronx-sdk2.9.0"
33+
}
34+
}
35+
}

src/sagemaker/remote_function/client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,7 @@ def map(self, func, *iterables):
731731
futures = map(self.submit, itertools.repeat(func), *iterables)
732732
return [future.result() for future in futures]
733733

734-
def shutdown(self):
734+
def shutdown(self, wait=True):
735735
"""Prevent more function executions to be submitted to this executor."""
736736
with self._state_condition:
737737
self._shutdown = True
@@ -742,15 +742,15 @@ def shutdown(self):
742742
self._state_condition.notify_all()
743743

744744
if self._workers is not None:
745-
self._workers.shutdown(wait=True)
745+
self._workers.shutdown(wait)
746746

747747
def __enter__(self):
748748
"""Create an executor instance and return it"""
749749
return self
750750

751751
def __exit__(self, exc_type, exc_val, exc_tb):
752752
"""Make sure the executor instance is shutdown."""
753-
self.shutdown()
753+
self.shutdown(wait=False)
754754
return False
755755

756756
@staticmethod

tests/unit/sagemaker/image_uris/test_djl.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,17 @@
4343
}
4444
DJL_DEEPSPEED_VERSIONS = ["0.21.0", "0.20.0", "0.19.0"]
4545
DJL_FASTERTRANSFORMER_VERSIONS = ["0.21.0"]
46+
DJL_NEURONX_VERSIONS = ["0.22.1"]
4647
DJL_VERSIONS_TO_FRAMEWORK = {
4748
"0.19.0": {"djl-deepspeed": "deepspeed0.7.3-cu113"},
4849
"0.20.0": {"djl-deepspeed": "deepspeed0.7.5-cu116"},
4950
"0.21.0": {
5051
"djl-deepspeed": "deepspeed0.8.3-cu117",
5152
"djl-fastertransformer": "fastertransformer5.3.0-cu117",
5253
},
54+
"0.22.1": {
55+
"djl-neuronx": "neuronx-sdk2.9.0",
56+
},
5357
}
5458

5559

@@ -65,6 +69,12 @@ def test_djl_fastertransformer(region, version):
6569
_test_djl_uris(region, version, "djl-fastertransformer")
6670

6771

72+
@pytest.mark.parametrize("region", ACCOUNTS.keys())
73+
@pytest.mark.parametrize("version", DJL_NEURONX_VERSIONS)
74+
def test_djl_neuronx(region, version):
75+
_test_djl_uris(region, version, "djl-neuronx")
76+
77+
6878
def _test_djl_uris(region, version, djl_framework):
6979
uri = image_uris.retrieve(framework=djl_framework, region=region, version=version)
7080
expected = expected_uris.djl_framework_uri(

tests/unit/sagemaker/remote_function/test_client.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,11 @@ def test_executor_submit_happy_case(mock_start, mock_job_settings, parallelism):
509509
future_3 = e.submit(job_function, 9, 10, c=11, d=12)
510510
future_4 = e.submit(job_function, 13, 14, c=15, d=16)
511511

512+
future_1.wait()
513+
future_2.wait()
514+
future_3.wait()
515+
future_4.wait()
516+
512517
mock_start.assert_has_calls(
513518
[
514519
call(ANY, job_function, (1, 2), {"c": 3, "d": 4}, None),
@@ -517,10 +522,6 @@ def test_executor_submit_happy_case(mock_start, mock_job_settings, parallelism):
517522
call(ANY, job_function, (13, 14), {"c": 15, "d": 16}, None),
518523
]
519524
)
520-
mock_job_1.describe.assert_called()
521-
mock_job_2.describe.assert_called()
522-
mock_job_3.describe.assert_called()
523-
mock_job_4.describe.assert_called()
524525

525526
assert future_1.done()
526527
assert future_2.done()
@@ -545,14 +546,15 @@ def test_executor_submit_with_run(mock_start, mock_job_settings, run_obj):
545546
future_1 = e.submit(job_function, 1, 2, c=3, d=4)
546547
future_2 = e.submit(job_function, 5, 6, c=7, d=8)
547548

549+
future_1.wait()
550+
future_2.wait()
551+
548552
mock_start.assert_has_calls(
549553
[
550554
call(ANY, job_function, (1, 2), {"c": 3, "d": 4}, run_info),
551555
call(ANY, job_function, (5, 6), {"c": 7, "d": 8}, run_info),
552556
]
553557
)
554-
mock_job_1.describe.assert_called()
555-
mock_job_2.describe.assert_called()
556558

557559
assert future_1.done()
558560
assert future_2.done()
@@ -562,14 +564,15 @@ def test_executor_submit_with_run(mock_start, mock_job_settings, run_obj):
562564
future_3 = e.submit(job_function, 9, 10, c=11, d=12)
563565
future_4 = e.submit(job_function, 13, 14, c=15, d=16)
564566

567+
future_3.wait()
568+
future_4.wait()
569+
565570
mock_start.assert_has_calls(
566571
[
567572
call(ANY, job_function, (9, 10), {"c": 11, "d": 12}, run_info),
568573
call(ANY, job_function, (13, 14), {"c": 15, "d": 16}, run_info),
569574
]
570575
)
571-
mock_job_3.describe.assert_called()
572-
mock_job_4.describe.assert_called()
573576

574577
assert future_3.done()
575578
assert future_4.done()
@@ -621,7 +624,7 @@ def test_executor_fails_to_start_job(mock_start, *args):
621624

622625
with pytest.raises(TypeError):
623626
future_1.result()
624-
print(future_2._state)
627+
future_2.wait()
625628
assert future_2.done()
626629

627630

@@ -678,6 +681,8 @@ def test_executor_describe_job_throttled_temporarily(mock_start, *args):
678681
# submit second job
679682
future_2 = e.submit(job_function, 5, 6, c=7, d=8)
680683

684+
future_1.wait()
685+
future_2.wait()
681686
assert future_1.done()
682687
assert future_2.done()
683688

@@ -697,9 +702,9 @@ def test_executor_describe_job_failed_permanently(mock_start, *args):
697702
future_2 = e.submit(job_function, 5, 6, c=7, d=8)
698703

699704
with pytest.raises(RuntimeError):
700-
future_1.done()
705+
future_1.result()
701706
with pytest.raises(RuntimeError):
702-
future_2.done()
707+
future_2.result()
703708

704709

705710
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)