feature: Add PyTorch 1.13.1 to SDK (#3587)

Mike Schneider · Shibo Xing · web-flow · commit 25c49d497872 · 2023-01-25T16:53:36.000-08:00
Co-authored-by: Shibo Xing &lt;shibox@amazon.com&gt;
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
@@ -131,6 +131,7 @@
         "1.12",
         "1.12.0",
         "1.12.1",
+        "1.13.1",
     ],
 }
 
@@ -143,6 +144,7 @@
     "1.12",
     "1.12.0",
     "1.12.1",
+    "1.13.1",
 ]
 
 
diff --git a/src/sagemaker/image_uri_config/pytorch.json b/src/sagemaker/image_uri_config/pytorch.json
@@ -74,7 +74,8 @@
             "1.9": "1.9.1",
             "1.10": "1.10.2",
             "1.11": "1.11.0",
-            "1.12": "1.12.1"
+            "1.12": "1.12.1",
+            "1.13": "1.13.1"
         },
         "versions": {
             "0.4.0": {
@@ -783,6 +784,42 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-inference"
+            },
+            "1.13.1": {
+                "py_versions": [
+                    "py39"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ap-southeast-3": "907027046896",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-east-1": "446045086412",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-isob-east-1": "094389454867",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-inference"
             }
         }
     },
@@ -855,7 +892,8 @@
             "1.9": "1.9.1",
             "1.10": "1.10.2",
             "1.11": "1.11.0",
-            "1.12": "1.12.1"
+            "1.12": "1.12.1",
+            "1.13": "1.13.1"
         },
         "versions": {
             "0.4.0": {
@@ -1520,6 +1558,42 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-training"
+            },
+            "1.13.1": {
+                "py_versions": [
+                    "py39"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ap-southeast-3": "907027046896",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-east-1": "446045086412",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-isob-east-1": "094389454867",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-training"
             }
         }
     }
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -86,6 +86,8 @@
     "huggingface_training_compiler",
 )
 
+PYTORCH_RENEWED_GPU = "ml.g4dn.xlarge"
+
 
 def pytest_addoption(parser):
     parser.addoption("--sagemaker-client-config", action="store", default=None)
@@ -221,22 +223,26 @@ def mxnet_eia_latest_py_version():
 
 @pytest.fixture(scope="module", params=["py2", "py3"])
 def pytorch_training_py_version(pytorch_training_version, request):
-    if Version(pytorch_training_version) < Version("1.5.0"):
-        return request.param
+    if Version(pytorch_training_version) >= Version("1.13"):
+        return "py39"
     elif Version(pytorch_training_version) >= Version("1.9"):
         return "py38"
-    else:
+    elif Version(pytorch_training_version) >= Version("1.5.0"):
         return "py3"
+    else:
+        return request.param
 
 
 @pytest.fixture(scope="module", params=["py2", "py3"])
 def pytorch_inference_py_version(pytorch_inference_version, request):
-    if Version(pytorch_inference_version) < Version("1.4.0"):
-        return request.param
+    if Version(pytorch_inference_version) >= Version("1.13"):
+        return "py39"
     elif Version(pytorch_inference_version) >= Version("1.9"):
         return "py38"
-    else:
+    elif Version(pytorch_inference_version) >= Version("1.4.0"):
         return "py3"
+    else:
+        return request.param
 
 
 @pytest.fixture(scope="module")
@@ -252,9 +258,13 @@ def huggingface_pytorch_training_py_version(huggingface_pytorch_training_version
 
 
 @pytest.fixture(scope="module")
-def huggingface_training_compiler_pytorch_version(huggingface_training_compiler_version):
+def huggingface_training_compiler_pytorch_version(
+    huggingface_training_compiler_version,
+):
     versions = _huggingface_base_fm_version(
-        huggingface_training_compiler_version, "pytorch", "huggingface_training_compiler"
+        huggingface_training_compiler_version,
+        "pytorch",
+        "huggingface_training_compiler",
     )
     if not versions:
         pytest.skip(
@@ -265,9 +275,13 @@ def huggingface_training_compiler_pytorch_version(huggingface_training_compiler_
 
 
 @pytest.fixture(scope="module")
-def huggingface_training_compiler_tensorflow_version(huggingface_training_compiler_version):
+def huggingface_training_compiler_tensorflow_version(
+    huggingface_training_compiler_version,
+):
     versions = _huggingface_base_fm_version(
-        huggingface_training_compiler_version, "tensorflow", "huggingface_training_compiler"
+        huggingface_training_compiler_version,
+        "tensorflow",
+        "huggingface_training_compiler",
     )
     if not versions:
         pytest.skip(
@@ -289,19 +303,25 @@ def huggingface_training_compiler_tensorflow_py_version(
 
 
 @pytest.fixture(scope="module")
-def huggingface_training_compiler_pytorch_py_version(huggingface_training_compiler_pytorch_version):
+def huggingface_training_compiler_pytorch_py_version(
+    huggingface_training_compiler_pytorch_version,
+):
     return "py38"
 
 
 @pytest.fixture(scope="module")
-def huggingface_pytorch_latest_training_py_version(huggingface_training_pytorch_latest_version):
+def huggingface_pytorch_latest_training_py_version(
+    huggingface_training_pytorch_latest_version,
+):
     return (
         "py38" if Version(huggingface_training_pytorch_latest_version) >= Version("1.9") else "py36"
     )
 
 
 @pytest.fixture(scope="module")
-def huggingface_pytorch_latest_inference_py_version(huggingface_inference_pytorch_latest_version):
+def huggingface_pytorch_latest_inference_py_version(
+    huggingface_inference_pytorch_latest_version,
+):
     return (
         "py38"
         if Version(huggingface_inference_pytorch_latest_version) >= Version("1.9")
@@ -477,7 +497,8 @@ def pytorch_ddp_py_version():
 
 
 @pytest.fixture(
-    scope="module", params=["1.10", "1.10.0", "1.10.2", "1.11", "1.11.0", "1.12", "1.12.0"]
+    scope="module",
+    params=["1.10", "1.10.0", "1.10.2", "1.11", "1.11.0", "1.12", "1.12.0"],
 )
 def pytorch_ddp_framework_version(request):
     return request.param
@@ -511,6 +532,23 @@ def gpu_instance_type(sagemaker_session, request):
         return "ml.p3.2xlarge"
 
 
+@pytest.fixture()
+def gpu_pytorch_instance_type(sagemaker_session, request):
+    if "pytorch_inference_version" in request.fixturenames:
+        fw_version = request.getfixturevalue("pytorch_inference_version")
+    else:
+        fw_version = request.param
+
+    region = sagemaker_session.boto_session.region_name
+    if region in NO_P3_REGIONS:
+        if Version(fw_version) >= Version("1.13"):
+            return PYTORCH_RENEWED_GPU
+        else:
+            return "ml.p2.xlarge"
+    else:
+        return "ml.p3.2xlarge"
+
+
 @pytest.fixture(scope="session")
 def gpu_instance_type_list(sagemaker_session, request):
     region = sagemaker_session.boto_session.region_name
diff --git a/tests/unit/sagemaker/image_uris/test_dlc_frameworks.py b/tests/unit/sagemaker/image_uris/test_dlc_frameworks.py
@@ -18,6 +18,7 @@
 from tests.unit.sagemaker.image_uris import expected_uris
 
 INSTANCE_TYPES_AND_PROCESSORS = (("ml.c4.xlarge", "cpu"), ("ml.p2.xlarge", "gpu"))
+RENEWED_PYTORCH_INSTANCE_TYPES_AND_PROCESSORS = (("ml.c4.xlarge", "cpu"), ("ml.g4dn.xlarge", "gpu"))
 REGION = "us-west-2"
 
 DLC_ACCOUNT = "763104351884"
@@ -70,7 +71,12 @@ def _test_image_uris(
         "image_scope": scope,
     }
 
-    for instance_type, processor in INSTANCE_TYPES_AND_PROCESSORS:
+    TYPES_AND_PROCESSORS = INSTANCE_TYPES_AND_PROCESSORS
+    if framework == "pytorch" and Version(fw_version) >= Version("1.13"):
+        """Handle P2 deprecation"""
+        TYPES_AND_PROCESSORS = RENEWED_PYTORCH_INSTANCE_TYPES_AND_PROCESSORS
+
+    for instance_type, processor in TYPES_AND_PROCESSORS:
         uri = image_uris.retrieve(region=REGION, instance_type=instance_type, **base_args)
 
         expected = expected_fn(processor=processor, **expected_fn_args)
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
@@ -912,6 +912,7 @@ def test_validate_smdataparallel_args_not_raises():
         ("ml.p3.16xlarge", "pytorch", "1.12.0", "py38", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.12.1", "py38", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.12", "py38", smdataparallel_enabled),
+        ("ml.p3.16xlarge", "pytorch", "1.13.1", "py39", smdataparallel_enabled),
         ("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "tensorflow", "2.4.1", "py37", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "tensorflow", "2.4.3", "py3", smdataparallel_enabled_custom_mpi),
@@ -932,6 +933,7 @@ def test_validate_smdataparallel_args_not_raises():
         ("ml.p3.16xlarge", "pytorch", "1.11.0", "py38", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "pytorch", "1.12.0", "py38", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "pytorch", "1.12.1", "py38", smdataparallel_enabled_custom_mpi),
+        ("ml.p3.16xlarge", "pytorch", "1.13.1", "py39", smdataparallel_enabled_custom_mpi),
     ]
     for instance_type, framework_name, framework_version, py_version, distribution in good_args:
         fw_utils._validate_smdataparallel_args(
diff --git a/tests/unit/test_pytorch.py b/tests/unit/test_pytorch.py
@@ -302,7 +302,12 @@ def test_create_model_with_custom_image(name_from_base, sagemaker_session):
 @patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME)
 @patch("time.time", return_value=TIME)
 def test_pytorch(
-    time, name_from_base, sagemaker_session, pytorch_inference_version, pytorch_inference_py_version
+    time,
+    name_from_base,
+    sagemaker_session,
+    pytorch_inference_version,
+    pytorch_inference_py_version,
+    gpu_pytorch_instance_type,
 ):
     pytorch = PyTorch(
         entry_point=SCRIPT_PATH,
@@ -339,24 +344,29 @@ def test_pytorch(
         REGION,
         version=pytorch_inference_version,
         py_version=pytorch_inference_py_version,
-        instance_type=GPU,
+        instance_type=gpu_pytorch_instance_type,
         image_scope="inference",
     )
 
-    actual_environment = model.prepare_container_def(GPU)
+    actual_environment = model.prepare_container_def(gpu_pytorch_instance_type)
     submit_directory = actual_environment["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"]
     model_url = actual_environment["ModelDataUrl"]
     expected_environment = _get_environment(submit_directory, model_url, expected_image_uri)
     assert actual_environment == expected_environment
 
     assert "cpu" in model.prepare_container_def(CPU)["Image"]
-    predictor = pytorch.deploy(1, GPU)
+    predictor = pytorch.deploy(1, gpu_pytorch_instance_type)
     assert isinstance(predictor, PyTorchPredictor)
 
 
 @patch("sagemaker.utils.repack_model", MagicMock())
 @patch("sagemaker.utils.create_tar_file", MagicMock())
-def test_model(sagemaker_session, pytorch_inference_version, pytorch_inference_py_version):
+def test_model(
+    sagemaker_session,
+    pytorch_inference_version,
+    pytorch_inference_py_version,
+    gpu_pytorch_instance_type,
+):
     model = PyTorchModel(
         MODEL_DATA,
         role=ROLE,
@@ -365,21 +375,22 @@ def test_model(sagemaker_session, pytorch_inference_version, pytorch_inference_p
         py_version=pytorch_inference_py_version,
         sagemaker_session=sagemaker_session,
     )
-    predictor = model.deploy(1, GPU)
+    predictor = model.deploy(1, gpu_pytorch_instance_type)
     assert isinstance(predictor, PyTorchPredictor)
 
 
 @patch("sagemaker.utils.create_tar_file", MagicMock())
 @patch("sagemaker.utils.repack_model")
-def test_mms_model(repack_model, sagemaker_session):
+@pytest.mark.parametrize("gpu_pytorch_instance_type", ["1.2"], indirect=True)
+def test_mms_model(repack_model, sagemaker_session, gpu_pytorch_instance_type):
     PyTorchModel(
         MODEL_DATA,
         role=ROLE,
         entry_point=SCRIPT_PATH,
         sagemaker_session=sagemaker_session,
         framework_version="1.2",
         py_version="py3",
-    ).deploy(1, GPU)
+    ).deploy(1, gpu_pytorch_instance_type)
 
     repack_model.assert_called_with(
         dependencies=[],
@@ -428,6 +439,7 @@ def test_model_custom_serialization(
     sagemaker_session,
     pytorch_inference_version,
     pytorch_inference_py_version,
+    gpu_pytorch_instance_type,
 ):
     model = PyTorchModel(
         MODEL_DATA,
@@ -441,7 +453,7 @@ def test_model_custom_serialization(
     custom_deserializer = Mock()
     predictor = model.deploy(
         1,
-        GPU,
+        gpu_pytorch_instance_type,
         serializer=custom_serializer,
         deserializer=custom_deserializer,
     )

Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,7 @@`
`131`	`131`	`"1.12",`
`132`	`132`	`"1.12.0",`
`133`	`133`	`"1.12.1",`
	`134`	`+ "1.13.1",`
`134`	`135`	`],`
`135`	`136`	`}`
`136`	`137`
`@@ -143,6 +144,7 @@`
`143`	`144`	`"1.12",`
`144`	`145`	`"1.12.0",`
`145`	`146`	`"1.12.1",`
	`147`	`+ "1.13.1",`
`146`	`148`	`]`
`147`	`149`
`148`	`150`