diff --git a/src/sagemaker/image_uri_config/pytorch-training-compiler.json b/src/sagemaker/image_uri_config/pytorch-training-compiler.json index fd7df875a3..719c7671ab 100644 --- a/src/sagemaker/image_uri_config/pytorch-training-compiler.json +++ b/src/sagemaker/image_uri_config/pytorch-training-compiler.json @@ -4,7 +4,8 @@ "gpu" ], "version_aliases": { - "1.12": "1.12.0" + "1.12": "1.12.0", + "1.13": "1.13.1" }, "versions": { "1.12.0": { @@ -35,6 +36,35 @@ "us-west-2": "763104351884" }, "repository": "pytorch-trcomp-training" + }, + "1.13.1": { + "py_versions": [ + "py39" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "pytorch-trcomp-training" } } } diff --git a/tests/conftest.py b/tests/conftest.py index 208cdcb221..0ba2dc8e16 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -318,6 +318,16 @@ def huggingface_pytorch_latest_training_py_version( ) +@pytest.fixture(scope="module") +def pytorch_training_compiler_py_version( + pytorch_training_compiler_version, +): + return "py39" if Version(pytorch_training_compiler_version) > Version("1.12") else "py38" + + +# TODO: Create a fixture to get the latest py version from TRCOMP image_uri. + + @pytest.fixture(scope="module") def huggingface_pytorch_latest_inference_py_version( huggingface_inference_pytorch_latest_version, diff --git a/tests/integ/test_training_compiler.py b/tests/integ/test_training_compiler.py index 724cd8890c..10bd809bc4 100644 --- a/tests/integ/test_training_compiler.py +++ b/tests/integ/test_training_compiler.py @@ -150,7 +150,7 @@ def test_pytorch( with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): hf = PyTorch( - py_version="py38", + py_version="py39", source_dir=os.path.join(DATA_DIR, "huggingface_byoc"), entry_point="run_glue.py", role="SageMakerRole", @@ -216,7 +216,10 @@ def test_huggingface_tensorflow( @pytest.mark.release def test_tensorflow( - sagemaker_session, gpu_instance_type, tensorflow_training_latest_version, imagenet_val_set + sagemaker_session, + gpu_instance_type, + tensorflow_training_latest_version, + imagenet_val_set, ): """ Test the TensorFlow estimator diff --git a/tests/unit/sagemaker/training_compiler/test_pytorch_compiler.py b/tests/unit/sagemaker/training_compiler/test_pytorch_compiler.py index 0b3f0e8de6..8bbed0bbec 100644 --- a/tests/unit/sagemaker/training_compiler/test_pytorch_compiler.py +++ b/tests/unit/sagemaker/training_compiler/test_pytorch_compiler.py @@ -83,12 +83,12 @@ def fixture_sagemaker_session(): return session -def _get_full_gpu_image_uri(version, instance_type, training_compiler_config): +def _get_full_gpu_image_uri(version, instance_type, training_compiler_config, py_version): return image_uris.retrieve( "pytorch-training-compiler", REGION, version=version, - py_version="py38", + py_version=py_version, instance_type=instance_type, image_scope="training", container_version=None, @@ -96,9 +96,13 @@ def _get_full_gpu_image_uri(version, instance_type, training_compiler_config): ) -def _create_train_job(version, instance_type, training_compiler_config, instance_count=1): +def _create_train_job( + version, instance_type, training_compiler_config, py_version, instance_count=1 +): return { - "image_uri": _get_full_gpu_image_uri(version, instance_type, training_compiler_config), + "image_uri": _get_full_gpu_image_uri( + version, instance_type, training_compiler_config, py_version + ), "input_mode": "File", "input_config": [ { @@ -303,7 +307,12 @@ def test_unsupported_distribution( @patch("time.time", return_value=TIME) @pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES) def test_pytorchxla_distribution( - time, name_from_base, sagemaker_session, pytorch_training_compiler_version, instance_class + time, + name_from_base, + sagemaker_session, + pytorch_training_compiler_version, + instance_class, + pytorch_training_compiler_py_version, ): if Version(pytorch_training_compiler_version) < Version("1.12"): pytest.skip("This test is intended for PyTorch 1.12 and above") @@ -311,7 +320,7 @@ def test_pytorchxla_distribution( instance_type = f"ml.{instance_class}.xlarge" pt = PyTorch( - py_version="py38", + py_version=pytorch_training_compiler_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -333,7 +342,11 @@ def test_pytorchxla_distribution( assert boto_call_names == ["resource"] expected_train_args = _create_train_job( - pytorch_training_compiler_version, instance_type, compiler_config, instance_count=2 + pytorch_training_compiler_version, + instance_type, + compiler_config, + pytorch_training_compiler_py_version, + instance_count=2, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -357,13 +370,17 @@ def test_pytorchxla_distribution( @patch("time.time", return_value=TIME) @pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES) def test_default_compiler_config( - time, name_from_base, sagemaker_session, pytorch_training_compiler_version, instance_class + time, + name_from_base, + sagemaker_session, + pytorch_training_compiler_version, + instance_class, + pytorch_training_compiler_py_version, ): compiler_config = TrainingCompilerConfig() instance_type = f"ml.{instance_class}.xlarge" - pt = PyTorch( - py_version="py38", + py_version=pytorch_training_compiler_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -384,7 +401,10 @@ def test_default_compiler_config( assert boto_call_names == ["resource"] expected_train_args = _create_train_job( - pytorch_training_compiler_version, instance_type, compiler_config + pytorch_training_compiler_version, + instance_type, + compiler_config, + pytorch_training_compiler_py_version, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -406,12 +426,16 @@ def test_default_compiler_config( @patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME) @patch("time.time", return_value=TIME) def test_debug_compiler_config( - time, name_from_base, sagemaker_session, pytorch_training_compiler_version + time, + name_from_base, + sagemaker_session, + pytorch_training_compiler_version, + pytorch_training_compiler_py_version, ): compiler_config = TrainingCompilerConfig(debug=True) pt = PyTorch( - py_version="py38", + py_version=pytorch_training_compiler_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -432,7 +456,10 @@ def test_debug_compiler_config( assert boto_call_names == ["resource"] expected_train_args = _create_train_job( - pytorch_training_compiler_version, INSTANCE_TYPE, compiler_config + pytorch_training_compiler_version, + INSTANCE_TYPE, + compiler_config, + pytorch_training_compiler_py_version, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -454,12 +481,16 @@ def test_debug_compiler_config( @patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME) @patch("time.time", return_value=TIME) def test_disable_compiler_config( - time, name_from_base, sagemaker_session, pytorch_training_compiler_version + time, + name_from_base, + sagemaker_session, + pytorch_training_compiler_version, + pytorch_training_compiler_py_version, ): compiler_config = TrainingCompilerConfig(enabled=False) pt = PyTorch( - py_version="py38", + py_version=pytorch_training_compiler_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -480,7 +511,10 @@ def test_disable_compiler_config( assert boto_call_names == ["resource"] expected_train_args = _create_train_job( - pytorch_training_compiler_version, INSTANCE_TYPE, compiler_config + pytorch_training_compiler_version, + INSTANCE_TYPE, + compiler_config, + pytorch_training_compiler_py_version, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -508,7 +542,10 @@ def test_attach(sagemaker_session, compiler_enabled, debug_enabled): "py38-cu113-ubuntu20.04" ) returned_job_description = { - "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "AlgorithmSpecification": { + "TrainingInputMode": "File", + "TrainingImage": training_image, + }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', @@ -530,7 +567,10 @@ def test_attach(sagemaker_session, compiler_enabled, debug_enabled): "TrainingJobName": "trcomp", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/trcomp", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/trcomp"}, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://place/output/trcomp", + }, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock(