Skip to content

feature: Adding support for SageMaker Training Compiler PyTorch 1.13 #3629

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Feb 14, 2023
Merged
32 changes: 31 additions & 1 deletion src/sagemaker/image_uri_config/pytorch-training-compiler.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"gpu"
],
"version_aliases": {
"1.12": "1.12.0"
"1.12": "1.12.0",
"1.13": "1.13.1"
},
"versions": {
"1.12.0": {
Expand Down Expand Up @@ -35,6 +36,35 @@
"us-west-2": "763104351884"
},
"repository": "pytorch-trcomp-training"
},
"1.13.1": {
"py_versions": [
"py39"
],
"registries": {
"af-south-1": "626614931356",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ca-central-1": "763104351884",
"eu-central-1": "763104351884",
"eu-north-1": "763104351884",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"eu-south-1": "692866216735",
"me-south-1": "217643126080",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "pytorch-trcomp-training"
}
}
}
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,16 @@ def huggingface_pytorch_latest_training_py_version(
)


@pytest.fixture(scope="module")
def pytorch_training_compiler_py_version(
pytorch_training_compiler_version,
):
return "py39" if Version(pytorch_training_compiler_version) > Version("1.12") else "py38"


# TODO: Create a fixture to get the latest py version from TRCOMP image_uri.


@pytest.fixture(scope="module")
def huggingface_pytorch_latest_inference_py_version(
huggingface_inference_pytorch_latest_version,
Expand Down
7 changes: 5 additions & 2 deletions tests/integ/test_training_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def test_pytorch(
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

hf = PyTorch(
py_version="py38",
py_version="py39",
source_dir=os.path.join(DATA_DIR, "huggingface_byoc"),
entry_point="run_glue.py",
role="SageMakerRole",
Expand Down Expand Up @@ -216,7 +216,10 @@ def test_huggingface_tensorflow(

@pytest.mark.release
def test_tensorflow(
sagemaker_session, gpu_instance_type, tensorflow_training_latest_version, imagenet_val_set
sagemaker_session,
gpu_instance_type,
tensorflow_training_latest_version,
imagenet_val_set,
):
"""
Test the TensorFlow estimator
Expand Down
78 changes: 59 additions & 19 deletions tests/unit/sagemaker/training_compiler/test_pytorch_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,22 +83,26 @@ def fixture_sagemaker_session():
return session


def _get_full_gpu_image_uri(version, instance_type, training_compiler_config):
def _get_full_gpu_image_uri(version, instance_type, training_compiler_config, py_version):
return image_uris.retrieve(
"pytorch-training-compiler",
REGION,
version=version,
py_version="py38",
py_version=py_version,
instance_type=instance_type,
image_scope="training",
container_version=None,
training_compiler_config=training_compiler_config,
)


def _create_train_job(version, instance_type, training_compiler_config, instance_count=1):
def _create_train_job(
version, instance_type, training_compiler_config, py_version, instance_count=1
):
return {
"image_uri": _get_full_gpu_image_uri(version, instance_type, training_compiler_config),
"image_uri": _get_full_gpu_image_uri(
version, instance_type, training_compiler_config, py_version
),
"input_mode": "File",
"input_config": [
{
Expand Down Expand Up @@ -303,15 +307,20 @@ def test_unsupported_distribution(
@patch("time.time", return_value=TIME)
@pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES)
def test_pytorchxla_distribution(
time, name_from_base, sagemaker_session, pytorch_training_compiler_version, instance_class
time,
name_from_base,
sagemaker_session,
pytorch_training_compiler_version,
instance_class,
pytorch_training_compiler_py_version,
):
if Version(pytorch_training_compiler_version) < Version("1.12"):
pytest.skip("This test is intended for PyTorch 1.12 and above")
compiler_config = TrainingCompilerConfig()
instance_type = f"ml.{instance_class}.xlarge"

pt = PyTorch(
py_version="py38",
py_version=pytorch_training_compiler_py_version,
entry_point=SCRIPT_PATH,
role=ROLE,
sagemaker_session=sagemaker_session,
Expand All @@ -333,7 +342,11 @@ def test_pytorchxla_distribution(
assert boto_call_names == ["resource"]

expected_train_args = _create_train_job(
pytorch_training_compiler_version, instance_type, compiler_config, instance_count=2
pytorch_training_compiler_version,
instance_type,
compiler_config,
pytorch_training_compiler_py_version,
instance_count=2,
)
expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs
expected_train_args["enable_sagemaker_metrics"] = False
Expand All @@ -357,13 +370,17 @@ def test_pytorchxla_distribution(
@patch("time.time", return_value=TIME)
@pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES)
def test_default_compiler_config(
time, name_from_base, sagemaker_session, pytorch_training_compiler_version, instance_class
time,
name_from_base,
sagemaker_session,
pytorch_training_compiler_version,
instance_class,
pytorch_training_compiler_py_version,
):
compiler_config = TrainingCompilerConfig()
instance_type = f"ml.{instance_class}.xlarge"

pt = PyTorch(
py_version="py38",
py_version=pytorch_training_compiler_py_version,
entry_point=SCRIPT_PATH,
role=ROLE,
sagemaker_session=sagemaker_session,
Expand All @@ -384,7 +401,10 @@ def test_default_compiler_config(
assert boto_call_names == ["resource"]

expected_train_args = _create_train_job(
pytorch_training_compiler_version, instance_type, compiler_config
pytorch_training_compiler_version,
instance_type,
compiler_config,
pytorch_training_compiler_py_version,
)
expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs
expected_train_args["enable_sagemaker_metrics"] = False
Expand All @@ -406,12 +426,16 @@ def test_default_compiler_config(
@patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME)
@patch("time.time", return_value=TIME)
def test_debug_compiler_config(
time, name_from_base, sagemaker_session, pytorch_training_compiler_version
time,
name_from_base,
sagemaker_session,
pytorch_training_compiler_version,
pytorch_training_compiler_py_version,
):
compiler_config = TrainingCompilerConfig(debug=True)

pt = PyTorch(
py_version="py38",
py_version=pytorch_training_compiler_py_version,
entry_point=SCRIPT_PATH,
role=ROLE,
sagemaker_session=sagemaker_session,
Expand All @@ -432,7 +456,10 @@ def test_debug_compiler_config(
assert boto_call_names == ["resource"]

expected_train_args = _create_train_job(
pytorch_training_compiler_version, INSTANCE_TYPE, compiler_config
pytorch_training_compiler_version,
INSTANCE_TYPE,
compiler_config,
pytorch_training_compiler_py_version,
)
expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs
expected_train_args["enable_sagemaker_metrics"] = False
Expand All @@ -454,12 +481,16 @@ def test_debug_compiler_config(
@patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME)
@patch("time.time", return_value=TIME)
def test_disable_compiler_config(
time, name_from_base, sagemaker_session, pytorch_training_compiler_version
time,
name_from_base,
sagemaker_session,
pytorch_training_compiler_version,
pytorch_training_compiler_py_version,
):
compiler_config = TrainingCompilerConfig(enabled=False)

pt = PyTorch(
py_version="py38",
py_version=pytorch_training_compiler_py_version,
entry_point=SCRIPT_PATH,
role=ROLE,
sagemaker_session=sagemaker_session,
Expand All @@ -480,7 +511,10 @@ def test_disable_compiler_config(
assert boto_call_names == ["resource"]

expected_train_args = _create_train_job(
pytorch_training_compiler_version, INSTANCE_TYPE, compiler_config
pytorch_training_compiler_version,
INSTANCE_TYPE,
compiler_config,
pytorch_training_compiler_py_version,
)
expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs
expected_train_args["enable_sagemaker_metrics"] = False
Expand Down Expand Up @@ -508,7 +542,10 @@ def test_attach(sagemaker_session, compiler_enabled, debug_enabled):
"py38-cu113-ubuntu20.04"
)
returned_job_description = {
"AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
"AlgorithmSpecification": {
"TrainingInputMode": "File",
"TrainingImage": training_image,
},
"HyperParameters": {
"sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
"sagemaker_program": '"iris-dnn-classifier.py"',
Expand All @@ -530,7 +567,10 @@ def test_attach(sagemaker_session, compiler_enabled, debug_enabled):
"TrainingJobName": "trcomp",
"TrainingJobStatus": "Completed",
"TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/trcomp",
"OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/trcomp"},
"OutputDataConfig": {
"KmsKeyId": "",
"S3OutputPath": "s3://place/output/trcomp",
},
"TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
}
sagemaker_session.sagemaker_client.describe_training_job = Mock(
Expand Down