Skip to content

feature: Add PyTorch 1.13.1 to SDK #3587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f34555b
add PT 1.13.1 Inference
Jan 12, 2023
d4cdf64
chore: add PT-1.13.1 for tr
Jan 23, 2023
67eb6b2
Merge pull request #2 from ShiboXing/add-pt1.13.1-training
Jan 23, 2023
bf203e3
Merge branch 'master' into add-pt1.13.1-inference
Jan 24, 2023
238f1b8
test: update pytorch training py_version configuration
Jan 24, 2023
9b25f3f
solve py38 issue and training addition
Jan 24, 2023
1a6ad3c
refactor get py version
Jan 24, 2023
4189054
added comma
Jan 24, 2023
d598c07
adding PT 1.13.1 to test_fw_utils
Jan 24, 2023
9109fa1
adding PT 1.13.1 for smdataparallel.
Jan 24, 2023
ac5471f
remote 1.13 and only use 1.13.1 in test_fw_utils
Jan 24, 2023
8cb53db
Merge branch 'master' into add-pt1.13.1-inference
Jan 25, 2023
a6ad8ba
Merge branch 'master' into add-pt1.13.1-inference
Jan 25, 2023
42d8b6f
Merge branch 'add-pt1.13.1-inference' of https://github.com/xncqr/sag…
Jan 25, 2023
53e1a83
fix: p2 issue for test_pytorch_training
Jan 25, 2023
32f37d1
fix: 1.4.0 missing py_version for inference
Jan 25, 2023
d3a7822
fix: p2 error in unit test with a fixture
Jan 25, 2023
3968264
Merge pull request #3 from ShiboXing/add-pt1.13.1-training
Jan 25, 2023
44e0f18
fix: format with black
Jan 25, 2023
3099993
Merge pull request #4 from ShiboXing/add-pt1.13.1-training
Jan 25, 2023
5f1d22a
fix: remove us-isob-east-1 for the latest fw
Jan 25, 2023
62dce63
Merge pull request #5 from ShiboXing/add-pt1.13.1-training
Jan 25, 2023
dcef8d4
add isob to PT 1.13.1 training and inference
Jan 25, 2023
b0cbaf0
correct PT FW version for python logic
Jan 25, 2023
64bb0e8
remove py2 from PT 1.4.0 Inference. accidental add
Jan 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/sagemaker/fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
"1.12",
"1.12.0",
"1.12.1",
"1.13.1",
],
}

Expand All @@ -143,6 +144,7 @@
"1.12",
"1.12.0",
"1.12.1",
"1.13.1",
]


Expand Down
78 changes: 76 additions & 2 deletions src/sagemaker/image_uri_config/pytorch.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@
"1.9": "1.9.1",
"1.10": "1.10.2",
"1.11": "1.11.0",
"1.12": "1.12.1"
"1.12": "1.12.1",
"1.13": "1.13.1"
},
"versions": {
"0.4.0": {
Expand Down Expand Up @@ -783,6 +784,42 @@
"us-west-2": "763104351884"
},
"repository": "pytorch-inference"
},
"1.13.1": {
"py_versions": [
"py39"
],
"registries": {
"af-south-1": "626614931356",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ap-southeast-3": "907027046896",
"ca-central-1": "763104351884",
"cn-north-1": "727897471807",
"cn-northwest-1": "727897471807",
"eu-central-1": "763104351884",
"eu-north-1": "763104351884",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"eu-south-1": "692866216735",
"me-south-1": "217643126080",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-gov-east-1": "446045086412",
"us-gov-west-1": "442386744353",
"us-iso-east-1": "886529160074",
"us-isob-east-1": "094389454867",
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "pytorch-inference"
}
}
},
Expand Down Expand Up @@ -855,7 +892,8 @@
"1.9": "1.9.1",
"1.10": "1.10.2",
"1.11": "1.11.0",
"1.12": "1.12.1"
"1.12": "1.12.1",
"1.13": "1.13.1"
},
"versions": {
"0.4.0": {
Expand Down Expand Up @@ -1520,6 +1558,42 @@
"us-west-2": "763104351884"
},
"repository": "pytorch-training"
},
"1.13.1": {
"py_versions": [
"py39"
],
"registries": {
"af-south-1": "626614931356",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ap-southeast-3": "907027046896",
"ca-central-1": "763104351884",
"cn-north-1": "727897471807",
"cn-northwest-1": "727897471807",
"eu-central-1": "763104351884",
"eu-north-1": "763104351884",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"eu-south-1": "692866216735",
"me-south-1": "217643126080",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-gov-east-1": "446045086412",
"us-gov-west-1": "442386744353",
"us-iso-east-1": "886529160074",
"us-isob-east-1": "094389454867",
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "pytorch-training"
}
}
}
Expand Down
66 changes: 52 additions & 14 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@
"huggingface_training_compiler",
)

PYTORCH_RENEWED_GPU = "ml.g4dn.xlarge"


def pytest_addoption(parser):
parser.addoption("--sagemaker-client-config", action="store", default=None)
Expand Down Expand Up @@ -221,22 +223,26 @@ def mxnet_eia_latest_py_version():

@pytest.fixture(scope="module", params=["py2", "py3"])
def pytorch_training_py_version(pytorch_training_version, request):
if Version(pytorch_training_version) < Version("1.5.0"):
return request.param
if Version(pytorch_training_version) >= Version("1.13"):
return "py39"
elif Version(pytorch_training_version) >= Version("1.9"):
return "py38"
else:
elif Version(pytorch_training_version) >= Version("1.5.0"):
return "py3"
else:
return request.param
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a significant change in default behavior of this function pytorch_training_py_version.

  1. Insert the following code after line 225 in original code.
    if Version(pytorch_training_version) >= Version("1.13"):
    return "py39"
  2. The default return value of the function should still be set to "py3".

Copy link
Author

@ghost ghost Jan 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It isn't. the difference is smoother logic processing with the same results as before, with the addition of py39 for 1.13 and above. the other logic was in reverse where it looked at lower versions before newer versions. this process has less logic hits as newer versions are used more than older. Anything under 1.5.0 still has request.param, between 1.9 and 1.5.0 uses py3 above 1.9 uses py38, same as old. the inclusion is 1.13 which is py39. this is valid and tested. Also, this is training and the code is also training's code which is used with inference as well.
Did notice it should be 1.5.0 and not just 1.5.. fixing that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I agree that this is a refactoring and doesn't change the code meaning. And sorry about the wrong lower version. The Version("1.5") and Version("1.5.0") have the same effect though.



@pytest.fixture(scope="module", params=["py2", "py3"])
def pytorch_inference_py_version(pytorch_inference_version, request):
if Version(pytorch_inference_version) < Version("1.4.0"):
return request.param
if Version(pytorch_inference_version) >= Version("1.13"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same applies to this function pytorch_inference_py_version as above:

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be 1.4.0 for inference. 1.5.0 for training. otherwise the new logic is fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for catching this!

return "py39"
elif Version(pytorch_inference_version) >= Version("1.9"):
return "py38"
else:
elif Version(pytorch_inference_version) >= Version("1.4.0"):
return "py3"
else:
return request.param


@pytest.fixture(scope="module")
Expand All @@ -252,9 +258,13 @@ def huggingface_pytorch_training_py_version(huggingface_pytorch_training_version


@pytest.fixture(scope="module")
def huggingface_training_compiler_pytorch_version(huggingface_training_compiler_version):
def huggingface_training_compiler_pytorch_version(
huggingface_training_compiler_version,
):
versions = _huggingface_base_fm_version(
huggingface_training_compiler_version, "pytorch", "huggingface_training_compiler"
huggingface_training_compiler_version,
"pytorch",
"huggingface_training_compiler",
)
if not versions:
pytest.skip(
Expand All @@ -265,9 +275,13 @@ def huggingface_training_compiler_pytorch_version(huggingface_training_compiler_


@pytest.fixture(scope="module")
def huggingface_training_compiler_tensorflow_version(huggingface_training_compiler_version):
def huggingface_training_compiler_tensorflow_version(
huggingface_training_compiler_version,
):
versions = _huggingface_base_fm_version(
huggingface_training_compiler_version, "tensorflow", "huggingface_training_compiler"
huggingface_training_compiler_version,
"tensorflow",
"huggingface_training_compiler",
)
if not versions:
pytest.skip(
Expand All @@ -289,19 +303,25 @@ def huggingface_training_compiler_tensorflow_py_version(


@pytest.fixture(scope="module")
def huggingface_training_compiler_pytorch_py_version(huggingface_training_compiler_pytorch_version):
def huggingface_training_compiler_pytorch_py_version(
huggingface_training_compiler_pytorch_version,
):
return "py38"


@pytest.fixture(scope="module")
def huggingface_pytorch_latest_training_py_version(huggingface_training_pytorch_latest_version):
def huggingface_pytorch_latest_training_py_version(
huggingface_training_pytorch_latest_version,
):
return (
"py38" if Version(huggingface_training_pytorch_latest_version) >= Version("1.9") else "py36"
)


@pytest.fixture(scope="module")
def huggingface_pytorch_latest_inference_py_version(huggingface_inference_pytorch_latest_version):
def huggingface_pytorch_latest_inference_py_version(
huggingface_inference_pytorch_latest_version,
):
return (
"py38"
if Version(huggingface_inference_pytorch_latest_version) >= Version("1.9")
Expand Down Expand Up @@ -477,7 +497,8 @@ def pytorch_ddp_py_version():


@pytest.fixture(
scope="module", params=["1.10", "1.10.0", "1.10.2", "1.11", "1.11.0", "1.12", "1.12.0"]
scope="module",
params=["1.10", "1.10.0", "1.10.2", "1.11", "1.11.0", "1.12", "1.12.0"],
)
def pytorch_ddp_framework_version(request):
return request.param
Expand Down Expand Up @@ -511,6 +532,23 @@ def gpu_instance_type(sagemaker_session, request):
return "ml.p3.2xlarge"


@pytest.fixture()
def gpu_pytorch_instance_type(sagemaker_session, request):
if "pytorch_inference_version" in request.fixturenames:
fw_version = request.getfixturevalue("pytorch_inference_version")
else:
fw_version = request.param

region = sagemaker_session.boto_session.region_name
if region in NO_P3_REGIONS:
if Version(fw_version) >= Version("1.13"):
return PYTORCH_RENEWED_GPU
else:
return "ml.p2.xlarge"
else:
return "ml.p3.2xlarge"


@pytest.fixture(scope="session")
def gpu_instance_type_list(sagemaker_session, request):
region = sagemaker_session.boto_session.region_name
Expand Down
8 changes: 7 additions & 1 deletion tests/unit/sagemaker/image_uris/test_dlc_frameworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from tests.unit.sagemaker.image_uris import expected_uris

INSTANCE_TYPES_AND_PROCESSORS = (("ml.c4.xlarge", "cpu"), ("ml.p2.xlarge", "gpu"))
RENEWED_PYTORCH_INSTANCE_TYPES_AND_PROCESSORS = (("ml.c4.xlarge", "cpu"), ("ml.g4dn.xlarge", "gpu"))
REGION = "us-west-2"

DLC_ACCOUNT = "763104351884"
Expand Down Expand Up @@ -70,7 +71,12 @@ def _test_image_uris(
"image_scope": scope,
}

for instance_type, processor in INSTANCE_TYPES_AND_PROCESSORS:
TYPES_AND_PROCESSORS = INSTANCE_TYPES_AND_PROCESSORS
if framework == "pytorch" and Version(fw_version) >= Version("1.13"):
"""Handle P2 deprecation"""
TYPES_AND_PROCESSORS = RENEWED_PYTORCH_INSTANCE_TYPES_AND_PROCESSORS

for instance_type, processor in TYPES_AND_PROCESSORS:
uri = image_uris.retrieve(region=REGION, instance_type=instance_type, **base_args)

expected = expected_fn(processor=processor, **expected_fn_args)
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/test_fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,7 @@ def test_validate_smdataparallel_args_not_raises():
("ml.p3.16xlarge", "pytorch", "1.12.0", "py38", smdataparallel_enabled),
("ml.p3.16xlarge", "pytorch", "1.12.1", "py38", smdataparallel_enabled),
("ml.p3.16xlarge", "pytorch", "1.12", "py38", smdataparallel_enabled),
("ml.p3.16xlarge", "pytorch", "1.13.1", "py39", smdataparallel_enabled),
("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
("ml.p3.16xlarge", "tensorflow", "2.4.1", "py37", smdataparallel_enabled_custom_mpi),
("ml.p3.16xlarge", "tensorflow", "2.4.3", "py3", smdataparallel_enabled_custom_mpi),
Expand All @@ -932,6 +933,7 @@ def test_validate_smdataparallel_args_not_raises():
("ml.p3.16xlarge", "pytorch", "1.11.0", "py38", smdataparallel_enabled_custom_mpi),
("ml.p3.16xlarge", "pytorch", "1.12.0", "py38", smdataparallel_enabled_custom_mpi),
("ml.p3.16xlarge", "pytorch", "1.12.1", "py38", smdataparallel_enabled_custom_mpi),
("ml.p3.16xlarge", "pytorch", "1.13.1", "py39", smdataparallel_enabled_custom_mpi),
]
for instance_type, framework_name, framework_version, py_version, distribution in good_args:
fw_utils._validate_smdataparallel_args(
Expand Down
30 changes: 21 additions & 9 deletions tests/unit/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,12 @@ def test_create_model_with_custom_image(name_from_base, sagemaker_session):
@patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME)
@patch("time.time", return_value=TIME)
def test_pytorch(
time, name_from_base, sagemaker_session, pytorch_inference_version, pytorch_inference_py_version
time,
name_from_base,
sagemaker_session,
pytorch_inference_version,
pytorch_inference_py_version,
gpu_pytorch_instance_type,
):
pytorch = PyTorch(
entry_point=SCRIPT_PATH,
Expand Down Expand Up @@ -339,24 +344,29 @@ def test_pytorch(
REGION,
version=pytorch_inference_version,
py_version=pytorch_inference_py_version,
instance_type=GPU,
instance_type=gpu_pytorch_instance_type,
image_scope="inference",
)

actual_environment = model.prepare_container_def(GPU)
actual_environment = model.prepare_container_def(gpu_pytorch_instance_type)
submit_directory = actual_environment["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"]
model_url = actual_environment["ModelDataUrl"]
expected_environment = _get_environment(submit_directory, model_url, expected_image_uri)
assert actual_environment == expected_environment

assert "cpu" in model.prepare_container_def(CPU)["Image"]
predictor = pytorch.deploy(1, GPU)
predictor = pytorch.deploy(1, gpu_pytorch_instance_type)
assert isinstance(predictor, PyTorchPredictor)


@patch("sagemaker.utils.repack_model", MagicMock())
@patch("sagemaker.utils.create_tar_file", MagicMock())
def test_model(sagemaker_session, pytorch_inference_version, pytorch_inference_py_version):
def test_model(
sagemaker_session,
pytorch_inference_version,
pytorch_inference_py_version,
gpu_pytorch_instance_type,
):
model = PyTorchModel(
MODEL_DATA,
role=ROLE,
Expand All @@ -365,21 +375,22 @@ def test_model(sagemaker_session, pytorch_inference_version, pytorch_inference_p
py_version=pytorch_inference_py_version,
sagemaker_session=sagemaker_session,
)
predictor = model.deploy(1, GPU)
predictor = model.deploy(1, gpu_pytorch_instance_type)
assert isinstance(predictor, PyTorchPredictor)


@patch("sagemaker.utils.create_tar_file", MagicMock())
@patch("sagemaker.utils.repack_model")
def test_mms_model(repack_model, sagemaker_session):
@pytest.mark.parametrize("gpu_pytorch_instance_type", ["1.2"], indirect=True)
def test_mms_model(repack_model, sagemaker_session, gpu_pytorch_instance_type):
PyTorchModel(
MODEL_DATA,
role=ROLE,
entry_point=SCRIPT_PATH,
sagemaker_session=sagemaker_session,
framework_version="1.2",
py_version="py3",
).deploy(1, GPU)
).deploy(1, gpu_pytorch_instance_type)

repack_model.assert_called_with(
dependencies=[],
Expand Down Expand Up @@ -428,6 +439,7 @@ def test_model_custom_serialization(
sagemaker_session,
pytorch_inference_version,
pytorch_inference_py_version,
gpu_pytorch_instance_type,
):
model = PyTorchModel(
MODEL_DATA,
Expand All @@ -441,7 +453,7 @@ def test_model_custom_serialization(
custom_deserializer = Mock()
predictor = model.deploy(
1,
GPU,
gpu_pytorch_instance_type,
serializer=custom_serializer,
deserializer=custom_deserializer,
)
Expand Down