Skip to content

Commit 4b88ea8

Browse files
authored
feat: add Tensorflow and Pytorch version for SM Training Compiler and expand to regular regions (#3045)
update tf version for training compiler and expand to regular regions
1 parent fd753c3 commit 4b88ea8

File tree

6 files changed

+629
-18
lines changed

6 files changed

+629
-18
lines changed

src/sagemaker/image_uri_config/huggingface-training-compiler.json

+64-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"training": {
33
"processors": ["gpu"],
44
"version_aliases": {
5-
"4.11": "4.11.0"
5+
"4.11": "4.11.0",
6+
"4.17": "4.17.0"
67
},
78
"versions": {
89
"4.11.0": {
@@ -32,6 +33,68 @@
3233
"repository": "huggingface-tensorflow-trcomp-training",
3334
"container_version": {"gpu":"cu112-ubuntu18.04"}
3435
}
36+
},
37+
"4.17.0": {
38+
"version_aliases": {
39+
"pytorch1.10": "pytorch1.10.2",
40+
"tensorflow2.6": "tensorflow2.6.3"
41+
},
42+
"pytorch1.10.2": {
43+
"py_versions": ["py38"],
44+
"registries": {
45+
"af-south-1": "626614931356",
46+
"ap-east-1": "871362719292",
47+
"ap-northeast-1": "763104351884",
48+
"ap-northeast-2": "763104351884",
49+
"ap-northeast-3": "364406365360",
50+
"ap-south-1": "763104351884",
51+
"ap-southeast-1": "763104351884",
52+
"ap-southeast-2": "763104351884",
53+
"ca-central-1": "763104351884",
54+
"eu-central-1": "763104351884",
55+
"eu-north-1": "763104351884",
56+
"eu-south-1": "692866216735",
57+
"eu-west-1": "763104351884",
58+
"eu-west-2": "763104351884",
59+
"eu-west-3": "763104351884",
60+
"me-south-1": "217643126080",
61+
"sa-east-1": "763104351884",
62+
"us-east-1": "763104351884",
63+
"us-east-2": "763104351884",
64+
"us-west-1": "763104351884",
65+
"us-west-2": "763104351884"
66+
},
67+
"repository": "huggingface-pytorch-trcomp-training",
68+
"container_version": {"gpu":"cu113-ubuntu20.04"}
69+
},
70+
"tensorflow2.6.3": {
71+
"py_versions": ["py38"],
72+
"registries": {
73+
"af-south-1": "626614931356",
74+
"ap-east-1": "871362719292",
75+
"ap-northeast-1": "763104351884",
76+
"ap-northeast-2": "763104351884",
77+
"ap-northeast-3": "364406365360",
78+
"ap-south-1": "763104351884",
79+
"ap-southeast-1": "763104351884",
80+
"ap-southeast-2": "763104351884",
81+
"ca-central-1": "763104351884",
82+
"eu-central-1": "763104351884",
83+
"eu-north-1": "763104351884",
84+
"eu-south-1": "692866216735",
85+
"eu-west-1": "763104351884",
86+
"eu-west-2": "763104351884",
87+
"eu-west-3": "763104351884",
88+
"me-south-1": "217643126080",
89+
"sa-east-1": "763104351884",
90+
"us-east-1": "763104351884",
91+
"us-east-2": "763104351884",
92+
"us-west-1": "763104351884",
93+
"us-west-2": "763104351884"
94+
},
95+
"repository": "huggingface-tensorflow-trcomp-training",
96+
"container_version": {"gpu":"cu112-ubuntu20.04"}
97+
}
3598
}
3699
}
37100
}

tests/conftest.py

+9
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,15 @@ def huggingface_training_compiler_tensorflow_version(huggingface_training_compil
248248
)[0]
249249

250250

251+
@pytest.fixture(scope="module")
252+
def huggingface_training_compiler_py_version(huggingface_training_compiler_tensorflow_version):
253+
return (
254+
"py37"
255+
if Version(huggingface_training_compiler_tensorflow_version) < Version("2.6")
256+
else "py38"
257+
)
258+
259+
251260
@pytest.fixture(scope="module")
252261
def huggingface_pytorch_latest_training_py_version(huggingface_training_pytorch_latest_version):
253262
return (

tests/integ/__init__.py

+20
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,26 @@
158158
]
159159
# TODO: SM Training Compiler team to add all supported regions.
160160
TRAINING_COMPILER_SUPPORTED_REGIONS = [
161+
"af-south-1",
162+
"ap-east-1",
163+
"ap-northeast-1",
164+
"ap-northeast-2",
165+
"ap-northeast-3",
166+
"ap-south-1",
167+
"ap-southeast-1",
168+
"ap-southeast-2",
169+
"ca-central-1",
170+
"eu-central-1",
171+
"eu-north-1",
172+
"eu-south-1",
173+
"eu-west-1",
174+
"eu-west-2",
175+
"eu-west-3",
176+
"me-south-1",
177+
"sa-east-1",
178+
"us-east-1",
179+
"us-east-2",
180+
"us-west-1",
161181
"us-west-2",
162182
]
163183
# Data parallelism need to be tested with p3.16xlarge.

tests/integ/test_training_compiler.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_huggingface_tensorflow(
8888
data_path = os.path.join(DATA_DIR, "huggingface")
8989

9090
hf = HuggingFace(
91-
py_version="py37",
91+
py_version="py38",
9292
entry_point=os.path.join(data_path, "run_tf.py"),
9393
role="SageMakerRole",
9494
transformers_version=huggingface_training_compiler_latest_version,

tests/unit/sagemaker/training_compiler/test_huggingface_compiler.py renamed to tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,9 @@ def test_unsupported_BYOC(
158158
):
159159
byoc = (
160160
"1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:"
161-
"1.9.0-"
162-
"transformers4.10.2-gpu-"
163-
"py38-cu111-ubuntu20.04"
161+
"1.10.2-"
162+
"transformers4.17.0-gpu-"
163+
"py38-cu113-ubuntu20.04"
164164
)
165165
with pytest.raises(ValueError):
166166
HuggingFace(
@@ -451,9 +451,9 @@ def test_attach(
451451
):
452452
training_image = (
453453
"1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:"
454-
"1.9.0-"
455-
"transformers4.10.2-gpu-"
456-
"py38-cu111-ubuntu20.04"
454+
"1.10.2-"
455+
"transformers4.17.0-gpu-"
456+
"py38-cu113-ubuntu20.04"
457457
)
458458
returned_job_description = {
459459
"AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
@@ -462,7 +462,7 @@ def test_attach(
462462
"sagemaker_program": '"iris-dnn-classifier.py"',
463463
"sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
464464
"sagemaker_container_log_level": '"logging.INFO"',
465-
"sagemaker_job_name": '"hopper"',
465+
"sagemaker_job_name": '"trcomp"',
466466
"training_steps": "100",
467467
"sagemaker_region": '"us-east-1"',
468468
TrainingCompilerConfig.HP_ENABLE_COMPILER: json.dumps(compiler_enabled),
@@ -475,27 +475,27 @@ def test_attach(
475475
"InstanceType": "ml.p3.2xlarge",
476476
},
477477
"StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
478-
"TrainingJobName": "hopper",
478+
"TrainingJobName": "trcomp",
479479
"TrainingJobStatus": "Completed",
480-
"TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/hopper",
481-
"OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/hopper"},
480+
"TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/trcomp",
481+
"OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/trcomp"},
482482
"TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
483483
}
484484
sagemaker_session.sagemaker_client.describe_training_job = Mock(
485485
name="describe_training_job", return_value=returned_job_description
486486
)
487487

488-
estimator = HuggingFace.attach(training_job_name="hopper", sagemaker_session=sagemaker_session)
489-
assert estimator.latest_training_job.job_name == "hopper"
488+
estimator = HuggingFace.attach(training_job_name="trcomp", sagemaker_session=sagemaker_session)
489+
assert estimator.latest_training_job.job_name == "trcomp"
490490
assert estimator.py_version == "py38"
491-
assert estimator.framework_version == "4.10.2"
492-
assert estimator.pytorch_version == "1.9.0"
491+
assert estimator.framework_version == "4.17.0"
492+
assert estimator.pytorch_version == "1.10.2"
493493
assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
494494
assert estimator.instance_count == 1
495495
assert estimator.max_run == 24 * 60 * 60
496496
assert estimator.input_mode == "File"
497-
assert estimator.base_job_name == "hopper"
498-
assert estimator.output_path == "s3://place/output/hopper"
497+
assert estimator.base_job_name == "trcomp"
498+
assert estimator.output_path == "s3://place/output/trcomp"
499499
assert estimator.output_kms_key == ""
500500
assert estimator.hyperparameters()["training_steps"] == "100"
501501
assert estimator.hyperparameters()[TrainingCompilerConfig.HP_ENABLE_COMPILER] == json.dumps(

0 commit comments

Comments
 (0)