feat: add Tensorflow and Pytorch version for SM Training Compiler and expand to regular regions (#3045)

access2rohit · web-flow · commit 4b88ea8a0e83 · 2022-04-18T18:01:02.000-07:00
update tf version for training compiler and expand to regular regions
diff --git a/src/sagemaker/image_uri_config/huggingface-training-compiler.json b/src/sagemaker/image_uri_config/huggingface-training-compiler.json
@@ -2,7 +2,8 @@
     "training": {
         "processors": ["gpu"],
         "version_aliases": {
-            "4.11": "4.11.0"
+            "4.11": "4.11.0",
+            "4.17": "4.17.0"
         },
         "versions": {
             "4.11.0": {
@@ -32,6 +33,68 @@
                     "repository": "huggingface-tensorflow-trcomp-training",
                     "container_version": {"gpu":"cu112-ubuntu18.04"}
                 }
+            },
+            "4.17.0": {
+                "version_aliases": {
+                    "pytorch1.10": "pytorch1.10.2",
+                    "tensorflow2.6": "tensorflow2.6.3"
+                },
+                "pytorch1.10.2": {
+                    "py_versions": ["py38"],
+                    "registries": {
+                        "af-south-1": "626614931356",
+                        "ap-east-1": "871362719292",
+                        "ap-northeast-1": "763104351884",
+                        "ap-northeast-2": "763104351884",
+                        "ap-northeast-3": "364406365360",
+                        "ap-south-1": "763104351884",
+                        "ap-southeast-1": "763104351884",
+                        "ap-southeast-2": "763104351884",
+                        "ca-central-1": "763104351884",
+                        "eu-central-1": "763104351884",
+                        "eu-north-1": "763104351884",
+                        "eu-south-1": "692866216735",
+                        "eu-west-1": "763104351884",
+                        "eu-west-2": "763104351884",
+                        "eu-west-3": "763104351884",
+                        "me-south-1": "217643126080",
+                        "sa-east-1": "763104351884",
+                        "us-east-1": "763104351884",
+                        "us-east-2": "763104351884",
+                        "us-west-1": "763104351884",
+                        "us-west-2": "763104351884"
+                    },
+                    "repository": "huggingface-pytorch-trcomp-training",
+                    "container_version": {"gpu":"cu113-ubuntu20.04"}
+                },
+                "tensorflow2.6.3": {
+                    "py_versions": ["py38"],
+                    "registries": {
+                        "af-south-1": "626614931356",
+                        "ap-east-1": "871362719292",
+                        "ap-northeast-1": "763104351884",
+                        "ap-northeast-2": "763104351884",
+                        "ap-northeast-3": "364406365360",
+                        "ap-south-1": "763104351884",
+                        "ap-southeast-1": "763104351884",
+                        "ap-southeast-2": "763104351884",
+                        "ca-central-1": "763104351884",
+                        "eu-central-1": "763104351884",
+                        "eu-north-1": "763104351884",
+                        "eu-south-1": "692866216735",
+                        "eu-west-1": "763104351884",
+                        "eu-west-2": "763104351884",
+                        "eu-west-3": "763104351884",
+                        "me-south-1": "217643126080",
+                        "sa-east-1": "763104351884",
+                        "us-east-1": "763104351884",
+                        "us-east-2": "763104351884",
+                        "us-west-1": "763104351884",
+                        "us-west-2": "763104351884"
+                    },
+                    "repository": "huggingface-tensorflow-trcomp-training",
+                    "container_version": {"gpu":"cu112-ubuntu20.04"}
+                }
             }
         }
     }
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -248,6 +248,15 @@ def huggingface_training_compiler_tensorflow_version(huggingface_training_compil
     )[0]
 
 
+@pytest.fixture(scope="module")
+def huggingface_training_compiler_py_version(huggingface_training_compiler_tensorflow_version):
+    return (
+        "py37"
+        if Version(huggingface_training_compiler_tensorflow_version) < Version("2.6")
+        else "py38"
+    )
+
+
 @pytest.fixture(scope="module")
 def huggingface_pytorch_latest_training_py_version(huggingface_training_pytorch_latest_version):
     return (
diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py
@@ -158,6 +158,26 @@
 ]
 # TODO: SM Training Compiler team to add all supported regions.
 TRAINING_COMPILER_SUPPORTED_REGIONS = [
+    "af-south-1",
+    "ap-east-1",
+    "ap-northeast-1",
+    "ap-northeast-2",
+    "ap-northeast-3",
+    "ap-south-1",
+    "ap-southeast-1",
+    "ap-southeast-2",
+    "ca-central-1",
+    "eu-central-1",
+    "eu-north-1",
+    "eu-south-1",
+    "eu-west-1",
+    "eu-west-2",
+    "eu-west-3",
+    "me-south-1",
+    "sa-east-1",
+    "us-east-1",
+    "us-east-2",
+    "us-west-1",
     "us-west-2",
 ]
 # Data parallelism need to be tested with p3.16xlarge.
diff --git a/tests/integ/test_training_compiler.py b/tests/integ/test_training_compiler.py
@@ -88,7 +88,7 @@ def test_huggingface_tensorflow(
         data_path = os.path.join(DATA_DIR, "huggingface")
 
         hf = HuggingFace(
-            py_version="py37",
+            py_version="py38",
             entry_point=os.path.join(data_path, "run_tf.py"),
             role="SageMakerRole",
             transformers_version=huggingface_training_compiler_latest_version,
diff --git a/tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py b/tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py
@@ -158,9 +158,9 @@ def test_unsupported_BYOC(
 ):
     byoc = (
         "1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:"
-        "1.9.0-"
-        "transformers4.10.2-gpu-"
-        "py38-cu111-ubuntu20.04"
+        "1.10.2-"
+        "transformers4.17.0-gpu-"
+        "py38-cu113-ubuntu20.04"
     )
     with pytest.raises(ValueError):
         HuggingFace(
@@ -451,9 +451,9 @@ def test_attach(
 ):
     training_image = (
         "1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:"
-        "1.9.0-"
-        "transformers4.10.2-gpu-"
-        "py38-cu111-ubuntu20.04"
+        "1.10.2-"
+        "transformers4.17.0-gpu-"
+        "py38-cu113-ubuntu20.04"
     )
     returned_job_description = {
         "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
@@ -462,7 +462,7 @@ def test_attach(
             "sagemaker_program": '"iris-dnn-classifier.py"',
             "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
             "sagemaker_container_log_level": '"logging.INFO"',
-            "sagemaker_job_name": '"hopper"',
+            "sagemaker_job_name": '"trcomp"',
             "training_steps": "100",
             "sagemaker_region": '"us-east-1"',
             TrainingCompilerConfig.HP_ENABLE_COMPILER: json.dumps(compiler_enabled),
@@ -475,27 +475,27 @@ def test_attach(
             "InstanceType": "ml.p3.2xlarge",
         },
         "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
-        "TrainingJobName": "hopper",
+        "TrainingJobName": "trcomp",
         "TrainingJobStatus": "Completed",
-        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/hopper",
-        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/hopper"},
+        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/trcomp",
+        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/trcomp"},
         "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
     }
     sagemaker_session.sagemaker_client.describe_training_job = Mock(
         name="describe_training_job", return_value=returned_job_description
     )
 
-    estimator = HuggingFace.attach(training_job_name="hopper", sagemaker_session=sagemaker_session)
-    assert estimator.latest_training_job.job_name == "hopper"
+    estimator = HuggingFace.attach(training_job_name="trcomp", sagemaker_session=sagemaker_session)
+    assert estimator.latest_training_job.job_name == "trcomp"
     assert estimator.py_version == "py38"
-    assert estimator.framework_version == "4.10.2"
-    assert estimator.pytorch_version == "1.9.0"
+    assert estimator.framework_version == "4.17.0"
+    assert estimator.pytorch_version == "1.10.2"
     assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
     assert estimator.instance_count == 1
     assert estimator.max_run == 24 * 60 * 60
     assert estimator.input_mode == "File"
-    assert estimator.base_job_name == "hopper"
-    assert estimator.output_path == "s3://place/output/hopper"
+    assert estimator.base_job_name == "trcomp"
+    assert estimator.output_path == "s3://place/output/trcomp"
     assert estimator.output_kms_key == ""
     assert estimator.hyperparameters()["training_steps"] == "100"
     assert estimator.hyperparameters()[TrainingCompilerConfig.HP_ENABLE_COMPILER] == json.dumps(
diff --git a/tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py b/tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py