From f7e91ef01cec82e3e158f2f3e8d7d8e7f8351f82 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Tue, 14 Sep 2021 16:14:59 -0700 Subject: [PATCH 01/13] added pytorch 1.8.1 for supporting huggingface --- .../image_uri_config/huggingface.json | 79 +++++++++++++++++-- src/sagemaker/image_uris.py | 27 ++++++- 2 files changed, 100 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index 17d4b38c81..0d6cf52892 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -178,7 +178,8 @@ "us-west-1": "763104351884", "us-west-2": "763104351884" }, - "repository": "huggingface-pytorch-training" + "repository": "huggingface-pytorch-training", + "container_version": {"gpu":"cu110-ubuntu18.04"} }, "pytorch1.7.1": { "py_versions": ["py36"], @@ -209,7 +210,40 @@ "us-west-1": "763104351884", "us-west-2": "763104351884" }, - "repository": "huggingface-pytorch-training" + "repository": "huggingface-pytorch-training", + "container_version": {"gpu":"cu110-ubuntu18.04"} + }, + "pytorch1.8.1": { + "py_versions": ["py36"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-pytorch-training", + "container_version": {"gpu":"cu111-ubuntu18.04"} }, "tensorflow2.4.1": { "py_versions": ["py37"], @@ -240,7 +274,8 @@ "us-west-1": "763104351884", "us-west-2": "763104351884" }, - "repository": "huggingface-tensorflow-training" + "repository": "huggingface-tensorflow-training", + "container_version": {"gpu":"cu110-ubuntu18.04"} } } } @@ -286,7 +321,40 @@ "us-west-1": "763104351884", "us-west-2": "763104351884" }, - "repository": "huggingface-pytorch-inference" + "repository": "huggingface-pytorch-inference", + "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" } + }, + "pytorch1.8.1": { + "py_versions": ["py36"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-pytorch-inference", + "container_version": {"gpu":"cu111-ubuntu18.04", "cpu":"ubuntu18.04" } }, "tensorflow2.4.1": { "py_versions": ["py37"], @@ -317,7 +385,8 @@ "us-west-1": "763104351884", "us-west-2": "763104351884" }, - "repository": "huggingface-tensorflow-inference" + "repository": "huggingface-tensorflow-inference", + "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" } } } } diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 3d511403ed..f02974cb48 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -17,9 +17,12 @@ import logging import os import re +import pdb from sagemaker import utils from sagemaker.spark import defaults +from sagemaker.spark import defaults + logger = logging.getLogger(__name__) @@ -39,7 +42,10 @@ def retrieve( distribution=None, base_framework_version=None, ): + """Retrieves the ECR URI for the Docker image matching the given arguments. + Ideally this function should not be called directly, rather it should be called from the + fit() function inside framework estimator. Args: framework (str): The name of the framework or algorithm. @@ -56,7 +62,11 @@ def retrieve( image_scope (str): The image type, i.e. what it is used for. Valid values: "training", "inference", "eia". If ``accelerator_type`` is set, ``image_scope`` is ignored. - container_version (str): the version of docker image + container_version (str): the version of docker image. + Ideally the value of parameter is should be created inside the framework. + For custom use, see the list of supported container versions: + https://github.com/aws/deep-learning-containers/blob/master/available_images.md + (default: None). distribution (dict): A dictionary with information on how to run distributed training (default: None). @@ -66,10 +76,12 @@ def retrieve( Raises: ValueError: If the combination of arguments specified is not supported. """ + config = _config_for_framework_and_scope(framework, image_scope, accelerator_type) original_version = version version = _validate_version_and_set_if_needed(version, config, framework) version_config = config["versions"][_version_for_config(version, config)] + if framework == HUGGING_FACE_FRAMEWORK: if version_config.get("version_aliases"): full_base_framework_version = version_config["version_aliases"].get( @@ -79,9 +91,12 @@ def retrieve( _validate_arg(full_base_framework_version, list(version_config.keys()), "base framework") version_config = version_config.get(full_base_framework_version) + py_version = _validate_py_version_and_set_if_needed(py_version, version_config, framework) version_config = version_config.get(py_version) or version_config + + registry = _registry_from_region(region, version_config["registries"]) hostname = utils._botocore_resolver().construct_endpoint("ecr", region)["hostname"] @@ -90,12 +105,16 @@ def retrieve( processor = _processor( instance_type, config.get("processors") or version_config.get("processors") ) + #if container version is available in .json file, utilize that + if "container_version" in version_config.keys(): + container_version = version_config['container_version'][processor] if framework == HUGGING_FACE_FRAMEWORK: pt_or_tf_version = ( re.compile("^(pytorch|tensorflow)(.*)$").match(base_framework_version).group(2) ) tag_prefix = f"{pt_or_tf_version}-transformers{original_version}" + else: tag_prefix = version_config.get("tag_prefix", version) @@ -105,6 +124,8 @@ def retrieve( py_version, container_version, ) + + if _should_auto_select_container_version(instance_type, distribution): container_versions = { "tensorflow-2.3-gpu-py37": "cu110-ubuntu18.04-v3", @@ -119,8 +140,12 @@ def retrieve( "pytorch-1.6.0-gpu-py36": "cu110-ubuntu18.04", "pytorch-1.6-gpu-py3": "cu110-ubuntu18.04-v3", "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04", + "pytorch-1.8.1-gpu-py3": "cu111-ubuntu18.04" } + + key = "-".join([framework, tag]) + if key in container_versions: tag = "-".join([tag, container_versions[key]]) From a81cfbf8458526318ffd724d16233a7505218cb8 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Wed, 15 Sep 2021 10:23:08 -0700 Subject: [PATCH 02/13] fix: add pytorch 1.8.1 for huggingface --- src/sagemaker/image_uris.py | 5 +---- tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index f02974cb48..42dee9dff1 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -17,12 +17,9 @@ import logging import os import re -import pdb from sagemaker import utils from sagemaker.spark import defaults -from sagemaker.spark import defaults - logger = logging.getLogger(__name__) @@ -106,7 +103,7 @@ def retrieve( instance_type, config.get("processors") or version_config.get("processors") ) #if container version is available in .json file, utilize that - if "container_version" in version_config.keys(): + if version_config.get("container_version"): container_version = version_config['container_version'][processor] if framework == HUGGING_FACE_FRAMEWORK: diff --git a/tests/conftest.py b/tests/conftest.py index aa07ae0926..7db583c62f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -400,7 +400,7 @@ def _huggingface_base_fm_version(huggingface_version, base_fw, fixture_prefix): if len(original_version.split(".")) == 2: base_fw_version = ".".join(base_fw_version.split(".")[:-1]) versions.append(base_fw_version) - return versions + return sorted(versions,reverse=True) def _generate_huggingface_base_fw_latest_versions( From e95e0ced5a9d831050eb6a831fe11d1592fc092c Mon Sep 17 00:00:00 2001 From: Tabassum Date: Wed, 15 Sep 2021 14:07:44 -0700 Subject: [PATCH 03/13] fix: add pytorch 1.8.1 for huggingface --- src/sagemaker/image_uris.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 42dee9dff1..5ecc5090a3 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -122,7 +122,6 @@ def retrieve( container_version, ) - if _should_auto_select_container_version(instance_type, distribution): container_versions = { "tensorflow-2.3-gpu-py37": "cu110-ubuntu18.04-v3", @@ -136,8 +135,7 @@ def retrieve( "pytorch-1.6-gpu-py36": "cu110-ubuntu18.04-v3", "pytorch-1.6.0-gpu-py36": "cu110-ubuntu18.04", "pytorch-1.6-gpu-py3": "cu110-ubuntu18.04-v3", - "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04", - "pytorch-1.8.1-gpu-py3": "cu111-ubuntu18.04" + "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04" } From be2c875cde12b37d1976d95f6a0d69e62e6d6c29 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Wed, 15 Sep 2021 15:43:39 -0700 Subject: [PATCH 04/13] refactored code for flake --- src/sagemaker/image_uris.py | 9 +++------ tests/conftest.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 5ecc5090a3..9cf021a142 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -41,6 +41,7 @@ def retrieve( ): """Retrieves the ECR URI for the Docker image matching the given arguments. + Ideally this function should not be called directly, rather it should be called from the fit() function inside framework estimator. @@ -88,12 +89,8 @@ def retrieve( _validate_arg(full_base_framework_version, list(version_config.keys()), "base framework") version_config = version_config.get(full_base_framework_version) - py_version = _validate_py_version_and_set_if_needed(py_version, version_config, framework) version_config = version_config.get(py_version) or version_config - - - registry = _registry_from_region(region, version_config["registries"]) hostname = utils._botocore_resolver().construct_endpoint("ecr", region)["hostname"] @@ -102,7 +99,8 @@ def retrieve( processor = _processor( instance_type, config.get("processors") or version_config.get("processors") ) - #if container version is available in .json file, utilize that + + # if container version is available in .json file, utilize that if version_config.get("container_version"): container_version = version_config['container_version'][processor] @@ -138,7 +136,6 @@ def retrieve( "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04" } - key = "-".join([framework, tag]) if key in container_versions: diff --git a/tests/conftest.py b/tests/conftest.py index 7db583c62f..4438ac4c53 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -400,7 +400,7 @@ def _huggingface_base_fm_version(huggingface_version, base_fw, fixture_prefix): if len(original_version.split(".")) == 2: base_fw_version = ".".join(base_fw_version.split(".")[:-1]) versions.append(base_fw_version) - return sorted(versions,reverse=True) + return sorted(versions, reverse=True) def _generate_huggingface_base_fw_latest_versions( From 5ea64ccc6f728a5457ef3bd54c82d35647c61246 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Wed, 15 Sep 2021 15:45:07 -0700 Subject: [PATCH 05/13] refactored code for docstyle --- src/sagemaker/image_uris.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 9cf021a142..b1447ae60f 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -39,7 +39,6 @@ def retrieve( distribution=None, base_framework_version=None, ): - """Retrieves the ECR URI for the Docker image matching the given arguments. Ideally this function should not be called directly, rather it should be called from the From 6c134b985570d27cb5d5fac4c48848bb6c9ee2bc Mon Sep 17 00:00:00 2001 From: Tabassum Date: Thu, 16 Sep 2021 16:47:30 -0700 Subject: [PATCH 06/13] fix: add alias for pytorch 1.8 --- src/sagemaker/image_uri_config/huggingface.json | 1 + src/sagemaker/image_uris.py | 4 ++-- tests/unit/sagemaker/huggingface/test_estimator.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index 0d6cf52892..7596303b80 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -147,6 +147,7 @@ "version_aliases": { "pytorch1.6": "pytorch1.6.0", "pytorch1.7": "pytorch1.7.1", + "pytorch1.8": "pytorch1.8.1", "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.6.0": { diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index b1447ae60f..7d04cc7288 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -101,7 +101,7 @@ def retrieve( # if container version is available in .json file, utilize that if version_config.get("container_version"): - container_version = version_config['container_version'][processor] + container_version = version_config["container_version"][processor] if framework == HUGGING_FACE_FRAMEWORK: pt_or_tf_version = ( @@ -132,7 +132,7 @@ def retrieve( "pytorch-1.6-gpu-py36": "cu110-ubuntu18.04-v3", "pytorch-1.6.0-gpu-py36": "cu110-ubuntu18.04", "pytorch-1.6-gpu-py3": "cu110-ubuntu18.04-v3", - "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04" + "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04", } key = "-".join([framework, tag]) diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py index 10a1032d78..608f3997b1 100644 --- a/tests/unit/sagemaker/huggingface/test_estimator.py +++ b/tests/unit/sagemaker/huggingface/test_estimator.py @@ -18,7 +18,6 @@ import pytest from mock import MagicMock, Mock, patch - from sagemaker.huggingface import HuggingFace from .huggingface_utils import get_full_gpu_image_uri, GPU_INSTANCE_TYPE, REGION @@ -219,6 +218,7 @@ def test_huggingface( huggingface_training_version, huggingface_pytorch_training_version, ): + hf = HuggingFace( py_version="py36", entry_point=SCRIPT_PATH, From 015888c69e559c3f0861542274c313538083c039 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Mon, 20 Sep 2021 16:07:26 -0600 Subject: [PATCH 07/13] update to master --- .coverage.a07817ac2fd0.ant.amazon.com.80183.433182 | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .coverage.a07817ac2fd0.ant.amazon.com.80183.433182 diff --git a/.coverage.a07817ac2fd0.ant.amazon.com.80183.433182 b/.coverage.a07817ac2fd0.ant.amazon.com.80183.433182 new file mode 100644 index 0000000000..e69de29bb2 From 90097d34c8c1edf685f61a8423de7d7c4c6b7313 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Mon, 20 Sep 2021 16:12:10 -0600 Subject: [PATCH 08/13] add version alias for pytorch1.8 in huggingface.json --- src/sagemaker/image_uri_config/huggingface.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index 7596303b80..c14c4c28a0 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -291,6 +291,7 @@ "4.6.1": { "version_aliases": { "pytorch1.7": "pytorch1.7.1", + "pytorch1.8": "pytorch1.8.1", "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.7.1": { From 613fa6aef67f9614b53eb3467fcb094b3de2c671 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Mon, 20 Sep 2021 16:57:35 -0600 Subject: [PATCH 09/13] add version alias for pytorch1.8 in huggingface.json --- src/sagemaker/image_uri_config/huggingface.json | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index c14c4c28a0..7596303b80 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -291,7 +291,6 @@ "4.6.1": { "version_aliases": { "pytorch1.7": "pytorch1.7.1", - "pytorch1.8": "pytorch1.8.1", "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.7.1": { From 28f5ac1b5aeef87b4bf8a5962c92ca1c0e55146f Mon Sep 17 00:00:00 2001 From: Tabassum Date: Tue, 21 Sep 2021 11:31:39 -0600 Subject: [PATCH 10/13] removed empty file, corrected grammar --- .coverage.a07817ac2fd0.ant.amazon.com.80183.433182 | 0 src/sagemaker/image_uris.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 .coverage.a07817ac2fd0.ant.amazon.com.80183.433182 diff --git a/.coverage.a07817ac2fd0.ant.amazon.com.80183.433182 b/.coverage.a07817ac2fd0.ant.amazon.com.80183.433182 deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 7d04cc7288..56397564a4 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -60,7 +60,7 @@ def retrieve( Valid values: "training", "inference", "eia". If ``accelerator_type`` is set, ``image_scope`` is ignored. container_version (str): the version of docker image. - Ideally the value of parameter is should be created inside the framework. + Ideally the value of parameter should be created inside the framework. For custom use, see the list of supported container versions: https://github.com/aws/deep-learning-containers/blob/master/available_images.md (default: None). From 194025a66dc9b3cf8e9620b290fe50e36430c73a Mon Sep 17 00:00:00 2001 From: Tabassum Date: Tue, 21 Sep 2021 13:36:29 -0600 Subject: [PATCH 11/13] removed empty line in test_estimator --- tests/unit/sagemaker/huggingface/test_estimator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py index 608f3997b1..64c4e5e613 100644 --- a/tests/unit/sagemaker/huggingface/test_estimator.py +++ b/tests/unit/sagemaker/huggingface/test_estimator.py @@ -218,7 +218,6 @@ def test_huggingface( huggingface_training_version, huggingface_pytorch_training_version, ): - hf = HuggingFace( py_version="py36", entry_point=SCRIPT_PATH, From 86b70beee24117d40fab1880ba0b3cdad80c7db5 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Tue, 21 Sep 2021 13:41:16 -0600 Subject: [PATCH 12/13] removed empty line in test_estimator --- tests/unit/sagemaker/huggingface/test_estimator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py index 64c4e5e613..4c21020fd2 100644 --- a/tests/unit/sagemaker/huggingface/test_estimator.py +++ b/tests/unit/sagemaker/huggingface/test_estimator.py @@ -19,7 +19,6 @@ import pytest from mock import MagicMock, Mock, patch from sagemaker.huggingface import HuggingFace - from .huggingface_utils import get_full_gpu_image_uri, GPU_INSTANCE_TYPE, REGION From 40db98eb5dcf0c5fa68d0f7815561b17f33bef48 Mon Sep 17 00:00:00 2001 From: Tabassum Date: Tue, 21 Sep 2021 13:41:49 -0600 Subject: [PATCH 13/13] removed empty line in test_estimator --- tests/unit/sagemaker/huggingface/test_estimator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py index 4c21020fd2..10a1032d78 100644 --- a/tests/unit/sagemaker/huggingface/test_estimator.py +++ b/tests/unit/sagemaker/huggingface/test_estimator.py @@ -18,7 +18,9 @@ import pytest from mock import MagicMock, Mock, patch + from sagemaker.huggingface import HuggingFace + from .huggingface_utils import get_full_gpu_image_uri, GPU_INSTANCE_TYPE, REGION