fix: add pytorch 1.8.1 for huggingface (aws#2642)

jeniyat · Tabassum · ahsan-z-khan · EthanShouhanCheng · commit 3bb893f5e35f · 2022-01-11T13:26:16.000-08:00
* added pytorch 1.8.1 for supporting huggingface

* fix: add pytorch 1.8.1 for huggingface

* fix: add pytorch 1.8.1 for huggingface

* refactored code for flake

* refactored code for docstyle

* fix: add alias for pytorch 1.8

* update to master

* add version alias for pytorch1.8 in huggingface.json

* add version alias for pytorch1.8 in huggingface.json

* removed empty file, corrected grammar

* removed empty line in test_estimator

* removed empty line in test_estimator

* removed empty line in test_estimator

Co-authored-by: Tabassum &lt;jeniyat@amazon.com&gt;
Co-authored-by: Ahsan Khan &lt;ahsan.al.zaki@gmail.com&gt;
Co-authored-by: Shreya Pandit &lt;shreya.pandit@pillpack.com&gt;
diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json
@@ -147,6 +147,7 @@
                 "version_aliases": {
                     "pytorch1.6": "pytorch1.6.0",
                     "pytorch1.7": "pytorch1.7.1",
+                    "pytorch1.8": "pytorch1.8.1",
                     "tensorflow2.4": "tensorflow2.4.1"
                 },
                 "pytorch1.6.0": {
@@ -178,7 +179,8 @@
                         "us-west-1": "763104351884",
                         "us-west-2": "763104351884"
                     },
-                    "repository": "huggingface-pytorch-training"
+                    "repository": "huggingface-pytorch-training",
+                    "container_version": {"gpu":"cu110-ubuntu18.04"}
                 },
                 "pytorch1.7.1": {
                     "py_versions": ["py36"],
@@ -209,7 +211,40 @@
                         "us-west-1": "763104351884",
                         "us-west-2": "763104351884"
                     },
-                    "repository": "huggingface-pytorch-training"
+                    "repository": "huggingface-pytorch-training",
+                    "container_version": {"gpu":"cu110-ubuntu18.04"}
+                },
+                "pytorch1.8.1": {
+                    "py_versions": ["py36"],
+                    "registries": {
+                        "af-south-1": "626614931356",
+                        "ap-east-1": "871362719292",
+                        "ap-northeast-1": "763104351884",
+                        "ap-northeast-2": "763104351884",
+                        "ap-northeast-3": "364406365360",
+                        "ap-south-1": "763104351884",
+                        "ap-southeast-1": "763104351884",
+                        "ap-southeast-2": "763104351884",
+                        "ca-central-1": "763104351884",
+                        "cn-north-1": "727897471807",
+                        "cn-northwest-1": "727897471807",
+                        "eu-central-1": "763104351884",
+                        "eu-north-1": "763104351884",
+                        "eu-west-1": "763104351884",
+                        "eu-west-2": "763104351884",
+                        "eu-west-3": "763104351884",
+                        "eu-south-1": "692866216735",
+                        "me-south-1": "217643126080",
+                        "sa-east-1": "763104351884",
+                        "us-east-1": "763104351884",
+                        "us-east-2": "763104351884",
+                        "us-gov-west-1": "442386744353",
+                        "us-iso-east-1": "886529160074",
+                        "us-west-1": "763104351884",
+                        "us-west-2": "763104351884"
+                    },
+                    "repository": "huggingface-pytorch-training",
+                    "container_version": {"gpu":"cu111-ubuntu18.04"}
                 },
                 "tensorflow2.4.1": {
                     "py_versions": ["py37"],
@@ -240,7 +275,8 @@
                         "us-west-1": "763104351884",
                         "us-west-2": "763104351884"
                     },
-                    "repository": "huggingface-tensorflow-training"
+                    "repository": "huggingface-tensorflow-training",
+                    "container_version": {"gpu":"cu110-ubuntu18.04"}
                 }
             }
         }
@@ -286,7 +322,40 @@
                         "us-west-1": "763104351884",
                         "us-west-2": "763104351884"
                     },
-                    "repository": "huggingface-pytorch-inference"
+                    "repository": "huggingface-pytorch-inference",
+                    "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" }
+                },
+                "pytorch1.8.1": {
+                    "py_versions": ["py36"],
+                    "registries": {
+                        "af-south-1": "626614931356",
+                        "ap-east-1": "871362719292",
+                        "ap-northeast-1": "763104351884",
+                        "ap-northeast-2": "763104351884",
+                        "ap-northeast-3": "364406365360",
+                        "ap-south-1": "763104351884",
+                        "ap-southeast-1": "763104351884",
+                        "ap-southeast-2": "763104351884",
+                        "ca-central-1": "763104351884",
+                        "cn-north-1": "727897471807",
+                        "cn-northwest-1": "727897471807",
+                        "eu-central-1": "763104351884",
+                        "eu-north-1": "763104351884",
+                        "eu-west-1": "763104351884",
+                        "eu-west-2": "763104351884",
+                        "eu-west-3": "763104351884",
+                        "eu-south-1": "692866216735",
+                        "me-south-1": "217643126080",
+                        "sa-east-1": "763104351884",
+                        "us-east-1": "763104351884",
+                        "us-east-2": "763104351884",
+                        "us-gov-west-1": "442386744353",
+                        "us-iso-east-1": "886529160074",
+                        "us-west-1": "763104351884",
+                        "us-west-2": "763104351884"
+                    },
+                    "repository": "huggingface-pytorch-inference",
+                    "container_version": {"gpu":"cu111-ubuntu18.04", "cpu":"ubuntu18.04" }
                 },
                 "tensorflow2.4.1": {
                     "py_versions": ["py37"],
@@ -317,7 +386,8 @@
                         "us-west-1": "763104351884",
                         "us-west-2": "763104351884"
                     },
-                    "repository": "huggingface-tensorflow-inference"
+                    "repository": "huggingface-tensorflow-inference",
+                    "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" }
                 }
             }
         }
diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py
@@ -41,6 +41,9 @@ def retrieve(
 ):
     """Retrieves the ECR URI for the Docker image matching the given arguments.
 
+    Ideally this function should not be called directly, rather it should be called from the
+    fit() function inside framework estimator.
+
     Args:
         framework (str): The name of the framework or algorithm.
         region (str): The AWS region.
@@ -56,7 +59,11 @@ def retrieve(
         image_scope (str): The image type, i.e. what it is used for.
             Valid values: "training", "inference", "eia". If ``accelerator_type`` is set,
             ``image_scope`` is ignored.
-        container_version (str): the version of docker image
+        container_version (str): the version of docker image.
+            Ideally the value of parameter should be created inside the framework.
+            For custom use, see the list of supported container versions:
+            https://github.com/aws/deep-learning-containers/blob/master/available_images.md
+            (default: None).
         distribution (dict): A dictionary with information on how to run distributed training
             (default: None).
 
@@ -66,10 +73,12 @@ def retrieve(
     Raises:
         ValueError: If the combination of arguments specified is not supported.
     """
+
     config = _config_for_framework_and_scope(framework, image_scope, accelerator_type)
     original_version = version
     version = _validate_version_and_set_if_needed(version, config, framework)
     version_config = config["versions"][_version_for_config(version, config)]
+
     if framework == HUGGING_FACE_FRAMEWORK:
         if version_config.get("version_aliases"):
             full_base_framework_version = version_config["version_aliases"].get(
@@ -81,7 +90,6 @@ def retrieve(
 
     py_version = _validate_py_version_and_set_if_needed(py_version, version_config, framework)
     version_config = version_config.get(py_version) or version_config
-
     registry = _registry_from_region(region, version_config["registries"])
     hostname = utils._botocore_resolver().construct_endpoint("ecr", region)["hostname"]
 
@@ -91,11 +99,16 @@ def retrieve(
         instance_type, config.get("processors") or version_config.get("processors")
     )
 
+    # if container version is available in .json file, utilize that
+    if version_config.get("container_version"):
+        container_version = version_config["container_version"][processor]
+
     if framework == HUGGING_FACE_FRAMEWORK:
         pt_or_tf_version = (
             re.compile("^(pytorch|tensorflow)(.*)$").match(base_framework_version).group(2)
         )
         tag_prefix = f"{pt_or_tf_version}-transformers{original_version}"
+
     else:
         tag_prefix = version_config.get("tag_prefix", version)
 
@@ -105,6 +118,7 @@ def retrieve(
         py_version,
         container_version,
     )
+
     if _should_auto_select_container_version(instance_type, distribution):
         container_versions = {
             "tensorflow-2.3-gpu-py37": "cu110-ubuntu18.04-v3",
@@ -120,7 +134,9 @@ def retrieve(
             "pytorch-1.6-gpu-py3": "cu110-ubuntu18.04-v3",
             "pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04",
         }
+
         key = "-".join([framework, tag])
+
         if key in container_versions:
             tag = "-".join([tag, container_versions[key]])
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -400,7 +400,7 @@ def _huggingface_base_fm_version(huggingface_version, base_fw, fixture_prefix):
             if len(original_version.split(".")) == 2:
                 base_fw_version = ".".join(base_fw_version.split(".")[:-1])
             versions.append(base_fw_version)
-    return versions
+    return sorted(versions, reverse=True)
 
 
 def _generate_huggingface_base_fw_latest_versions(