Skip to content

fix: add pytorch 1.8.1 for huggingface #2642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Sep 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 75 additions & 5 deletions src/sagemaker/image_uri_config/huggingface.json
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
"version_aliases": {
"pytorch1.6": "pytorch1.6.0",
"pytorch1.7": "pytorch1.7.1",
"pytorch1.8": "pytorch1.8.1",
"tensorflow2.4": "tensorflow2.4.1"
},
"pytorch1.6.0": {
Expand Down Expand Up @@ -178,7 +179,8 @@
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-pytorch-training"
"repository": "huggingface-pytorch-training",
"container_version": {"gpu":"cu110-ubuntu18.04"}
},
"pytorch1.7.1": {
"py_versions": ["py36"],
Expand Down Expand Up @@ -209,7 +211,40 @@
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-pytorch-training"
"repository": "huggingface-pytorch-training",
"container_version": {"gpu":"cu110-ubuntu18.04"}
},
"pytorch1.8.1": {
"py_versions": ["py36"],
"registries": {
"af-south-1": "626614931356",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ca-central-1": "763104351884",
"cn-north-1": "727897471807",
"cn-northwest-1": "727897471807",
"eu-central-1": "763104351884",
"eu-north-1": "763104351884",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"eu-south-1": "692866216735",
"me-south-1": "217643126080",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-gov-west-1": "442386744353",
"us-iso-east-1": "886529160074",
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-pytorch-training",
"container_version": {"gpu":"cu111-ubuntu18.04"}
},
"tensorflow2.4.1": {
"py_versions": ["py37"],
Expand Down Expand Up @@ -240,7 +275,8 @@
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-tensorflow-training"
"repository": "huggingface-tensorflow-training",
"container_version": {"gpu":"cu110-ubuntu18.04"}
}
}
}
Expand Down Expand Up @@ -286,7 +322,40 @@
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-pytorch-inference"
"repository": "huggingface-pytorch-inference",
"container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" }
},
"pytorch1.8.1": {
"py_versions": ["py36"],
"registries": {
"af-south-1": "626614931356",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ca-central-1": "763104351884",
"cn-north-1": "727897471807",
"cn-northwest-1": "727897471807",
"eu-central-1": "763104351884",
"eu-north-1": "763104351884",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"eu-south-1": "692866216735",
"me-south-1": "217643126080",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-gov-west-1": "442386744353",
"us-iso-east-1": "886529160074",
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-pytorch-inference",
"container_version": {"gpu":"cu111-ubuntu18.04", "cpu":"ubuntu18.04" }
},
"tensorflow2.4.1": {
"py_versions": ["py37"],
Expand Down Expand Up @@ -317,7 +386,8 @@
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "huggingface-tensorflow-inference"
"repository": "huggingface-tensorflow-inference",
"container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" }
}
}
}
Expand Down
20 changes: 18 additions & 2 deletions src/sagemaker/image_uris.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def retrieve(
):
"""Retrieves the ECR URI for the Docker image matching the given arguments.

Ideally this function should not be called directly, rather it should be called from the
fit() function inside framework estimator.

Args:
framework (str): The name of the framework or algorithm.
region (str): The AWS region.
Expand All @@ -56,7 +59,11 @@ def retrieve(
image_scope (str): The image type, i.e. what it is used for.
Valid values: "training", "inference", "eia". If ``accelerator_type`` is set,
``image_scope`` is ignored.
container_version (str): the version of docker image
container_version (str): the version of docker image.
Ideally the value of parameter should be created inside the framework.
For custom use, see the list of supported container versions:
https://github.com/aws/deep-learning-containers/blob/master/available_images.md
(default: None).
distribution (dict): A dictionary with information on how to run distributed training
(default: None).

Expand All @@ -66,10 +73,12 @@ def retrieve(
Raises:
ValueError: If the combination of arguments specified is not supported.
"""

config = _config_for_framework_and_scope(framework, image_scope, accelerator_type)
original_version = version
version = _validate_version_and_set_if_needed(version, config, framework)
version_config = config["versions"][_version_for_config(version, config)]

if framework == HUGGING_FACE_FRAMEWORK:
if version_config.get("version_aliases"):
full_base_framework_version = version_config["version_aliases"].get(
Expand All @@ -81,7 +90,6 @@ def retrieve(

py_version = _validate_py_version_and_set_if_needed(py_version, version_config, framework)
version_config = version_config.get(py_version) or version_config

registry = _registry_from_region(region, version_config["registries"])
hostname = utils._botocore_resolver().construct_endpoint("ecr", region)["hostname"]

Expand All @@ -91,11 +99,16 @@ def retrieve(
instance_type, config.get("processors") or version_config.get("processors")
)

# if container version is available in .json file, utilize that
if version_config.get("container_version"):
container_version = version_config["container_version"][processor]

if framework == HUGGING_FACE_FRAMEWORK:
pt_or_tf_version = (
re.compile("^(pytorch|tensorflow)(.*)$").match(base_framework_version).group(2)
)
tag_prefix = f"{pt_or_tf_version}-transformers{original_version}"

else:
tag_prefix = version_config.get("tag_prefix", version)

Expand All @@ -105,6 +118,7 @@ def retrieve(
py_version,
container_version,
)

if _should_auto_select_container_version(instance_type, distribution):
container_versions = {
"tensorflow-2.3-gpu-py37": "cu110-ubuntu18.04-v3",
Expand All @@ -120,7 +134,9 @@ def retrieve(
"pytorch-1.6-gpu-py3": "cu110-ubuntu18.04-v3",
"pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04",
}

key = "-".join([framework, tag])

if key in container_versions:
tag = "-".join([tag, container_versions[key]])

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def _huggingface_base_fm_version(huggingface_version, base_fw, fixture_prefix):
if len(original_version.split(".")) == 2:
base_fw_version = ".".join(base_fw_version.split(".")[:-1])
versions.append(base_fw_version)
return versions
return sorted(versions, reverse=True)


def _generate_huggingface_base_fw_latest_versions(
Expand Down