From 8e29874e7658381bf6f6c64b1022c6febb8baf73 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Thu, 4 Nov 2021 09:24:55 +0100 Subject: [PATCH 1/6] added new HuggingFace DLCs --- .../image_uri_config/huggingface.json | 150 +++++++++++++++++- 1 file changed, 146 insertions(+), 4 deletions(-) diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index ba84436a90..3a0f292440 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -6,7 +6,9 @@ "4.5": "4.5.0", "4.6": "4.6.1", "4.10": "4.10.2", - "4.11": "4.11.0" + "4.11": "4.11.0", + "4.12": "4.12.3" + }, "versions": { "4.4.2": { @@ -416,8 +418,7 @@ "repository": "huggingface-tensorflow-training", "container_version": {"gpu": "cu112-ubuntu18.04"} } - } - , + }, "4.11.0": { "version_aliases": { "pytorch1.9": "pytorch1.9.0", @@ -487,6 +488,76 @@ "repository": "huggingface-tensorflow-training", "container_version": {"gpu": "cu112-ubuntu18.04"} } + }, + "4.12.3": { + "version_aliases": { + "pytorch1.9": "pytorch1.9.1", + "tensorflow2.5": "tensorflow2.5.1" + }, + "pytorch1.9.1": { + "py_versions": ["py38"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-pytorch-training", + "container_version": {"gpu": "cu111-ubuntu20.04"} + }, + "tensorflow2.5.1": { + "py_versions": ["py37"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-tensorflow-training", + "container_version": {"gpu": "cu112-ubuntu18.04"} + } } } }, @@ -496,7 +567,8 @@ "version_aliases": { "4.6": "4.6.1", "4.10": "4.10.2", - "4.11": "4.11.0" + "4.11": "4.11.0", + "4.12": "4.12.3" }, "versions": { "4.6.1": { @@ -806,6 +878,76 @@ "repository": "huggingface-tensorflow-inference", "container_version": {"gpu": "cu112-ubuntu18.04", "cpu": "ubuntu18.04" } } + }, + "4.12.3": { + "version_aliases": { + "pytorch1.9": "pytorch1.9.1", + "tensorflow2.5": "tensorflow2.5.1" + }, + "pytorch1.9.1": { + "py_versions": ["py38"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-pytorch-inference", + "container_version": {"gpu": "cu111-ubuntu20.04", "cpu": "ubuntu20.04" } + }, + "tensorflow2.5.1": { + "py_versions": ["py37"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-tensorflow-inference", + "container_version": {"gpu": "cu112-ubuntu18.04", "cpu": "ubuntu18.04" } + } } } } From 13138ea58b03de5b87eba6eab6f5606082990d6c Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Thu, 4 Nov 2021 09:27:06 +0100 Subject: [PATCH 2/6] removed break --- src/sagemaker/image_uri_config/huggingface.json | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index 3a0f292440..039f0d54d8 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -8,7 +8,6 @@ "4.10": "4.10.2", "4.11": "4.11.0", "4.12": "4.12.3" - }, "versions": { "4.4.2": { From cb2d374d83e58cdfd932e53d498c4256cd82a357 Mon Sep 17 00:00:00 2001 From: philschmid Date: Tue, 9 Nov 2021 08:30:01 +0100 Subject: [PATCH 3/6] saved with correct format --- tests/data/huggingface/run_tf.py | 12 ++++-------- tests/integ/test_huggingface.py | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/data/huggingface/run_tf.py b/tests/data/huggingface/run_tf.py index 2af0eb04b0..e96b5c6597 100644 --- a/tests/data/huggingface/run_tf.py +++ b/tests/data/huggingface/run_tf.py @@ -57,10 +57,8 @@ ) train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) - train_features = { - x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) - for x in ["input_ids", "attention_mask"] - } + train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]} + tf_train_dataset = tf.data.Dataset.from_tensor_slices( (train_features, train_dataset["label"]) ).batch(args.per_device_train_batch_size) @@ -71,10 +69,8 @@ ) test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) - test_features = { - x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) - for x in ["input_ids", "attention_mask"] - } + test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]} + tf_test_dataset = tf.data.Dataset.from_tensor_slices( (test_features, test_dataset["label"]) ).batch(args.per_device_eval_batch_size) diff --git a/tests/integ/test_huggingface.py b/tests/integ/test_huggingface.py index e8bce98cf6..607a20befd 100644 --- a/tests/integ/test_huggingface.py +++ b/tests/integ/test_huggingface.py @@ -158,7 +158,7 @@ def test_huggingface_inference( huggingface_pytorch_latest_inference_py_version, ): env = { - "HF_MODEL_ID": "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english", + "HF_MODEL_ID": "philschmid/tiny-distilbert-classification", "HF_TASK": "text-classification", } endpoint_name = unique_name_from_base("test-hf-inference") From 856a94998ae24f988485bf2fe1ad760a227d7f29 Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 11 Nov 2021 09:25:33 +0100 Subject: [PATCH 4/6] added condition for test to also work with lower transformers version --- tests/data/huggingface/run_tf.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/data/huggingface/run_tf.py b/tests/data/huggingface/run_tf.py index e96b5c6597..db2335f9ef 100644 --- a/tests/data/huggingface/run_tf.py +++ b/tests/data/huggingface/run_tf.py @@ -7,6 +7,7 @@ from datasets import load_dataset from transformers import AutoTokenizer, TFAutoModelForSequenceClassification +import transformers if __name__ == "__main__": @@ -57,8 +58,15 @@ ) train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) - train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]} - + if transformers.__version__ > "4.12.0": + train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]} + else: + train_features = { + x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) + for x in ["input_ids", "attention_mask"] + } + + tf_train_dataset = tf.data.Dataset.from_tensor_slices( (train_features, train_dataset["label"]) ).batch(args.per_device_train_batch_size) @@ -69,7 +77,14 @@ ) test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) - test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]} + if transformers.__version__ > "4.12.0": + test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]} + else: + test_features = { + x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) + for x in ["input_ids", "attention_mask"] + } + tf_test_dataset = tf.data.Dataset.from_tensor_slices( (test_features, test_dataset["label"]) From a5c6012517da7de493651d4dbee3567e647fc0be Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 11 Nov 2021 09:52:27 +0100 Subject: [PATCH 5/6] make black happy --- tests/data/huggingface/run_tf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/data/huggingface/run_tf.py b/tests/data/huggingface/run_tf.py index db2335f9ef..7370b11a76 100644 --- a/tests/data/huggingface/run_tf.py +++ b/tests/data/huggingface/run_tf.py @@ -65,8 +65,7 @@ x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) for x in ["input_ids", "attention_mask"] } - - + tf_train_dataset = tf.data.Dataset.from_tensor_slices( (train_features, train_dataset["label"]) ).batch(args.per_device_train_batch_size) @@ -85,7 +84,6 @@ for x in ["input_ids", "attention_mask"] } - tf_test_dataset = tf.data.Dataset.from_tensor_slices( (test_features, test_dataset["label"]) ).batch(args.per_device_eval_batch_size) From 100749891b4d877aeb7e12f3027b14458948f681 Mon Sep 17 00:00:00 2001 From: Navin Soni Date: Fri, 12 Nov 2021 03:12:19 +0000 Subject: [PATCH 6/6] refactor feature generation --- tests/data/huggingface/run_tf.py | 35 +++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tests/data/huggingface/run_tf.py b/tests/data/huggingface/run_tf.py index 7370b11a76..811c98053c 100644 --- a/tests/data/huggingface/run_tf.py +++ b/tests/data/huggingface/run_tf.py @@ -4,10 +4,21 @@ import time import tensorflow as tf +import transformers from datasets import load_dataset - from transformers import AutoTokenizer, TFAutoModelForSequenceClassification -import transformers + + +def _get_dataset_features(dataset, tokenizer, columns=[]): + if transformers.__version__ > "4.12.0": + features = {x: dataset[x] for x in columns} + else: + features = { + x: dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) + for x in columns + } + + return features if __name__ == "__main__": @@ -58,13 +69,9 @@ ) train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) - if transformers.__version__ > "4.12.0": - train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]} - else: - train_features = { - x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) - for x in ["input_ids", "attention_mask"] - } + train_features = _get_dataset_features( + train_dataset, tokenizer, columns=["input_ids", "attention_mask"] + ) tf_train_dataset = tf.data.Dataset.from_tensor_slices( (train_features, train_dataset["label"]) @@ -76,13 +83,9 @@ ) test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) - if transformers.__version__ > "4.12.0": - test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]} - else: - test_features = { - x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) - for x in ["input_ids", "attention_mask"] - } + test_features = _get_dataset_features( + test_dataset, tokenizer, columns=["input_ids", "attention_mask"] + ) tf_test_dataset = tf.data.Dataset.from_tensor_slices( (test_features, test_dataset["label"])