TGI NeuronX

amzn-choeric · amzn-choeric · commit eb16dccaf1fb · 2024-01-17T11:27:00.000-05:00
diff --git a/src/sagemaker/huggingface/llm_utils.py b/src/sagemaker/huggingface/llm_utils.py
@@ -57,6 +57,14 @@ def get_huggingface_llm_image_uri(
             version=version,
             image_scope="inference",
         )
+    if backend == "huggingface-neuronx":
+        return image_uris.retrieve(
+            "huggingface-llm-neuronx",
+            region=region,
+            version=version,
+            image_scope="inference",
+            inference_tool="neuronx",
+        )
     if backend == "lmi":
         version = version or "0.24.0"
         return image_uris.retrieve(framework="djl-deepspeed", region=region, version=version)
diff --git a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json
@@ -0,0 +1,41 @@
+{
+    "inference": {
+        "processors": [
+            "inf"
+        ],
+        "version_aliases": {
+            "0.0": "0.0.16"
+        },
+        "versions": {
+            "0.0.16": {
+                "py_versions": [
+                    "py310"
+                ],
+                "registries": {
+                    "ap-northeast-1": "763104351884",
+                    "ap-south-1": "763104351884",
+                    "ap-south-2": "772153158452",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ap-southeast-4": "457447274322",
+                    "eu-central-1": "763104351884",
+                    "eu-central-2": "380420809688",
+                    "eu-south-2": "503227376785",
+                    "eu-west-1": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "il-central-1": "780543022126",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-west-2": "763104351884",
+                    "ca-west-1": "204538143572"
+                },
+                "tag_prefix": "2.1.1-optimum0.0.16",
+                "repository": "huggingface-pytorch-tgi-inference",
+                "container_version": {
+                    "inf": "ubuntu22.04"
+                }
+            }
+        }
+    }
+}
diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py
@@ -37,6 +37,7 @@
 ECR_URI_TEMPLATE = "{registry}.dkr.{hostname}/{repository}"
 HUGGING_FACE_FRAMEWORK = "huggingface"
 HUGGING_FACE_LLM_FRAMEWORK = "huggingface-llm"
+HUGGING_FACE_LLM_NEURONX_FRAMEWORK = "huggingface-llm-neuronx"
 XGBOOST_FRAMEWORK = "xgboost"
 SKLEARN_FRAMEWORK = "sklearn"
 TRAINIUM_ALLOWED_FRAMEWORKS = "pytorch"
@@ -470,6 +471,7 @@ def _validate_version_and_set_if_needed(version, config, framework):
     if version is None and framework in [
         DATA_WRANGLER_FRAMEWORK,
         HUGGING_FACE_LLM_FRAMEWORK,
+        HUGGING_FACE_LLM_NEURONX_FRAMEWORK,
         STABILITYAI_FRAMEWORK,
     ]:
         version = _get_latest_versions(available_versions)
diff --git a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py
@@ -19,29 +19,38 @@
 
 LMI_VERSIONS = ["0.24.0"]
 HF_VERSIONS_MAPPING = {
-    "0.6.0": "2.0.0-tgi0.6.0-gpu-py39-cu118-ubuntu20.04",
-    "0.8.2": "2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04",
-    "0.9.3": "2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04",
-    "1.0.3": "2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04",
-    "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04",
-    "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04",
-    "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04",
-    "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04",
+    "gpu": {
+        "0.6.0": "2.0.0-tgi0.6.0-gpu-py39-cu118-ubuntu20.04",
+        "0.8.2": "2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04",
+        "0.9.3": "2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04",
+        "1.0.3": "2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04",
+        "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04",
+        "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04",
+        "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04",
+        "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04",
+    },
+    "inf": {
+        "0.0.16": "2.1.1-optimum0.0.16-neuronx-py310-ubuntu22.04",
+    },
 }
 
 
-@pytest.mark.parametrize("load_config", ["huggingface-llm.json"], indirect=True)
+@pytest.mark.parametrize(
+    "load_config", ["huggingface-llm.json", "huggingface-llm-neuronx.json"], indirect=True
+)
 def test_huggingface_uris(load_config):
     VERSIONS = load_config["inference"]["versions"]
+    device = load_config["inference"]["processors"][0]
+    backend = "huggingface-neuronx" if device == "inf" else "huggingface"
     for version in VERSIONS:
         ACCOUNTS = load_config["inference"]["versions"][version]["registries"]
         for region in ACCOUNTS.keys():
-            uri = get_huggingface_llm_image_uri("huggingface", region=region, version=version)
+            uri = get_huggingface_llm_image_uri(backend, region=region, version=version)
             expected = expected_uris.huggingface_llm_framework_uri(
                 "huggingface-pytorch-tgi-inference",
                 ACCOUNTS[region],
                 version,
-                HF_VERSIONS_MAPPING[version],
+                HF_VERSIONS_MAPPING[device][version],
                 region=region,
             )
             assert expected == uri