aws · knikure · Mar 6, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
@@ -4,3 +4,4 @@ docutils==0.15.2
 packaging==20.9
 jinja2==3.1.3
 schema==0.7.5
+accelerate>=0.24.1,<=0.27.0
@@ -0,0 +1 @@
+accelerate>=0.24.1,<=0.27.0
@@ -39,3 +39,4 @@ tritonclient[http]<2.37.0
 onnx==1.14.1
 # tf2onnx==1.15.1
 nbformat>=5.9,<6
+accelerate>=0.24.1,<=0.27.0
@@ -79,6 +79,7 @@ def read_requirements(filename):
     "feature-processor": read_requirements(
         "requirements/extras/feature-processor_requirements.txt"
     ),
+    "huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"),
 }
 # Meta dependency groups
 extras["all"] = [item for group in extras.values() for item in group]

@@ -20,9 +20,11 @@
 
 from pathlib import Path
 
+from accelerate.commands.estimate import estimate_command_parser, gather_data
 from sagemaker import Session
 from sagemaker.model import Model
 from sagemaker.base_predictor import PredictorBase
+from sagemaker.djl_inference import defaults
 from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
 from sagemaker.deserializers import JSONDeserializer, TorchTensorDeserializer
 from sagemaker.serve.builder.schema_builder import SchemaBuilder
@@ -41,6 +43,7 @@
 from sagemaker.serve.utils import task
 from sagemaker.serve.utils.exceptions import TaskNotFoundException
 from sagemaker.serve.utils.predictors import _get_local_mode_predictor
+from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
 from sagemaker.serve.detector.image_detector import (
     auto_detect_container,
     _detect_framework_and_version,
@@ -67,6 +70,9 @@
     ModelServer.DJL_SERVING,
 }
 
+MIB_CONVERSION_FACTOR = 0.00000095367431640625
+MEMORY_BUFFER_MULTIPLIER = 1.2  # 20% buffer
+
 
 # pylint: disable=attribute-defined-outside-init
 @dataclass
@@ -569,7 +575,7 @@ def wrapper(*args, **kwargs):
     # It supports two modes of deployment
     # 1/ SageMaker Endpoint
     # 2/ Local launch with container
-    def build(
+    def build(  # pylint: disable=R0911
         self,
         mode: Type[Mode] = None,
         role_arn: str = None,
@@ -625,6 +631,13 @@ def build(
 
                 if model_task == "text-generation":  # pylint: disable=R1705
                     return self._build_for_tgi()
+                elif self._can_fit_on_single_gpu():
+                    return self._build_for_transformers()
+                elif (
+                    self.model in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES
+                    or self.model in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES
+                ):
+                    return self._build_for_djl()
                 else:
                     return self._build_for_transformers()
 
@@ -696,3 +709,66 @@ def _schema_builder_init(self, model_task: str):
             self.schema_builder = SchemaBuilder(sample_inputs, sample_outputs)
         except ValueError:
             raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.")
+
+    def _total_inference_model_size_mib(self):
+        """Calculates the model size from HF accelerate
+
+        This function gets the model size from accelerate. It also adds a
+        padding and converts to size MiB. When performing inference, expect
+        to add up to an additional 20% to the given model size as found by EleutherAI.
+        """
+        dtypes = self.env_vars.get("dtypes", "float32")
+        parser = estimate_command_parser()
+        args = parser.parse_args([self.model, "--dtypes", dtypes])
+
+        output = gather_data(
+            args
+        )  # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"
+
+        if output is None:
+            raise ValueError(f"Could not get Model size for {self.model}")
+
+        total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
+        logger.info("Total memory size MIB: %s", total_memory_size_mib)
+        return total_memory_size_mib
+
+    def _can_fit_on_single_gpu(self) -> Type[bool]:
+        """Check if model can fit on a single GPU
+
+        If the size of the model is <= single gpu memory size, returns True else False
+        """
+        try:
+            single_gpu_size_mib = self._try_fetch_gpu_info()
+            if self._total_inference_model_size_mib() <= single_gpu_size_mib:
+                logger.info(
+                    "Total inference model size MIB %s, single GPU size for instance MIB %s",
+                    self._total_inference_model_size_mib(),
+                    single_gpu_size_mib,
+                )
+                return True
+            return False
+        except ValueError:
+            logger.info("Unable to determine single GPU size for instance %s", self.instance_type)
+            return False
+
+    def _try_fetch_gpu_info(self):
+        """Get GPU info
+
+        This function gets the GPU info or fallback to set the size of a single GPU
+        """
+        try:
+            gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session)
+            logger.info("GPU info %s for instance %s", gpu_info, self.instance_type)
+            return gpu_info[1] / gpu_info[0]
+        except ValueError:
+            pass
+        try:
+            gpu_fallback = _get_gpu_info_fallback(
+                self.instance_type, self.sagemaker_session.boto_region_name
+            )
+            logger.info("GPU fallback picked up %s", gpu_fallback)
+            return gpu_fallback[1] / gpu_fallback[0]
+        except ValueError:
+            raise ValueError(
+                f"Unable to determine single GPU size for instance: [{self.instance_type}]"
+            )
@@ -2,7 +2,7 @@
 	"fill-mask": {
         "sample_inputs": {
 			"properties": {
-				"inputs": "Paris is the <mask> of France.",
+				"inputs": "Paris is the [MASK] of France.",
 				"parameters": {}
 			}
         },

@@ -0,0 +1,184 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import pytest
+from sagemaker.serve.builder.schema_builder import SchemaBuilder
+from sagemaker.serve.builder.model_builder import ModelBuilder, Mode
+import tests.integ
+from tests.integ.sagemaker.serve.constants import (
+    HF_DIR,
+    PYTHON_VERSION_IS_NOT_310,
+    SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
+)
+from tests.integ.timeout import timeout
+from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
+import logging
+
+logger = logging.getLogger(__name__)
+
+model_id = "bert-base-uncased"
+
+sample_input = {"inputs": "Hello I'm a [MASK] model."}
+
+sample_output = [
+    {
+        "score": 0.10731109976768494,
+        "token": 4827,
+        "token_str": "fashion",
+        "sequence": "hello i'm a fashion model.",
+    },
+    {
+        "score": 0.08774465322494507,
+        "token": 2535,
+        "token_str": "role",
+        "sequence": "hello i'm a role model.",
+    },
+    {
+        "score": 0.05338414013385773,
+        "token": 2047,
+        "token_str": "new",
+        "sequence": "hello i'm a new model.",
+    },
+    {
+        "score": 0.04667224362492561,
+        "token": 3565,
+        "token_str": "super",
+        "sequence": "hello i'm a super model.",
+    },
+    {
+        "score": 0.027096163481473923,
+        "token": 2986,
+        "token_str": "fine",
+        "sequence": "hello i'm a fine model.",
+    },
+]
+
+
+@pytest.fixture
+def model_input():
+    return {"inputs": "The man worked as a [MASK]."}
+
+
+@pytest.fixture
+def model_builder_model_schema_builder():
+    return ModelBuilder(
+        model_path=HF_DIR, model=model_id, schema_builder=SchemaBuilder(sample_input, sample_output)
+    )
+
+
+@pytest.fixture
+def model_builder(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.mark.skipif(
+    PYTHON_VERSION_IS_NOT_310,
+    tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
+    and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
+    reason="no ml.p2 or ml.p3 instances in this region",
+)
+@retry_with_instance_list(gpu_list(tests.integ.test_region()))
+@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
+def test_non_text_generation_model_single_GPU(
+    sagemaker_session, model_builder, model_input, **kwargs
+):
+    iam_client = sagemaker_session.boto_session.client("iam")
+    role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
+    model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
+    caught_ex = None
+    with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
+        try:
+            logger.info("Running in SAGEMAKER_ENDPOINT mode")
+            predictor = model.deploy(
+                mode=Mode.SAGEMAKER_ENDPOINT,
+                instance_type=kwargs["instance_type"],
+                initial_instance_count=1,
+            )
+            logger.info("Endpoint successfully deployed.")
+            prediction = predictor.predict(model_input)
+            assert prediction is not None
+
+            endpoint_name = predictor.endpoint_name
+            sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
+            endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
+                "EndpointConfigName"
+            ]
+            actual_instance_type = sagemaker_client.describe_endpoint_config(
+                EndpointConfigName=endpoint_config_name
+            )["ProductionVariants"][0]["InstanceType"]
+            assert kwargs["instance_type"] == actual_instance_type
+        except Exception as e:
+            caught_ex = e
+        finally:
+            cleanup_model_resources(
+                sagemaker_session=model_builder.sagemaker_session,
+                model_name=model.name,
+                endpoint_name=model.endpoint_name,
+            )
+            if caught_ex:
+                logger.exception(caught_ex)
+                assert (
+                    False
+                ), f"Exception {caught_ex} was thrown when running model builder single GPU test"
+
+
+@pytest.mark.skipif(
+    PYTHON_VERSION_IS_NOT_310,
+    tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
+    and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
+    reason="no ml.p2 or ml.p3 instances in this region",
+)
+@retry_with_instance_list(gpu_list(tests.integ.test_region()))
+@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
+def test_non_text_generation_model_multi_GPU(
+    sagemaker_session, model_builder, model_input, **kwargs
+):
+    iam_client = sagemaker_session.boto_session.client("iam")
+    role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
+    caught_ex = None
+    model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
+    with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
+        try:
+            logger.info("Running in SAGEMAKER_ENDPOINT mode")
+            predictor = model.deploy(
+                mode=Mode.SAGEMAKER_ENDPOINT,
+                instance_type=kwargs["instance_type"],
+                initial_instance_count=1,
+            )
+            logger.info("Endpoint successfully deployed.")
+            prediction = predictor.predict(model_input)
+            assert prediction is not None
+
+            endpoint_name = predictor.endpoint_name
+            sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
+            endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
+                "EndpointConfigName"
+            ]
+            actual_instance_type = sagemaker_client.describe_endpoint_config(
+                EndpointConfigName=endpoint_config_name
+            )["ProductionVariants"][0]["InstanceType"]
+            assert kwargs["instance_type"] == actual_instance_type
+        except Exception as e:
+            caught_ex = e
+        finally:
+            cleanup_model_resources(
+                sagemaker_session=model_builder.sagemaker_session,
+                model_name=model.name,
+                endpoint_name=model.endpoint_name,
+            )
+            if caught_ex:
+                logger.exception(caught_ex)
+                assert (
+                    False
+                ), f"Exception {caught_ex} was thrown when running model builder multi GPU test"
@@ -15,15 +15,15 @@
 import pytest
 from sagemaker.serve.builder.schema_builder import SchemaBuilder
 from sagemaker.serve.builder.model_builder import ModelBuilder, Mode
-
+import tests.integ
 from tests.integ.sagemaker.serve.constants import (
     HF_DIR,
     PYTHON_VERSION_IS_NOT_310,
     SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
 )
 
 from tests.integ.timeout import timeout
-from tests.integ.utils import cleanup_model_resources
+from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
 import logging
 
 logger = logging.getLogger(__name__)
@@ -67,7 +67,7 @@
 
 
 @pytest.fixture
-def input():
+def model_input():
     return {"inputs": "The man worked as a [MASK]."}
 
 
@@ -87,11 +87,14 @@ def model_builder(request):
 
 @pytest.mark.skipif(
     PYTHON_VERSION_IS_NOT_310,
-    reason="Testing feature",
+    tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
+    and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
+    reason="no ml.p2 or ml.p3 instances in this region",
 )
+@retry_with_instance_list(gpu_list(tests.integ.test_region()))
 @pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
 def test_pytorch_transformers_sagemaker_endpoint(
-    sagemaker_session, model_builder, gpu_instance_type, input
+    sagemaker_session, model_builder, model_input, **kwargs
 ):
     logger.info("Running in SAGEMAKER_ENDPOINT mode...")
     caught_ex = None
@@ -106,9 +109,12 @@ def test_pytorch_transformers_sagemaker_endpoint(
     with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
         try:
             logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
-            predictor = model.deploy(instance_type=gpu_instance_type, initial_instance_count=1)
+            predictor = model.deploy(
+                instance_type=kwargs["instance_type"], initial_instance_count=2
+            )
             logger.info("Endpoint successfully deployed.")
-            predictor.predict(input)
+            predictor.predict(model_input)
+            assert predictor is not None
         except Exception as e:
             caught_ex = e
         finally: