Skip to content

change: Enhance model builder selection logic to include model size #4429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ docutils==0.15.2
packaging==20.9
jinja2==3.1.3
schema==0.7.5
accelerate>=0.24.1,<=0.27.0
1 change: 1 addition & 0 deletions requirements/extras/huggingface_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
accelerate>=0.24.1,<=0.27.0
1 change: 1 addition & 0 deletions requirements/extras/test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ tritonclient[http]<2.37.0
onnx==1.14.1
# tf2onnx==1.15.1
nbformat>=5.9,<6
accelerate>=0.24.1,<=0.27.0
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def read_requirements(filename):
"feature-processor": read_requirements(
"requirements/extras/feature-processor_requirements.txt"
),
"huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"),
}
# Meta dependency groups
extras["all"] = [item for group in extras.values() for item in group]
Expand Down
78 changes: 77 additions & 1 deletion src/sagemaker/serve/builder/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@

from pathlib import Path

from accelerate.commands.estimate import estimate_command_parser, gather_data
from sagemaker import Session
from sagemaker.model import Model
from sagemaker.base_predictor import PredictorBase
from sagemaker.djl_inference import defaults
from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
from sagemaker.deserializers import JSONDeserializer, TorchTensorDeserializer
from sagemaker.serve.builder.schema_builder import SchemaBuilder
Expand All @@ -41,6 +43,7 @@
from sagemaker.serve.utils import task
from sagemaker.serve.utils.exceptions import TaskNotFoundException
from sagemaker.serve.utils.predictors import _get_local_mode_predictor
from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
from sagemaker.serve.detector.image_detector import (
auto_detect_container,
_detect_framework_and_version,
Expand All @@ -67,6 +70,9 @@
ModelServer.DJL_SERVING,
}

MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer


# pylint: disable=attribute-defined-outside-init
@dataclass
Expand Down Expand Up @@ -569,7 +575,7 @@ def wrapper(*args, **kwargs):
# It supports two modes of deployment
# 1/ SageMaker Endpoint
# 2/ Local launch with container
def build(
def build( # pylint: disable=R0911
self,
mode: Type[Mode] = None,
role_arn: str = None,
Expand Down Expand Up @@ -625,6 +631,13 @@ def build(

if model_task == "text-generation": # pylint: disable=R1705
return self._build_for_tgi()
elif self._can_fit_on_single_gpu():
return self._build_for_transformers()
elif (
self.model in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES
or self.model in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES
):
return self._build_for_djl()
else:
return self._build_for_transformers()

Expand Down Expand Up @@ -696,3 +709,66 @@ def _schema_builder_init(self, model_task: str):
self.schema_builder = SchemaBuilder(sample_inputs, sample_outputs)
except ValueError:
raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.")

def _total_inference_model_size_mib(self):
"""Calculates the model size from HF accelerate

This function gets the model size from accelerate. It also adds a
padding and converts to size MiB. When performing inference, expect
to add up to an additional 20% to the given model size as found by EleutherAI.
"""
dtypes = self.env_vars.get("dtypes", "float32")
parser = estimate_command_parser()
args = parser.parse_args([self.model, "--dtypes", dtypes])

output = gather_data(
args
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"

if output is None:
raise ValueError(f"Could not get Model size for {self.model}")

total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
logger.info("Total memory size MIB: %s", total_memory_size_mib)
return total_memory_size_mib

def _can_fit_on_single_gpu(self) -> Type[bool]:
"""Check if model can fit on a single GPU

If the size of the model is <= single gpu memory size, returns True else False
"""
try:
single_gpu_size_mib = self._try_fetch_gpu_info()
if self._total_inference_model_size_mib() <= single_gpu_size_mib:
logger.info(
"Total inference model size MIB %s, single GPU size for instance MIB %s",
self._total_inference_model_size_mib(),
single_gpu_size_mib,
)
return True
return False
except ValueError:
logger.info("Unable to determine single GPU size for instance %s", self.instance_type)
return False

def _try_fetch_gpu_info(self):
"""Get GPU info

This function gets the GPU info or fallback to set the size of a single GPU
"""
try:
gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session)
logger.info("GPU info %s for instance %s", gpu_info, self.instance_type)
return gpu_info[1] / gpu_info[0]
except ValueError:
pass
try:
gpu_fallback = _get_gpu_info_fallback(
self.instance_type, self.sagemaker_session.boto_region_name
)
logger.info("GPU fallback picked up %s", gpu_fallback)
return gpu_fallback[1] / gpu_fallback[0]
except ValueError:
raise ValueError(
f"Unable to determine single GPU size for instance: [{self.instance_type}]"
)
2 changes: 1 addition & 1 deletion src/sagemaker/serve/schema/task.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"fill-mask": {
"sample_inputs": {
"properties": {
"inputs": "Paris is the <mask> of France.",
"inputs": "Paris is the [MASK] of France.",
"parameters": {}
}
},
Expand Down
184 changes: 184 additions & 0 deletions tests/integ/sagemaker/serve/test_serve_model_builder_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import pytest
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.builder.model_builder import ModelBuilder, Mode
import tests.integ
from tests.integ.sagemaker.serve.constants import (
HF_DIR,
PYTHON_VERSION_IS_NOT_310,
SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
)
from tests.integ.timeout import timeout
from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
import logging

logger = logging.getLogger(__name__)

model_id = "bert-base-uncased"

sample_input = {"inputs": "Hello I'm a [MASK] model."}

sample_output = [
{
"score": 0.10731109976768494,
"token": 4827,
"token_str": "fashion",
"sequence": "hello i'm a fashion model.",
},
{
"score": 0.08774465322494507,
"token": 2535,
"token_str": "role",
"sequence": "hello i'm a role model.",
},
{
"score": 0.05338414013385773,
"token": 2047,
"token_str": "new",
"sequence": "hello i'm a new model.",
},
{
"score": 0.04667224362492561,
"token": 3565,
"token_str": "super",
"sequence": "hello i'm a super model.",
},
{
"score": 0.027096163481473923,
"token": 2986,
"token_str": "fine",
"sequence": "hello i'm a fine model.",
},
]


@pytest.fixture
def model_input():
return {"inputs": "The man worked as a [MASK]."}


@pytest.fixture
def model_builder_model_schema_builder():
return ModelBuilder(
model_path=HF_DIR, model=model_id, schema_builder=SchemaBuilder(sample_input, sample_output)
)


@pytest.fixture
def model_builder(request):
return request.getfixturevalue(request.param)


@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310,
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
reason="no ml.p2 or ml.p3 instances in this region",
)
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
def test_non_text_generation_model_single_GPU(
sagemaker_session, model_builder, model_input, **kwargs
):
iam_client = sagemaker_session.boto_session.client("iam")
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
caught_ex = None
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Running in SAGEMAKER_ENDPOINT mode")
predictor = model.deploy(
mode=Mode.SAGEMAKER_ENDPOINT,
instance_type=kwargs["instance_type"],
initial_instance_count=1,
)
logger.info("Endpoint successfully deployed.")
prediction = predictor.predict(model_input)
assert prediction is not None

endpoint_name = predictor.endpoint_name
sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
"EndpointConfigName"
]
actual_instance_type = sagemaker_client.describe_endpoint_config(
EndpointConfigName=endpoint_config_name
)["ProductionVariants"][0]["InstanceType"]
assert kwargs["instance_type"] == actual_instance_type
except Exception as e:
caught_ex = e
finally:
cleanup_model_resources(
sagemaker_session=model_builder.sagemaker_session,
model_name=model.name,
endpoint_name=model.endpoint_name,
)
if caught_ex:
logger.exception(caught_ex)
assert (
False
), f"Exception {caught_ex} was thrown when running model builder single GPU test"


@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310,
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
reason="no ml.p2 or ml.p3 instances in this region",
)
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
def test_non_text_generation_model_multi_GPU(
sagemaker_session, model_builder, model_input, **kwargs
):
iam_client = sagemaker_session.boto_session.client("iam")
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
caught_ex = None
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Running in SAGEMAKER_ENDPOINT mode")
predictor = model.deploy(
mode=Mode.SAGEMAKER_ENDPOINT,
instance_type=kwargs["instance_type"],
initial_instance_count=1,
)
logger.info("Endpoint successfully deployed.")
prediction = predictor.predict(model_input)
assert prediction is not None

endpoint_name = predictor.endpoint_name
sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
"EndpointConfigName"
]
actual_instance_type = sagemaker_client.describe_endpoint_config(
EndpointConfigName=endpoint_config_name
)["ProductionVariants"][0]["InstanceType"]
assert kwargs["instance_type"] == actual_instance_type
except Exception as e:
caught_ex = e
finally:
cleanup_model_resources(
sagemaker_session=model_builder.sagemaker_session,
model_name=model.name,
endpoint_name=model.endpoint_name,
)
if caught_ex:
logger.exception(caught_ex)
assert (
False
), f"Exception {caught_ex} was thrown when running model builder multi GPU test"
20 changes: 13 additions & 7 deletions tests/integ/sagemaker/serve/test_serve_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
import pytest
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.builder.model_builder import ModelBuilder, Mode

import tests.integ
from tests.integ.sagemaker.serve.constants import (
HF_DIR,
PYTHON_VERSION_IS_NOT_310,
SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
)

from tests.integ.timeout import timeout
from tests.integ.utils import cleanup_model_resources
from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
import logging

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -67,7 +67,7 @@


@pytest.fixture
def input():
def model_input():
return {"inputs": "The man worked as a [MASK]."}


Expand All @@ -87,11 +87,14 @@ def model_builder(request):

@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310,
reason="Testing feature",
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
reason="no ml.p2 or ml.p3 instances in this region",
)
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
def test_pytorch_transformers_sagemaker_endpoint(
sagemaker_session, model_builder, gpu_instance_type, input
sagemaker_session, model_builder, model_input, **kwargs
):
logger.info("Running in SAGEMAKER_ENDPOINT mode...")
caught_ex = None
Expand All @@ -106,9 +109,12 @@ def test_pytorch_transformers_sagemaker_endpoint(
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
predictor = model.deploy(instance_type=gpu_instance_type, initial_instance_count=1)
predictor = model.deploy(
instance_type=kwargs["instance_type"], initial_instance_count=2
)
logger.info("Endpoint successfully deployed.")
predictor.predict(input)
predictor.predict(model_input)
assert predictor is not None
except Exception as e:
caught_ex = e
finally:
Expand Down
Loading