Skip to content

Commit f47e40a

Browse files
committed
change: Enhance model builder selection logic to include model size
1 parent 84989bb commit f47e40a

File tree

7 files changed

+463
-8
lines changed

7 files changed

+463
-8
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
accelerate>=0.24.1,<=0.27.0
2+
huggingface_hub>=0.14.0,<=0.18.0

requirements/extras/test_requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,5 @@ tritonclient[http]<2.37.0
3939
onnx==1.14.1
4040
# tf2onnx==1.15.1
4141
nbformat>=5.9,<6
42+
accelerate>=0.24.1,<=0.27.0
43+
huggingface_hub>=0.14.0,<=0.18.0

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def read_requirements(filename):
7979
"feature-processor": read_requirements(
8080
"requirements/extras/feature-processor_requirements.txt"
8181
),
82+
"huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"),
8283
}
8384
# Meta dependency groups
8485
extras["all"] = [item for group in extras.values() for item in group]

src/sagemaker/serve/builder/model_builder.py

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020

2121
from pathlib import Path
2222

23+
from accelerate.commands.estimate import estimate_command_parser, gather_data
2324
from sagemaker import Session
2425
from sagemaker.model import Model
2526
from sagemaker.base_predictor import PredictorBase
27+
from sagemaker.djl_inference import defaults
2628
from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
2729
from sagemaker.deserializers import JSONDeserializer, TorchTensorDeserializer
2830
from sagemaker.serve.builder.schema_builder import SchemaBuilder
@@ -41,6 +43,7 @@
4143
from sagemaker.serve.utils import task
4244
from sagemaker.serve.utils.exceptions import TaskNotFoundException
4345
from sagemaker.serve.utils.predictors import _get_local_mode_predictor
46+
from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
4447
from sagemaker.serve.detector.image_detector import (
4548
auto_detect_container,
4649
_detect_framework_and_version,
@@ -67,6 +70,9 @@
6770
ModelServer.DJL_SERVING,
6871
}
6972

73+
MIB_CONVERSION_FACTOR = 0.00000095367431640625
74+
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer
75+
7076

7177
# pylint: disable=attribute-defined-outside-init
7278
@dataclass
@@ -569,7 +575,7 @@ def wrapper(*args, **kwargs):
569575
# It supports two modes of deployment
570576
# 1/ SageMaker Endpoint
571577
# 2/ Local launch with container
572-
def build(
578+
def build( # pylint: disable=R0911
573579
self,
574580
mode: Type[Mode] = None,
575581
role_arn: str = None,
@@ -607,7 +613,11 @@ def build(
607613

608614
self.serve_settings = self._get_serve_setting()
609615

616+
<<<<<<< Updated upstream
610617
self._is_custom_image_uri = self.image_uri is not None
618+
=======
619+
self._is_custom_image_uri = self.image_uri is None
620+
>>>>>>> Stashed changes
611621

612622
if isinstance(self.model, str):
613623
if self._is_jumpstart_model_id():
@@ -618,13 +628,25 @@ def build(
618628
hf_model_md = get_huggingface_model_metadata(
619629
self.model, self.env_vars.get("HUGGING_FACE_HUB_TOKEN")
620630
)
631+
<<<<<<< Updated upstream
621632

622633
model_task = hf_model_md.get("pipeline_tag")
623634
if self.schema_builder is None and model_task:
624635
self._schema_builder_init(model_task)
625636

626637
if model_task == "text-generation": # pylint: disable=R1705
627638
return self._build_for_tgi()
639+
=======
640+
if hf_model_md.get("pipeline_tag") == "text-generation": # pylint: disable=R1705
641+
return self._build_for_tgi()
642+
elif self._can_fit_on_single_gpu():
643+
return self._build_for_transformers()
644+
elif (
645+
self.model in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES
646+
or self.model in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES
647+
):
648+
return self._build_for_djl()
649+
>>>>>>> Stashed changes
628650
else:
629651
return self._build_for_transformers()
630652

@@ -682,6 +704,7 @@ def validate(self, model_dir: str) -> Type[bool]:
682704

683705
return get_metadata(model_dir)
684706

707+
<<<<<<< Updated upstream
685708
def _schema_builder_init(self, model_task: str):
686709
"""Initialize the schema builder
687710
@@ -696,3 +719,69 @@ def _schema_builder_init(self, model_task: str):
696719
self.schema_builder = SchemaBuilder(sample_inputs, sample_outputs)
697720
except ValueError:
698721
raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.")
722+
=======
723+
def _total_inference_model_size_mib(self):
724+
"""Calculates the model size from HF accelerate
725+
726+
This function gets the model size from accelerate. It also adds a
727+
padding and converts to size MiB. When performing inference, expect
728+
to add up to an additional 20% to the given model size as found by EleutherAI.
729+
"""
730+
dtypes = "float32"
731+
try:
732+
if self.env_vars.get("dtypes"):
733+
dtypes = self.env_vars.get("dtypes")
734+
735+
parser = estimate_command_parser()
736+
args = parser.parse_args([self.model, "--dtypes", dtypes])
737+
except ValueError:
738+
logging.error("Args specified incorrect for model %s", self.model)
739+
740+
output = gather_data(
741+
args
742+
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"
743+
744+
total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
745+
logger.info("Total memory size MIB: %s", total_memory_size_mib)
746+
return total_memory_size_mib
747+
748+
def _can_fit_on_single_gpu(self) -> Type[bool]:
749+
"""Check if model can fit on a single GPU
750+
751+
If the size of the model is <= single gpu memory size, returns True else False
752+
"""
753+
try:
754+
single_gpu_size_mib = self._try_fetch_gpu_info()
755+
if self._total_inference_model_size_mib() <= single_gpu_size_mib:
756+
logger.info(
757+
"Total inference model size MIB %s, single GPU size for instance MIB %s",
758+
self._total_inference_model_size_mib(),
759+
single_gpu_size_mib,
760+
)
761+
return True
762+
except ValueError:
763+
logger.info("Unable to determine single GPU size for instance %s", self.instance_type)
764+
return False
765+
766+
def _try_fetch_gpu_info(self):
767+
"""Get GPU info
768+
769+
This function gets the GPU info or fallback to set the size of a single GPU
770+
"""
771+
try:
772+
gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session)
773+
logger.info("GPU info %s for instance %s", gpu_info, self.instance_type)
774+
return gpu_info[1] / gpu_info[0]
775+
except ValueError:
776+
pass
777+
try:
778+
gpu_fallback = _get_gpu_info_fallback(
779+
self.instance_type, self.sagemaker_session.boto_region_name
780+
)
781+
logger.info("GPU fallback picked up %s", gpu_fallback)
782+
return gpu_fallback[1] / gpu_fallback[0]
783+
except ValueError:
784+
raise ValueError(
785+
f"Unable to determine single GPU size for instance: [{self.instance_type}]"
786+
)
787+
>>>>>>> Stashed changes
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import pytest
16+
from sagemaker.serve import Mode
17+
from sagemaker.serve.builder.model_builder import ModelBuilder
18+
from sagemaker.serve.builder.schema_builder import SchemaBuilder
19+
from tests.integ.sagemaker.serve.constants import (
20+
HF_DIR,
21+
PYTHON_VERSION_IS_NOT_310,
22+
SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
23+
)
24+
from tests.integ.timeout import timeout
25+
from tests.integ.utils import cleanup_model_resources
26+
import logging
27+
28+
logger = logging.getLogger(__name__)
29+
30+
model_id = "bert-base-uncased"
31+
32+
sample_input = {"inputs": "Hello I'm a [MASK] model."}
33+
34+
sample_output = [
35+
{
36+
"score": 0.10731109976768494,
37+
"token": 4827,
38+
"token_str": "fashion",
39+
"sequence": "hello i'm a fashion model.",
40+
},
41+
{
42+
"score": 0.08774465322494507,
43+
"token": 2535,
44+
"token_str": "role",
45+
"sequence": "hello i'm a role model.",
46+
},
47+
{
48+
"score": 0.05338414013385773,
49+
"token": 2047,
50+
"token_str": "new",
51+
"sequence": "hello i'm a new model.",
52+
},
53+
{
54+
"score": 0.04667224362492561,
55+
"token": 3565,
56+
"token_str": "super",
57+
"sequence": "hello i'm a super model.",
58+
},
59+
{
60+
"score": 0.027096163481473923,
61+
"token": 2986,
62+
"token_str": "fine",
63+
"sequence": "hello i'm a fine model.",
64+
},
65+
]
66+
67+
68+
@pytest.fixture
69+
def model_input():
70+
return {"inputs": "The man worked as a [MASK]."}
71+
72+
73+
@pytest.fixture
74+
def model_builder_model_schema_builder():
75+
return ModelBuilder(
76+
model_path=HF_DIR, model=model_id, schema_builder=SchemaBuilder(sample_input, sample_output)
77+
)
78+
79+
80+
@pytest.fixture
81+
def model_builder(request):
82+
return request.getfixturevalue(request.param)
83+
84+
85+
@pytest.mark.skipif(
86+
PYTHON_VERSION_IS_NOT_310,
87+
reason="Testing feature",
88+
)
89+
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
90+
def test_non_text_generation_model_single_GPU(sagemaker_session, model_builder, model_input):
91+
iam_client = sagemaker_session.boto_session.client("iam")
92+
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
93+
caught_ex = None
94+
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
95+
try:
96+
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
97+
logger.info("Running in SAGEMAKER_ENDPOINT mode")
98+
predictor = model.deploy(
99+
mode=Mode.SAGEMAKER_ENDPOINT,
100+
instance_type="ml.g4dn.xlarge",
101+
initial_instance_count=1,
102+
)
103+
logger.info("Endpoint successfully deployed.")
104+
prediction = predictor.predict(model_input)
105+
assert prediction is not None
106+
107+
endpoint_name = predictor.endpoint_name
108+
sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
109+
endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
110+
"EndpointConfigName"
111+
]
112+
actual_instance_type = sagemaker_client.describe_endpoint_config(
113+
EndpointConfigName=endpoint_config_name
114+
)["ProductionVariants"][0]["InstanceType"]
115+
assert "ml.g4dn.xlarge" == actual_instance_type
116+
except Exception as e:
117+
caught_ex = e
118+
finally:
119+
cleanup_model_resources(
120+
sagemaker_session=model_builder.sagemaker_session,
121+
model_name=model.name,
122+
endpoint_name=model.endpoint_name,
123+
)
124+
if caught_ex:
125+
logger.exception(caught_ex)
126+
assert (
127+
False
128+
), f"Exception {caught_ex} was thrown when running model builder single GPU test"
129+
130+
131+
@pytest.mark.skipif(
132+
PYTHON_VERSION_IS_NOT_310,
133+
reason="Testing feature",
134+
)
135+
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
136+
def test_non_text_generation_model_multi_GPU(sagemaker_session, model_builder, model_input):
137+
iam_client = sagemaker_session.boto_session.client("iam")
138+
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
139+
caught_ex = None
140+
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
141+
try:
142+
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
143+
logger.info("Running in SAGEMAKER_ENDPOINT mode")
144+
predictor = model.deploy(
145+
mode=Mode.SAGEMAKER_ENDPOINT,
146+
instance_type="ml.g4dn.12xlarge",
147+
initial_instance_count=1,
148+
)
149+
logger.info("Endpoint successfully deployed.")
150+
prediction = predictor.predict(model_input)
151+
assert prediction is not None
152+
153+
endpoint_name = predictor.endpoint_name
154+
sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
155+
endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
156+
"EndpointConfigName"
157+
]
158+
actual_instance_type = sagemaker_client.describe_endpoint_config(
159+
EndpointConfigName=endpoint_config_name
160+
)["ProductionVariants"][0]["InstanceType"]
161+
assert "ml.g4dn.12xlarge" == actual_instance_type
162+
except Exception as e:
163+
caught_ex = e
164+
finally:
165+
cleanup_model_resources(
166+
sagemaker_session=model_builder.sagemaker_session,
167+
model_name=model.name,
168+
endpoint_name=model.endpoint_name,
169+
)
170+
if caught_ex:
171+
logger.exception(caught_ex)
172+
assert (
173+
False
174+
), f"Exception {caught_ex} was thrown when running model builder multi GPU test"

0 commit comments

Comments
 (0)