Skip to content

Commit 94f609e

Browse files
authored
Merge branch 'master' into processing-job-codeartifact-support
2 parents 5d46457 + 156f6ea commit 94f609e

21 files changed

+1926
-357
lines changed

requirements/extras/test_requirements.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,7 @@ pyvis==0.2.1
2323
pandas>=1.3.5,<1.5
2424
scikit-learn==1.3.0
2525
cloudpickle==2.2.1
26-
scipy==1.10.1
27-
urllib3>=1.26.8,<3.0.0
28-
docker>=5.0.2,<7.0.0
2926
PyYAML==6.0
30-
pyspark==3.3.1
31-
sagemaker-feature-store-pyspark-3.3
3227
# TODO find workaround
3328
xgboost>=1.6.2,<=1.7.6
3429
pillow>=10.0.1,<=11
@@ -39,4 +34,3 @@ tritonclient[http]<2.37.0
3934
onnx==1.14.1
4035
# tf2onnx==1.15.1
4136
nbformat>=5.9,<6
42-
accelerate>=0.24.1,<=0.27.0

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ def read_requirements(filename):
8585
extras["all"] = [item for group in extras.values() for item in group]
8686
# Tests specific dependencies (do not need to be included in 'all')
8787
test_dependencies = read_requirements("requirements/extras/test_requirements.txt")
88+
# test dependencies are a superset of testing and extra dependencies
89+
test_dependencies.extend(extras["all"])
8890
# remove torch and torchvision if python version is not 3.10
8991
if sys.version_info.minor != 10:
9092
test_dependencies = [

src/sagemaker/clarify.py

Lines changed: 561 additions & 12 deletions
Large diffs are not rendered by default.

src/sagemaker/serve/builder/model_builder.py

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
from pathlib import Path
2222

23-
from accelerate.commands.estimate import estimate_command_parser, gather_data
2423
from sagemaker import Session
2524
from sagemaker.model import Model
2625
from sagemaker.base_predictor import PredictorBase
@@ -43,7 +42,11 @@
4342
from sagemaker.serve.utils import task
4443
from sagemaker.serve.utils.exceptions import TaskNotFoundException
4544
from sagemaker.serve.utils.predictors import _get_local_mode_predictor
46-
from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
45+
from sagemaker.serve.utils.hardware_detector import (
46+
_get_gpu_info,
47+
_get_gpu_info_fallback,
48+
_total_inference_model_size_mib,
49+
)
4750
from sagemaker.serve.detector.image_detector import (
4851
auto_detect_container,
4952
_detect_framework_and_version,
@@ -70,11 +73,8 @@
7073
ModelServer.DJL_SERVING,
7174
}
7275

73-
MIB_CONVERSION_FACTOR = 0.00000095367431640625
74-
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer
75-
7676

77-
# pylint: disable=attribute-defined-outside-init
77+
# pylint: disable=attribute-defined-outside-init, disable=E1101
7878
@dataclass
7979
class ModelBuilder(Triton, DJL, JumpStart, TGI, Transformers):
8080
"""Class that builds a deployable model.
@@ -719,39 +719,22 @@ def _schema_builder_init(self, model_task: str):
719719
except ValueError:
720720
raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.")
721721

722-
def _total_inference_model_size_mib(self):
723-
"""Calculates the model size from HF accelerate
724-
725-
This function gets the model size from accelerate. It also adds a
726-
padding and converts to size MiB. When performing inference, expect
727-
to add up to an additional 20% to the given model size as found by EleutherAI.
728-
"""
729-
dtypes = self.env_vars.get("dtypes", "float32")
730-
parser = estimate_command_parser()
731-
args = parser.parse_args([self.model, "--dtypes", dtypes])
732-
733-
output = gather_data(
734-
args
735-
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"
736-
737-
if output is None:
738-
raise ValueError(f"Could not get Model size for {self.model}")
739-
740-
total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
741-
logger.info("Total memory size MIB: %s", total_memory_size_mib)
742-
return total_memory_size_mib
743-
744722
def _can_fit_on_single_gpu(self) -> Type[bool]:
745723
"""Check if model can fit on a single GPU
746724
747725
If the size of the model is <= single gpu memory size, returns True else False
748726
"""
749727
try:
750728
single_gpu_size_mib = self._try_fetch_gpu_info()
751-
if self._total_inference_model_size_mib() <= single_gpu_size_mib:
729+
if (
730+
_total_inference_model_size_mib(self.model, self.env_vars.get("dtypes", "float32"))
731+
<= single_gpu_size_mib
732+
):
752733
logger.info(
753734
"Total inference model size MIB %s, single GPU size for instance MIB %s",
754-
self._total_inference_model_size_mib(),
735+
_total_inference_model_size_mib(
736+
self.model, self.env_vars.get("dtypes", "float32")
737+
),
755738
single_gpu_size_mib,
756739
)
757740
return True

src/sagemaker/serve/builder/schema_builder.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,12 +208,18 @@ def _get_inverse(self, obj):
208208

209209
def __repr__(self):
210210
"""Placeholder docstring"""
211+
if hasattr(self, "input_serializer") and hasattr(self, "output_serializer"):
212+
return (
213+
f"SchemaBuilder(\n"
214+
f"input_serializer={self.input_serializer}\n"
215+
f"output_serializer={self.output_serializer}\n"
216+
f"input_deserializer={self.input_deserializer._deserializer}\n"
217+
f"output_deserializer={self.output_deserializer._deserializer})"
218+
)
211219
return (
212220
f"SchemaBuilder(\n"
213-
f"input_serializer={self.input_serializer}\n"
214-
f"output_serializer={self.output_serializer}\n"
215-
f"input_deserializer={self.input_deserializer._deserializer}\n"
216-
f"output_deserializer={self.output_deserializer._deserializer})"
221+
f"custom_input_translator={self.custom_input_translator}\n"
222+
f"custom_output_translator={self.custom_output_translator}\n"
217223
)
218224

219225
def generate_marshalling_map(self) -> dict:

src/sagemaker/serve/utils/hardware_detector.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,18 @@
1818

1919
from botocore.exceptions import ClientError
2020

21+
from accelerate.commands.estimate import estimate_command_parser, gather_data
2122
from sagemaker import Session
23+
from sagemaker.model import Model
2224
from sagemaker import instance_types_gpu_info
2325

2426
logger = logging.getLogger(__name__)
2527

2628

29+
MIB_CONVERSION_FACTOR = 0.00000095367431640625
30+
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer
31+
32+
2733
def _get_gpu_info(instance_type: str, session: Session) -> Tuple[int, int]:
2834
"""Get GPU info for the provided instance
2935
@@ -108,3 +114,24 @@ def _format_instance_type(instance_type: str) -> str:
108114

109115
ec2_instance = ".".join(split_instance)
110116
return ec2_instance
117+
118+
119+
def _total_inference_model_size_mib(model: Model, dtype: str) -> int:
120+
"""Calculates the model size from HF accelerate
121+
122+
This function gets the model size from accelerate. It also adds a
123+
padding and converts to size MiB. When performing inference, expect
124+
to add up to an additional 20% to the given model size as found by EleutherAI.
125+
"""
126+
args = estimate_command_parser().parse_args([model, "--dtypes", dtype])
127+
128+
output = gather_data(
129+
args
130+
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"
131+
132+
if output is None:
133+
raise ValueError(f"Could not get Model size for {model}")
134+
135+
total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
136+
logger.info("Total memory size MIB: %s", total_memory_size_mib)
137+
return total_memory_size_mib

src/sagemaker/session.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def __init__(
189189
sagemaker_runtime_client=None,
190190
sagemaker_featurestore_runtime_client=None,
191191
default_bucket=None,
192-
settings=SessionSettings(),
192+
settings=None,
193193
sagemaker_metrics_client=None,
194194
sagemaker_config: dict = None,
195195
default_bucket_prefix: str = None,
@@ -260,7 +260,7 @@ def __init__(
260260
self.resource_group_tagging_client = None
261261
self._config = None
262262
self.lambda_client = None
263-
self.settings = settings
263+
self.settings = settings if settings else SessionSettings()
264264

265265
self._initialize(
266266
boto_session=boto_session,

tests/integ/sagemaker/serve/test_serve_pt_happy.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,6 @@ def model_builder(request):
181181
# ), f"{caught_ex} was thrown when running pytorch squeezenet local container test"
182182

183183

184-
@pytest.mark.skip(reason="Failing test. Fix is pending.")
185184
@pytest.mark.skipif(
186185
PYTHON_VERSION_IS_NOT_310, # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE,
187186
reason="The goal of these test are to test the serving components of our feature",
@@ -222,8 +221,10 @@ def test_happy_pytorch_sagemaker_endpoint(
222221
)
223222
if caught_ex:
224223
logger.exception(caught_ex)
224+
ignore_if_worker_dies = "Worker died." in str(caught_ex)
225+
# https://github.com/pytorch/serve/issues/3032
225226
assert (
226-
False
227+
ignore_if_worker_dies
227228
), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"
228229

229230

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
17+
import pytest
18+
from botocore.config import Config
19+
20+
from tests.integ import DATA_DIR
21+
from sagemaker import Session, get_execution_role
22+
23+
CUSTOM_S3_OBJECT_KEY_PREFIX = "session-default-prefix"
24+
25+
26+
# Create a sagemaker_session in workflow scope to prevent race condition
27+
# with other tests. Some other tests may change the session `settings`.
28+
@pytest.fixture(scope="module")
29+
def sagemaker_session_for_pipeline(
30+
sagemaker_client_config,
31+
boto_session,
32+
):
33+
sagemaker_client_config.setdefault("config", Config(retries=dict(max_attempts=10)))
34+
sagemaker_client = (
35+
boto_session.client("sagemaker", **sagemaker_client_config)
36+
if sagemaker_client_config
37+
else None
38+
)
39+
40+
return Session(
41+
boto_session=boto_session,
42+
sagemaker_client=sagemaker_client,
43+
sagemaker_config={},
44+
default_bucket_prefix=CUSTOM_S3_OBJECT_KEY_PREFIX,
45+
)
46+
47+
48+
@pytest.fixture(scope="module")
49+
def smclient(sagemaker_session):
50+
return sagemaker_session.boto_session.client("sagemaker")
51+
52+
53+
@pytest.fixture(scope="module")
54+
def role(sagemaker_session_for_pipeline):
55+
return get_execution_role(sagemaker_session_for_pipeline)
56+
57+
58+
@pytest.fixture(scope="module")
59+
def region_name(sagemaker_session_for_pipeline):
60+
return sagemaker_session_for_pipeline.boto_session.region_name
61+
62+
63+
@pytest.fixture(scope="module")
64+
def script_dir():
65+
return os.path.join(DATA_DIR, "sklearn_processing")

tests/integ/sagemaker/workflow/test_experiment.py

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
from tests.integ.sagemaker.workflow.helpers import wait_pipeline_execution
2121
from sagemaker.processing import ProcessingInput
22-
from sagemaker.session import get_execution_role
2322
from sagemaker.sklearn.processing import SKLearnProcessor
2423
from sagemaker.dataset_definition.inputs import DatasetDefinition, AthenaDatasetDefinition
2524
from sagemaker.workflow.execution_variables import ExecutionVariables
@@ -33,33 +32,13 @@
3332
from tests.integ import DATA_DIR
3433

3534

36-
@pytest.fixture(scope="module")
37-
def region_name(sagemaker_session):
38-
return sagemaker_session.boto_session.region_name
39-
40-
41-
@pytest.fixture(scope="module")
42-
def role(sagemaker_session):
43-
return get_execution_role(sagemaker_session)
44-
45-
46-
@pytest.fixture(scope="module")
47-
def script_dir():
48-
return os.path.join(DATA_DIR, "sklearn_processing")
49-
50-
5135
@pytest.fixture
5236
def pipeline_name():
5337
return f"my-pipeline-{int(time.time() * 10**7)}"
5438

5539

5640
@pytest.fixture
57-
def smclient(sagemaker_session):
58-
return sagemaker_session.boto_session.client("sagemaker")
59-
60-
61-
@pytest.fixture
62-
def athena_dataset_definition(sagemaker_session):
41+
def athena_dataset_definition(sagemaker_session_for_pipeline):
6342
return DatasetDefinition(
6443
local_path="/opt/ml/processing/input/add",
6544
data_distribution_type="FullyReplicated",
@@ -69,15 +48,15 @@ def athena_dataset_definition(sagemaker_session):
6948
database="default",
7049
work_group="workgroup",
7150
query_string='SELECT * FROM "default"."s3_test_table_$STAGE_$REGIONUNDERSCORED";',
72-
output_s3_uri=f"s3://{sagemaker_session.default_bucket()}/add",
51+
output_s3_uri=f"s3://{sagemaker_session_for_pipeline.default_bucket()}/add",
7352
output_format="JSON",
7453
output_compression="GZIP",
7554
),
7655
)
7756

7857

7958
def test_pipeline_execution_with_default_experiment_config(
80-
sagemaker_session,
59+
sagemaker_session_for_pipeline,
8160
smclient,
8261
role,
8362
sklearn_latest_version,
@@ -99,7 +78,7 @@ def test_pipeline_execution_with_default_experiment_config(
9978
instance_type=cpu_instance_type,
10079
instance_count=instance_count,
10180
command=["python3"],
102-
sagemaker_session=sagemaker_session,
81+
sagemaker_session=sagemaker_session_for_pipeline,
10382
base_job_name="test-sklearn",
10483
)
10584

@@ -113,7 +92,7 @@ def test_pipeline_execution_with_default_experiment_config(
11392
name=pipeline_name,
11493
parameters=[instance_count],
11594
steps=[step_sklearn],
116-
sagemaker_session=sagemaker_session,
95+
sagemaker_session=sagemaker_session_for_pipeline,
11796
)
11897

11998
try:
@@ -142,7 +121,7 @@ def test_pipeline_execution_with_default_experiment_config(
142121

143122

144123
def test_pipeline_execution_with_custom_experiment_config(
145-
sagemaker_session,
124+
sagemaker_session_for_pipeline,
146125
smclient,
147126
role,
148127
sklearn_latest_version,
@@ -164,7 +143,7 @@ def test_pipeline_execution_with_custom_experiment_config(
164143
instance_type=cpu_instance_type,
165144
instance_count=instance_count,
166145
command=["python3"],
167-
sagemaker_session=sagemaker_session,
146+
sagemaker_session=sagemaker_session_for_pipeline,
168147
base_job_name="test-sklearn",
169148
)
170149

@@ -185,7 +164,7 @@ def test_pipeline_execution_with_custom_experiment_config(
185164
trial_name=Join(on="-", values=["my-trial", ExecutionVariables.PIPELINE_EXECUTION_ID]),
186165
),
187166
steps=[step_sklearn],
188-
sagemaker_session=sagemaker_session,
167+
sagemaker_session=sagemaker_session_for_pipeline,
189168
)
190169

191170
try:

0 commit comments

Comments
 (0)