Skip to content

Add telemetry metrics on usage of default images for ModelBuilder #4430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/sagemaker/serve/builder/djl_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def __init__(self):
self.mode = None
self.model_server = None
self.image_uri = None
self._is_custom_image_uri = False
self.image_config = None
self.vpc_config = None
self._original_deploy = None
Expand Down
4 changes: 3 additions & 1 deletion src/sagemaker/serve/builder/jumpstart_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def __init__(self):
self.mode = None
self.model_server = None
self.image_uri = None
self._is_custom_image_uri = False
self.vpc_config = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the added VPC config for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for feature parity. This config has already exposed to djl/tgi/torchserve/tri

self._original_deploy = None
self.secret_key = None
self.js_model_config = None
Expand Down Expand Up @@ -94,7 +96,7 @@ def _is_jumpstart_model_id(self) -> bool:

def _create_pre_trained_js_model(self) -> Type[Model]:
"""Placeholder docstring"""
pysdk_model = JumpStartModel(self.model)
pysdk_model = JumpStartModel(self.model, vpc_config=self.vpc_config)
pysdk_model.sagemaker_session = self.sagemaker_session

self._original_deploy = pysdk_model.deploy
Expand Down
2 changes: 2 additions & 0 deletions src/sagemaker/serve/builder/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,8 @@ def build(

self.serve_settings = self._get_serve_setting()

self._is_custom_image_uri = self.image_uri is None

if isinstance(self.model, str):
if self._is_jumpstart_model_id():
return self._build_for_jumpstart()
Expand Down
1 change: 1 addition & 0 deletions src/sagemaker/serve/builder/tgi_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(self):
self.mode = None
self.model_server = None
self.image_uri = None
self._is_custom_image_uri = False
self.image_config = None
self.vpc_config = None
self._original_deploy = None
Expand Down
4 changes: 4 additions & 0 deletions src/sagemaker/serve/builder/transformers_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def __init__(self):
self.mode = None
self.model_server = None
self.image_uri = None
self._is_custom_image_uri = False
self.vpc_config = None
self._original_deploy = None
self.hf_model_config = None
self._default_data_type = None
Expand Down Expand Up @@ -111,6 +113,7 @@ def _create_transformers_model(self) -> Type[Model]:
py_version=self.py_version,
transformers_version=base_hf_version,
pytorch_version=self.pytorch_version,
vpc_config=self.vpc_config,
)
elif "keras" in hf_model_md.get("tags") or "tensorflow" in hf_model_md.get("tags"):
self.tensorflow_version = self._get_supported_version(
Expand All @@ -126,6 +129,7 @@ def _create_transformers_model(self) -> Type[Model]:
py_version=self.py_version,
transformers_version=base_hf_version,
tensorflow_version=self.tensorflow_version,
vpc_config=self.vpc_config,
)

if self.mode == Mode.LOCAL_CONTAINER:
Expand Down
24 changes: 23 additions & 1 deletion src/sagemaker/serve/utils/telemetry_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
from sagemaker import Session, exceptions
from sagemaker.serve.mode.function_pointers import Mode
from sagemaker.serve.utils.exceptions import ModelBuilderException
from sagemaker.serve.utils.types import ModelServer
from sagemaker.serve.utils.types import ModelServer, ImageUriOption
from sagemaker.serve.validations.check_image_uri import is_1p_image_uri
from sagemaker.user_agent import SDK_VERSION

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -62,11 +63,13 @@ def wrapper(self, *args, **kwargs):
caught_ex = None

image_uri_tail = self.image_uri.split("/")[1]
image_uri_option = _get_image_uri_option(self.image_uri, self._is_custom_image_uri)
extra = (
f"{func_name}"
f"&x-modelServer={MODEL_SERVER_TO_CODE[str(self.model_server)]}"
f"&x-imageTag={image_uri_tail}"
f"&x-sdkVersion={SDK_VERSION}"
f"&x-defaultImageUsage={image_uri_option}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious, is "default" important to the naming here or can we just go with "imageUsage"

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or "imageType"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, this metric is to capture whether customers provide their own image or not, the further differentiation on custom image or custom 1p image is not that important.

Copy link
Collaborator

@gwang111 gwang111 Feb 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack sound good. posed the question from a string len perspective. 1024 is max i believe. So if there are ways to cut down on length without losing the meaningfulness we should try and do so

)

if self.model_server == ModelServer.DJL_SERVING or self.model_server == ModelServer.TGI:
Expand Down Expand Up @@ -201,3 +204,22 @@ def _get_region_or_default(session):
return session.boto_session.region_name
except Exception: # pylint: disable=W0703
return "us-west-2"


def _get_image_uri_option(image_uri: str, is_custom_image: bool) -> int:
"""Detect whether default values are used for ModelBuilder

Args:
image_uri (str): Image uri used by ModelBuilder.
is_custom_image: (bool): Boolean indicating whether customer provides with custom image.
Returns:
bool: Integer code of image option types.
"""

if not is_custom_image:
return ImageUriOption.DEFAULT_IMAGE.value

if is_1p_image_uri(image_uri):
return ImageUriOption.CUSTOM_1P_IMAGE.value

return ImageUriOption.CUSTOM_IMAGE.value
12 changes: 12 additions & 0 deletions src/sagemaker/serve/utils/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,15 @@ def __str__(self) -> str:
INFERENTIA_1 = 3
INFERENTIA_2 = 4
GRAVITON = 5


class ImageUriOption(Enum):
"""Enum type for image uri options"""

def __str__(self) -> str:
"""Convert enum to string"""
return str(self.name)

CUSTOM_IMAGE = 1
CUSTOM_1P_IMAGE = 2
DEFAULT_IMAGE = 3
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import unittest
from sagemaker.serve.builder.model_builder import ModelBuilder
from sagemaker.serve.mode.function_pointers import Mode
from tests.unit.sagemaker.serve.constants import MOCK_VPC_CONFIG

from sagemaker.serve.utils.predictors import TransformersLocalModePredictor

Expand Down Expand Up @@ -74,6 +75,7 @@ def test_build_deploy_for_transformers_local_container_and_remote_container(
model=mock_model_id,
schema_builder=mock_schema_builder,
mode=Mode.LOCAL_CONTAINER,
vpc_config=MOCK_VPC_CONFIG,
)

builder._prepare_for_mode = MagicMock()
Expand All @@ -85,6 +87,7 @@ def test_build_deploy_for_transformers_local_container_and_remote_container(
builder.modes[str(Mode.LOCAL_CONTAINER)] = MagicMock()
predictor = model.deploy(model_data_download_timeout=1800)

assert model.vpc_config == MOCK_VPC_CONFIG
assert builder.env_vars["MODEL_LOADING_TIMEOUT"] == "1800"
assert isinstance(predictor, TransformersLocalModePredictor)

Expand Down
38 changes: 38 additions & 0 deletions tests/unit/sagemaker/serve/utils/test_telemetry_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
_construct_url,
)
from sagemaker.serve.utils.exceptions import ModelBuilderException, LocalModelOutOfMemoryException
from sagemaker.serve.utils.types import ImageUriOption
from sagemaker.user_agent import SDK_VERSION

MOCK_SESSION = Mock()
Expand Down Expand Up @@ -71,6 +72,7 @@ def test_capture_telemetry_decorator_djl_success(self, mock_send_telemetry):
mock_model_builder = ModelBuilderMock()
mock_model_builder.serve_settings.telemetry_opt_out = False
mock_model_builder.image_uri = MOCK_DJL_CONTAINER
mock_model_builder._is_custom_image_uri = False
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.DJL_SERVING
Expand All @@ -85,6 +87,37 @@ def test_capture_telemetry_decorator_djl_success(self, mock_send_telemetry):
"&x-modelServer=4"
"&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118"
f"&x-sdkVersion={SDK_VERSION}"
f"&x-defaultImageUsage={ImageUriOption.DEFAULT_IMAGE.value}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
f"&x-latency={latency}"
)

mock_send_telemetry.assert_called_once_with(
"1", 2, MOCK_SESSION, None, None, expected_extra_str
)

@patch("sagemaker.serve.utils.telemetry_logger._send_telemetry")
def test_capture_telemetry_decorator_djl_success_with_custom_image(self, mock_send_telemetry):
mock_model_builder = ModelBuilderMock()
mock_model_builder.serve_settings.telemetry_opt_out = False
mock_model_builder.image_uri = MOCK_DJL_CONTAINER
mock_model_builder._is_custom_image_uri = True
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.DJL_SERVING
mock_model_builder.sagemaker_session.endpoint_arn = MOCK_ENDPOINT_ARN

mock_model_builder.mock_deploy()

args = mock_send_telemetry.call_args.args
latency = str(args[5]).split("latency=")[1]
expected_extra_str = (
f"{MOCK_FUNC_NAME}"
"&x-modelServer=4"
"&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118"
f"&x-sdkVersion={SDK_VERSION}"
f"&x-defaultImageUsage={ImageUriOption.CUSTOM_1P_IMAGE.value}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
f"&x-latency={latency}"
Expand All @@ -99,6 +132,7 @@ def test_capture_telemetry_decorator_tgi_success(self, mock_send_telemetry):
mock_model_builder = ModelBuilderMock()
mock_model_builder.serve_settings.telemetry_opt_out = False
mock_model_builder.image_uri = MOCK_TGI_CONTAINER
mock_model_builder._is_custom_image_uri = False
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.TGI
Expand All @@ -113,6 +147,7 @@ def test_capture_telemetry_decorator_tgi_success(self, mock_send_telemetry):
"&x-modelServer=6"
"&x-imageTag=huggingface-pytorch-inference:2.0.0-transformers4.28.1-cpu-py310-ubuntu20.04"
f"&x-sdkVersion={SDK_VERSION}"
f"&x-defaultImageUsage={ImageUriOption.DEFAULT_IMAGE.value}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
f"&x-latency={latency}"
Expand All @@ -127,6 +162,7 @@ def test_capture_telemetry_decorator_no_call_when_disabled(self, mock_send_telem
mock_model_builder = ModelBuilderMock()
mock_model_builder.serve_settings.telemetry_opt_out = True
mock_model_builder.image_uri = MOCK_DJL_CONTAINER
mock_model_builder._is_custom_image_uri = False
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.model_server = ModelServer.DJL_SERVING

Expand All @@ -139,6 +175,7 @@ def test_capture_telemetry_decorator_handle_exception_success(self, mock_send_te
mock_model_builder = ModelBuilderMock()
mock_model_builder.serve_settings.telemetry_opt_out = False
mock_model_builder.image_uri = MOCK_DJL_CONTAINER
mock_model_builder._is_custom_image_uri = False
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.DJL_SERVING
Expand All @@ -158,6 +195,7 @@ def test_capture_telemetry_decorator_handle_exception_success(self, mock_send_te
"&x-modelServer=4"
"&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118"
f"&x-sdkVersion={SDK_VERSION}"
f"&x-defaultImageUsage={ImageUriOption.DEFAULT_IMAGE.value}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
f"&x-latency={latency}"
Expand Down