Skip to content

Telemetry metrics #4414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Feb 14, 2024
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ env/
tests/data/**/_repack_model.py
tests/data/experiment/sagemaker-dev-1.0.tar.gz
src/sagemaker/serve/tmp_workspace
src/sagemaker/image_uri_config/pysdk_version.json
15 changes: 15 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,21 @@ def read_version():
return read("VERSION").strip()


def pysdk_version():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a little strange to include in the setup.py. Is this the best way to get the PySDK version?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am still looking for a better way. Any suggestion here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gwang111 and @makungaj1 We don't need to create a function to just get SDK version. We can do something like -

import importlib_metadata

SDK_VERSION=importlib_metadata.version("sagemaker")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tagging @mohanasudhan to check if he approves to this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could try this:

SDK_VERSION = importlib_metadata.version("sagemaker")

SDK_VERSION = importlib_metadata.version("sagemaker")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious, why do we need a function to pull the version? What is the problem with above listed option?

"""Persists Sagemaker Python SDK Version in Config"""
content = '{"version": "' + read_version() + '"}'
with open(
os.path.join(
os.path.dirname(__file__), "src", "sagemaker", "image_uri_config", "pysdk_version.json"
),
"w",
) as v:
v.write(content)


pysdk_version()


def read_requirements(filename):
"""Reads requirements file which lists package dependencies.

Expand Down
13 changes: 11 additions & 2 deletions src/sagemaker/serve/utils/telemetry_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
import logging
import requests

from sagemaker import Session
from sagemaker import Session, exceptions
from sagemaker.serve.mode.function_pointers import Mode
from sagemaker.serve.utils.exceptions import ModelBuilderException
from sagemaker.serve.utils.types import ModelServer
from sagemaker.utils import pysdk_version

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -63,11 +64,15 @@ def wrapper(self, *args, **kwargs):
f"{func_name}"
f"&x-modelServer={MODEL_SERVER_TO_CODE[str(self.model_server)]}"
f"&x-imageTag={image_uri_tail}"
f"&x-pySdkVersion={pysdk_version()}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Create the constant SDK_VERSION and then reuse it wherever needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)

if self.model_server == ModelServer.DJL_SERVING or self.model_server == ModelServer.TGI:
extra += f"&x-modelName={self.model}"

if self.sagemaker_session.endpoint_arn:
extra += f"&x-endpointArn={self.sagemaker_session.endpoint_arn}"

try:
response = func(self, *args, **kwargs)
if not self.serve_settings.telemetry_opt_out:
Expand All @@ -79,7 +84,11 @@ def wrapper(self, *args, **kwargs):
None,
extra,
)
except ModelBuilderException as e:
except (
ModelBuilderException,
exceptions.CapacityError,
exceptions.UnexpectedStatusException,
) as e:
if not self.serve_settings.telemetry_opt_out:
_send_telemetry(
"0",
Expand Down
2 changes: 2 additions & 0 deletions src/sagemaker/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ def __init__(
# sagemaker_config is validated and initialized inside :func:`_initialize`,
# so if default_bucket is None and the sagemaker_config has a default S3 bucket configured,
# _default_bucket_name_override will be set again inside :func:`_initialize`.
self.endpoint_arn = None
self._default_bucket = None
self._default_bucket_name_override = default_bucket
# this may also be set again inside :func:`_initialize` if it is None
Expand Down Expand Up @@ -5054,6 +5055,7 @@ def wait_for_endpoint(self, endpoint, poll=DEFAULT_EP_POLL, live_logging=False):
poll=EP_LOGGER_POLL,
)
status = desc["EndpointStatus"]
self.endpoint_arn = desc["EndpointArn"]

if status != "InService":
reason = desc.get("FailureReason", None)
Expand Down
7 changes: 7 additions & 0 deletions src/sagemaker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1489,3 +1489,10 @@ def format_tags(tags: Tags) -> List[TagsDict]:
return [{"Key": str(k), "Value": str(v)} for k, v in tags.items()]

return tags


def pysdk_version() -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need this too as suggested above

"""Returns the current Sagemaker Python SDK Version"""
v_path = os.path.join(os.path.dirname(__file__), "image_uri_config", "pysdk_version.json")
with open(v_path) as v:
return json.load(v).get("version")
14 changes: 14 additions & 0 deletions tests/unit/sagemaker/serve/utils/test_telemetry_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
_construct_url,
)
from sagemaker.serve.utils.exceptions import ModelBuilderException, LocalModelOutOfMemoryException
from sagemaker.utils import pysdk_version

MOCK_SESSION = Mock()
MOCK_FUNC_NAME = "Mock.deploy"
Expand All @@ -32,6 +33,10 @@
)
MOCK_HUGGINGFACE_ID = "meta-llama/Llama-2-7b-hf"
MOCK_EXCEPTION = LocalModelOutOfMemoryException("mock raise ex")
MOCK_ENDPOINT_ARN = (
"arn:aws:sagemaker:us-west-2:123456789012:endpoint/huggingface-pytorch-tgi-inference-2024-02-06"
"-04-06-23-819"
)


class ModelBuilderMock:
Expand Down Expand Up @@ -72,14 +77,17 @@ def test_capture_telemetry_decorator_djl_success(self, mock_send_telemetry):
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.DJL_SERVING
mock_model_builder.sagemaker_session.endpoint_arn = MOCK_ENDPOINT_ARN

mock_model_builder.mock_deploy()

expected_extra_str = (
f"{MOCK_FUNC_NAME}"
"&x-modelServer=4"
"&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118"
f"&x-pySdkVersion={pysdk_version()}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
)
mock_send_telemetry.assert_called_once_with(
"1", 2, MOCK_SESSION, None, None, expected_extra_str
Expand All @@ -93,14 +101,17 @@ def test_capture_telemetry_decorator_tgi_success(self, mock_send_telemetry):
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.TGI
mock_model_builder.sagemaker_session.endpoint_arn = MOCK_ENDPOINT_ARN

mock_model_builder.mock_deploy()

expected_extra_str = (
f"{MOCK_FUNC_NAME}"
"&x-modelServer=6"
"&x-imageTag=huggingface-pytorch-inference:2.0.0-transformers4.28.1-cpu-py310-ubuntu20.04"
f"&x-pySdkVersion={pysdk_version()}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
)
mock_send_telemetry.assert_called_once_with(
"1", 2, MOCK_SESSION, None, None, expected_extra_str
Expand All @@ -126,6 +137,7 @@ def test_capture_telemetry_decorator_handle_exception_success(self, mock_send_te
mock_model_builder.model = MOCK_HUGGINGFACE_ID
mock_model_builder.mode = Mode.LOCAL_CONTAINER
mock_model_builder.model_server = ModelServer.DJL_SERVING
mock_model_builder.sagemaker_session.endpoint_arn = MOCK_ENDPOINT_ARN

mock_exception = Mock()
mock_exception_obj = MOCK_EXCEPTION
Expand All @@ -138,7 +150,9 @@ def test_capture_telemetry_decorator_handle_exception_success(self, mock_send_te
f"{MOCK_FUNC_NAME}"
"&x-modelServer=4"
"&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118"
f"&x-pySdkVersion={pysdk_version()}"
f"&x-modelName={MOCK_HUGGINGFACE_ID}"
f"&x-endpointArn={MOCK_ENDPOINT_ARN}"
)
mock_send_telemetry.assert_called_once_with(
"0",
Expand Down
79 changes: 58 additions & 21 deletions tests/unit/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -3192,7 +3192,9 @@ def test_create_model_from_job_with_vpc_config_override(sagemaker_session):

def test_endpoint_from_production_variants(sagemaker_session):
ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Could we use a full ARN like arn:aws:sagemaker:us-west-2:123456789012:endpoint/test

)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -3487,7 +3489,7 @@ def test_endpoint_from_production_variants_with_sagemaker_config_injection(
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT_CONFIG

sagemaker_session.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService"}
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
Expand Down Expand Up @@ -3555,7 +3557,7 @@ def test_endpoint_from_production_variants_with_sagemaker_config_injection_parti
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT_CONFIG

sagemaker_session.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService"}
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.g5.xlarge"),
Expand Down Expand Up @@ -3619,7 +3621,7 @@ def test_endpoint_from_production_variants_with_sagemaker_config_injection_no_km
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT_CONFIG

sagemaker_session.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService"}
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.g5.xlarge"),
Expand Down Expand Up @@ -3726,7 +3728,9 @@ def test_create_endpoint_config_with_explainer_config(sagemaker_session):

def test_endpoint_from_production_variants_with_tags(sagemaker_session):
ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -3757,7 +3761,9 @@ def test_endpoint_from_production_variants_with_combined_sagemaker_config_inject
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT_ENDPOINT_CONFIG_COMBINED

ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -3801,7 +3807,9 @@ def test_endpoint_from_production_variants_with_sagemaker_config_injection_tags(
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT

ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -3830,7 +3838,9 @@ def test_endpoint_from_production_variants_with_sagemaker_config_injection_tags(

def test_endpoint_from_production_variants_with_accelerator_type(sagemaker_session):
ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge", accelerator_type=ACCELERATOR_TYPE),
sagemaker.production_variant("B", "p299.4096xlarge", accelerator_type=ACCELERATOR_TYPE),
Expand Down Expand Up @@ -3861,7 +3871,9 @@ def test_endpoint_from_production_variants_with_accelerator_type_sagemaker_confi
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT

ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge", accelerator_type=ACCELERATOR_TYPE),
sagemaker.production_variant("B", "p299.4096xlarge", accelerator_type=ACCELERATOR_TYPE),
Expand Down Expand Up @@ -3892,7 +3904,9 @@ def test_endpoint_from_production_variants_with_serverless_inference_config(
sagemaker_session,
):
ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant(
"A", "ml.p2.xlarge", serverless_inference_config=SERVERLESS_INFERENCE_CONFIG
Expand Down Expand Up @@ -3929,7 +3943,9 @@ def test_endpoint_from_production_variants_with_serverless_inference_config_sage
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT

ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant(
"A", "ml.p2.xlarge", serverless_inference_config=SERVERLESS_INFERENCE_CONFIG
Expand Down Expand Up @@ -3964,7 +3980,9 @@ def test_endpoint_from_production_variants_with_serverless_inference_config_sage

def test_endpoint_from_production_variants_with_async_config(sagemaker_session):
ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -4000,7 +4018,9 @@ def test_endpoint_from_production_variants_with_async_config_sagemaker_config_in
sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_ENDPOINT

ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -4037,7 +4057,9 @@ def test_endpoint_from_production_variants_with_clarify_explainer_config(
sagemaker_session,
):
ims = sagemaker_session
ims.sagemaker_client.describe_endpoint = Mock(return_value={"EndpointStatus": "InService"})
ims.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
pvs = [
sagemaker.production_variant("A", "ml.p2.xlarge"),
sagemaker.production_variant("B", "p299.4096xlarge"),
Expand Down Expand Up @@ -4069,7 +4091,7 @@ def test_endpoint_from_production_variants_with_clarify_explainer_config(

def test_update_endpoint_succeed(sagemaker_session):
sagemaker_session.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "InService"}
return_value={"EndpointStatus": "InService", "EndpointArn": "arn:aws:sagemaker:"}
)
endpoint_name = "some-endpoint"
endpoint_config = "some-endpoint-config"
Expand All @@ -4079,7 +4101,7 @@ def test_update_endpoint_succeed(sagemaker_session):

def test_update_endpoint_no_wait(sagemaker_session):
sagemaker_session.sagemaker_client.describe_endpoint = Mock(
return_value={"EndpointStatus": "Updating"}
return_value={"EndpointStatus": "Updating", "EndpointArn": "arn:aws:sagemaker:"}
)
endpoint_name = "some-endpoint"
endpoint_config = "some-endpoint-config"
Expand Down Expand Up @@ -6136,7 +6158,10 @@ def test_upload_data_default_bucket_and_prefix_combinations(

def test_is_inference_component_based_endpoint_affirmative(sagemaker_session):

describe_endpoint_response = {"EndpointConfigName": "some-endpoint-config"}
describe_endpoint_response = {
"EndpointConfigName": "some-endpoint-config",
"EndpointArn": "arn:aws:sagemaker:",
}
describe_endpoint_config_response = {
"ExecutionRoleArn": "some-role-arn",
"ProductionVariants": [{"VariantName": "AllTraffic"}],
Expand All @@ -6160,7 +6185,10 @@ def test_is_inference_component_based_endpoint_affirmative(sagemaker_session):

def test_is_inference_component_based_endpoint_negative_no_role(sagemaker_session):

describe_endpoint_response = {"EndpointConfigName": "some-endpoint-config"}
describe_endpoint_response = {
"EndpointConfigName": "some-endpoint-config",
"EndpointArn": "arn:aws:sagemaker:",
}
describe_endpoint_config_response = {
"ProductionVariants": [{"VariantName": "AllTraffic"}],
}
Expand All @@ -6183,7 +6211,10 @@ def test_is_inference_component_based_endpoint_negative_no_role(sagemaker_sessio

def test_is_inference_component_based_endpoint_positive_multiple_variants(sagemaker_session):

describe_endpoint_response = {"EndpointConfigName": "some-endpoint-config"}
describe_endpoint_response = {
"EndpointConfigName": "some-endpoint-config",
"EndpointArn": "arn:aws:sagemaker:",
}
describe_endpoint_config_response = {
"ExecutionRoleArn": "some-role-arn",
"ProductionVariants": [{"VariantName": "AllTraffic1"}, {"VariantName": "AllTraffic2"}],
Expand All @@ -6207,7 +6238,10 @@ def test_is_inference_component_based_endpoint_positive_multiple_variants(sagema

def test_is_inference_component_based_endpoint_negative_no_variants(sagemaker_session):

describe_endpoint_response = {"EndpointConfigName": "some-endpoint-config"}
describe_endpoint_response = {
"EndpointConfigName": "some-endpoint-config",
"EndpointArn": "arn:aws:sagemaker:",
}
describe_endpoint_config_response = {
"ExecutionRoleArn": "some-role-arn",
"ProductionVariants": [],
Expand All @@ -6231,7 +6265,10 @@ def test_is_inference_component_based_endpoint_negative_no_variants(sagemaker_se

def test_is_inference_component_based_endpoint_negative_model_name_present(sagemaker_session):

describe_endpoint_response = {"EndpointConfigName": "some-endpoint-config"}
describe_endpoint_response = {
"EndpointConfigName": "some-endpoint-config",
"EndpointArn": "arn:aws:sagemaker:",
}
describe_endpoint_config_response = {
"ExecutionRoleArn": "some-role-arn",
"ProductionVariants": [
Expand Down
Loading