Skip to content

Lzao/debugger rule removal #3379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sagemaker/debugger/profiler_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
s3_output_path: Optional[Union[str, PipelineVariable]] = None,
system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
framework_profile_params: Optional[FrameworkProfile] = None,
disable_profiler: Optional[FrameworkProfile] = False,
):
"""Initialize a ``ProfilerConfig`` instance.

Expand Down Expand Up @@ -78,6 +79,7 @@ class and SageMaker Framework estimators.
self.s3_output_path = s3_output_path
self.system_monitor_interval_millis = system_monitor_interval_millis
self.framework_profile_params = framework_profile_params
self.disable_profiler = disable_profiler

def _to_request_dict(self):
"""Generate a request dictionary using the parameters provided when initializing the object.
Expand All @@ -91,6 +93,8 @@ def _to_request_dict(self):
if self.s3_output_path is not None:
profiler_config_request["S3OutputPath"] = self.s3_output_path

profiler_config_request["DisableProfiler"] = self.disable_profiler

if self.system_monitor_interval_millis is not None:
profiler_config_request[
"ProfilingIntervalInMilliseconds"
Expand Down
23 changes: 13 additions & 10 deletions src/sagemaker/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,26 +937,29 @@ def _prepare_collection_configs(self):
def _prepare_profiler_for_training(self):
"""Set necessary values and do basic validations in profiler config and profiler rules.

When user explicitly set rules to an empty list, default profiler rule won't be enabled.
Default profiler rule will be enabled in supported regions when either:
1. user doesn't specify any rules, i.e., rules=None; or
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
No default profiler rule will be used. The user needs to specify rules explicitly
"""
if self.disable_profiler:
if self.profiler_config:
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
if self.profiler_config and not self.profiler_config.disable_profiler:
raise RuntimeError(
"profiler_config.disable_profiler cannot be False"
+ " when disable_profiler is True."
)
if self.profiler_rules:
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
if self.profiler_config is None:
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
if self.rules is None or (self.rules and not self.profiler_rules):
self.profiler_rules = [get_default_profiler_rule()]
self.profiler_rules = []

if self.profiler_config and not self.profiler_config.s3_output_path:
self.profiler_config.s3_output_path = self.output_path

self.profiler_rule_configs = self._prepare_profiler_rules()
# if profiler_config is still None, it means the job has profiler disabled
if self.profiler_config is None:
self.profiler_config = ProfilerConfig(disable_profiler=True)

def _prepare_profiler_rules(self):
"""Set any necessary values in profiler rules, if they are provided."""
Expand Down Expand Up @@ -1047,7 +1050,7 @@ def latest_job_profiler_artifacts_path(self):
error_message="""Cannot get the profiling output artifacts path.
The Estimator is not associated with a training job."""
)
if self.profiler_config is not None:
if self.profiler_config is not None and not self.profiler_config.disable_profiler:
return os.path.join(
self.profiler_config.s3_output_path,
self.latest_training_job.name,
Expand Down Expand Up @@ -1893,8 +1896,8 @@ def enable_default_profiling(self):
else:
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)

self.profiler_rules = [get_default_profiler_rule()]
self.profiler_rule_configs = self._prepare_profiler_rules()
self.profiler_rules = []
self.profiler_rule_configs = []

_TrainingJob.update(
self, self.profiler_rule_configs, self.profiler_config._to_request_dict()
Expand Down
32 changes: 0 additions & 32 deletions tests/integ/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from __future__ import absolute_import

import os
import re
import time
import uuid

Expand All @@ -22,7 +21,6 @@
from sagemaker.debugger import (
DebuggerHookConfig,
FrameworkProfile,
get_rule_container_image_uri,
ProfilerConfig,
ProfilerRule,
Rule,
Expand Down Expand Up @@ -103,13 +101,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
)
assert job_description.get("ProfilingStatus") == "Enabled"

profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}

with pytest.raises(ValueError) as error:
mx.enable_default_profiling()
assert "Debugger monitoring is already enabled." in str(error)
Expand Down Expand Up @@ -160,13 +151,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
assert job_description.get("ProfilingStatus") == "Enabled"

profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}

_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)

mx.update_profiler(
Expand All @@ -178,13 +162,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500

profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}


def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
sagemaker_session,
Expand Down Expand Up @@ -387,13 +364,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
== updated_framework_profile.profiling_parameters
)

profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}


def test_mxnet_with_disable_profiler_then_enable_default_profiling(
sagemaker_session,
Expand Down Expand Up @@ -431,12 +401,10 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
)

job_description = mx.latest_training_job.describe()
assert job_description.get("ProfilerConfig") is None
assert job_description.get("ProfilerRuleConfigurations") is None
assert job_description.get("ProfilingStatus") == "Disabled"

_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)

mx.enable_default_profiling()

job_description = mx.latest_training_job.describe()
Expand Down
8 changes: 1 addition & 7 deletions tests/unit/sagemaker/huggingface/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,8 @@ def _create_train_job(version, base_framework_version):
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
8 changes: 1 addition & 7 deletions tests/unit/sagemaker/tensorflow/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,8 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
"metric_definitions": None,
"environment": None,
"experiment_config": None,
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,8 @@ def _create_train_job(
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,8 @@ def _create_train_job(
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,8 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/sagemaker/workflow/test_step_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True},
"HyperParameters": {
"inference_script": '"dummy_script.py"',
"dependencies": f'"{dummy_requirements}"',
Expand Down Expand Up @@ -923,6 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True},
"HyperParameters": {
"inference_script": '"dummy_script.py"',
"model_archive": '"s3://my-bucket/model.tar.gz"',
Expand Down Expand Up @@ -1052,6 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True},
"HyperParameters": {
"dependencies": "null",
"inference_script": '"dummy_script.py"',
Expand Down Expand Up @@ -1243,6 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
},
"ProfilerConfig": {"DisableProfiler": True},
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
"ResourceConfig": {
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/sagemaker/workflow/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ def test_training_step_base_estimator(sagemaker_session):
"CollectionConfigurations": [],
},
"ProfilerConfig": {
"DisableProfiler": False,
"ProfilingIntervalInMilliseconds": 500,
"S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
},
Expand Down Expand Up @@ -483,7 +484,7 @@ def test_training_step_tensorflow(sagemaker_session):
"sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
"sagemaker_distributed_dataparallel_custom_mpi_options": '""',
},
"ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
"ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
},
"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
}
Expand Down
28 changes: 0 additions & 28 deletions tests/unit/sagemaker/workflow/test_training_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,6 @@ def test_training_step_with_estimator(
}
step_definition = json.loads(pipeline.definition())["Steps"][0]

# delete profiler rule configurations because of timestamp collision
del step_definition["Arguments"]["ProfilerRuleConfigurations"]
del expected_step_arguments["ProfilerRuleConfigurations"]

assert step_definition == {
"Name": "MyTrainingStep",
"Description": "TrainingStep description",
Expand Down Expand Up @@ -427,18 +423,6 @@ def test_training_step_with_framework_estimator(
del step_args["OutputDataConfig"]["S3OutputPath"]
del step_def["Arguments"]["OutputDataConfig"]["S3OutputPath"]

# trim timestamp so RuleConfigurationName will match
rule_config_name_step_args = step_args["ProfilerRuleConfigurations"][0]["RuleConfigurationName"]
step_args["ProfilerRuleConfigurations"][0][
"RuleConfigurationName"
] = rule_config_name_step_args[:-11]
rule_config_name_step_def = step_def["Arguments"]["ProfilerRuleConfigurations"][0][
"RuleConfigurationName"
]
step_def["Arguments"]["ProfilerRuleConfigurations"][0][
"RuleConfigurationName"
] = rule_config_name_step_def[:-11]

if "sagemaker_s3_output" in step_args["HyperParameters"]:
del step_args["HyperParameters"]["sagemaker_s3_output"]
del step_def["Arguments"]["HyperParameters"]["sagemaker_s3_output"]
Expand Down Expand Up @@ -519,18 +503,6 @@ def test_training_step_with_algorithm_base(algo_estimator, training_input, pipel
del step_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"]
del step_def["Arguments"]["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"]

# trim timestamp so RuleConfigurationName will match
rule_config_name_step_args = step_args["ProfilerRuleConfigurations"][0]["RuleConfigurationName"]
step_args["ProfilerRuleConfigurations"][0][
"RuleConfigurationName"
] = rule_config_name_step_args[:-11]
rule_config_name_step_def = step_def["Arguments"]["ProfilerRuleConfigurations"][0][
"RuleConfigurationName"
]
step_def["Arguments"]["ProfilerRuleConfigurations"][0][
"RuleConfigurationName"
] = rule_config_name_step_def[:-11]

assert step_def == {
"Name": "MyTrainingStep",
"Type": "Training",
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/sagemaker/workflow/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def test_repack_model_step(estimator):
}
],
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
"ProfilerConfig": {"DisableProfiler": True},
"ResourceConfig": {
"InstanceCount": 1,
"InstanceType": "ml.m5.large",
Expand Down Expand Up @@ -238,6 +239,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
}
],
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
"ProfilerConfig": {"DisableProfiler": True},
"ResourceConfig": {
"InstanceCount": 1,
"InstanceType": "ml.m5.large",
Expand Down
8 changes: 1 addition & 7 deletions tests/unit/test_chainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,8 @@ def _create_train_job(version, py_version):
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Loading