Skip to content

Commit 70c3ca4

Browse files
author
Liu
committed
feature: disable default profiler rules
1 parent 9c445f9 commit 70c3ca4

17 files changed

+25
-208
lines changed

src/sagemaker/estimator.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -937,32 +937,29 @@ def _prepare_collection_configs(self):
937937
def _prepare_profiler_for_training(self):
938938
"""Set necessary values and do basic validations in profiler config and profiler rules.
939939
940-
When user explicitly set rules to an empty list, default profiler rule won't be enabled.
941-
Default profiler rule will be enabled in supported regions when either:
942-
1. user doesn't specify any rules, i.e., rules=None; or
943-
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
940+
No default profiler rule will be used. The user needs to specify rules explicitly
944941
"""
945942
if self.disable_profiler:
946-
if self.profiler_config and self.profiler_config.disable_profiler == False:
947-
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
943+
if self.profiler_config and not self.profiler_config.disable_profiler:
944+
raise RuntimeError(
945+
"profiler_config.disable_profiler cannot be False"
946+
+ " when disable_profiler is True."
947+
)
948948
if self.profiler_rules:
949949
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
950950
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
951951
if self.profiler_config is None:
952952
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
953953
if self.rules is None or (self.rules and not self.profiler_rules):
954-
self.profiler_rules = [get_default_profiler_rule()]
954+
self.profiler_rules = []
955955

956956
if self.profiler_config and not self.profiler_config.s3_output_path:
957957
self.profiler_config.s3_output_path = self.output_path
958958

959959
self.profiler_rule_configs = self._prepare_profiler_rules()
960960
# if profiler_config is still None, it means the job has profiler disabled
961961
if self.profiler_config is None:
962-
# self.profiler_config = ProfilerConfig(disable_profiler=True)
963-
self.profiler_config = ProfilerConfig(
964-
s3_output_path=self.output_path, disable_profiler=True
965-
)
962+
self.profiler_config = ProfilerConfig(disable_profiler=True)
966963

967964
def _prepare_profiler_rules(self):
968965
"""Set any necessary values in profiler rules, if they are provided."""
@@ -1053,7 +1050,7 @@ def latest_job_profiler_artifacts_path(self):
10531050
error_message="""Cannot get the profiling output artifacts path.
10541051
The Estimator is not associated with a training job."""
10551052
)
1056-
if self.profiler_config is not None and self.profiler_config.disable_profiler == False:
1053+
if self.profiler_config is not None and not self.profiler_config.disable_profiler:
10571054
return os.path.join(
10581055
self.profiler_config.s3_output_path,
10591056
self.latest_training_job.name,
@@ -1899,8 +1896,8 @@ def enable_default_profiling(self):
18991896
else:
19001897
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
19011898

1902-
self.profiler_rules = [get_default_profiler_rule()]
1903-
self.profiler_rule_configs = self._prepare_profiler_rules()
1899+
self.profiler_rules = []
1900+
self.profiler_rule_configs = []
19041901

19051902
_TrainingJob.update(
19061903
self, self.profiler_rule_configs, self.profiler_config._to_request_dict()

tests/integ/test_profiler.py

Lines changed: 2 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from __future__ import absolute_import
1414

1515
import os
16-
import re
1716
import time
1817
import uuid
1918

@@ -22,7 +21,6 @@
2221
from sagemaker.debugger import (
2322
DebuggerHookConfig,
2423
FrameworkProfile,
25-
get_rule_container_image_uri,
2624
ProfilerConfig,
2725
ProfilerRule,
2826
Rule,
@@ -93,8 +91,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
9391
)
9492

9593
job_description = mx.latest_training_job.describe()
96-
# Temporarily added until the service package changes are updated
97-
job_description["ProfilerConfig"]["DisableProfiler"] = False
9894
assert (
9995
job_description["ProfilerConfig"]
10096
== ProfilerConfig(
@@ -103,13 +99,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
10399
)
104100
assert job_description.get("ProfilingStatus") == "Enabled"
105101

106-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
107-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
108-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
109-
mx.sagemaker_session.boto_region_name
110-
)
111-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
112-
113102
with pytest.raises(ValueError) as error:
114103
mx.enable_default_profiling()
115104
assert "Debugger monitoring is already enabled." in str(error)
@@ -155,18 +144,9 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
155144
)
156145

157146
job_description = mx.latest_training_job.describe()
158-
# Temporarily added until the service package changes are updated
159-
job_description["ProfilerConfig"]["DisableProfiler"] = False
160147
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
161148
assert job_description.get("ProfilingStatus") == "Enabled"
162149

163-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
164-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
165-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
166-
mx.sagemaker_session.boto_region_name
167-
)
168-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
169-
170150
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
171151

172152
mx.update_profiler(
@@ -178,13 +158,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
178158
assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
179159
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
180160

181-
profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
182-
assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
183-
assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
184-
mx.sagemaker_session.boto_region_name
185-
)
186-
assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
187-
188161

189162
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
190163
sagemaker_session,
@@ -225,8 +198,6 @@ def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
225198
)
226199

227200
job_description = mx.latest_training_job.describe()
228-
# Temporarily added until the service package changes are updated
229-
job_description["ProfilerConfig"]["DisableProfiler"] = False
230201
assert job_description.get("ProfilingStatus") == "Enabled"
231202
assert (
232203
job_description.get("ProfilerConfig")
@@ -298,8 +269,6 @@ def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
298269
)
299270

300271
job_description = mx.latest_training_job.describe()
301-
# Temporarily added until the service package changes are updated
302-
job_description["ProfilerConfig"]["DisableProfiler"] = False
303272
assert job_description["ProfilerConfig"] == profiler_config._to_request_dict()
304273
assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
305274
assert job_description.get("ProfilingStatus") == "Enabled"
@@ -387,13 +356,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
387356
== updated_framework_profile.profiling_parameters
388357
)
389358

390-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
391-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
392-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
393-
mx.sagemaker_session.boto_region_name
394-
)
395-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
396-
397359

398360
def test_mxnet_with_disable_profiler_then_enable_default_profiling(
399361
sagemaker_session,
@@ -431,15 +393,11 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
431393
)
432394

433395
job_description = mx.latest_training_job.describe()
434-
# when the profiler is disabled, ProfilerConfig is not None. Temporarily remove this check until the service packages are updated.
435-
# assert job_description.get("ProfilerConfig") is None
436396
assert job_description.get("ProfilerRuleConfigurations") is None
437-
# Temporarily remove this check until the service packages are updated.
438-
# assert job_description.get("ProfilingStatus") == "Disabled"
397+
assert job_description.get("ProfilingStatus") == "Disabled"
439398

440399
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
441-
# profilingStatus is currently wrong, temporarily remove this check until the service packages are updated.
442-
# mx.enable_default_profiling()
400+
mx.enable_default_profiling()
443401

444402
job_description = mx.latest_training_job.describe()
445403
assert job_description["ProfilerConfig"]["S3OutputPath"] == mx.output_path

tests/unit/sagemaker/huggingface/test_estimator.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,6 @@ def _create_train_job(version, base_framework_version):
142142
"CollectionConfigurations": [],
143143
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
144144
},
145-
"profiler_rule_configs": [
146-
{
147-
"RuleConfigurationName": "ProfilerReport-1510006209",
148-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
149-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
150-
}
151-
],
152145
"profiler_config": {
153146
"DisableProfiler": False,
154147
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

tests/unit/sagemaker/tensorflow/test_estimator.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,6 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
135135
"metric_definitions": None,
136136
"environment": None,
137137
"experiment_config": None,
138-
"profiler_rule_configs": [
139-
{
140-
"RuleConfigurationName": "ProfilerReport-1510006209",
141-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
142-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
143-
}
144-
],
145138
"profiler_config": {
146139
"DisableProfiler": False,
147140
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,6 @@ def _create_train_job(
144144
"CollectionConfigurations": [],
145145
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
146146
},
147-
"profiler_rule_configs": [
148-
{
149-
"RuleConfigurationName": "ProfilerReport-1510006209",
150-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
151-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
152-
}
153-
],
154147
"profiler_config": {
155148
"DisableProfiler": False,
156149
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,6 @@ def _create_train_job(
142142
"CollectionConfigurations": [],
143143
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
144144
},
145-
"profiler_rule_configs": [
146-
{
147-
"RuleConfigurationName": "ProfilerReport-1510006209",
148-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
149-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
150-
}
151-
],
152145
"profiler_config": {
153146
"DisableProfiler": False,
154147
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,6 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
144144
"CollectionConfigurations": [],
145145
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
146146
},
147-
"profiler_rule_configs": [
148-
{
149-
"RuleConfigurationName": "ProfilerReport-1510006209",
150-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
151-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
152-
}
153-
],
154147
"profiler_config": {
155148
"DisableProfiler": False,
156149
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

tests/unit/sagemaker/workflow/test_step_collections.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
796796
"CollectionConfigurations": [],
797797
"S3OutputPath": f"s3://{BUCKET}/",
798798
},
799-
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
799+
"ProfilerConfig": {"DisableProfiler": True},
800800
"HyperParameters": {
801801
"inference_script": '"dummy_script.py"',
802802
"dependencies": f'"{dummy_requirements}"',
@@ -924,7 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
924924
"CollectionConfigurations": [],
925925
"S3OutputPath": f"s3://{BUCKET}/",
926926
},
927-
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
927+
"ProfilerConfig": {"DisableProfiler": True},
928928
"HyperParameters": {
929929
"inference_script": '"dummy_script.py"',
930930
"model_archive": '"s3://my-bucket/model.tar.gz"',
@@ -1054,7 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
10541054
"CollectionConfigurations": [],
10551055
"S3OutputPath": f"s3://{BUCKET}/",
10561056
},
1057-
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
1057+
"ProfilerConfig": {"DisableProfiler": True},
10581058
"HyperParameters": {
10591059
"dependencies": "null",
10601060
"inference_script": '"dummy_script.py"',
@@ -1246,7 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
12461246
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
12471247
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
12481248
},
1249-
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
1249+
"ProfilerConfig": {"DisableProfiler": True},
12501250
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
12511251
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
12521252
"ResourceConfig": {

tests/unit/sagemaker/workflow/test_training_step.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,6 @@ def test_training_step_with_estimator(
307307
}
308308
step_definition = json.loads(pipeline.definition())["Steps"][0]
309309

310-
# delete profiler rule configurations because of timestamp collision
311-
del step_definition["Arguments"]["ProfilerRuleConfigurations"]
312-
del expected_step_arguments["ProfilerRuleConfigurations"]
313-
314310
assert step_definition == {
315311
"Name": "MyTrainingStep",
316312
"Description": "TrainingStep description",
@@ -427,18 +423,6 @@ def test_training_step_with_framework_estimator(
427423
del step_args["OutputDataConfig"]["S3OutputPath"]
428424
del step_def["Arguments"]["OutputDataConfig"]["S3OutputPath"]
429425

430-
# trim timestamp so RuleConfigurationName will match
431-
rule_config_name_step_args = step_args["ProfilerRuleConfigurations"][0]["RuleConfigurationName"]
432-
step_args["ProfilerRuleConfigurations"][0][
433-
"RuleConfigurationName"
434-
] = rule_config_name_step_args[:-11]
435-
rule_config_name_step_def = step_def["Arguments"]["ProfilerRuleConfigurations"][0][
436-
"RuleConfigurationName"
437-
]
438-
step_def["Arguments"]["ProfilerRuleConfigurations"][0][
439-
"RuleConfigurationName"
440-
] = rule_config_name_step_def[:-11]
441-
442426
if "sagemaker_s3_output" in step_args["HyperParameters"]:
443427
del step_args["HyperParameters"]["sagemaker_s3_output"]
444428
del step_def["Arguments"]["HyperParameters"]["sagemaker_s3_output"]
@@ -519,18 +503,6 @@ def test_training_step_with_algorithm_base(algo_estimator, training_input, pipel
519503
del step_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"]
520504
del step_def["Arguments"]["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"]
521505

522-
# trim timestamp so RuleConfigurationName will match
523-
rule_config_name_step_args = step_args["ProfilerRuleConfigurations"][0]["RuleConfigurationName"]
524-
step_args["ProfilerRuleConfigurations"][0][
525-
"RuleConfigurationName"
526-
] = rule_config_name_step_args[:-11]
527-
rule_config_name_step_def = step_def["Arguments"]["ProfilerRuleConfigurations"][0][
528-
"RuleConfigurationName"
529-
]
530-
step_def["Arguments"]["ProfilerRuleConfigurations"][0][
531-
"RuleConfigurationName"
532-
] = rule_config_name_step_def[:-11]
533-
534506
assert step_def == {
535507
"Name": "MyTrainingStep",
536508
"Type": "Training",

tests/unit/sagemaker/workflow/test_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def test_repack_model_step(estimator):
157157
}
158158
],
159159
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
160-
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": f"s3://{BUCKET}/"},
160+
"ProfilerConfig": {"DisableProfiler": True},
161161
"ResourceConfig": {
162162
"InstanceCount": 1,
163163
"InstanceType": "ml.m5.large",
@@ -239,7 +239,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
239239
}
240240
],
241241
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
242-
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": f"s3://{BUCKET}/"},
242+
"ProfilerConfig": {"DisableProfiler": True},
243243
"ResourceConfig": {
244244
"InstanceCount": 1,
245245
"InstanceType": "ml.m5.large",

tests/unit/test_chainer.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,13 +150,6 @@ def _create_train_job(version, py_version):
150150
"CollectionConfigurations": [],
151151
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
152152
},
153-
"profiler_rule_configs": [
154-
{
155-
"RuleConfigurationName": "ProfilerReport-1510006209",
156-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
157-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
158-
}
159-
],
160153
"profiler_config": {
161154
"DisableProfiler": False,
162155
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

0 commit comments

Comments
 (0)