Skip to content

Commit 92358fe

Browse files
mariumofMarius Moisescu
authored andcommitted
feature: Add disable_profiler field in config and propagate changes (#3523)
Co-authored-by: Marius Moisescu <[email protected]>
1 parent 2173d3d commit 92358fe

21 files changed

+211
-243
lines changed

src/sagemaker/debugger/profiler_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(
3232
s3_output_path: Optional[Union[str, PipelineVariable]] = None,
3333
system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
3434
framework_profile_params: Optional[FrameworkProfile] = None,
35+
disable_profiler: Optional[Union[str, PipelineVariable]] = False,
3536
):
3637
"""Initialize a ``ProfilerConfig`` instance.
3738
@@ -78,6 +79,7 @@ class and SageMaker Framework estimators.
7879
self.s3_output_path = s3_output_path
7980
self.system_monitor_interval_millis = system_monitor_interval_millis
8081
self.framework_profile_params = framework_profile_params
82+
self.disable_profiler = disable_profiler
8183

8284
def _to_request_dict(self):
8385
"""Generate a request dictionary using the parameters provided when initializing the object.
@@ -91,6 +93,8 @@ def _to_request_dict(self):
9193
if self.s3_output_path is not None:
9294
profiler_config_request["S3OutputPath"] = self.s3_output_path
9395

96+
profiler_config_request["DisableProfiler"] = self.disable_profiler
97+
9498
if self.system_monitor_interval_millis is not None:
9599
profiler_config_request[
96100
"ProfilingIntervalInMilliseconds"

src/sagemaker/estimator.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -938,26 +938,29 @@ def _prepare_collection_configs(self):
938938
def _prepare_profiler_for_training(self):
939939
"""Set necessary values and do basic validations in profiler config and profiler rules.
940940
941-
When user explicitly set rules to an empty list, default profiler rule won't be enabled.
942-
Default profiler rule will be enabled in supported regions when either:
943-
1. user doesn't specify any rules, i.e., rules=None; or
944-
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
941+
No default profiler rule will be used. The user needs to specify rules explicitly
945942
"""
946943
if self.disable_profiler:
947-
if self.profiler_config:
948-
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
944+
if self.profiler_config and not self.profiler_config.disable_profiler:
945+
raise RuntimeError(
946+
"profiler_config.disable_profiler cannot be False"
947+
+ " when disable_profiler is True."
948+
)
949949
if self.profiler_rules:
950950
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
951951
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
952952
if self.profiler_config is None:
953953
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
954954
if self.rules is None or (self.rules and not self.profiler_rules):
955-
self.profiler_rules = [get_default_profiler_rule()]
955+
self.profiler_rules = []
956956

957957
if self.profiler_config and not self.profiler_config.s3_output_path:
958958
self.profiler_config.s3_output_path = self.output_path
959959

960960
self.profiler_rule_configs = self._prepare_profiler_rules()
961+
# if profiler_config is still None, it means the job has profiler disabled
962+
if self.profiler_config is None:
963+
self.profiler_config = ProfilerConfig(disable_profiler=True)
961964

962965
def _prepare_profiler_rules(self):
963966
"""Set any necessary values in profiler rules, if they are provided."""
@@ -1048,7 +1051,7 @@ def latest_job_profiler_artifacts_path(self):
10481051
error_message="""Cannot get the profiling output artifacts path.
10491052
The Estimator is not associated with a training job."""
10501053
)
1051-
if self.profiler_config is not None:
1054+
if self.profiler_config is not None and not self.profiler_config.disable_profiler:
10521055
return os.path.join(
10531056
self.profiler_config.s3_output_path,
10541057
self.latest_training_job.name,
@@ -1895,8 +1898,8 @@ def enable_default_profiling(self):
18951898
else:
18961899
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
18971900

1898-
self.profiler_rules = [get_default_profiler_rule()]
1899-
self.profiler_rule_configs = self._prepare_profiler_rules()
1901+
self.profiler_rules = []
1902+
self.profiler_rule_configs = []
19001903

19011904
_TrainingJob.update(
19021905
self, self.profiler_rule_configs, self.profiler_config._to_request_dict()

tests/integ/sagemaker/workflow/test_workflow.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,8 +1269,6 @@ def test_caching_behavior(
12691269
# create pipeline
12701270
pipeline.create(role)
12711271
definition = json.loads(pipeline.definition())
1272-
# delete profiler config for assertions as it will contain a timestamp
1273-
del definition["Steps"][1]["Arguments"]["ProfilerRuleConfigurations"]
12741272

12751273
# verify input path
12761274
expected_abalone_input_path = f"{pipeline_name}/{step_process.name}" f"/input/abalone_data"
@@ -1295,7 +1293,6 @@ def test_caching_behavior(
12951293

12961294
# verify no changes
12971295
definition2 = json.loads(pipeline.definition())
1298-
del definition2["Steps"][1]["Arguments"]["ProfilerRuleConfigurations"]
12991296
assert definition == definition2
13001297

13011298
# add dummy file to source_dir
@@ -1306,7 +1303,6 @@ def test_caching_behavior(
13061303

13071304
# verify changes
13081305
definition3 = json.loads(pipeline.definition())
1309-
del definition3["Steps"][1]["Arguments"]["ProfilerRuleConfigurations"]
13101306
assert definition != definition3
13111307

13121308
finally:

tests/integ/test_profiler.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from __future__ import absolute_import
1414

1515
import os
16-
import re
1716
import time
1817
import uuid
1918

@@ -22,7 +21,6 @@
2221
from sagemaker.debugger import (
2322
DebuggerHookConfig,
2423
FrameworkProfile,
25-
get_rule_container_image_uri,
2624
ProfilerConfig,
2725
ProfilerRule,
2826
Rule,
@@ -93,8 +91,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
9391
)
9492

9593
job_description = mx.latest_training_job.describe()
96-
if "DisableProfiler" in job_description["ProfilerConfig"]:
97-
job_description["ProfilerConfig"].pop("DisableProfiler")
9894
assert (
9995
job_description["ProfilerConfig"]
10096
== ProfilerConfig(
@@ -103,13 +99,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
10399
)
104100
assert job_description.get("ProfilingStatus") == "Enabled"
105101

106-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
107-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
108-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
109-
mx.sagemaker_session.boto_region_name
110-
)
111-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
112-
113102
with pytest.raises(ValueError) as error:
114103
mx.enable_default_profiling()
115104
assert "Debugger monitoring is already enabled." in str(error)
@@ -155,18 +144,9 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
155144
)
156145

157146
job_description = mx.latest_training_job.describe()
158-
if "DisableProfiler" in job_description["ProfilerConfig"]:
159-
job_description["ProfilerConfig"].pop("DisableProfiler")
160147
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
161148
assert job_description.get("ProfilingStatus") == "Enabled"
162149

163-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
164-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
165-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
166-
mx.sagemaker_session.boto_region_name
167-
)
168-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
169-
170150
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
171151

172152
mx.update_profiler(
@@ -178,13 +158,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
178158
assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
179159
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
180160

181-
profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
182-
assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
183-
assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
184-
mx.sagemaker_session.boto_region_name
185-
)
186-
assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
187-
188161

189162
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
190163
sagemaker_session,
@@ -225,8 +198,6 @@ def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
225198
)
226199

227200
job_description = mx.latest_training_job.describe()
228-
if "DisableProfiler" in job_description["ProfilerConfig"]:
229-
job_description["ProfilerConfig"].pop("DisableProfiler")
230201
assert job_description.get("ProfilingStatus") == "Enabled"
231202
assert (
232203
job_description.get("ProfilerConfig")
@@ -298,8 +269,6 @@ def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
298269
)
299270

300271
job_description = mx.latest_training_job.describe()
301-
if "DisableProfiler" in job_description["ProfilerConfig"]:
302-
job_description["ProfilerConfig"].pop("DisableProfiler")
303272
assert job_description["ProfilerConfig"] == profiler_config._to_request_dict()
304273
assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
305274
assert job_description.get("ProfilingStatus") == "Enabled"
@@ -387,13 +356,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
387356
== updated_framework_profile.profiling_parameters
388357
)
389358

390-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
391-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
392-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
393-
mx.sagemaker_session.boto_region_name
394-
)
395-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
396-
397359

398360
def test_mxnet_with_disable_profiler_then_enable_default_profiling(
399361
sagemaker_session,
@@ -431,12 +393,10 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
431393
)
432394

433395
job_description = mx.latest_training_job.describe()
434-
assert job_description.get("ProfilerConfig") is None
435396
assert job_description.get("ProfilerRuleConfigurations") is None
436397
assert job_description.get("ProfilingStatus") == "Disabled"
437398

438399
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
439-
440400
mx.enable_default_profiling()
441401

442402
job_description = mx.latest_training_job.describe()

tests/unit/sagemaker/huggingface/test_estimator.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,8 @@ def _create_train_job(version, base_framework_version):
143143
"CollectionConfigurations": [],
144144
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
145145
},
146-
"profiler_rule_configs": [
147-
{
148-
"RuleConfigurationName": "ProfilerReport-1510006209",
149-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
150-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
151-
}
152-
],
153146
"profiler_config": {
147+
"DisableProfiler": False,
154148
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
155149
},
156150
}

tests/unit/sagemaker/tensorflow/test_estimator.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,8 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
136136
"metric_definitions": None,
137137
"environment": None,
138138
"experiment_config": None,
139-
"profiler_rule_configs": [
140-
{
141-
"RuleConfigurationName": "ProfilerReport-1510006209",
142-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
143-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
144-
}
145-
],
146139
"profiler_config": {
140+
"DisableProfiler": False,
147141
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
148142
},
149143
}

tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,8 @@ def _create_train_job(
145145
"CollectionConfigurations": [],
146146
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
147147
},
148-
"profiler_rule_configs": [
149-
{
150-
"RuleConfigurationName": "ProfilerReport-1510006209",
151-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
152-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
153-
}
154-
],
155148
"profiler_config": {
149+
"DisableProfiler": False,
156150
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
157151
},
158152
}

tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,8 @@ def _create_train_job(
143143
"CollectionConfigurations": [],
144144
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
145145
},
146-
"profiler_rule_configs": [
147-
{
148-
"RuleConfigurationName": "ProfilerReport-1510006209",
149-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
150-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
151-
}
152-
],
153146
"profiler_config": {
147+
"DisableProfiler": False,
154148
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
155149
},
156150
}

tests/unit/sagemaker/training_compiler/test_pytorch_compiler.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,14 +137,10 @@ def _create_train_job(version, instance_type, training_compiler_config, instance
137137
"CollectionConfigurations": [],
138138
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
139139
},
140-
"profiler_rule_configs": [
141-
{
142-
"RuleConfigurationName": "ProfilerReport-1510006209",
143-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
144-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
145-
}
146-
],
147-
"profiler_config": {"S3OutputPath": "s3://{}/".format(BUCKET_NAME)},
140+
"profiler_config": {
141+
"DisableProfiler": False,
142+
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
143+
},
148144
}
149145

150146

tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,8 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
145145
"CollectionConfigurations": [],
146146
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
147147
},
148-
"profiler_rule_configs": [
149-
{
150-
"RuleConfigurationName": "ProfilerReport-1510006209",
151-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
152-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
153-
}
154-
],
155148
"profiler_config": {
149+
"DisableProfiler": False,
156150
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
157151
},
158152
}

tests/unit/sagemaker/workflow/test_step_collections.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
796796
"CollectionConfigurations": [],
797797
"S3OutputPath": f"s3://{BUCKET}/",
798798
},
799+
"ProfilerConfig": {"DisableProfiler": True},
799800
"HyperParameters": {
800801
"inference_script": '"dummy_script.py"',
801802
"dependencies": f'"{dummy_requirements}"',
@@ -923,6 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
923924
"CollectionConfigurations": [],
924925
"S3OutputPath": f"s3://{BUCKET}/",
925926
},
927+
"ProfilerConfig": {"DisableProfiler": True},
926928
"HyperParameters": {
927929
"inference_script": '"dummy_script.py"',
928930
"model_archive": '"s3://my-bucket/model.tar.gz"',
@@ -1052,6 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
10521054
"CollectionConfigurations": [],
10531055
"S3OutputPath": f"s3://{BUCKET}/",
10541056
},
1057+
"ProfilerConfig": {"DisableProfiler": True},
10551058
"HyperParameters": {
10561059
"dependencies": "null",
10571060
"inference_script": '"dummy_script.py"',
@@ -1243,6 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
12431246
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
12441247
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
12451248
},
1249+
"ProfilerConfig": {"DisableProfiler": True},
12461250
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
12471251
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
12481252
"ResourceConfig": {

tests/unit/sagemaker/workflow/test_steps.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def test_training_step_base_estimator(sagemaker_session):
329329
"CollectionConfigurations": [],
330330
},
331331
"ProfilerConfig": {
332+
"DisableProfiler": False,
332333
"ProfilingIntervalInMilliseconds": 500,
333334
"S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
334335
},
@@ -438,7 +439,7 @@ def test_training_step_tensorflow(sagemaker_session):
438439
"sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
439440
"sagemaker_distributed_dataparallel_custom_mpi_options": '""',
440441
},
441-
"ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
442+
"ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
442443
},
443444
"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
444445
}

0 commit comments

Comments
 (0)