Skip to content

Commit e236c0d

Browse files
author
Marius Moisescu
committed
Adapted zaoliu's changes
1 parent 1cbfc83 commit e236c0d

18 files changed

+60
-161
lines changed

src/sagemaker/debugger/profiler_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(
3232
s3_output_path: Optional[Union[str, PipelineVariable]] = None,
3333
system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
3434
framework_profile_params: Optional[FrameworkProfile] = None,
35+
disable_profiler: Optional[FrameworkProfile] = False,
3536
):
3637
"""Initialize a ``ProfilerConfig`` instance.
3738
@@ -78,6 +79,7 @@ class and SageMaker Framework estimators.
7879
self.s3_output_path = s3_output_path
7980
self.system_monitor_interval_millis = system_monitor_interval_millis
8081
self.framework_profile_params = framework_profile_params
82+
self.disable_profiler = disable_profiler
8183

8284
def _to_request_dict(self):
8385
"""Generate a request dictionary using the parameters provided when initializing the object.
@@ -91,6 +93,8 @@ def _to_request_dict(self):
9193
if self.s3_output_path is not None:
9294
profiler_config_request["S3OutputPath"] = self.s3_output_path
9395

96+
profiler_config_request["DisableProfiler"] = self.disable_profiler
97+
9498
if self.system_monitor_interval_millis is not None:
9599
profiler_config_request[
96100
"ProfilingIntervalInMilliseconds"

src/sagemaker/estimator.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -938,26 +938,29 @@ def _prepare_collection_configs(self):
938938
def _prepare_profiler_for_training(self):
939939
"""Set necessary values and do basic validations in profiler config and profiler rules.
940940
941-
When user explicitly set rules to an empty list, default profiler rule won't be enabled.
942-
Default profiler rule will be enabled in supported regions when either:
943-
1. user doesn't specify any rules, i.e., rules=None; or
944-
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
941+
No default profiler rule will be used. The user needs to specify rules explicitly
945942
"""
946943
if self.disable_profiler:
947-
if self.profiler_config:
948-
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
944+
if self.profiler_config and not self.profiler_config.disable_profiler:
945+
raise RuntimeError(
946+
"profiler_config.disable_profiler cannot be False"
947+
+ " when disable_profiler is True."
948+
)
949949
if self.profiler_rules:
950950
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
951951
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
952952
if self.profiler_config is None:
953953
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
954954
if self.rules is None or (self.rules and not self.profiler_rules):
955-
self.profiler_rules = [get_default_profiler_rule()]
955+
self.profiler_rules = []
956956

957957
if self.profiler_config and not self.profiler_config.s3_output_path:
958958
self.profiler_config.s3_output_path = self.output_path
959959

960960
self.profiler_rule_configs = self._prepare_profiler_rules()
961+
# if profiler_config is still None, it means the job has profiler disabled
962+
if self.profiler_config is None:
963+
self.profiler_config = ProfilerConfig(disable_profiler=True)
961964

962965
def _prepare_profiler_rules(self):
963966
"""Set any necessary values in profiler rules, if they are provided."""
@@ -1048,7 +1051,7 @@ def latest_job_profiler_artifacts_path(self):
10481051
error_message="""Cannot get the profiling output artifacts path.
10491052
The Estimator is not associated with a training job."""
10501053
)
1051-
if self.profiler_config is not None:
1054+
if self.profiler_config is not None and not self.profiler_config.disable_profiler:
10521055
return os.path.join(
10531056
self.profiler_config.s3_output_path,
10541057
self.latest_training_job.name,
@@ -1895,8 +1898,8 @@ def enable_default_profiling(self):
18951898
else:
18961899
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
18971900

1898-
self.profiler_rules = [get_default_profiler_rule()]
1899-
self.profiler_rule_configs = self._prepare_profiler_rules()
1901+
self.profiler_rules = []
1902+
self.profiler_rule_configs = []
19001903

19011904
_TrainingJob.update(
19021905
self, self.profiler_rule_configs, self.profiler_config._to_request_dict()

tests/integ/test_profiler.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from __future__ import absolute_import
1414

1515
import os
16-
import re
1716
import time
1817
import uuid
1918

@@ -22,7 +21,6 @@
2221
from sagemaker.debugger import (
2322
DebuggerHookConfig,
2423
FrameworkProfile,
25-
get_rule_container_image_uri,
2624
ProfilerConfig,
2725
ProfilerRule,
2826
Rule,
@@ -103,13 +101,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
103101
)
104102
assert job_description.get("ProfilingStatus") == "Enabled"
105103

106-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
107-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
108-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
109-
mx.sagemaker_session.boto_region_name
110-
)
111-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
112-
113104
with pytest.raises(ValueError) as error:
114105
mx.enable_default_profiling()
115106
assert "Debugger monitoring is already enabled." in str(error)
@@ -160,13 +151,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
160151
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
161152
assert job_description.get("ProfilingStatus") == "Enabled"
162153

163-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
164-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
165-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
166-
mx.sagemaker_session.boto_region_name
167-
)
168-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
169-
170154
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
171155

172156
mx.update_profiler(
@@ -178,13 +162,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
178162
assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
179163
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
180164

181-
profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
182-
assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
183-
assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
184-
mx.sagemaker_session.boto_region_name
185-
)
186-
assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
187-
188165

189166
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
190167
sagemaker_session,
@@ -387,13 +364,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
387364
== updated_framework_profile.profiling_parameters
388365
)
389366

390-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
391-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
392-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
393-
mx.sagemaker_session.boto_region_name
394-
)
395-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
396-
397367

398368
def test_mxnet_with_disable_profiler_then_enable_default_profiling(
399369
sagemaker_session,
@@ -431,12 +401,10 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
431401
)
432402

433403
job_description = mx.latest_training_job.describe()
434-
assert job_description.get("ProfilerConfig") is None
435404
assert job_description.get("ProfilerRuleConfigurations") is None
436405
assert job_description.get("ProfilingStatus") == "Disabled"
437406

438407
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
439-
440408
mx.enable_default_profiling()
441409

442410
job_description = mx.latest_training_job.describe()

tests/unit/sagemaker/huggingface/test_estimator.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,8 @@ def _create_train_job(version, base_framework_version):
143143
"CollectionConfigurations": [],
144144
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
145145
},
146-
"profiler_rule_configs": [
147-
{
148-
"RuleConfigurationName": "ProfilerReport-1510006209",
149-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
150-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
151-
}
152-
],
153146
"profiler_config": {
147+
"DisableProfiler": False,
154148
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
155149
},
156150
}

tests/unit/sagemaker/tensorflow/test_estimator.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,8 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
136136
"metric_definitions": None,
137137
"environment": None,
138138
"experiment_config": None,
139-
"profiler_rule_configs": [
140-
{
141-
"RuleConfigurationName": "ProfilerReport-1510006209",
142-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
143-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
144-
}
145-
],
146139
"profiler_config": {
140+
"DisableProfiler": False,
147141
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
148142
},
149143
}

tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,8 @@ def _create_train_job(
145145
"CollectionConfigurations": [],
146146
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
147147
},
148-
"profiler_rule_configs": [
149-
{
150-
"RuleConfigurationName": "ProfilerReport-1510006209",
151-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
152-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
153-
}
154-
],
155148
"profiler_config": {
149+
"DisableProfiler": False,
156150
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
157151
},
158152
}

tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,8 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
145145
"CollectionConfigurations": [],
146146
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
147147
},
148-
"profiler_rule_configs": [
149-
{
150-
"RuleConfigurationName": "ProfilerReport-1510006209",
151-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
152-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
153-
}
154-
],
155148
"profiler_config": {
149+
"DisableProfiler": False,
156150
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
157151
},
158152
}

tests/unit/sagemaker/workflow/test_step_collections.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
796796
"CollectionConfigurations": [],
797797
"S3OutputPath": f"s3://{BUCKET}/",
798798
},
799+
"ProfilerConfig": {"DisableProfiler": True},
799800
"HyperParameters": {
800801
"inference_script": '"dummy_script.py"',
801802
"dependencies": f'"{dummy_requirements}"',
@@ -923,6 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
923924
"CollectionConfigurations": [],
924925
"S3OutputPath": f"s3://{BUCKET}/",
925926
},
927+
"ProfilerConfig": {"DisableProfiler": True},
926928
"HyperParameters": {
927929
"inference_script": '"dummy_script.py"',
928930
"model_archive": '"s3://my-bucket/model.tar.gz"',
@@ -1052,6 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
10521054
"CollectionConfigurations": [],
10531055
"S3OutputPath": f"s3://{BUCKET}/",
10541056
},
1057+
"ProfilerConfig": {"DisableProfiler": True},
10551058
"HyperParameters": {
10561059
"dependencies": "null",
10571060
"inference_script": '"dummy_script.py"',
@@ -1243,6 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
12431246
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
12441247
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
12451248
},
1249+
"ProfilerConfig": {"DisableProfiler": True},
12461250
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
12471251
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
12481252
"ResourceConfig": {

tests/unit/sagemaker/workflow/test_steps.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def test_training_step_base_estimator(sagemaker_session):
329329
"CollectionConfigurations": [],
330330
},
331331
"ProfilerConfig": {
332+
"DisableProfiler": False,
332333
"ProfilingIntervalInMilliseconds": 500,
333334
"S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
334335
},
@@ -438,7 +439,7 @@ def test_training_step_tensorflow(sagemaker_session):
438439
"sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
439440
"sagemaker_distributed_dataparallel_custom_mpi_options": '""',
440441
},
441-
"ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
442+
"ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
442443
},
443444
"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
444445
}
@@ -873,7 +874,7 @@ def test_create_model_step_with_model_pipeline(tfo, time, sagemaker_session):
873874
},
874875
{
875876
"Environment": {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"},
876-
"Image": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sparkml-serving:3.3",
877+
"Image": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sparkml-serving:2.4",
877878
"ModelDataUrl": "s3://bucket/model_2.tar.gz",
878879
},
879880
],

tests/unit/sagemaker/workflow/test_training_step.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -401,10 +401,6 @@ def test_training_step_with_estimator(
401401
}
402402
step_definition = json.loads(pipeline.definition())["Steps"][0]
403403

404-
# delete profiler rule configurations because of timestamp collision
405-
del step_definition["Arguments"]["ProfilerRuleConfigurations"]
406-
del step_args["ProfilerRuleConfigurations"]
407-
408404
assert step_definition == {
409405
"Name": "MyTrainingStep",
410406
"Description": "TrainingStep description",

tests/unit/sagemaker/workflow/test_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def test_repack_model_step(estimator):
107107
}
108108
],
109109
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
110+
"ProfilerConfig": {"DisableProfiler": True},
110111
"ResourceConfig": {
111112
"InstanceCount": 1,
112113
"InstanceType": "ml.m5.large",
@@ -188,6 +189,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
188189
}
189190
],
190191
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
192+
"ProfilerConfig": {"DisableProfiler": True},
191193
"ResourceConfig": {
192194
"InstanceCount": 1,
193195
"InstanceType": "ml.m5.large",

tests/unit/test_chainer.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,8 @@ def _create_train_job(version, py_version):
150150
"CollectionConfigurations": [],
151151
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
152152
},
153-
"profiler_rule_configs": [
154-
{
155-
"RuleConfigurationName": "ProfilerReport-1510006209",
156-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
157-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
158-
}
159-
],
160153
"profiler_config": {
154+
"DisableProfiler": False,
161155
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
162156
},
163157
}

0 commit comments

Comments
 (0)