Skip to content

Commit ab9c79d

Browse files
author
Marius Moisescu
committed
Adapted zaoliu's changes
1 parent b2e8b66 commit ab9c79d

18 files changed

+60
-161
lines changed

src/sagemaker/debugger/profiler_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(
3232
s3_output_path: Optional[Union[str, PipelineVariable]] = None,
3333
system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
3434
framework_profile_params: Optional[FrameworkProfile] = None,
35+
disable_profiler: Optional[FrameworkProfile] = False,
3536
):
3637
"""Initialize a ``ProfilerConfig`` instance.
3738
@@ -78,6 +79,7 @@ class and SageMaker Framework estimators.
7879
self.s3_output_path = s3_output_path
7980
self.system_monitor_interval_millis = system_monitor_interval_millis
8081
self.framework_profile_params = framework_profile_params
82+
self.disable_profiler = disable_profiler
8183

8284
def _to_request_dict(self):
8385
"""Generate a request dictionary using the parameters provided when initializing the object.
@@ -91,6 +93,8 @@ def _to_request_dict(self):
9193
if self.s3_output_path is not None:
9294
profiler_config_request["S3OutputPath"] = self.s3_output_path
9395

96+
profiler_config_request["DisableProfiler"] = self.disable_profiler
97+
9498
if self.system_monitor_interval_millis is not None:
9599
profiler_config_request[
96100
"ProfilingIntervalInMilliseconds"

src/sagemaker/estimator.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -937,26 +937,29 @@ def _prepare_collection_configs(self):
937937
def _prepare_profiler_for_training(self):
938938
"""Set necessary values and do basic validations in profiler config and profiler rules.
939939
940-
When user explicitly set rules to an empty list, default profiler rule won't be enabled.
941-
Default profiler rule will be enabled in supported regions when either:
942-
1. user doesn't specify any rules, i.e., rules=None; or
943-
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
940+
No default profiler rule will be used. The user needs to specify rules explicitly
944941
"""
945942
if self.disable_profiler:
946-
if self.profiler_config:
947-
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
943+
if self.profiler_config and not self.profiler_config.disable_profiler:
944+
raise RuntimeError(
945+
"profiler_config.disable_profiler cannot be False"
946+
+ " when disable_profiler is True."
947+
)
948948
if self.profiler_rules:
949949
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
950950
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
951951
if self.profiler_config is None:
952952
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
953953
if self.rules is None or (self.rules and not self.profiler_rules):
954-
self.profiler_rules = [get_default_profiler_rule()]
954+
self.profiler_rules = []
955955

956956
if self.profiler_config and not self.profiler_config.s3_output_path:
957957
self.profiler_config.s3_output_path = self.output_path
958958

959959
self.profiler_rule_configs = self._prepare_profiler_rules()
960+
# if profiler_config is still None, it means the job has profiler disabled
961+
if self.profiler_config is None:
962+
self.profiler_config = ProfilerConfig(disable_profiler=True)
960963

961964
def _prepare_profiler_rules(self):
962965
"""Set any necessary values in profiler rules, if they are provided."""
@@ -1047,7 +1050,7 @@ def latest_job_profiler_artifacts_path(self):
10471050
error_message="""Cannot get the profiling output artifacts path.
10481051
The Estimator is not associated with a training job."""
10491052
)
1050-
if self.profiler_config is not None:
1053+
if self.profiler_config is not None and not self.profiler_config.disable_profiler:
10511054
return os.path.join(
10521055
self.profiler_config.s3_output_path,
10531056
self.latest_training_job.name,
@@ -1893,8 +1896,8 @@ def enable_default_profiling(self):
18931896
else:
18941897
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
18951898

1896-
self.profiler_rules = [get_default_profiler_rule()]
1897-
self.profiler_rule_configs = self._prepare_profiler_rules()
1899+
self.profiler_rules = []
1900+
self.profiler_rule_configs = []
18981901

18991902
_TrainingJob.update(
19001903
self, self.profiler_rule_configs, self.profiler_config._to_request_dict()

tests/integ/test_profiler.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from __future__ import absolute_import
1414

1515
import os
16-
import re
1716
import time
1817
import uuid
1918

@@ -22,7 +21,6 @@
2221
from sagemaker.debugger import (
2322
DebuggerHookConfig,
2423
FrameworkProfile,
25-
get_rule_container_image_uri,
2624
ProfilerConfig,
2725
ProfilerRule,
2826
Rule,
@@ -103,13 +101,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
103101
)
104102
assert job_description.get("ProfilingStatus") == "Enabled"
105103

106-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
107-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
108-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
109-
mx.sagemaker_session.boto_region_name
110-
)
111-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
112-
113104
with pytest.raises(ValueError) as error:
114105
mx.enable_default_profiling()
115106
assert "Debugger monitoring is already enabled." in str(error)
@@ -160,13 +151,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
160151
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
161152
assert job_description.get("ProfilingStatus") == "Enabled"
162153

163-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
164-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
165-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
166-
mx.sagemaker_session.boto_region_name
167-
)
168-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
169-
170154
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
171155

172156
mx.update_profiler(
@@ -178,13 +162,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
178162
assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
179163
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
180164

181-
profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
182-
assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
183-
assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
184-
mx.sagemaker_session.boto_region_name
185-
)
186-
assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
187-
188165

189166
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
190167
sagemaker_session,
@@ -387,13 +364,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
387364
== updated_framework_profile.profiling_parameters
388365
)
389366

390-
profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
391-
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
392-
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
393-
mx.sagemaker_session.boto_region_name
394-
)
395-
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
396-
397367

398368
def test_mxnet_with_disable_profiler_then_enable_default_profiling(
399369
sagemaker_session,
@@ -431,12 +401,10 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
431401
)
432402

433403
job_description = mx.latest_training_job.describe()
434-
assert job_description.get("ProfilerConfig") is None
435404
assert job_description.get("ProfilerRuleConfigurations") is None
436405
assert job_description.get("ProfilingStatus") == "Disabled"
437406

438407
_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
439-
440408
mx.enable_default_profiling()
441409

442410
job_description = mx.latest_training_job.describe()

tests/unit/sagemaker/huggingface/test_estimator.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,8 @@ def _create_train_job(version, base_framework_version):
142142
"CollectionConfigurations": [],
143143
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
144144
},
145-
"profiler_rule_configs": [
146-
{
147-
"RuleConfigurationName": "ProfilerReport-1510006209",
148-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
149-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
150-
}
151-
],
152145
"profiler_config": {
146+
"DisableProfiler": False,
153147
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
154148
},
155149
}

tests/unit/sagemaker/tensorflow/test_estimator.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,8 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
135135
"metric_definitions": None,
136136
"environment": None,
137137
"experiment_config": None,
138-
"profiler_rule_configs": [
139-
{
140-
"RuleConfigurationName": "ProfilerReport-1510006209",
141-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
142-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
143-
}
144-
],
145138
"profiler_config": {
139+
"DisableProfiler": False,
146140
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
147141
},
148142
}

tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,14 +144,8 @@ def _create_train_job(
144144
"CollectionConfigurations": [],
145145
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
146146
},
147-
"profiler_rule_configs": [
148-
{
149-
"RuleConfigurationName": "ProfilerReport-1510006209",
150-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
151-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
152-
}
153-
],
154147
"profiler_config": {
148+
"DisableProfiler": False,
155149
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
156150
},
157151
}

tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,14 +144,8 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
144144
"CollectionConfigurations": [],
145145
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
146146
},
147-
"profiler_rule_configs": [
148-
{
149-
"RuleConfigurationName": "ProfilerReport-1510006209",
150-
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
151-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
152-
}
153-
],
154147
"profiler_config": {
148+
"DisableProfiler": False,
155149
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
156150
},
157151
}

tests/unit/sagemaker/workflow/test_step_collections.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
796796
"CollectionConfigurations": [],
797797
"S3OutputPath": f"s3://{BUCKET}/",
798798
},
799+
"ProfilerConfig": {"DisableProfiler": True},
799800
"HyperParameters": {
800801
"inference_script": '"dummy_script.py"',
801802
"dependencies": f'"{dummy_requirements}"',
@@ -923,6 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
923924
"CollectionConfigurations": [],
924925
"S3OutputPath": f"s3://{BUCKET}/",
925926
},
927+
"ProfilerConfig": {"DisableProfiler": True},
926928
"HyperParameters": {
927929
"inference_script": '"dummy_script.py"',
928930
"model_archive": '"s3://my-bucket/model.tar.gz"',
@@ -1052,6 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
10521054
"CollectionConfigurations": [],
10531055
"S3OutputPath": f"s3://{BUCKET}/",
10541056
},
1057+
"ProfilerConfig": {"DisableProfiler": True},
10551058
"HyperParameters": {
10561059
"dependencies": "null",
10571060
"inference_script": '"dummy_script.py"',
@@ -1243,6 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
12431246
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
12441247
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
12451248
},
1249+
"ProfilerConfig": {"DisableProfiler": True},
12461250
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
12471251
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
12481252
"ResourceConfig": {

tests/unit/sagemaker/workflow/test_steps.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ def test_training_step_base_estimator(sagemaker_session):
374374
"CollectionConfigurations": [],
375375
},
376376
"ProfilerConfig": {
377+
"DisableProfiler": False,
377378
"ProfilingIntervalInMilliseconds": 500,
378379
"S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
379380
},
@@ -483,7 +484,7 @@ def test_training_step_tensorflow(sagemaker_session):
483484
"sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
484485
"sagemaker_distributed_dataparallel_custom_mpi_options": '""',
485486
},
486-
"ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
487+
"ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
487488
},
488489
"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
489490
}
@@ -918,7 +919,7 @@ def test_create_model_step_with_model_pipeline(tfo, time, sagemaker_session):
918919
},
919920
{
920921
"Environment": {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"},
921-
"Image": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sparkml-serving:3.3",
922+
"Image": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sparkml-serving:2.4",
922923
"ModelDataUrl": "s3://bucket/model_2.tar.gz",
923924
},
924925
],

tests/unit/sagemaker/workflow/test_training_step.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -401,10 +401,6 @@ def test_training_step_with_estimator(
401401
}
402402
step_definition = json.loads(pipeline.definition())["Steps"][0]
403403

404-
# delete profiler rule configurations because of timestamp collision
405-
del step_definition["Arguments"]["ProfilerRuleConfigurations"]
406-
del step_args["ProfilerRuleConfigurations"]
407-
408404
assert step_definition == {
409405
"Name": "MyTrainingStep",
410406
"Description": "TrainingStep description",

tests/unit/sagemaker/workflow/test_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def test_repack_model_step(estimator):
107107
}
108108
],
109109
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
110+
"ProfilerConfig": {"DisableProfiler": True},
110111
"ResourceConfig": {
111112
"InstanceCount": 1,
112113
"InstanceType": "ml.m5.large",
@@ -188,6 +189,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
188189
}
189190
],
190191
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
192+
"ProfilerConfig": {"DisableProfiler": True},
191193
"ResourceConfig": {
192194
"InstanceCount": 1,
193195
"InstanceType": "ml.m5.large",

tests/unit/test_chainer.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,8 @@ def _create_train_job(version, py_version):
150150
"CollectionConfigurations": [],
151151
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
152152
},
153-
"profiler_rule_configs": [
154-
{
155-
"RuleConfigurationName": "ProfilerReport-1510006209",
156-
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
157-
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
158-
}
159-
],
160153
"profiler_config": {
154+
"DisableProfiler": False,
161155
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
162156
},
163157
}

0 commit comments

Comments
 (0)