Skip to content

feature: Add disable_profiler field in config and propagate changes #3523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Dec 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ab9c79d
Adapted zaoliu's changes
Dec 9, 2022
c9c9f1f
Merge branch 'master' into zaochange
mariumof Dec 9, 2022
75f95eb
One more file to zaochage
Dec 9, 2022
055c0ec
Added empty line
Dec 9, 2022
4e108d0
Ran black
Dec 9, 2022
1ff3160
Re blacked and modified test
Dec 9, 2022
8523702
Added test_pytorch_compiler.py
Dec 9, 2022
fff6a92
Formatted test_pytorch_compiler.py
Dec 9, 2022
20c743c
Updated test_pytorch_compiler
Dec 12, 2022
dfdc52d
Updated test_training_step
Dec 12, 2022
ed21e79
Updated test_workflow and est_training_step
Dec 12, 2022
1599a79
Updated test_training_step
Dec 12, 2022
75f6bc4
Updated test_workflow
Dec 12, 2022
4505a9b
Merge branch 'master' into zaochange
mariumof Dec 13, 2022
8bbe653
Updated profiler_config and test
Dec 13, 2022
6799161
Typo
Dec 13, 2022
4a00ea1
black
Dec 13, 2022
e236c0d
Adapted zaoliu's changes
Dec 9, 2022
7aecb53
One more file to zaochage
Dec 9, 2022
d689cd0
Added empty line
Dec 9, 2022
2f62550
Updated test_estimator
Dec 14, 2022
d2258d3
Re blacked and modified test
Dec 9, 2022
9517cd3
Added test_pytorch_compiler.py
Dec 9, 2022
596c50d
Formatted test_pytorch_compiler.py
Dec 9, 2022
60e3ecf
Updated test_pytorch_compiler
Dec 12, 2022
cb855b1
Updated test_training_step
Dec 12, 2022
d7ae37b
Updated test_workflow and est_training_step
Dec 12, 2022
48b755e
Updated test_training_step
Dec 12, 2022
2ec9f80
Updated test_workflow
Dec 12, 2022
c16a7c0
Updated profiler_config and test
Dec 13, 2022
389babd
Typo
Dec 13, 2022
d33c91d
black
Dec 13, 2022
e8ba707
Updated test_estimator
Dec 14, 2022
32e3c16
Merge branch 'master' into zaochange
mariumof Dec 15, 2022
f8dced3
ran black on test_estimator.py
Dec 15, 2022
cbfa59f
Merge branch 'master' into zaochange
mariumof Dec 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sagemaker/debugger/profiler_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
s3_output_path: Optional[Union[str, PipelineVariable]] = None,
system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
framework_profile_params: Optional[FrameworkProfile] = None,
disable_profiler: Optional[Union[str, PipelineVariable]] = False,
):
"""Initialize a ``ProfilerConfig`` instance.

Expand Down Expand Up @@ -78,6 +79,7 @@ class and SageMaker Framework estimators.
self.s3_output_path = s3_output_path
self.system_monitor_interval_millis = system_monitor_interval_millis
self.framework_profile_params = framework_profile_params
self.disable_profiler = disable_profiler

def _to_request_dict(self):
"""Generate a request dictionary using the parameters provided when initializing the object.
Expand All @@ -91,6 +93,8 @@ def _to_request_dict(self):
if self.s3_output_path is not None:
profiler_config_request["S3OutputPath"] = self.s3_output_path

profiler_config_request["DisableProfiler"] = self.disable_profiler

if self.system_monitor_interval_millis is not None:
profiler_config_request[
"ProfilingIntervalInMilliseconds"
Expand Down
23 changes: 13 additions & 10 deletions src/sagemaker/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,26 +938,29 @@ def _prepare_collection_configs(self):
def _prepare_profiler_for_training(self):
"""Set necessary values and do basic validations in profiler config and profiler rules.

When user explicitly set rules to an empty list, default profiler rule won't be enabled.
Default profiler rule will be enabled in supported regions when either:
1. user doesn't specify any rules, i.e., rules=None; or
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
No default profiler rule will be used. The user needs to specify rules explicitly
"""
if self.disable_profiler:
if self.profiler_config:
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
if self.profiler_config and not self.profiler_config.disable_profiler:
raise RuntimeError(
"profiler_config.disable_profiler cannot be False"
+ " when disable_profiler is True."
)
if self.profiler_rules:
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
if self.profiler_config is None:
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
if self.rules is None or (self.rules and not self.profiler_rules):
self.profiler_rules = [get_default_profiler_rule()]
self.profiler_rules = []

if self.profiler_config and not self.profiler_config.s3_output_path:
self.profiler_config.s3_output_path = self.output_path

self.profiler_rule_configs = self._prepare_profiler_rules()
# if profiler_config is still None, it means the job has profiler disabled
if self.profiler_config is None:
self.profiler_config = ProfilerConfig(disable_profiler=True)

def _prepare_profiler_rules(self):
"""Set any necessary values in profiler rules, if they are provided."""
Expand Down Expand Up @@ -1048,7 +1051,7 @@ def latest_job_profiler_artifacts_path(self):
error_message="""Cannot get the profiling output artifacts path.
The Estimator is not associated with a training job."""
)
if self.profiler_config is not None:
if self.profiler_config is not None and not self.profiler_config.disable_profiler:
return os.path.join(
self.profiler_config.s3_output_path,
self.latest_training_job.name,
Expand Down Expand Up @@ -1895,8 +1898,8 @@ def enable_default_profiling(self):
else:
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)

self.profiler_rules = [get_default_profiler_rule()]
self.profiler_rule_configs = self._prepare_profiler_rules()
self.profiler_rules = []

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enable_default_profiling method is used while enabling training job mid way training according to this: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-update-monitoring-profiling.html If we are not running rules while enabling, need to update docs and method description

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whom do I talk to?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean the person who handles docs.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

discussed on slack

self.profiler_rule_configs = []

_TrainingJob.update(
self, self.profiler_rule_configs, self.profiler_config._to_request_dict()
Expand Down
4 changes: 0 additions & 4 deletions tests/integ/sagemaker/workflow/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,8 +1269,6 @@ def test_caching_behavior(
# create pipeline
pipeline.create(role)
definition = json.loads(pipeline.definition())
# delete profiler config for assertions as it will contain a timestamp
del definition["Steps"][1]["Arguments"]["ProfilerRuleConfigurations"]

# verify input path
expected_abalone_input_path = f"{pipeline_name}/{step_process.name}" f"/input/abalone_data"
Expand All @@ -1295,7 +1293,6 @@ def test_caching_behavior(

# verify no changes
definition2 = json.loads(pipeline.definition())
del definition2["Steps"][1]["Arguments"]["ProfilerRuleConfigurations"]
assert definition == definition2

# add dummy file to source_dir
Expand All @@ -1306,7 +1303,6 @@ def test_caching_behavior(

# verify changes
definition3 = json.loads(pipeline.definition())
del definition3["Steps"][1]["Arguments"]["ProfilerRuleConfigurations"]
assert definition != definition3

finally:
Expand Down
40 changes: 0 additions & 40 deletions tests/integ/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from __future__ import absolute_import

import os
import re
import time
import uuid

Expand All @@ -22,7 +21,6 @@
from sagemaker.debugger import (
DebuggerHookConfig,
FrameworkProfile,
get_rule_container_image_uri,
ProfilerConfig,
ProfilerRule,
Rule,
Expand Down Expand Up @@ -93,8 +91,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
)

job_description = mx.latest_training_job.describe()
if "DisableProfiler" in job_description["ProfilerConfig"]:
job_description["ProfilerConfig"].pop("DisableProfiler")
assert (
job_description["ProfilerConfig"]
== ProfilerConfig(
Expand All @@ -103,13 +99,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
)
assert job_description.get("ProfilingStatus") == "Enabled"

profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}

with pytest.raises(ValueError) as error:
mx.enable_default_profiling()
assert "Debugger monitoring is already enabled." in str(error)
Expand Down Expand Up @@ -155,18 +144,9 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
)

job_description = mx.latest_training_job.describe()
if "DisableProfiler" in job_description["ProfilerConfig"]:
job_description["ProfilerConfig"].pop("DisableProfiler")
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
assert job_description.get("ProfilingStatus") == "Enabled"

profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}

_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)

mx.update_profiler(
Expand All @@ -178,13 +158,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500

profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}


def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
sagemaker_session,
Expand Down Expand Up @@ -225,8 +198,6 @@ def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
)

job_description = mx.latest_training_job.describe()
if "DisableProfiler" in job_description["ProfilerConfig"]:
job_description["ProfilerConfig"].pop("DisableProfiler")
assert job_description.get("ProfilingStatus") == "Enabled"
assert (
job_description.get("ProfilerConfig")
Expand Down Expand Up @@ -298,8 +269,6 @@ def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
)

job_description = mx.latest_training_job.describe()
if "DisableProfiler" in job_description["ProfilerConfig"]:
job_description["ProfilerConfig"].pop("DisableProfiler")
assert job_description["ProfilerConfig"] == profiler_config._to_request_dict()
assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
assert job_description.get("ProfilingStatus") == "Enabled"
Expand Down Expand Up @@ -387,13 +356,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
== updated_framework_profile.profiling_parameters
)

profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
mx.sagemaker_session.boto_region_name
)
assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}


def test_mxnet_with_disable_profiler_then_enable_default_profiling(
sagemaker_session,
Expand Down Expand Up @@ -431,12 +393,10 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
)

job_description = mx.latest_training_job.describe()
assert job_description.get("ProfilerConfig") is None
assert job_description.get("ProfilerRuleConfigurations") is None
assert job_description.get("ProfilingStatus") == "Disabled"

_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)

mx.enable_default_profiling()

job_description = mx.latest_training_job.describe()
Expand Down
8 changes: 1 addition & 7 deletions tests/unit/sagemaker/huggingface/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,8 @@ def _create_train_job(version, base_framework_version):
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
8 changes: 1 addition & 7 deletions tests/unit/sagemaker/tensorflow/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,8 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
"metric_definitions": None,
"environment": None,
"experiment_config": None,
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,8 @@ def _create_train_job(
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,8 @@ def _create_train_job(
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,10 @@ def _create_train_job(version, instance_type, training_compiler_config, instance
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {"S3OutputPath": "s3://{}/".format(BUCKET_NAME)},
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,8 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
"CollectionConfigurations": [],
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/sagemaker/workflow/test_step_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True},
"HyperParameters": {
"inference_script": '"dummy_script.py"',
"dependencies": f'"{dummy_requirements}"',
Expand Down Expand Up @@ -923,6 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True},
"HyperParameters": {
"inference_script": '"dummy_script.py"',
"model_archive": '"s3://my-bucket/model.tar.gz"',
Expand Down Expand Up @@ -1052,6 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True},
"HyperParameters": {
"dependencies": "null",
"inference_script": '"dummy_script.py"',
Expand Down Expand Up @@ -1243,6 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
},
"ProfilerConfig": {"DisableProfiler": True},
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
"ResourceConfig": {
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/sagemaker/workflow/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ def test_training_step_base_estimator(sagemaker_session):
"CollectionConfigurations": [],
},
"ProfilerConfig": {
"DisableProfiler": False,
"ProfilingIntervalInMilliseconds": 500,
"S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
},
Expand Down Expand Up @@ -438,7 +439,7 @@ def test_training_step_tensorflow(sagemaker_session):
"sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
"sagemaker_distributed_dataparallel_custom_mpi_options": '""',
},
"ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
"ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
},
"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
}
Expand Down
Loading