Skip to content

Zaoliu/disable profiler #3364

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sagemaker/debugger/profiler_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
s3_output_path: Optional[Union[str, PipelineVariable]] = None,
system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
framework_profile_params: Optional[FrameworkProfile] = None,
disable_profiler: Optional[FrameworkProfile] = False,
):
"""Initialize a ``ProfilerConfig`` instance.

Expand Down Expand Up @@ -77,6 +78,7 @@ class and SageMaker Framework estimators.
self.s3_output_path = s3_output_path
self.system_monitor_interval_millis = system_monitor_interval_millis
self.framework_profile_params = framework_profile_params
self.disable_profiler = disable_profiler

def _to_request_dict(self):
"""Generate a request dictionary using the parameters provided when initializing the object.
Expand All @@ -90,6 +92,8 @@ def _to_request_dict(self):
if self.s3_output_path is not None:
profiler_config_request["S3OutputPath"] = self.s3_output_path

profiler_config_request["DisableProfiler"] = self.disable_profiler

if self.system_monitor_interval_millis is not None:
profiler_config_request[
"ProfilingIntervalInMilliseconds"
Expand Down
10 changes: 8 additions & 2 deletions src/sagemaker/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ def _prepare_profiler_for_training(self):
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
"""
if self.disable_profiler:
if self.profiler_config:
if self.profiler_config and self.profiler_config.disable_profiler == False:
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
if self.profiler_rules:
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
Expand All @@ -928,6 +928,12 @@ def _prepare_profiler_for_training(self):
self.profiler_config.s3_output_path = self.output_path

self.profiler_rule_configs = self._prepare_profiler_rules()
# if profiler_config is still None, it means the job has profiler disabled
if self.profiler_config is None:
# self.profiler_config = ProfilerConfig(disable_profiler=True)
self.profiler_config = ProfilerConfig(
s3_output_path=self.output_path, disable_profiler=True
)

def _prepare_profiler_rules(self):
"""Set any necessary values in profiler rules, if they are provided."""
Expand Down Expand Up @@ -1018,7 +1024,7 @@ def latest_job_profiler_artifacts_path(self):
error_message="""Cannot get the profiling output artifacts path.
The Estimator is not associated with a training job."""
)
if self.profiler_config is not None:
if self.profiler_config is not None and self.profiler_config.disable_profiler == False:
return os.path.join(
self.profiler_config.s3_output_path,
self.latest_training_job.name,
Expand Down
18 changes: 14 additions & 4 deletions tests/integ/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
)

job_description = mx.latest_training_job.describe()
# Temporarily added until the service package changes are updated
job_description["ProfilerConfig"]["DisableProfiler"] = False
assert (
job_description["ProfilerConfig"]
== ProfilerConfig(
Expand Down Expand Up @@ -153,6 +155,8 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
)

job_description = mx.latest_training_job.describe()
# Temporarily added until the service package changes are updated
job_description["ProfilerConfig"]["DisableProfiler"] = False
assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
assert job_description.get("ProfilingStatus") == "Enabled"

Expand Down Expand Up @@ -221,6 +225,8 @@ def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
)

job_description = mx.latest_training_job.describe()
# Temporarily added until the service package changes are updated
job_description["ProfilerConfig"]["DisableProfiler"] = False
assert job_description.get("ProfilingStatus") == "Enabled"
assert (
job_description.get("ProfilerConfig")
Expand Down Expand Up @@ -292,6 +298,8 @@ def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
)

job_description = mx.latest_training_job.describe()
# Temporarily added until the service package changes are updated
job_description["ProfilerConfig"]["DisableProfiler"] = False
assert job_description["ProfilerConfig"] == profiler_config._to_request_dict()
assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
assert job_description.get("ProfilingStatus") == "Enabled"
Expand Down Expand Up @@ -423,13 +431,15 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
)

job_description = mx.latest_training_job.describe()
assert job_description.get("ProfilerConfig") is None
# when the profiler is disabled, ProfilerConfig is not None. Temporarily remove this check until the service packages are updated.
# assert job_description.get("ProfilerConfig") is None
assert job_description.get("ProfilerRuleConfigurations") is None
assert job_description.get("ProfilingStatus") == "Disabled"
# Temporarily remove this check until the service packages are updated.
# assert job_description.get("ProfilingStatus") == "Disabled"

_wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)

mx.enable_default_profiling()
# profilingStatus is currently wrong, temporarily remove this check until the service packages are updated.
# mx.enable_default_profiling()

job_description = mx.latest_training_job.describe()
assert job_description["ProfilerConfig"]["S3OutputPath"] == mx.output_path
Expand Down
1 change: 1 addition & 0 deletions tests/unit/sagemaker/huggingface/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def _create_train_job(version, base_framework_version):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
1 change: 1 addition & 0 deletions tests/unit/sagemaker/tensorflow/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
}
],
"profiler_config": {
"DisableProfiler": False,
Copy link

@Roshrini Roshrini Sep 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to pass this parameter when by default it is false already?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This argument is later on used in validation in all the related test. If I don't specify DisableProfiler:False here, later on the validation check will throw error.

"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def _create_train_job(
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def _create_train_job(
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/sagemaker/workflow/test_step_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,7 @@ def test_register_model_with_model_repack_with_estimator(
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
"HyperParameters": {
"inference_script": '"dummy_script.py"',
"dependencies": f'"{dummy_requirements}"',
Expand Down Expand Up @@ -922,6 +923,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
"HyperParameters": {
"inference_script": '"dummy_script.py"',
"model_archive": '"s3://my-bucket/model.tar.gz"',
Expand Down Expand Up @@ -1051,6 +1053,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
"CollectionConfigurations": [],
"S3OutputPath": f"s3://{BUCKET}/",
},
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
"HyperParameters": {
"dependencies": "null",
"inference_script": '"dummy_script.py"',
Expand Down Expand Up @@ -1242,6 +1245,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
"TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
+ "sagemaker-scikit-learn:0.23-1-cpu-py3",
},
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
"StoppingCondition": {"MaxRuntimeInSeconds": 86400},
"ResourceConfig": {
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/sagemaker/workflow/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ def test_training_step_base_estimator(sagemaker_session):
"CollectionConfigurations": [],
},
"ProfilerConfig": {
"DisableProfiler": False,
"ProfilingIntervalInMilliseconds": 500,
"S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
},
Expand Down Expand Up @@ -484,7 +485,7 @@ def test_training_step_tensorflow(sagemaker_session):
"sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
"sagemaker_distributed_dataparallel_custom_mpi_options": '""',
},
"ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
"ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
"Environment": {DEBUGGER_FLAG: "0"},
},
"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/sagemaker/workflow/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def test_repack_model_step(estimator):
}
],
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": f"s3://{BUCKET}/"},
"ResourceConfig": {
"InstanceCount": 1,
"InstanceType": "ml.m5.large",
Expand Down Expand Up @@ -225,6 +226,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
}
],
"OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
"ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": f"s3://{BUCKET}/"},
"ResourceConfig": {
"InstanceCount": 1,
"InstanceType": "ml.m5.large",
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_chainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def _create_train_job(version, py_version):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
28 changes: 22 additions & 6 deletions tests/unit/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ def test_framework_with_debugger_and_built_in_rule(sagemaker_session):
],
}
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
}

Expand Down Expand Up @@ -552,6 +553,7 @@ def test_framework_without_debugger_and_profiler(time, sagemaker_session):
}
assert "debugger_rule_configs" not in args
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
}
assert args["profiler_rule_configs"] == [
Expand Down Expand Up @@ -622,6 +624,7 @@ def test_framework_with_debugger_and_profiler_rules(sagemaker_session):
],
}
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
}
assert args["profiler_rule_configs"] == [
Expand Down Expand Up @@ -657,6 +660,7 @@ def test_framework_with_only_profiler_rule_specified(sagemaker_session):
sagemaker_session.train.assert_called_once()
_, args = sagemaker_session.train.call_args
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
}
assert args["profiler_rule_configs"] == [
Expand Down Expand Up @@ -689,6 +693,7 @@ def test_framework_with_profiler_config_without_s3_output_path(time, sagemaker_s
sagemaker_session.train.assert_called_once()
_, args = sagemaker_session.train.call_args
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
"ProfilingIntervalInMilliseconds": 1000,
}
Expand Down Expand Up @@ -723,7 +728,9 @@ def test_framework_with_no_default_profiler_in_unsupported_region(region):
f.fit("s3://mydata")
sms.train.assert_called_once()
_, args = sms.train.call_args
assert args.get("profiler_config") is None
# assert args.get("profiler_config") == {"DisableProfiler": True}
# temporarily check if "DisableProfiler" flag is true until s3_output is changed to optional in service
assert args.get("profiler_config")["DisableProfiler"] == True
assert args.get("profiler_rule_configs") is None


Expand Down Expand Up @@ -905,6 +912,7 @@ def test_framework_with_enabling_default_profiling(
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
}
assert args["profiler_rule_configs"] == [
Expand Down Expand Up @@ -938,6 +946,7 @@ def test_framework_with_enabling_default_profiling_with_existed_s3_output_path(
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {
"DisableProfiler": False,
"S3OutputPath": "s3://custom/",
}
assert args["profiler_rule_configs"] == [
Expand Down Expand Up @@ -979,7 +988,9 @@ def test_framework_with_disabling_profiling(sagemaker_session, training_job_desc
f.disable_profiling()
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {"DisableProfiler": True}
# assert args["profiler_config"] == {"DisableProfiler": True}
# temporarily check if "DisableProfiler" flag is true until s3_output is changed to optional in service
assert args.get("profiler_config")["DisableProfiler"] == True


def test_framework_with_update_profiler_when_no_training_job(sagemaker_session):
Expand Down Expand Up @@ -1036,6 +1047,7 @@ def test_framework_with_update_profiler_config(sagemaker_session):
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {
"DisableProfiler": False,
"ProfilingIntervalInMilliseconds": 1000,
}
assert "profiler_rule_configs" not in args
Expand Down Expand Up @@ -1064,7 +1076,7 @@ def test_framework_with_update_profiler_report_rule(sagemaker_session):
"RuleParameters": {"rule_to_invoke": "ProfilerReport"},
}
]
assert "profiler_config" not in args
assert args["profiler_config"]["DisableProfiler"] == False


def test_framework_with_disable_framework_metrics(sagemaker_session):
Expand All @@ -1079,7 +1091,7 @@ def test_framework_with_disable_framework_metrics(sagemaker_session):
f.update_profiler(disable_framework_metrics=True)
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {"ProfilingParameters": {}}
assert args["profiler_config"] == {"DisableProfiler": False, "ProfilingParameters": {}}
assert "profiler_rule_configs" not in args


Expand All @@ -1096,6 +1108,7 @@ def test_framework_with_disable_framework_metrics_and_update_system_metrics(sage
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {
"DisableProfiler": False,
"ProfilingIntervalInMilliseconds": 1000,
"ProfilingParameters": {},
}
Expand Down Expand Up @@ -1138,7 +1151,10 @@ def test_framework_with_update_profiler_config_and_profiler_rule(sagemaker_sessi
f.update_profiler(rules=[profiler_custom_rule], system_monitor_interval_millis=1000)
sagemaker_session.update_training_job.assert_called_once()
_, args = sagemaker_session.update_training_job.call_args
assert args["profiler_config"] == {"ProfilingIntervalInMilliseconds": 1000}
assert args["profiler_config"] == {
"DisableProfiler": False,
"ProfilingIntervalInMilliseconds": 1000,
}
assert args["profiler_rule_configs"] == [
{
"InstanceType": "c4.4xlarge",
Expand Down Expand Up @@ -2608,7 +2624,7 @@ def test_unsupported_type_in_dict():
"input_config": None,
"input_mode": "File",
"output_config": {"S3OutputPath": OUTPUT_PATH},
"profiler_config": {"S3OutputPath": OUTPUT_PATH},
"profiler_config": {"DisableProfiler": False, "S3OutputPath": OUTPUT_PATH},
"profiler_rule_configs": [
{
"RuleConfigurationName": "ProfilerReport-1510006209",
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_mxnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def _get_train_args(job_name):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def _create_train_job(version, py_version):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def _create_train_job(toolkit, toolkit_version, framework):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
"retry_strategy": None,
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def _create_train_job(version):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def _create_train_job(version, instance_count=1, instance_type="ml.c4.4xlarge"):
}
],
"profiler_config": {
"DisableProfiler": False,
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),
},
}
Expand Down