aws · zaoliu-aws · Sep 14, 2022 · Sep 27, 2022 · Nov 30, 2022 · Nov 30, 2022
@@ -32,6 +32,7 @@ def __init__(
         s3_output_path: Optional[Union[str, PipelineVariable]] = None,
         system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
         framework_profile_params: Optional[FrameworkProfile] = None,
+        disable_profiler: Optional[FrameworkProfile] = False,
     ):
         """Initialize a ``ProfilerConfig`` instance.
 
@@ -78,6 +79,7 @@ class and SageMaker Framework estimators.
         self.s3_output_path = s3_output_path
         self.system_monitor_interval_millis = system_monitor_interval_millis
         self.framework_profile_params = framework_profile_params
+        self.disable_profiler = disable_profiler
 
     def _to_request_dict(self):
         """Generate a request dictionary using the parameters provided when initializing the object.
@@ -91,6 +93,8 @@ def _to_request_dict(self):
         if self.s3_output_path is not None:
             profiler_config_request["S3OutputPath"] = self.s3_output_path
 
+        profiler_config_request["DisableProfiler"] = self.disable_profiler
+
         if self.system_monitor_interval_millis is not None:
             profiler_config_request[
                 "ProfilingIntervalInMilliseconds"

@@ -937,26 +937,29 @@ def _prepare_collection_configs(self):
     def _prepare_profiler_for_training(self):
         """Set necessary values and do basic validations in profiler config and profiler rules.
 
-        When user explicitly set rules to an empty list, default profiler rule won't be enabled.
-        Default profiler rule will be enabled in supported regions when either:
-        1. user doesn't specify any rules, i.e., rules=None; or
-        2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
+        No default profiler rule will be used. The user needs to specify rules explicitly
         """
         if self.disable_profiler:
-            if self.profiler_config:
-                raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
+            if self.profiler_config and not self.profiler_config.disable_profiler:
+                raise RuntimeError(
+                    "profiler_config.disable_profiler cannot be False"
+                    + " when disable_profiler is True."
+                )
             if self.profiler_rules:
                 raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
         elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
             if self.profiler_config is None:
                 self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
             if self.rules is None or (self.rules and not self.profiler_rules):
-                self.profiler_rules = [get_default_profiler_rule()]
+                self.profiler_rules = []
 
         if self.profiler_config and not self.profiler_config.s3_output_path:
             self.profiler_config.s3_output_path = self.output_path
 
         self.profiler_rule_configs = self._prepare_profiler_rules()
+        # if profiler_config is still None, it means the job has profiler disabled
+        if self.profiler_config is None:
+            self.profiler_config = ProfilerConfig(disable_profiler=True)
 
     def _prepare_profiler_rules(self):
         """Set any necessary values in profiler rules, if they are provided."""
@@ -1047,7 +1050,7 @@ def latest_job_profiler_artifacts_path(self):
             error_message="""Cannot get the profiling output artifacts path.
         The Estimator is not associated with a training job."""
         )
-        if self.profiler_config is not None:
+        if self.profiler_config is not None and not self.profiler_config.disable_profiler:
             return os.path.join(
                 self.profiler_config.s3_output_path,
                 self.latest_training_job.name,
@@ -1893,8 +1896,8 @@ def enable_default_profiling(self):
         else:
             self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
 
-        self.profiler_rules = [get_default_profiler_rule()]
-        self.profiler_rule_configs = self._prepare_profiler_rules()
+        self.profiler_rules = []
+        self.profiler_rule_configs = []
 
         _TrainingJob.update(
             self, self.profiler_rule_configs, self.profiler_config._to_request_dict()

@@ -13,7 +13,6 @@
 from __future__ import absolute_import
 
 import os
-import re
 import time
 import uuid
 
@@ -22,7 +21,6 @@
 from sagemaker.debugger import (
     DebuggerHookConfig,
     FrameworkProfile,
-    get_rule_container_image_uri,
     ProfilerConfig,
     ProfilerRule,
     Rule,
@@ -103,13 +101,6 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
         )
         assert job_description.get("ProfilingStatus") == "Enabled"
 
-        profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
-        assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
-        assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
-            mx.sagemaker_session.boto_region_name
-        )
-        assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
-
         with pytest.raises(ValueError) as error:
             mx.enable_default_profiling()
         assert "Debugger monitoring is already enabled." in str(error)
@@ -160,13 +151,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
         assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
         assert job_description.get("ProfilingStatus") == "Enabled"
 
-        profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
-        assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
-        assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
-            mx.sagemaker_session.boto_region_name
-        )
-        assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
-
         _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
 
         mx.update_profiler(
@@ -178,13 +162,6 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
         assert job_description["ProfilerConfig"]["S3OutputPath"] == profiler_config.s3_output_path
         assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
 
-        profiler_report_rule_config = job_description.get("ProfilerRuleConfigurations")[0]
-        assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"])
-        assert profiler_report_rule_config["RuleEvaluatorImage"] == get_rule_container_image_uri(
-            mx.sagemaker_session.boto_region_name
-        )
-        assert profiler_report_rule_config["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
-
 
 def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
     sagemaker_session,
@@ -387,13 +364,6 @@ def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
             == updated_framework_profile.profiling_parameters
         )
 
-        profiler_rule_configuration = job_description.get("ProfilerRuleConfigurations")[0]
-        assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"])
-        assert profiler_rule_configuration["RuleEvaluatorImage"] == get_rule_container_image_uri(
-            mx.sagemaker_session.boto_region_name
-        )
-        assert profiler_rule_configuration["RuleParameters"] == {"rule_to_invoke": "ProfilerReport"}
-
 
 def test_mxnet_with_disable_profiler_then_enable_default_profiling(
     sagemaker_session,
@@ -431,12 +401,10 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
         )
 
         job_description = mx.latest_training_job.describe()
-        assert job_description.get("ProfilerConfig") is None
         assert job_description.get("ProfilerRuleConfigurations") is None
         assert job_description.get("ProfilingStatus") == "Disabled"
 
         _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
-
         mx.enable_default_profiling()
 
         job_description = mx.latest_training_job.describe()

@@ -142,14 +142,8 @@ def _create_train_job(version, base_framework_version):
             "CollectionConfigurations": [],
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
-        "profiler_rule_configs": [
-            {
-                "RuleConfigurationName": "ProfilerReport-1510006209",
-                "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
-                "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
-            }
-        ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -135,14 +135,8 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
         "metric_definitions": None,
         "environment": None,
         "experiment_config": None,
-        "profiler_rule_configs": [
-            {
-                "RuleConfigurationName": "ProfilerReport-1510006209",
-                "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
-                "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
-            }
-        ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -144,14 +144,8 @@ def _create_train_job(
             "CollectionConfigurations": [],
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
-        "profiler_rule_configs": [
-            {
-                "RuleConfigurationName": "ProfilerReport-1510006209",
-                "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
-                "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
-            }
-        ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -142,14 +142,8 @@ def _create_train_job(
             "CollectionConfigurations": [],
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
-        "profiler_rule_configs": [
-            {
-                "RuleConfigurationName": "ProfilerReport-1510006209",
-                "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
-                "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
-            }
-        ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -144,14 +144,8 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
             "CollectionConfigurations": [],
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
-        "profiler_rule_configs": [
-            {
-                "RuleConfigurationName": "ProfilerReport-1510006209",
-                "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
-                "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
-            }
-        ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -796,6 +796,7 @@ def test_register_model_with_model_repack_with_estimator(
                         "CollectionConfigurations": [],
                         "S3OutputPath": f"s3://{BUCKET}/",
                     },
+                    "ProfilerConfig": {"DisableProfiler": True},
                     "HyperParameters": {
                         "inference_script": '"dummy_script.py"',
                         "dependencies": f'"{dummy_requirements}"',
@@ -923,6 +924,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
                         "CollectionConfigurations": [],
                         "S3OutputPath": f"s3://{BUCKET}/",
                     },
+                    "ProfilerConfig": {"DisableProfiler": True},
                     "HyperParameters": {
                         "inference_script": '"dummy_script.py"',
                         "model_archive": '"s3://my-bucket/model.tar.gz"',
@@ -1052,6 +1054,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
                         "CollectionConfigurations": [],
                         "S3OutputPath": f"s3://{BUCKET}/",
                     },
+                    "ProfilerConfig": {"DisableProfiler": True},
                     "HyperParameters": {
                         "dependencies": "null",
                         "inference_script": '"dummy_script.py"',
@@ -1243,6 +1246,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
                     "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
                     + "sagemaker-scikit-learn:0.23-1-cpu-py3",
                 },
+                "ProfilerConfig": {"DisableProfiler": True},
                 "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
                 "StoppingCondition": {"MaxRuntimeInSeconds": 86400},
                 "ResourceConfig": {

@@ -374,6 +374,7 @@ def test_training_step_base_estimator(sagemaker_session):
                 "CollectionConfigurations": [],
             },
             "ProfilerConfig": {
+                "DisableProfiler": False,
                 "ProfilingIntervalInMilliseconds": 500,
                 "S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
             },
@@ -483,7 +484,7 @@ def test_training_step_tensorflow(sagemaker_session):
                 "sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
                 "sagemaker_distributed_dataparallel_custom_mpi_options": '""',
             },
-            "ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
+            "ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
         },
         "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
     }

@@ -307,10 +307,6 @@ def test_training_step_with_estimator(
     }
     step_definition = json.loads(pipeline.definition())["Steps"][0]
 
-    # delete profiler rule configurations because of timestamp collision
-    del step_definition["Arguments"]["ProfilerRuleConfigurations"]
-    del expected_step_arguments["ProfilerRuleConfigurations"]
-
     assert step_definition == {
         "Name": "MyTrainingStep",
         "Description": "TrainingStep description",
@@ -427,18 +423,6 @@ def test_training_step_with_framework_estimator(
     del step_args["OutputDataConfig"]["S3OutputPath"]
     del step_def["Arguments"]["OutputDataConfig"]["S3OutputPath"]
 
-    # trim timestamp so RuleConfigurationName will match
-    rule_config_name_step_args = step_args["ProfilerRuleConfigurations"][0]["RuleConfigurationName"]
-    step_args["ProfilerRuleConfigurations"][0][
-        "RuleConfigurationName"
-    ] = rule_config_name_step_args[:-11]
-    rule_config_name_step_def = step_def["Arguments"]["ProfilerRuleConfigurations"][0][
-        "RuleConfigurationName"
-    ]
-    step_def["Arguments"]["ProfilerRuleConfigurations"][0][
-        "RuleConfigurationName"
-    ] = rule_config_name_step_def[:-11]
-
     if "sagemaker_s3_output" in step_args["HyperParameters"]:
         del step_args["HyperParameters"]["sagemaker_s3_output"]
         del step_def["Arguments"]["HyperParameters"]["sagemaker_s3_output"]
@@ -519,18 +503,6 @@ def test_training_step_with_algorithm_base(algo_estimator, training_input, pipel
     del step_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"]
     del step_def["Arguments"]["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"]
 
-    # trim timestamp so RuleConfigurationName will match
-    rule_config_name_step_args = step_args["ProfilerRuleConfigurations"][0]["RuleConfigurationName"]
-    step_args["ProfilerRuleConfigurations"][0][
-        "RuleConfigurationName"
-    ] = rule_config_name_step_args[:-11]
-    rule_config_name_step_def = step_def["Arguments"]["ProfilerRuleConfigurations"][0][
-        "RuleConfigurationName"
-    ]
-    step_def["Arguments"]["ProfilerRuleConfigurations"][0][
-        "RuleConfigurationName"
-    ] = rule_config_name_step_def[:-11]
-
     assert step_def == {
         "Name": "MyTrainingStep",
         "Type": "Training",

@@ -157,6 +157,7 @@ def test_repack_model_step(estimator):
                 }
             ],
             "OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
+            "ProfilerConfig": {"DisableProfiler": True},
             "ResourceConfig": {
                 "InstanceCount": 1,
                 "InstanceType": "ml.m5.large",
@@ -238,6 +239,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
                 }
             ],
             "OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
+            "ProfilerConfig": {"DisableProfiler": True},
             "ResourceConfig": {
                 "InstanceCount": 1,
                 "InstanceType": "ml.m5.large",

@@ -150,14 +150,8 @@ def _create_train_job(version, py_version):
             "CollectionConfigurations": [],
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
-        "profiler_rule_configs": [
-            {
-                "RuleConfigurationName": "ProfilerReport-1510006209",
-                "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest",
-                "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
-            }
-        ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }