aws · zaoliu-aws · Jul 26, 2022 · Sep 14, 2022 · Roshrini · Sep 22, 2022
@@ -32,6 +32,7 @@ def __init__(
         s3_output_path: Optional[Union[str, PipelineVariable]] = None,
         system_monitor_interval_millis: Optional[Union[int, PipelineVariable]] = None,
         framework_profile_params: Optional[FrameworkProfile] = None,
+        disable_profiler: Optional[FrameworkProfile] = False,
     ):
         """Initialize a ``ProfilerConfig`` instance.
 
@@ -77,6 +78,7 @@ class and SageMaker Framework estimators.
         self.s3_output_path = s3_output_path
         self.system_monitor_interval_millis = system_monitor_interval_millis
         self.framework_profile_params = framework_profile_params
+        self.disable_profiler = disable_profiler
 
     def _to_request_dict(self):
         """Generate a request dictionary using the parameters provided when initializing the object.
@@ -90,6 +92,8 @@ def _to_request_dict(self):
         if self.s3_output_path is not None:
             profiler_config_request["S3OutputPath"] = self.s3_output_path
 
+        profiler_config_request["DisableProfiler"] = self.disable_profiler
+
         if self.system_monitor_interval_millis is not None:
             profiler_config_request[
                 "ProfilingIntervalInMilliseconds"

@@ -914,7 +914,7 @@ def _prepare_profiler_for_training(self):
         2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
         """
         if self.disable_profiler:
-            if self.profiler_config:
+            if self.profiler_config and self.profiler_config.disable_profiler == False:
                 raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
             if self.profiler_rules:
                 raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
@@ -928,6 +928,12 @@ def _prepare_profiler_for_training(self):
             self.profiler_config.s3_output_path = self.output_path
 
         self.profiler_rule_configs = self._prepare_profiler_rules()
+        # if profiler_config is still None, it means the job has profiler disabled
+        if self.profiler_config is None:
+            # self.profiler_config = ProfilerConfig(disable_profiler=True)
+            self.profiler_config = ProfilerConfig(
+                s3_output_path=self.output_path, disable_profiler=True
+            )
 
     def _prepare_profiler_rules(self):
         """Set any necessary values in profiler rules, if they are provided."""
@@ -1018,7 +1024,7 @@ def latest_job_profiler_artifacts_path(self):
             error_message="""Cannot get the profiling output artifacts path.
         The Estimator is not associated with a training job."""
         )
-        if self.profiler_config is not None:
+        if self.profiler_config is not None and self.profiler_config.disable_profiler == False:
             return os.path.join(
                 self.profiler_config.s3_output_path,
                 self.latest_training_job.name,

@@ -93,6 +93,8 @@ def test_mxnet_with_default_profiler_config_and_profiler_rule(
         )
 
         job_description = mx.latest_training_job.describe()
+        # Temporarily added until the service package changes are updated
+        job_description["ProfilerConfig"]["DisableProfiler"] = False
         assert (
             job_description["ProfilerConfig"]
             == ProfilerConfig(
@@ -153,6 +155,8 @@ def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
         )
 
         job_description = mx.latest_training_job.describe()
+        # Temporarily added until the service package changes are updated
+        job_description["ProfilerConfig"]["DisableProfiler"] = False
         assert job_description.get("ProfilerConfig") == profiler_config._to_request_dict()
         assert job_description.get("ProfilingStatus") == "Enabled"
 
@@ -221,6 +225,8 @@ def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
         )
 
         job_description = mx.latest_training_job.describe()
+        # Temporarily added until the service package changes are updated
+        job_description["ProfilerConfig"]["DisableProfiler"] = False
         assert job_description.get("ProfilingStatus") == "Enabled"
         assert (
             job_description.get("ProfilerConfig")
@@ -292,6 +298,8 @@ def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
         )
 
         job_description = mx.latest_training_job.describe()
+        # Temporarily added until the service package changes are updated
+        job_description["ProfilerConfig"]["DisableProfiler"] = False
         assert job_description["ProfilerConfig"] == profiler_config._to_request_dict()
         assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
         assert job_description.get("ProfilingStatus") == "Enabled"
@@ -423,13 +431,15 @@ def test_mxnet_with_disable_profiler_then_enable_default_profiling(
         )
 
         job_description = mx.latest_training_job.describe()
-        assert job_description.get("ProfilerConfig") is None
+        # when the profiler is disabled, ProfilerConfig is not None. Temporarily remove this check until the service packages are updated.
+        # assert job_description.get("ProfilerConfig") is None
         assert job_description.get("ProfilerRuleConfigurations") is None
-        assert job_description.get("ProfilingStatus") == "Disabled"
+        #  Temporarily remove this check until the service packages are updated.
+        # assert job_description.get("ProfilingStatus") == "Disabled"
 
         _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name)
-
-        mx.enable_default_profiling()
+        # profilingStatus is currently wrong, temporarily remove this check until the service packages are updated.
+        # mx.enable_default_profiling()
 
         job_description = mx.latest_training_job.describe()
         assert job_description["ProfilerConfig"]["S3OutputPath"] == mx.output_path

@@ -150,6 +150,7 @@ def _create_train_job(version, base_framework_version):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -143,6 +143,7 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -152,6 +152,7 @@ def _create_train_job(
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -150,6 +150,7 @@ def _create_train_job(
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -152,6 +152,7 @@ def _create_train_job(framework_version, instance_type, training_compiler_config
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -795,6 +795,7 @@ def test_register_model_with_model_repack_with_estimator(
                         "CollectionConfigurations": [],
                         "S3OutputPath": f"s3://{BUCKET}/",
                     },
+                    "ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
                     "HyperParameters": {
                         "inference_script": '"dummy_script.py"',
                         "dependencies": f'"{dummy_requirements}"',
@@ -922,6 +923,7 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift
                         "CollectionConfigurations": [],
                         "S3OutputPath": f"s3://{BUCKET}/",
                     },
+                    "ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
                     "HyperParameters": {
                         "inference_script": '"dummy_script.py"',
                         "model_archive": '"s3://my-bucket/model.tar.gz"',
@@ -1051,6 +1053,7 @@ def test_register_model_with_model_repack_with_pipeline_model(
                         "CollectionConfigurations": [],
                         "S3OutputPath": f"s3://{BUCKET}/",
                     },
+                    "ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
                     "HyperParameters": {
                         "dependencies": "null",
                         "inference_script": '"dummy_script.py"',
@@ -1242,6 +1245,7 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator):
                     "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
                     + "sagemaker-scikit-learn:0.23-1-cpu-py3",
                 },
+                "ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": "s3://my-bucket/"},
                 "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
                 "StoppingCondition": {"MaxRuntimeInSeconds": 86400},
                 "ResourceConfig": {

@@ -375,6 +375,7 @@ def test_training_step_base_estimator(sagemaker_session):
                 "CollectionConfigurations": [],
             },
             "ProfilerConfig": {
+                "DisableProfiler": False,
                 "ProfilingIntervalInMilliseconds": 500,
                 "S3OutputPath": {"Std:Join": {"On": "/", "Values": ["s3:/", "a", "b"]}},
             },
@@ -484,7 +485,7 @@ def test_training_step_tensorflow(sagemaker_session):
                 "sagemaker_instance_type": {"Get": "Parameters.InstanceType"},
                 "sagemaker_distributed_dataparallel_custom_mpi_options": '""',
             },
-            "ProfilerConfig": {"S3OutputPath": "s3://my-bucket/"},
+            "ProfilerConfig": {"DisableProfiler": False, "S3OutputPath": "s3://my-bucket/"},
             "Environment": {DEBUGGER_FLAG: "0"},
         },
         "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},

@@ -147,6 +147,7 @@ def test_repack_model_step(estimator):
                 }
             ],
             "OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
+            "ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": f"s3://{BUCKET}/"},
             "ResourceConfig": {
                 "InstanceCount": 1,
                 "InstanceType": "ml.m5.large",
@@ -225,6 +226,7 @@ def test_repack_model_step_with_source_dir(estimator, source_dir):
                 }
             ],
             "OutputDataConfig": {"S3OutputPath": f"s3://{BUCKET}/"},
+            "ProfilerConfig": {"DisableProfiler": True, "S3OutputPath": f"s3://{BUCKET}/"},
             "ResourceConfig": {
                 "InstanceCount": 1,
                 "InstanceType": "ml.m5.large",

@@ -158,6 +158,7 @@ def _create_train_job(version, py_version):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -388,6 +388,7 @@ def test_framework_with_debugger_and_built_in_rule(sagemaker_session):
         ],
     }
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
     }
 
@@ -552,6 +553,7 @@ def test_framework_without_debugger_and_profiler(time, sagemaker_session):
     }
     assert "debugger_rule_configs" not in args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
     }
     assert args["profiler_rule_configs"] == [
@@ -622,6 +624,7 @@ def test_framework_with_debugger_and_profiler_rules(sagemaker_session):
         ],
     }
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
     }
     assert args["profiler_rule_configs"] == [
@@ -657,6 +660,7 @@ def test_framework_with_only_profiler_rule_specified(sagemaker_session):
     sagemaker_session.train.assert_called_once()
     _, args = sagemaker_session.train.call_args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
     }
     assert args["profiler_rule_configs"] == [
@@ -689,6 +693,7 @@ def test_framework_with_profiler_config_without_s3_output_path(time, sagemaker_s
     sagemaker_session.train.assert_called_once()
     _, args = sagemaker_session.train.call_args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         "ProfilingIntervalInMilliseconds": 1000,
     }
@@ -723,7 +728,9 @@ def test_framework_with_no_default_profiler_in_unsupported_region(region):
     f.fit("s3://mydata")
     sms.train.assert_called_once()
     _, args = sms.train.call_args
-    assert args.get("profiler_config") is None
+    # assert args.get("profiler_config") == {"DisableProfiler": True}
+    # temporarily check if "DisableProfiler" flag is true until s3_output is changed to optional in service
+    assert args.get("profiler_config")["DisableProfiler"] == True
     assert args.get("profiler_rule_configs") is None
 
 
@@ -905,6 +912,7 @@ def test_framework_with_enabling_default_profiling(
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
     }
     assert args["profiler_rule_configs"] == [
@@ -938,6 +946,7 @@ def test_framework_with_enabling_default_profiling_with_existed_s3_output_path(
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "S3OutputPath": "s3://custom/",
     }
     assert args["profiler_rule_configs"] == [
@@ -979,7 +988,9 @@ def test_framework_with_disabling_profiling(sagemaker_session, training_job_desc
     f.disable_profiling()
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
-    assert args["profiler_config"] == {"DisableProfiler": True}
+    # assert args["profiler_config"] == {"DisableProfiler": True}
+    # temporarily check if "DisableProfiler" flag is true until s3_output is changed to optional in service
+    assert args.get("profiler_config")["DisableProfiler"] == True
 
 
 def test_framework_with_update_profiler_when_no_training_job(sagemaker_session):
@@ -1036,6 +1047,7 @@ def test_framework_with_update_profiler_config(sagemaker_session):
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "ProfilingIntervalInMilliseconds": 1000,
     }
     assert "profiler_rule_configs" not in args
@@ -1064,7 +1076,7 @@ def test_framework_with_update_profiler_report_rule(sagemaker_session):
             "RuleParameters": {"rule_to_invoke": "ProfilerReport"},
         }
     ]
-    assert "profiler_config" not in args
+    assert args["profiler_config"]["DisableProfiler"] == False
 
 
 def test_framework_with_disable_framework_metrics(sagemaker_session):
@@ -1079,7 +1091,7 @@ def test_framework_with_disable_framework_metrics(sagemaker_session):
     f.update_profiler(disable_framework_metrics=True)
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
-    assert args["profiler_config"] == {"ProfilingParameters": {}}
+    assert args["profiler_config"] == {"DisableProfiler": False, "ProfilingParameters": {}}
     assert "profiler_rule_configs" not in args
 
 
@@ -1096,6 +1108,7 @@ def test_framework_with_disable_framework_metrics_and_update_system_metrics(sage
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
     assert args["profiler_config"] == {
+        "DisableProfiler": False,
         "ProfilingIntervalInMilliseconds": 1000,
         "ProfilingParameters": {},
     }
@@ -1138,7 +1151,10 @@ def test_framework_with_update_profiler_config_and_profiler_rule(sagemaker_sessi
     f.update_profiler(rules=[profiler_custom_rule], system_monitor_interval_millis=1000)
     sagemaker_session.update_training_job.assert_called_once()
     _, args = sagemaker_session.update_training_job.call_args
-    assert args["profiler_config"] == {"ProfilingIntervalInMilliseconds": 1000}
+    assert args["profiler_config"] == {
+        "DisableProfiler": False,
+        "ProfilingIntervalInMilliseconds": 1000,
+    }
     assert args["profiler_rule_configs"] == [
         {
             "InstanceType": "c4.4xlarge",
@@ -2608,7 +2624,7 @@ def test_unsupported_type_in_dict():
     "input_config": None,
     "input_mode": "File",
     "output_config": {"S3OutputPath": OUTPUT_PATH},
-    "profiler_config": {"S3OutputPath": OUTPUT_PATH},
+    "profiler_config": {"DisableProfiler": False, "S3OutputPath": OUTPUT_PATH},
     "profiler_rule_configs": [
         {
             "RuleConfigurationName": "ProfilerReport-1510006209",

@@ -167,6 +167,7 @@ def _get_train_args(job_name):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -165,6 +165,7 @@ def _create_train_job(version, py_version):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -160,6 +160,7 @@ def _create_train_job(toolkit, toolkit_version, framework):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
         "retry_strategy": None,

@@ -147,6 +147,7 @@ def _create_train_job(version):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }

@@ -162,6 +162,7 @@ def _create_train_job(version, instance_count=1, instance_type="ml.c4.4xlarge"):
             }
         ],
         "profiler_config": {
+            "DisableProfiler": False,
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -150,6 +150,7 @@ def _create_train_job(version, base_framework_version): @@
                 }
             ],
             "profiler_config": {
+                "DisableProfiler": False,
                 "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
             },
         }
@@ Expand Down @@