Skip to content

Commit 03c2252

Browse files
authored
change: enable smdebug for Horovod (MPI) training setup (#1265)
1 parent 8e1ffb0 commit 03c2252

File tree

2 files changed

+5
-8
lines changed

2 files changed

+5
-8
lines changed

src/sagemaker/tensorflow/estimator.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -707,22 +707,19 @@ def _script_mode_enabled(self):
707707

708708
def _validate_and_set_debugger_configs(self):
709709
"""
710-
Disable Debugger Hook Config for PS and Horovod as they are not
711-
supported in smdebug 0.4.13, the current latest version of smdebug
710+
Disable Debugger Hook Config for ParameterServer (PS) as it is not
711+
supported in smdebug.
712712
713713
Else, set default HookConfig
714714
"""
715715
ps_enabled = "parameter_server" in self.distributions and self.distributions[
716716
"parameter_server"
717717
].get("enabled", False)
718-
mpi_enabled = "mpi" in self.distributions and self.distributions["mpi"].get(
719-
"enabled", False
720-
)
721-
if ps_enabled or mpi_enabled:
718+
if ps_enabled:
722719
if self.debugger_hook_config is not None or self.debugger_rule_configs is not None:
723720
logger.info(
724721
"Amazon SageMaker Debugger does not currently support "
725-
"Parameter Server and MPI distributions"
722+
"Parameter Server distribution"
726723
)
727724
self.debugger_hook_config = None
728725
self.debugger_rule_configs = None

tests/unit/test_tf_estimator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def _create_train_job(
160160
"experiment_config": None,
161161
}
162162

163-
if not ps and not horovod:
163+
if not ps:
164164
conf["debugger_hook_config"] = {
165165
"CollectionConfigurations": [],
166166
"S3OutputPath": "s3://{}/".format(BUCKET_NAME),

0 commit comments

Comments
 (0)