Skip to content

Commit 558e3f8

Browse files
committed
Update warning logs
1 parent c7e7f0d commit 558e3f8

File tree

3 files changed

+11
-8
lines changed

3 files changed

+11
-8
lines changed

doc/frameworks/pytorch/using_pytorch.rst

-1
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,6 @@ but you can also overwrite them.
238238

239239
**Supported backends:**
240240

241-
- ``gloo`` and ``tcp`` for CPU instances
242241
- ``gloo`` and ``nccl`` for GPU instances
243242

244243
Launching a Distributed Training Job

src/sagemaker/fw_utils.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -1104,31 +1104,35 @@ def validate_smddp_collectives_support(
11041104
# in case image_uri is not set, then both are mandatory
11051105
if framework_version not in SMDDP_COLLECTIVES_SUPPORTED_FRAMEWORK_VERSIONS:
11061106
err_msg += (
1107-
f"Provided framework_version {framework_version} is not supported."
1107+
f"Provided framework_version {framework_version} is not supported. "
11081108
"Please specify one of the supported framework versions:"
11091109
f" {SMDDP_COLLECTIVES_SUPPORTED_FRAMEWORK_VERSIONS}.\n"
11101110
)
11111111
if "py3" not in py_version:
11121112
err_msg += (
1113-
f"Provided py_version {py_version} is not supported."
1113+
f"Provided py_version {py_version} is not supported. "
11141114
"Please specify py_version>=py3.\n"
11151115
)
11161116
if instance_type not in SMDDP_COLLECTIVES_SUPPORTED_INSTANCE_TYPES:
11171117
err_msg += (
1118-
f"Provided instance_type {instance_type} is not supported."
1118+
f"Provided instance_type {instance_type} is not supported. "
11191119
"Please specify one of the supported instance types:"
11201120
f"{SMDDP_COLLECTIVES_SUPPORTED_INSTANCE_TYPES}.\n"
11211121
)
11221122
if instance_count == 1:
11231123
# Communication backend auto is not supported for single-node jobs
11241124
err_msg += (
1125-
"SMDDP Collective backend is not supported for single-node jobs."
1125+
"SMDDP Collective backend is not supported for single-node jobs. "
11261126
"Please increase instance_count to be greater than 1.\n"
11271127
)
11281128
if not err_msg:
11291129
return True
1130-
logger.warning("Could not enable SMDDP Collectives for the training job.\n%s", err_msg)
1131-
logger.warning("Continuing training with NCCL collective backend.\n")
1130+
logger.warning(
1131+
"The system is not compatible or not configured to run SMDDP collectives optimized"
1132+
" for AWS infrastructure.\n%s",
1133+
err_msg,
1134+
)
1135+
logger.warning("Continuing model training with default NCCL communication backend.\n")
11321136
return False
11331137

11341138

tests/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ def pytorch_ddp_py_version():
476476
return "py3"
477477

478478

479-
@pytest.fixture(scope="module", params=["1.11", "1.11.0", "1.12", "1.12.0"])
479+
@pytest.fixture(scope="module", params=["1.11", "1.11.0", "1.12", "1.12.0", "1.12.1"])
480480
def pytorch_ddp_framework_version(request):
481481
return request.param
482482

0 commit comments

Comments
 (0)