@@ -1104,31 +1104,35 @@ def validate_smddp_collectives_support(
1104
1104
# in case image_uri is not set, then both are mandatory
1105
1105
if framework_version not in SMDDP_COLLECTIVES_SUPPORTED_FRAMEWORK_VERSIONS :
1106
1106
err_msg += (
1107
- f"Provided framework_version { framework_version } is not supported."
1107
+ f"Provided framework_version { framework_version } is not supported. "
1108
1108
"Please specify one of the supported framework versions:"
1109
1109
f" { SMDDP_COLLECTIVES_SUPPORTED_FRAMEWORK_VERSIONS } .\n "
1110
1110
)
1111
1111
if "py3" not in py_version :
1112
1112
err_msg += (
1113
- f"Provided py_version { py_version } is not supported."
1113
+ f"Provided py_version { py_version } is not supported. "
1114
1114
"Please specify py_version>=py3.\n "
1115
1115
)
1116
1116
if instance_type not in SMDDP_COLLECTIVES_SUPPORTED_INSTANCE_TYPES :
1117
1117
err_msg += (
1118
- f"Provided instance_type { instance_type } is not supported."
1118
+ f"Provided instance_type { instance_type } is not supported. "
1119
1119
"Please specify one of the supported instance types:"
1120
1120
f"{ SMDDP_COLLECTIVES_SUPPORTED_INSTANCE_TYPES } .\n "
1121
1121
)
1122
1122
if instance_count == 1 :
1123
1123
# Communication backend auto is not supported for single-node jobs
1124
1124
err_msg += (
1125
- "SMDDP Collective backend is not supported for single-node jobs."
1125
+ "SMDDP Collective backend is not supported for single-node jobs. "
1126
1126
"Please increase instance_count to be greater than 1.\n "
1127
1127
)
1128
1128
if not err_msg :
1129
1129
return True
1130
- logger .warning ("Could not enable SMDDP Collectives for the training job.\n %s" , err_msg )
1131
- logger .warning ("Continuing training with NCCL collective backend.\n " )
1130
+ logger .warning (
1131
+ "The system is not compatible or not configured to run SMDDP collectives optimized"
1132
+ " for AWS infrastructure.\n %s" ,
1133
+ err_msg ,
1134
+ )
1135
+ logger .warning ("Continuing model training with default NCCL communication backend.\n " )
1132
1136
return False
1133
1137
1134
1138
0 commit comments