Feature: Cluster setup for MultiWorkerMirroredStrategy

Lokiiiiii · Lokiiiiii · commit 36eb70d3ba91 · 2022-05-19T10:51:00.000-07:00
diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py
@@ -27,14 +27,15 @@
 logger = logging.getLogger(__name__)
 
 SAGEMAKER_PARAMETER_SERVER_ENABLED = "sagemaker_parameter_server_enabled"
+SAGEMAKER_MULTI_WORKER_MIRRORED_ENABLED = "sagemaker_multi_worker_mirrored_enabled"
 MODEL_DIR = "/opt/ml/model"
 
 
 def _is_host_master(hosts, current_host):
     return current_host == hosts[0]
 
 
-def _build_tf_config(hosts, current_host, ps_task=False):
+def _build_tf_config_for_ps(hosts, current_host, ps_task=False):
     """Builds a dictionary containing cluster information based on number of hosts and number of
     parameter servers.
 
@@ -84,6 +85,31 @@ def host_addresses(hosts, port=2222):
     return tf_config
 
 
+def _build_tf_config_for_mwm(hosts, current_host):
+    """Builds a dictionary containing cluster information based on number of workers
+    for Multi Worker Mirrored distribution strategy.
+
+    Args:
+        hosts (list[str]): List of host names in the cluster
+        current_host (str): Current host name
+
+    Returns:
+        dict[str: dict]: A dictionary describing the cluster setup for distributed training.
+            For more information regarding TF_CONFIG:
+            https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
+    """
+    workers = hosts
+
+    def host_addresses(hosts, port=8890):
+        return ["{}:{}".format(host, port) for host in hosts]
+
+    tf_config = {"cluster": {}, "environment": "cloud"}
+    tf_config["cluster"]["worker"] = host_addresses(workers)
+    tf_config["task"] = {"index": workers.index(current_host), "type": "worker"}
+
+    return tf_config
+
+
 def _run_ps(env, cluster):
     logger.info("Running distributed training job with parameter servers")
 
@@ -135,12 +161,26 @@ def train(env, cmd_args):
     """
     parameter_server_enabled = env.additional_framework_parameters.get(
         SAGEMAKER_PARAMETER_SERVER_ENABLED, False
+    ) and len(env.hosts) > 1
+    multi_worker_mirrored_enabled = env.additional_framework_parameters.get(
+        SAGEMAKER_MULTI_WORKER_MIRRORED_ENABLED, False
     )
-    if len(env.hosts) > 1 and parameter_server_enabled:
+    
+    # Setup
+    if parameter_server_enabled:
+        
+        tf_config = _build_tf_config_for_ps(hosts=env.hosts, current_host=env.current_host)
+        logger.info("Running distributed training job with parameter servers")
+    
+    elif multi_worker_mirrored_enabled:
+        
+        tf_config = _build_tf_config_for_mwm(hosts=env.hosts, current_host=env.current_host)
+        logger.info("Running distributed training job with multi_worker_mirrored setup")
 
-        tf_config = _build_tf_config(hosts=env.hosts, current_host=env.current_host)
 
-        logger.info("Running distributed training job with parameter servers")
+    # Run
+    if parameter_server_enabled:
+        
         logger.info("Launching parameter server process")
         _run_ps(env, tf_config["cluster"])
         logger.info("Launching worker process")
diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py
@@ -0,0 +1,43 @@
+# Copyright 2017-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+
+import boto3
+import pytest
+from sagemaker.tensorflow import TensorFlow
+from sagemaker.utils import unique_name_from_base
+from six.moves.urllib.parse import urlparse
+
+from timeout import timeout
+
+
+
+RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
+
+
+
+def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framework_version):
+    estimator = TensorFlow(
+                        entry_point=os.path.join(RESOURCE_PATH, "multi_worker_mirrored", "train_sample.py"),
+                        role="SageMakerRole",
+                        instance_type=instance_type,
+                        instance_count=2,
+                        image_name=image_uri,
+                        framework_version=framework_version,
+                        py_version="py3",
+                        sagemaker_session=sagemaker_session,
+                    )
+    estimator.fit(job_name=unique_name_from_base("test-tf-mwms"))
+    raise NotImplementedError('Yet to add assertions')
diff --git a/test/resources/multi_worker_mirrored/__init__.py b/test/resources/multi_worker_mirrored/__init__.py
@@ -0,0 +1,13 @@
+#  Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License").
+#  You may not use this file except in compliance with the License.
+#  A copy of the License is located at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  or in the "license" file accompanying this file. This file is distributed
+#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+#  express or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+from __future__ import absolute_import
diff --git a/test/resources/multi_worker_mirrored/train_sample.py b/test/resources/multi_worker_mirrored/train_sample.py
@@ -0,0 +1,21 @@
+import tensorflow as tf
+
+
+
+strategy = tf.distribute.MultiWorkerMirroredStrategy()
+
+with strategy.scope():
+  model = tf.keras.Sequential([
+    tf.keras.layers.Dense(2, input_shape=(5,)),
+  ])
+  optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
+
+def dataset_fn(ctx):
+  x = np.random.random((2, 5)).astype(np.float32)
+  y = np.random.randint(2, size=(2, 1))
+  dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  return dataset.repeat().batch(1, drop_remainder=True)
+dist_dataset = strategy.distribute_datasets_from_function(dataset_fn)
+
+model.compile()
+model.fit(dist_dataset)
diff --git a/test/unit/test_training.py b/test/unit/test_training.py
@@ -31,10 +31,14 @@
 CURRENT_HOST = HOST1
 CMD_ARGS = {"some_key": "some_value"}
 CLUSTER_WITH_PS = {
-    "master": ["{}:2222".format(HOST1)],
-    "worker": ["{}:2222".format(HOST2)],
+    "master": ["{}:8890".format(HOST1)],
+    "worker": ["{}:8890".format(HOST2)],
     "ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)],
 }
+CLUSTER_WITH_MWMS = {
+    "worker": ["{}:8890".format(HOST) for HOST IN (HOST1, HOST2)],
+}
+
 MASTER_TASK = {"index": 0, "type": "master"}
 WORKER_TASK = {"index": 0, "type": "worker"}
 PS_TASK_1 = {"index": 0, "type": "ps"}
@@ -205,32 +209,45 @@ def test_train_distributed_no_ps(run, distributed_training_env):
     )
 
 
-def test_build_tf_config():
-    assert training._build_tf_config(HOST_LIST, HOST1) == {
+def test_build_tf_config_for_mwms():
+    assert training._build_tf_config_for_mwms(HOST_LIST, HOST1) == {
+        "cluster": CLUSTER_WITH_MWMS,
+        "environment": "cloud",
+        "task": {"index": HOST_LIST.index(HOST1), "type": "worker"},
+    }
+    assert training._build_tf_config_for_mwms(HOST_LIST, HOST2) == {
+        "cluster": CLUSTER_WITH_MWMS,
+        "environment": "cloud",
+        "task": {"index": HOST_LIST.index(HOST2), "type": "worker"},
+    }
+
+
+def test_build_tf_config_for_ps():
+    assert training._build_tf_config_for_ps(HOST_LIST, HOST1) == {
         "cluster": CLUSTER_WITH_PS,
         "environment": "cloud",
         "task": MASTER_TASK,
     }
-    assert training._build_tf_config(HOST_LIST, HOST1, ps_task=True) == {
+    assert training._build_tf_config_for_ps(HOST_LIST, HOST1, ps_task=True) == {
         "cluster": CLUSTER_WITH_PS,
         "environment": "cloud",
         "task": PS_TASK_1,
     }
-    assert training._build_tf_config(HOST_LIST, HOST2) == {
+    assert training._build_tf_config_for_ps(HOST_LIST, HOST2) == {
         "cluster": CLUSTER_WITH_PS,
         "environment": "cloud",
         "task": WORKER_TASK,
     }
-    assert training._build_tf_config(HOST_LIST, HOST2, ps_task=True) == {
+    assert training._build_tf_config_for_ps(HOST_LIST, HOST2, ps_task=True) == {
         "cluster": CLUSTER_WITH_PS,
         "environment": "cloud",
         "task": PS_TASK_2,
     }
 
 
-def test_build_tf_config_error():
+def test_build_tf_config_for_ps_error():
     with pytest.raises(ValueError) as error:
-        training._build_tf_config([HOST1], HOST1, ps_task=True)
+        training._build_tf_config_for_ps([HOST1], HOST1, ps_task=True)
     assert "Cannot have a ps task if there are no parameter servers in the cluster" in str(
         error.value
     )