From 14fde5e7c360f57238d81a159fcf627d70def057 Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 13:24:48 -0700 Subject: [PATCH 1/9] Testing MWMS in TF 2.9.1 with TF Model Garden --- buildspec-dlc-cpu-tests.yml | 2 +- buildspec-dlc-gpu-tests.yml | 2 +- buildspec-gen-cpu-tests.yml | 2 +- buildspec-gen-gpu-tests.yml | 2 +- test/container/2.9.1/Dockerfile.dlc.cpu | 6 ++ test/container/2.9.1/Dockerfile.dlc.gpu | 6 ++ test/container/2.9.1/Dockerfile.tf.cpu | 9 +++ test/container/2.9.1/Dockerfile.tf.gpu | 13 +++++ .../sagemaker/test_multi_worker_mirrored.py | 58 ++++++++++++++++++- 9 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 test/container/2.9.1/Dockerfile.dlc.cpu create mode 100644 test/container/2.9.1/Dockerfile.dlc.gpu create mode 100644 test/container/2.9.1/Dockerfile.tf.cpu create mode 100644 test/container/2.9.1/Dockerfile.tf.gpu diff --git a/buildspec-dlc-cpu-tests.yml b/buildspec-dlc-cpu-tests.yml index 7bf062de..9f3f596f 100644 --- a/buildspec-dlc-cpu-tests.yml +++ b/buildspec-dlc-cpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' ECR_REPO: 'sagemaker-test' diff --git a/buildspec-dlc-gpu-tests.yml b/buildspec-dlc-gpu-tests.yml index 6266877e..3ad2cf65 100644 --- a/buildspec-dlc-gpu-tests.yml +++ b/buildspec-dlc-gpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' GPU_INSTANCE_TYPE: 'ml.p3.2xlarge' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit' diff --git a/buildspec-gen-cpu-tests.yml b/buildspec-gen-cpu-tests.yml index f1f88b3e..4433deb4 100644 --- a/buildspec-gen-cpu-tests.yml +++ b/buildspec-gen-cpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' ECR_REPO: 'sagemaker-test' diff --git a/buildspec-gen-gpu-tests.yml b/buildspec-gen-gpu-tests.yml index 441dd269..e9cd04ff 100644 --- a/buildspec-gen-gpu-tests.yml +++ b/buildspec-gen-gpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' GPU_INSTANCE_TYPE: 'ml.p3.16xlarge' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit' diff --git a/test/container/2.9.1/Dockerfile.dlc.cpu b/test/container/2.9.1/Dockerfile.dlc.cpu new file mode 100644 index 00000000..855e2458 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.dlc.cpu @@ -0,0 +1,6 @@ +ARG region +FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.9.1-cpu-py39-ubuntu20.04-sagemaker + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/container/2.9.1/Dockerfile.dlc.gpu b/test/container/2.9.1/Dockerfile.dlc.gpu new file mode 100644 index 00000000..b468d9f5 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.dlc.gpu @@ -0,0 +1,6 @@ +ARG region +FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.9.1-gpu-py39-cu112-ubuntu20.04-sagemaker + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/container/2.9.1/Dockerfile.tf.cpu b/test/container/2.9.1/Dockerfile.tf.cpu new file mode 100644 index 00000000..b18e3cf5 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.tf.cpu @@ -0,0 +1,9 @@ +FROM tensorflow/tensorflow:2.9.1 + +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz +RUN pip install --no-cache-dir tensorflow-io +RUN apt-get update && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd \ No newline at end of file diff --git a/test/container/2.9.1/Dockerfile.tf.gpu b/test/container/2.9.1/Dockerfile.tf.gpu new file mode 100644 index 00000000..3adb62f0 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.tf.gpu @@ -0,0 +1,13 @@ +FROM tensorflow/tensorflow:2.9.1-gpu + +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz +RUN pip install --no-cache-dir tensorflow-io +RUN apt-key del 7fa2af80 \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-get update \ + && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index eeb980fb..435f3d4e 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -21,7 +21,9 @@ RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") -def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys): +def test_keras_example( + sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys +): estimator = TensorFlow( entry_point=os.path.join(RESOURCE_PATH, "multi_worker_mirrored", "train_dummy.py"), role="SageMakerRole", @@ -40,3 +42,57 @@ def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framewo logs = captured.out + captured.err assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs assert "TF_CONFIG=" in logs + + +def test_tf_model_garden( + sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys +): + epochs = 10 + batch_size = 512 + train_steps = int(1024 * epochs / batch_size) + steps_per_loop = train_steps // 10 + overrides = ( + f"runtime.enable_xla=False," + f"runtime.num_gpus=1," + f"runtime.distribution_strategy=multi_worker_mirrored," + f"runtime.mixed_precision_dtype=float16," + f"task.train_data.global_batch_size={batch_size}," + f"task.train_data.input_path=/opt/ml/input/data/training/validation*," + f"task.train_data.cache=True," + f"trainer.train_steps={train_steps}," + f"trainer.steps_per_loop={steps_per_loop}," + f"trainer.summary_interval={steps_per_loop}," + f"trainer.checkpoint_interval={train_steps}," + f"task.model.backbone.type=resnet," + f"task.model.backbone.resnet.model_id=50" + ) + estimator = TensorFlow( + git_config={ + "repo": "https://github.com/tensorflow/models.git", + "branch": "v2.9.2", + }, + source_dir=".", + entry_point="official/vision/train.py", + model_dir=False, + instance_type=instance_type, + instance_count=2, + image_uri=image_uri, + hyperparameters={ + "sagemaker_multi_worker_mirrored_strategy_enabled": True, + "experiment": "resnet_imagenet", + "config_file": "official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml", + "mode": "train", + "model_dir": "/opt/ml/model", + "params_override": overrides, + }, + max_run=60 * 60 * 1, # 1 hour + role="SageMakerRole", + ) + estimator.fit( + inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/validation", + job_name=unique_name_from_base("test-tf-mwms"), + ) + captured = capsys.readouterr() + logs = captured.out + captured.err + assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs + assert "TF_CONFIG=" in logs From c662d1f98ddae82ed51ce695cf36f725eb39aba4 Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 14:15:01 -0700 Subject: [PATCH 2/9] Skipping CPU and lowering batch size --- test/integration/sagemaker/test_multi_worker_mirrored.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index 435f3d4e..e394ad94 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -17,6 +17,7 @@ from sagemaker.tensorflow import TensorFlow from sagemaker.utils import unique_name_from_base +import pytest RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") @@ -44,19 +45,20 @@ def test_keras_example( assert "TF_CONFIG=" in logs +@pytest.mark.skip_cpu def test_tf_model_garden( sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys ): epochs = 10 - batch_size = 512 - train_steps = int(1024 * epochs / batch_size) + global_batch_size = 64 + train_steps = int(1024 * epochs / global_batch_size) steps_per_loop = train_steps // 10 overrides = ( f"runtime.enable_xla=False," f"runtime.num_gpus=1," f"runtime.distribution_strategy=multi_worker_mirrored," f"runtime.mixed_precision_dtype=float16," - f"task.train_data.global_batch_size={batch_size}," + f"task.train_data.global_batch_size={global_batch_size}," f"task.train_data.input_path=/opt/ml/input/data/training/validation*," f"task.train_data.cache=True," f"trainer.train_steps={train_steps}," From cb785b3cb2cf2429fd871872920e4e188c7d3984 Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 15:16:05 -0700 Subject: [PATCH 3/9] reduce Keras verbosity in trcomp tests --- test/resources/multi_worker_mirrored/train_dummy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/resources/multi_worker_mirrored/train_dummy.py b/test/resources/multi_worker_mirrored/train_dummy.py index 7552e019..c347a761 100644 --- a/test/resources/multi_worker_mirrored/train_dummy.py +++ b/test/resources/multi_worker_mirrored/train_dummy.py @@ -45,4 +45,4 @@ def build_and_compile_cnn_model(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_cnn_model() -multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70) +multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70, verbose=2) From 1706b6ae60b036ac40741c2447543d2bb648e231 Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 15:16:27 -0700 Subject: [PATCH 4/9] Changing MWMS to use port 2222 --- src/sagemaker_tensorflow_container/training.py | 2 +- test/unit/test_training.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index 090d6d66..4d1965f4 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -103,7 +103,7 @@ def _build_tf_config_for_mwms(hosts, current_host): """ workers = hosts - def host_addresses(hosts, port=8890): + def host_addresses(hosts, port=2222): return ["{}:{}".format(host, port) for host in hosts] tf_config = {"cluster": {}, "environment": "cloud"} diff --git a/test/unit/test_training.py b/test/unit/test_training.py index d955a6da..68b72757 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -35,7 +35,7 @@ "worker": ["{}:2222".format(HOST2)], "ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)], } -CLUSTER_WITH_MWMS = {"worker": ["{}:8890".format(HOST) for HOST in HOST_LIST]} +CLUSTER_WITH_MWMS = {"worker": ["{}:2222".format(HOST) for HOST in HOST_LIST]} MASTER_TASK = {"index": 0, "type": "master"} WORKER_TASK = {"index": 0, "type": "worker"} From 895324be4d702bf144452fb5d3a4d3dc0772c9ae Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 15:16:53 -0700 Subject: [PATCH 5/9] fix linting --- test/integration/sagemaker/test_multi_worker_mirrored.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index e394ad94..a59d861e 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -14,10 +14,10 @@ import os +import pytest from sagemaker.tensorflow import TensorFlow from sagemaker.utils import unique_name_from_base -import pytest RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") From 5da90ca9eeccb474be85b7782e1d8401d2b7819f Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 17:00:54 -0700 Subject: [PATCH 6/9] Increasing dataset size for MWMS test. --- .../sagemaker/test_multi_worker_mirrored.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index a59d861e..295269b8 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -49,17 +49,17 @@ def test_keras_example( def test_tf_model_garden( sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys ): - epochs = 10 + epochs = 1 global_batch_size = 64 - train_steps = int(1024 * epochs / global_batch_size) - steps_per_loop = train_steps // 10 + train_steps = int(10**6 * epochs / global_batch_size) + steps_per_loop = train_steps // 100 overrides = ( f"runtime.enable_xla=False," f"runtime.num_gpus=1," f"runtime.distribution_strategy=multi_worker_mirrored," f"runtime.mixed_precision_dtype=float16," f"task.train_data.global_batch_size={global_batch_size}," - f"task.train_data.input_path=/opt/ml/input/data/training/validation*," + f"task.train_data.input_path=/opt/ml/input/data/training/train*," f"task.train_data.cache=True," f"trainer.train_steps={train_steps}," f"trainer.steps_per_loop={steps_per_loop}," @@ -87,11 +87,14 @@ def test_tf_model_garden( "model_dir": "/opt/ml/model", "params_override": overrides, }, - max_run=60 * 60 * 1, # 1 hour + environment={ + 'NCCL_DEBUG': 'INFO', + }, + max_run=60 * 60 * 12, # 1 hour role="SageMakerRole", ) estimator.fit( - inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/validation", + inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/train", job_name=unique_name_from_base("test-tf-mwms"), ) captured = capsys.readouterr() From b6fb6eb711ceddfd784a0f970abf60280d10c23f Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 18:09:08 -0700 Subject: [PATCH 7/9] Advancing test dependency versions --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 439cfbc5..279e7ad9 100644 --- a/setup.py +++ b/setup.py @@ -38,13 +38,13 @@ def read_version(): "pytest-rerunfailures", "mock", "sagemaker[local]>=2", - "tensorflow<2.4", + "tensorflow>=2.9", "docker-compose", - "boto3==1.16.34", + "boto3", "python-dateutil>=2.1,<2.8.1", - "botocore==1.19.34", + "botocore", "requests-mock", - "awscli==1.18.194", + "awscli", "protobuf>=3.20,<3.21" ] From 00f83a5383a39759fafdd376360a402362ea5520 Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 18:49:54 -0700 Subject: [PATCH 8/9] Increasing disk size on training job to test MWMS --- test/integration/sagemaker/test_multi_worker_mirrored.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index 295269b8..7512ae66 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -92,6 +92,7 @@ def test_tf_model_garden( }, max_run=60 * 60 * 12, # 1 hour role="SageMakerRole", + volume_size=400, ) estimator.fit( inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/train", From 9d4a52896387447a9399709fe90fc2eaf8c231aa Mon Sep 17 00:00:00 2001 From: Loki Date: Tue, 14 Jun 2022 22:44:44 -0700 Subject: [PATCH 9/9] Reducing the size of the training loop to fit in a p3.2xl --- test/integration/sagemaker/test_multi_worker_mirrored.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index 7512ae66..0472fe32 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -51,7 +51,7 @@ def test_tf_model_garden( ): epochs = 1 global_batch_size = 64 - train_steps = int(10**6 * epochs / global_batch_size) + train_steps = int(10**5 * epochs / global_batch_size) steps_per_loop = train_steps // 100 overrides = ( f"runtime.enable_xla=False," @@ -59,7 +59,7 @@ def test_tf_model_garden( f"runtime.distribution_strategy=multi_worker_mirrored," f"runtime.mixed_precision_dtype=float16," f"task.train_data.global_batch_size={global_batch_size}," - f"task.train_data.input_path=/opt/ml/input/data/training/train*," + f"task.train_data.input_path=/opt/ml/input/data/training/train-000*," f"task.train_data.cache=True," f"trainer.train_steps={train_steps}," f"trainer.steps_per_loop={steps_per_loop},"