fix: black formatting

satishpasumarthi · satishpasumarthi · commit f8144306e135 · 2022-10-19T12:14:56.000-07:00
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
@@ -134,14 +134,9 @@
     "1.12.0",
 ]
 
-TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS = [
-    "1.11",
-    "1.11.0"
-]
+TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS = ["1.11", "1.11.0"]
 
-TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = [
-    "torch_distributed"
-]
+TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = ["torch_distributed"]
 
 SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
 
@@ -710,7 +705,14 @@ def _validate_smdataparallel_args(
 
 
 def validate_distribution(
-    distribution, instance_groups, framework_name, framework_version, py_version, image_uri, entry_point, kwargs
+    distribution,
+    instance_groups,
+    framework_name,
+    framework_version,
+    py_version,
+    image_uri,
+    entry_point,
+    kwargs,
 ):
     """Check if distribution strategy is correctly invoked by the user.
 
@@ -850,9 +852,8 @@ def validate_distribution(
         )
     return distribution
 
-def validate_distribution_for_instance_type(
-    instance_type, distribution
-):
+
+def validate_distribution_for_instance_type(instance_type, distribution):
     """Check if the provided distribution strategy is supported for the instance_type
 
     Args:
@@ -869,11 +870,11 @@ def validate_distribution_for_instance_type(
             distribution_strategy = keys[0]
             if distribution_strategy != "torch_distributed":
                 err_msg += (
-                f"Provided distribution strategy {distribution_strategy} is not supported for"
-                " Trainium instances.\n"
-                "Please specify one of the following supported distribution strategies:"
-                f" {TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES} \n"
-            )
+                    f"Provided distribution strategy {distribution_strategy} is not supported for"
+                    " Trainium instances.\n"
+                    "Please specify one of the following supported distribution strategies:"
+                    f" {TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES} \n"
+                )
         elif len(keys) > 1:
             err_msg += (
                 f"Multiple distribution strategies are not supported for Trainium instances.\n"
@@ -884,6 +885,7 @@ def validate_distribution_for_instance_type(
     if err_msg:
         raise ValueError(err_msg)
 
+
 def validate_pytorch_distribution(
     distribution, framework_name, framework_version, py_version, image_uri
 ):
@@ -940,8 +942,15 @@ def validate_pytorch_distribution(
     if err_msg:
         raise ValueError(err_msg)
 
+
 def validate_torch_distributed_distribution(
-    instance_type, distribution, framework_name, framework_version, py_version, image_uri, entry_point,
+    instance_type,
+    distribution,
+    framework_name,
+    framework_version,
+    py_version,
+    image_uri,
+    entry_point,
 ):
     """Check if torch_distributed distribution strategy is correctly invoked by the user.
 
@@ -1003,20 +1012,22 @@ def validate_torch_distributed_distribution(
         return
     else:
         err_msg += (
-                f"torch_distributed is currently supported only for trainium instances."
-                " Please refer https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training \
+            f"torch_distributed is currently supported only for trainium instances."
+            " Please refer https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training \
                 for information regarding distributed training on non-trainium instances"
         )
 
     # Check entry point type
     if not entry_point.endswith(".py"):
-        err_msg += ("Unsupported entry point type for torch_distributed.\n"
-                    "Only python programs (*.py) are supported."
+        err_msg += (
+            "Unsupported entry point type for torch_distributed.\n"
+            "Only python programs (*.py) are supported."
         )
-    
+
     if err_msg:
         raise ValueError(err_msg)
 
+
 def python_deprecation_warning(framework, latest_supported_version):
     """Placeholder docstring"""
     return PYTHON_2_DEPRECATION_WARNING.format(
diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py
@@ -168,7 +168,7 @@ def __init__(
 
                     To learn more, see `Distributed PyTorch Training
                     <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
-                
+
                 **To enable Torch Distributed (Trainium Instances):**
 
                     .. code:: python
@@ -177,7 +177,7 @@ def __init__(
                             "enabled": True
                         }
                     }
-                    To learn more, see `Distributed PyTorch Training on Trainium 
+                    To learn more, see `Distributed PyTorch Training on Trainium
                     <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
 
                 **To enable MPI:**
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -452,12 +452,11 @@ def torch_distributed_py_version():
     return "py3"
 
 
-@pytest.fixture(
-    scope="module", params=["1.11.0"]
-)
+@pytest.fixture(scope="module", params=["1.11.0"])
 def torch_distributed_framework_version(request):
     return request.param
 
+
 @pytest.fixture(scope="session")
 def cpu_instance_type(sagemaker_session, request):
     region = sagemaker_session.boto_session.region_name
diff --git a/tests/data/torch_distributed/mnist_mlp_trainium.py b/tests/data/torch_distributed/mnist_mlp_trainium.py
@@ -16,16 +16,22 @@
 
 # Initialize XLA process group for torchrun
 import torch_xla.distributed.xla_backend
-torch.distributed.init_process_group('xla')
+
+torch.distributed.init_process_group("xla")
 
 # Global constants
 EPOCHS = 4
 WARMUP_STEPS = 2
 BATCH_SIZE = 32
 
 # Load MNIST train dataset
-train_dataset = mnist.MNIST(root=os.path.join('./MNIST_DATA_train', str(xm.get_ordinal())),
-                            train=True, download=True, transform=ToTensor())
+train_dataset = mnist.MNIST(
+    root=os.path.join("./MNIST_DATA_train", str(xm.get_ordinal())),
+    train=True,
+    download=True,
+    transform=ToTensor(),
+)
+
 
 def main():
     # XLA MP: get world size
@@ -34,7 +40,7 @@ def main():
     torch.manual_seed(0)
 
     # Move model to device and declare optimizer and loss function
-    device = 'xla'
+    device = "xla"
     model = MLP().to(device)
     # For multiprocessing, scale up learning rate
     optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size)
@@ -43,45 +49,47 @@ def main():
     # Prepare data loader
     train_sampler = None
     if world_size > 1:
-        train_sampler = DistributedSampler(train_dataset,
-                                           num_replicas=world_size,
-                                           rank=xm.get_ordinal(),
-                                           shuffle=True)
-    train_loader = DataLoader(train_dataset,
-                              batch_size=BATCH_SIZE,
-                              sampler=train_sampler,
-                              shuffle=False if train_sampler else True)
+        train_sampler = DistributedSampler(
+            train_dataset, num_replicas=world_size, rank=xm.get_ordinal(), shuffle=True
+        )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        sampler=train_sampler,
+        shuffle=False if train_sampler else True,
+    )
     # XLA MP: use MpDeviceLoader from torch_xla.distributed
     train_device_loader = pl.MpDeviceLoader(train_loader, device)
 
     # Run the training loop
-    print('----------Training ---------------')
+    print("----------Training ---------------")
     model.train()
     for epoch in range(EPOCHS):
         start = time.time()
-        print(f'Epoch: {epoch}')
+        print(f"Epoch: {epoch}")
         for idx, (train_x, train_label) in enumerate(train_device_loader):
             optimizer.zero_grad()
             train_x = train_x.view(train_x.size(0), -1)
             output = model(train_x)
             loss = loss_fn(output, train_label)
             loss.backward()
-            xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step
-            if idx < WARMUP_STEPS: # skip warmup iterations
+            xm.optimizer_step(optimizer)  # XLA MP: performs grad allreduce and optimizer step
+            if idx < WARMUP_STEPS:  # skip warmup iterations
                 start = time.time()
 
     # Compute statistics for the last epoch
-    interval = idx - WARMUP_STEPS # skip warmup iterations
+    interval = idx - WARMUP_STEPS  # skip warmup iterations
     throughput = interval / (time.time() - start)
     print("Train throughput (iter/sec): {}".format(throughput))
-    print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
+    print("Final loss is {:0.4f}".format(loss.detach().to("cpu")))
 
     # Save checkpoint for evaluation (xm.save ensures only one process save)
     os.makedirs("checkpoints", exist_ok=True)
-    checkpoint = {'state_dict': model.state_dict()}
-    xm.save(checkpoint,'checkpoints/checkpoint.pt')
+    checkpoint = {"state_dict": model.state_dict()}
+    xm.save(checkpoint, "checkpoints/checkpoint.pt")
+
+    print("----------End Training ---------------")
 
-    print('----------End Training ---------------')
 
-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    main()
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
@@ -947,6 +947,7 @@ def test_validate_pytorchddp_raises():
             image_uri=None,
         )
 
+
 def test_validate_torch_distributed_not_raises():
     # Case 1: Framework is not PyTorch
     fw_utils.validate_torch_distributed_distribution(
@@ -979,6 +980,7 @@ def test_validate_torch_distributed_not_raises():
             image_uri="custom-container",
         )
 
+
 def test_validate_torch_distributed_raises():
     torch_distributed_enabled = {"torch_distributed": {"enabled": True}}
     # Case 1: Unsupported framework version
@@ -1001,6 +1003,7 @@ def test_validate_torch_distributed_raises():
             image_uri=None,
         )
 
+
 def test_validate_unsupported_distributions_trainium_raises():
     with pytest.raises(ValueError):
         mpi_enabled = {"mpi": {"enabled": True}}
@@ -1015,17 +1018,17 @@ def test_validate_unsupported_distributions_trainium_raises():
             distribution=mpi_enabled,
             instance_type="ml.trn1.32xlarge",
         )
-    
+
     with pytest.raises(ValueError):
         pytorch_ddp_enabled = {"pytorch_ddp": {"enabled": True}}
         fw_utils.validate_distribution_for_instance_type(
             distribution=pytorch_ddp_enabled,
             instance_type="ml.trn1.32xlarge",
         )
-    
+
     with pytest.raises(ValueError):
         smdataparallel_enabled = {"smdataparallel": {"enabled": True}}
         fw_utils.validate_distribution_for_instance_type(
             distribution=smdataparallel_enabled,
             instance_type="ml.trn1.32xlarge",
-        )
+        )

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ def __init__(`
`168`	`168`
`169`	`169`	To learn more, see `Distributed PyTorch Training
`170`	`170`	<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
`171`		`-`
	`171`	`+`
`172`	`172`	`To enable Torch Distributed (Trainium Instances):`
`173`	`173`
`174`	`174`	`.. code:: python`
`@@ -177,7 +177,7 @@ def __init__(`
`177`	`177`	`"enabled": True`
`178`	`178`	`}`
`179`	`179`	`}`
`180`		- To learn more, see `Distributed PyTorch Training on Trainium
	`180`	+ To learn more, see `Distributed PyTorch Training on Trainium
`181`	`181`	<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
`182`	`182`
`183`	`183`	`To enable MPI:`