Fix bug in script mode setup ModelTrainer (#1575)

benieric · pintaoz-aws · commit 869b75f3d09d · 2024-12-04T04:38:29.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,7 @@ env/
 **/_repack_script_launcher.sh
 src/sagemaker/modules/train/container_drivers/sm_train.sh
 src/sagemaker/modules/train/container_drivers/sourcecodeconfig.json
+src/sagemaker/modules/train/container_drivers/distribution.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
 src/sagemaker/serve/tmp_workspace
diff --git a/src/sagemaker/modules/constants.py b/src/sagemaker/modules/constants.py
@@ -26,6 +26,7 @@
 )
 
 SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"
+DISTRIBUTION_JSON = "distribution.json"
 TRAIN_SCRIPT = "sm_train.sh"
 
 DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]
diff --git a/src/sagemaker/modules/templates.py b/src/sagemaker/modules/templates.py
@@ -77,6 +77,10 @@
 cat /opt/ml/input/data/sm_drivers/sourcecodeconfig.json
 echo
 
+echo "/opt/ml/input/data/sm_drivers/distribution.json"
+cat /opt/ml/input/data/sm_drivers/distribution.json
+echo
+
 echo "Setting up environment variables"
 $SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/scripts/environment.py
 source /opt/ml/input/data/sm_drivers/scripts/sm_training.env
diff --git a/src/sagemaker/modules/train/container_drivers/mpi_driver.py b/src/sagemaker/modules/train/container_drivers/mpi_driver.py
@@ -19,6 +19,7 @@
 from utils import (
     logger,
     read_source_code_config_json,
+    read_distribution_json,
     get_process_count,
     execute_commands,
     write_failure_file,
@@ -55,7 +56,7 @@ def main():
 
     """
     source_code_config = read_source_code_config_json()
-    distribution = source_code_config.get("distribution", {})
+    distribution = read_distribution_json()
     sm_distributed_settings = distribution.get("smdistributed_settings", {})
 
     sm_current_host = os.environ["SM_CURRENT_HOST"]
@@ -73,7 +74,7 @@ def main():
 
         host_list = json.loads(os.environ["SM_HOSTS"])
         host_count = int(os.environ["SM_HOST_COUNT"])
-        process_count = get_process_count(source_code_config)
+        process_count = get_process_count(distribution)
 
         if process_count > 1:
             host_list = ["{}:{}".format(host, process_count) for host in host_list]
diff --git a/src/sagemaker/modules/train/container_drivers/pytorch_driver.py b/src/sagemaker/modules/train/container_drivers/pytorch_driver.py
@@ -20,6 +20,7 @@
 from utils import (
     logger,
     read_source_code_config_json,
+    read_distribution_json,
     get_process_count,
     get_python_executable,
     SM_EFA_NCCL_INSTANCES,
@@ -62,8 +63,9 @@ def setup_env():
 def create_commands():
     """Create the Torch Distributed command to execute"""
     source_code_config = read_source_code_config_json()
+    distribution = read_distribution_json()
 
-    process_count = get_process_count(source_code_config)
+    process_count = get_process_count(distribution)
     host_count = int(os.environ["SM_HOST_COUNT"])
 
     torch_cmd = []
diff --git a/src/sagemaker/modules/train/container_drivers/utils.py b/src/sagemaker/modules/train/container_drivers/utils.py
@@ -36,8 +36,9 @@
 TrainingJob - {os.environ['TRAINING_JOB_NAME']}
 """
 
-USER_CODE_PATH = "/opt/ml/input/data/code"
-SOURCE_CODE_CONFIG_JSON = "/opt/ml/input/data/sm_code/sourcecodeconfig.json"
+USER_CODE_PATH = "/opt/ml/input/data/sm_code"
+SOURCE_CODE_CONFIG_JSON = "/opt/ml/input/data/sm_drivers/sourcecodeconfig.json"
+DISTRIBUTION_JSON = "/opt/ml/input/data/sm_drivers/distribution.json"
 
 SM_EFA_NCCL_INSTANCES = [
     "ml.g4dn.8xlarge",
@@ -67,19 +68,25 @@ def write_failure_file(message: str = DEFAULT_FAILURE_MESSAGE):
 def read_source_code_config_json(source_code_config_file: Dict[str, Any] = SOURCE_CODE_CONFIG_JSON):
     """Read the source code config json file."""
     with open(source_code_config_file, "r") as f:
-        distribution_config = json.load(f)
-    return distribution_config
+        source_code_config_json = json.load(f)
+    return source_code_config_json
 
 
-def get_process_count(source_code_config: Dict[str, Any]) -> int:
+def read_distribution_json(distribution_file: Dict[str, Any] = DISTRIBUTION_JSON):
+    """Read the distribution json file."""
+    with open(distribution_file, "r") as f:
+        distribution_json = json.load(f)
+    return distribution_json
+
+
+def get_process_count(distribution: Dict[str, Any]) -> int:
     """Get the number of processes to run on each node in the training job."""
-    if source_code_config.get("distribution", {}).get("process_count_per_node") is not None:
-        return int(source_code_config["distribution"]["process_count_per_node"])
-    if os.environ.get("SM_NUM_GPUS") is not None:
-        return int(os.environ["SM_NUM_GPUS"])
-    if os.environ.get("SM_NUM_NEURONS") is not None:
-        return int(os.environ["SM_NUM_NEURONS"])
-    return 1  # Default to 1 process per node
+    return (
+        int(distribution.get("process_count_per_node", 0))
+        or int(os.environ.get("SM_NUM_GPUS", 0))
+        or int(os.environ.get("SM_NUM_NEURONS", 0))
+        or 1
+    )
 
 
 def get_python_executable() -> str:
diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py
@@ -70,6 +70,7 @@
     DEFAULT_CONTAINER_ENTRYPOINT,
     DEFAULT_CONTAINER_ARGUMENTS,
     SOURCE_CODE_CONFIG_JSON,
+    DISTRIBUTION_JSON,
 )
 from sagemaker.modules.templates import (
     TRAIN_SCRIPT_TEMPLATE,
@@ -385,6 +386,7 @@ def train(
 
             self._prepare_train_script(
                 source_code_config=self.source_code_config,
+                distribution_config=self.distribution_config,
             )
             if self.distribution_config:
                 smd_modelparallel_parameters = getattr(
@@ -397,6 +399,8 @@ def train(
                         smd_modelparallel_parameters
                     )
             self._write_source_code_config_json(self.source_code_config)
+            if self.distribution_config:
+                self._write_distribution_config_json(self.distribution_config)
 
             # Create an input channel for drivers packaged by the sdk
             sm_drivers_channel = self.create_input_data_channel(SM_DRIVERS, SM_DRIVERS_LOCAL_PATH)
@@ -555,6 +559,14 @@ def _write_source_code_config_json(self, source_code_config: SourceCodeConfig):
         with open(file_path, "w") as f:
             f.write(source_code_config.model_dump_json())
 
+    def _write_distribution_config_json(
+        self, distribution: Union[MPIDistributionConfig, TorchDistributionConfig]
+    ):
+        """Write the distribution configuration to a JSON file."""
+        file_path = os.path.join(SM_DRIVERS_LOCAL_PATH, DISTRIBUTION_JSON)
+        with open(file_path, "w") as f:
+            f.write(distribution.model_dump_json())
+
     def _prepare_train_script(
         self,
         source_code_config: SourceCodeConfig,

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"`
	`29`	`+DISTRIBUTION_JSON = "distribution.json"`
`29`	`30`	`TRAIN_SCRIPT = "sm_train.sh"`
`30`	`31`
`31`	`32`	`DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]`