Model Trainer Bucket improvements (#1618)

nargokul · pintaoz-aws · commit 9c75b2b6ae33 · 2024-12-04T04:38:29.000-08:00
* Model Trainer Bucket improvements

* Address Comments

* Unit test fix

* Unit test fix

* Codestyle

* Codestyle

* Codestyle

* Fixes

* Fixes

* Fixes

* Fixes

* Fixes
diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py
@@ -135,6 +135,8 @@ class ModelTrainer(BaseModel):
             The SageMakerCore session. For convinience, can be imported like:
             `from sagemaker.modules import Session`.
             If not specified, a new session will be created.
+            If the default bucket for the artifacts needs to be updated, it can be done by
+            passing it in the Session object.
         role (Optional(str)):
             The IAM role ARN for the training job.
             If not specified, the default SageMaker execution role will be used.
@@ -173,7 +175,8 @@ class ModelTrainer(BaseModel):
         output_data_config (Optional[OutputDataConfig]):
             The output data configuration. This is used to specify the output data location
             for the training job.
-            If not specified, will default to `s3://<default_bucket>/<base_job_name>/output/`.
+            If not specified in the session, will default to
+            `s3://<default_bucket>/<default_prefix>/<base_job_name>/`.
         input_data_config (Optional[List[Union[Channel, InputData]]]):
             The input data config for the training job.
             Takes a list of Channel or InputData objects. An InputDataSource can be an S3 URI
@@ -348,7 +351,7 @@ def _populate_intelligent_defaults_from_model_trainer_space(self):
                             configurable_attribute
                         )(
                             **default_config  # pylint: disable=E1134
-                        )  # noqa
+                        )
                     setattr(self, configurable_attribute, default_config)
 
     def __del__(self):
@@ -461,7 +464,8 @@ def model_post_init(self, __context: Any):
             session = self.sagemaker_session
             base_job_name = self.base_job_name
             self.output_data_config = OutputDataConfig(
-                s3_output_path=f"s3://{session.default_bucket()}/{base_job_name}",
+                s3_output_path=f"s3://{self._fetch_bucket_name_and_prefix(session)}"
+                f"/{base_job_name}",
                 compression_type="GZIP",
                 kms_key_id=None,
             )
@@ -473,6 +477,12 @@ def model_post_init(self, __context: Any):
         if self.training_image:
             logger.info(f"Training image URI: {self.training_image}")
 
+    def _fetch_bucket_name_and_prefix(self, session: Session) -> str:
+        """Helper function to get the bucket name with the corresponding prefix if applicable"""
+        if session.default_bucket_prefix is not None:
+            return f"{session.default_bucket()}/{session.default_bucket_prefix}"
+        return session.default_bucket()
+
     @_telemetry_emitter(feature=Feature.MODEL_TRAINER, func_name="model_trainer.train")
     @validate_call
     def train(
@@ -497,12 +507,16 @@ def train(
                 Defaults to True.
         """
         self._populate_intelligent_defaults()
+        current_training_job_name = _get_unique_name(self.base_job_name)
+        input_data_key_prefix = f"{self.base_job_name}/{current_training_job_name}/input"
         if input_data_config:
             self.input_data_config = input_data_config
 
         input_data_config = []
         if self.input_data_config:
-            input_data_config = self._get_input_data_config(self.input_data_config)
+            input_data_config = self._get_input_data_config(
+                self.input_data_config, input_data_key_prefix
+            )
 
         string_hyper_parameters = {}
         if self.hyperparameters:
@@ -524,7 +538,9 @@ def train(
             # The source code will be mounted at /opt/ml/input/data/sm_code in the container
             if self.source_code.source_dir:
                 source_code_channel = self.create_input_data_channel(
-                    SM_CODE, self.source_code.source_dir
+                    channel_name=SM_CODE,
+                    data_source=self.source_code.source_dir,
+                    key_prefix=input_data_key_prefix,
                 )
                 input_data_config.append(source_code_channel)
 
@@ -542,7 +558,11 @@ def train(
             self._write_distributed_json(tmp_dir=drivers_dir, distributed=self.distributed)
 
             # Create an input channel for drivers packaged by the sdk
-            sm_drivers_channel = self.create_input_data_channel(SM_DRIVERS, drivers_dir.name)
+            sm_drivers_channel = self.create_input_data_channel(
+                channel_name=SM_DRIVERS,
+                data_source=drivers_dir.name,
+                key_prefix=input_data_key_prefix,
+            )
             input_data_config.append(sm_drivers_channel)
 
             # If source_code is provided, we will always use
@@ -567,7 +587,7 @@ def train(
 
         if self.training_mode == Mode.SAGEMAKER_TRAINING_JOB:
             training_job = TrainingJob.create(
-                training_job_name=_get_unique_name(self.base_job_name),
+                training_job_name=current_training_job_name,
                 algorithm_specification=algorithm_specification,
                 hyper_parameters=string_hyper_parameters,
                 input_data_config=input_data_config,
@@ -621,14 +641,22 @@ def train(
             )
             local_container.train(wait)
 
-    def create_input_data_channel(self, channel_name: str, data_source: DataSourceType) -> Channel:
+    def create_input_data_channel(
+        self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] = None
+    ) -> Channel:
         """Create an input data channel for the training job.
 
         Args:
             channel_name (str): The name of the input data channel.
             data_source (DataSourceType): The data source for the input data channel.
                 DataSourceType can be an S3 URI string, local file path string,
                 S3DataSource object, or FileSystemDataSource object.
+            key_prefix (Optional[str]): The key prefix to use when uploading data to S3.
+                Only applicable when data_source is a local file path string.
+                If not specified, local data will be uploaded to:
+                    s3://<default_bucket_path>/<base_job_name>/input/<channel_name>/
+                If specified, local data will be uploaded to:
+                    s3://<default_bucket_path>/<key_prefix>/<channel_name>/
         """
         channel = None
         if isinstance(data_source, str):
@@ -644,6 +672,10 @@ def create_input_data_channel(self, channel_name: str, data_source: DataSourceTy
                     ),
                     input_mode="File",
                 )
+                if key_prefix:
+                    logger.warning(
+                        "key_prefix is only applicable when data_source is a local file path."
+                    )
             elif _is_valid_path(data_source):
                 if self.training_mode == Mode.LOCAL_CONTAINER:
                     channel = Channel(
@@ -657,10 +689,17 @@ def create_input_data_channel(self, channel_name: str, data_source: DataSourceTy
                         input_mode="File",
                     )
                 else:
+                    key_prefix = (
+                        f"{key_prefix}/{channel_name}"
+                        if key_prefix
+                        else f"{self.base_job_name}/input/{channel_name}"
+                    )
+                    if self.sagemaker_session.default_bucket_prefix:
+                        key_prefix = f"{self.sagemaker_session.default_bucket_prefix}/{key_prefix}"
                     s3_uri = self.sagemaker_session.upload_data(
                         path=data_source,
                         bucket=self.sagemaker_session.default_bucket(),
-                        key_prefix=f"{self.base_job_name}/input/{channel_name}",
+                        key_prefix=key_prefix,
                     )
                     channel = Channel(
                         channel_name=channel_name,
@@ -687,7 +726,9 @@ def create_input_data_channel(self, channel_name: str, data_source: DataSourceTy
         return channel
 
     def _get_input_data_config(
-        self, input_data_channels: Optional[List[Union[Channel, InputData]]]
+        self,
+        input_data_channels: Optional[List[Union[Channel, InputData]]],
+        key_prefix: Optional[str] = None,
     ) -> List[Channel]:
         """Get the input data configuration for the training job.
 
@@ -706,7 +747,7 @@ def _get_input_data_config(
                 channels.append(input_data)
             elif isinstance(input_data, InputData):
                 channel = self.create_input_data_channel(
-                    input_data.channel_name, input_data.data_source
+                    input_data.channel_name, input_data.data_source, key_prefix=key_prefix
                 )
                 channels.append(channel)
             else:
@@ -850,7 +891,7 @@ def from_recipe(
                 An array of key-value pairs. You can use tags to categorize your AWS resources
                 in different ways, for example, by purpose, owner, or environment.
             sagemaker_session (Optional[Session]):
-                The SageMaker session.
+                The SageMakerCore session.
                 If not specified, a new session will be created.
             role (Optional[str]):
                 The IAM role ARN for the training job.
diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py
@@ -59,9 +59,10 @@
 DEFAULT_IMAGE = "000000000000.dkr.ecr.us-west-2.amazonaws.com/dummy-image:latest"
 DEFAULT_BUCKET = "sagemaker-us-west-2-000000000000"
 DEFAULT_ROLE = "arn:aws:iam::000000000000:role/test-role"
+DEFAULT_BUCKET_PREFIX = "sample-prefix"
 DEFAULT_COMPUTE_CONFIG = Compute(instance_type=DEFAULT_INSTANCE_TYPE, instance_count=1)
 DEFAULT_OUTPUT_DATA_CONFIG = OutputDataConfig(
-    s3_output_path=f"s3://{DEFAULT_BUCKET}/{DEFAULT_BASE_NAME}",
+    s3_output_path=f"s3://{DEFAULT_BUCKET}/{DEFAULT_BUCKET_PREFIX}/{DEFAULT_BASE_NAME}",
     compression_type="GZIP",
     kms_key_id=None,
 )
@@ -85,6 +86,7 @@ def modules_session():
         session_instance = session_mock.return_value
         session_instance.default_bucket.return_value = DEFAULT_BUCKET
         session_instance.get_caller_identity_arn.return_value = DEFAULT_ROLE
+        session_instance.default_bucket_prefix = DEFAULT_BUCKET_PREFIX
         session_instance.boto_session = MagicMock(spec="boto3.session.Session")
         yield session_instance
 
@@ -170,8 +172,9 @@ def test_train_with_default_params(mock_training_job, model_trainer):
 
 @patch("sagemaker.modules.train.model_trainer.TrainingJob")
 @patch("sagemaker.modules.train.model_trainer.resolve_value_from_config")
+@patch("sagemaker.modules.train.model_trainer.ModelTrainer.create_input_data_channel")
 def test_train_with_intelligent_defaults(
-    mock_resolve_value_from_config, mock_training_job, model_trainer
+    mock_create_input_data_channel, mock_resolve_value_from_config, mock_training_job, model_trainer
 ):
     source_code_path = _simple_path(SAGEMAKER, PYTHON_SDK, MODULES, MODEL_TRAINER, "sourceCode")
 
@@ -229,7 +232,11 @@ def test_train_with_intelligent_defaults_training_job_space(
             max_pending_time_in_seconds=None,
         ),
         output_data_config=OutputDataConfig(
-            s3_output_path="s3://" "sagemaker-us-west-2" "-000000000000/d" "ummy-image-job",
+            s3_output_path="s3://"
+            "sagemaker-us-west-2"
+            "-000000000000/"
+            "sample-prefix/"
+            "dummy-image-job",
             kms_key_id=None,
             compression_type="GZIP",
         ),
@@ -258,7 +265,7 @@ def test_train_with_input_data_channels(mock_get_input_config, mock_training_job
 
     model_trainer.train(input_data_config=mock_input_data_config)
 
-    mock_get_input_config.assert_called_once_with(mock_input_data_config)
+    mock_get_input_config.assert_called_once_with(mock_input_data_config, ANY)
     mock_training_job.create.assert_called_once()
 
 
@@ -309,10 +316,11 @@ def test_train_with_input_data_channels(mock_get_input_config, mock_training_job
     ],
 )
 @patch("sagemaker.modules.train.model_trainer.Session.upload_data")
-def test_create_input_data_channel(mock_upload_data, model_trainer, test_case):
+@patch("sagemaker.modules.train.model_trainer.Session.default_bucket")
+def test_create_input_data_channel(mock_default_bucket, mock_upload_data, model_trainer, test_case):
     expected_s3_uri = f"s3://{DEFAULT_BUCKET}/{DEFAULT_BASE_NAME}-job/input/test"
     mock_upload_data.return_value = expected_s3_uri
-
+    mock_default_bucket.return_value = DEFAULT_BUCKET
     if not test_case["valid"]:
         with pytest.raises(ValueError):
             model_trainer.create_input_data_channel(
@@ -323,7 +331,6 @@ def test_create_input_data_channel(mock_upload_data, model_trainer, test_case):
             test_case["channel_name"], test_case["data_source"]
         )
         assert channel.channel_name == test_case["channel_name"]
-
         if isinstance(test_case["data_source"], S3DataSource):
             assert channel.data_source.s3_data_source == test_case["data_source"]
         elif isinstance(test_case["data_source"], FileSystemDataSource):