Trainer handshake (#1535)

nargokul · benieric · pintaoz-aws · pintaoz-aws · commit a99ae84f67a4 · 2024-12-04T04:38:29.000-08:00
* Base model trainer (#1521) * Base model trainer * flake8 * add testing notebook * add param validation & set defaults * Implement simple train method * feature: support script mode with local train.sh (#1523) * feature: support script mode with local train.sh * Stop tracking train.sh and add it to .gitignore * update message * make dir if not exist * fix docs * fix: docstyle * Address comments * fix hyperparams * Revert pydantic custom error * pylint * Image Spec refactoring and updates (#1525) * Image Spec refactoring and updates * Unit tests and update function for Image Spec * Fix hugging face test * Fix Tests * Add unit tests for ModelTrainer (#1527) * Add unit tests for ModelTrainer * Flake8 * format * Add example notebook (#1528) * Add testing notebook * format * use smaller data * remove large dataset * update * pylint * flake8 * ignore docstyle in directories with test * format * format * Add enviornment variable bootstrapping script (#1530) * Add enviornment variables scripts * format * fix comment * add docstrings * fix comment * feature: add utility function to capture local snapshot (#1524) * local snapshot * Update pip list command * Remove function calls * Address comments * Address comments * Change to make Model Trainer return a Model Object * Fix * Cleanup * Support intelligent parameters (#1540) * Support intelligent parameters * fix codestyle * Revert Image Spec (#1541) * Cleanup ModelTrainer (#1542) * General image builder (#1546) * General image builder * General image builder * Fix codestyle * Fix codestyle * Move location * Add warnings * Add integ tests * Fix integ test * Fix integ test * Fix region error * Add region * Latest Container Image (#1545) * Latest Container Image * Test Fixes * Parameterized tests and some logic updates * Test fixes * Move to Image URI * Fixes for unit test * Fixes for unit test * Fix codestyle error checks * Cleanup ModelTrainer code (#1552) * Updates * feat: add pre-processing and post-processing logic to inference_spec (#1560) * add pre-processing and post-processing logic to inference_spec * fix format * make accept_type and content_type optional * remove accept_type and content_type from pre/post processing * correct typo * Add Distributed Training Support Model Trainer (#1536) * Add path to set Additional Settings in ModelTrainer (#1555) * Updates * Mask Sensitive Env Logs in Container (#1568) * Cleanup PR * Codestyle fixes * Update logic to use model parameter instead of model_path * Fixes * Fixes * Tests * Codestyle Fixes * Codestyle Fixes * Codestyle Fixes * Codestyle Fixes --------- Co-authored-by: Erick Benitez-Ramos <141277478+benieric@users.noreply.github.com> Co-authored-by: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Co-authored-by: Pravali Uppugunduri <46845440+pravali96@users.noreply.github.com>
diff --git a/src/sagemaker/modules/testing_notebooks/model_builder_handshake.ipynb b/src/sagemaker/modules/testing_notebooks/model_builder_handshake.ipynb
@@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "from sagemaker_core.main.shapes import TrainingJob\n",
+    "\n",
+    "from sagemaker import Session, get_execution_role\n",
+    "\n",
+    "sagemaker_session = Session()\n",
+    "role = get_execution_role()\n",
+    "region = sagemaker_session.boto_region_name\n",
+    "bucket = sagemaker_session.default_bucket()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "\n",
+    "from sagemaker.modules.configs import SourceCode\n",
+    "from sagemaker.modules.train.model_trainer import ModelTrainer\n",
+    "\n",
+    "xgboost_image = \"433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest\"\n",
+    "\n",
+    "source_code = SourceCode(\n",
+    "    command=\"echo 'Hello World' && env\",\n",
+    ")\n",
+    "model_trainer = ModelTrainer(\n",
+    "    training_image=xgboost_image,\n",
+    "    source_code=source_code,\n",
+    ")\n",
+    "\n",
+    "model_trainer.train()"
+   ],
+   "id": "4b3a4f7d1713685f",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "import numpy as np\n",
+    "from sagemaker.serve.builder.schema_builder import SchemaBuilder\n",
+    "import pandas as pd\n",
+    "from xgboost import XGBClassifier\n",
+    "from sagemaker.serve.spec.inference_spec import InferenceSpec\n",
+    "from sagemaker.serve import ModelBuilder\n",
+    "\n",
+    "data = {\n",
+    "    'Name': ['Alice', 'Bob', 'Charlie']\n",
+    "}\n",
+    "df = pd.DataFrame(data)\n",
+    "schema_builder = SchemaBuilder(sample_input=df, sample_output=df)\n",
+    "\n",
+    "\n",
+    "class XGBoostSpec(InferenceSpec):\n",
+    "    def load(self, model_dir: str):\n",
+    "        print(model_dir)\n",
+    "        model = XGBClassifier()\n",
+    "        model.load_model(model_dir + \"/xgboost-model\")\n",
+    "        return model\n",
+    "\n",
+    "    def invoke(self, input_object: object, model: object):\n",
+    "        prediction_probabilities = model.predict_proba(input_object)\n",
+    "        predictions = np.argmax(prediction_probabilities, axis=1)\n",
+    "        return predictions\n",
+    "\n",
+    "model_builder = ModelBuilder(\n",
+    "    model=model_trainer, # ModelTrainer object passed onto ModelBuilder directly \n",
+    "    role_arn=role,\n",
+    "    image_uri=xgboost_image,\n",
+    "    inference_spec=XGBoostSpec(),\n",
+    "    schema_builder=schema_builder,\n",
+    "    instance_type=\"ml.c6i.xlarge\"\n",
+    ")\n",
+    "model=model_builder.build()\n",
+    "predictor=model_builder.deploy()\n",
+    "\n",
+    "predictor\n",
+    "assert model.model_data == model_trainer._latest_training_job.model_artifacts.s3_model_artifacts\n",
+    "\n",
+    "print(model.model_data)"
+   ],
+   "id": "295a16ef277257a0",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "training_job: TrainingJob = model_trainer._latest_training_job\n",
+    "\n",
+    "model_builder = ModelBuilder(\n",
+    "    model=training_job, # Sagemaker core's TrainingJob object passed onto ModelBuilder directly \n",
+    "    role_arn=role,\n",
+    "    image_uri=xgboost_image,\n",
+    "    schema_builder=schema_builder,\n",
+    "    inference_spec=XGBoostSpec(),\n",
+    "    instance_type=\"ml.c6i.xlarge\"\n",
+    ")\n",
+    "model=model_builder.build()\n",
+    "\n",
+    "assert model.model_data == training_job.model_artifacts.s3_model_artifacts\n",
+    "\n",
+    "print(model.model_data)"
+   ],
+   "id": "935ea8486278d7b1",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "",
+   "id": "757180da84407a1a",
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py
@@ -19,11 +19,12 @@
 from tempfile import TemporaryDirectory
 
 from typing import Optional, List, Union, Dict, Any
-from pydantic import BaseModel, ConfigDict, PrivateAttr, validate_call
-
+from sagemaker_core.main import resources
 from sagemaker_core.resources import TrainingJob
 from sagemaker_core.shapes import AlgorithmSpecification
 
+from pydantic import BaseModel, ConfigDict, PrivateAttr, validate_call
+
 from sagemaker import get_execution_role, Session
 from sagemaker.modules.configs import (
     Compute,
@@ -51,6 +52,7 @@
     CheckpointConfig,
     InputData,
 )
+
 from sagemaker.modules.distributed import (
     DistributedRunner,
     TorchrunSMP,
@@ -187,13 +189,17 @@ class ModelTrainer(BaseModel):
     hyperparameters: Optional[Dict[str, Any]] = None
     tags: Optional[List[Tag]] = None
 
+    # Created Artifacts
+    _latest_training_job: Optional[resources.TrainingJob] = None
+
     # Metrics settings
     _enable_sage_maker_metrics_time_series: Optional[bool] = PrivateAttr(default=False)
     _metric_definitions: Optional[List[MetricDefinition]] = PrivateAttr(default=None)
 
     # Debugger settings
     _debug_hook_config: Optional[DebugHookConfig] = PrivateAttr(default=None)
     _debug_rule_configurations: Optional[List[DebugRuleConfiguration]] = PrivateAttr(default=None)
+    _remote_debug_config: Optional[RemoteDebugConfig] = PrivateAttr(default=None)
     _profiler_config: Optional[ProfilerConfig] = PrivateAttr(default=None)
     _profiler_rule_configurations: Optional[List[ProfilerRuleConfiguration]] = PrivateAttr(
         default=None
@@ -448,11 +454,9 @@ def train(
             infra_check_config=self._infra_check_config,
             session_chaining_config=self._session_chaining_config,
         )
-
+        self._latest_training_job = training_job
         if wait:
             training_job.wait(logs=logs)
-        if logs and not wait:
-            logger.warning("Not displaing the training container logs as 'wait' is set to False.")
 
     def create_input_data_channel(self, channel_name: str, data_source: DataSourceType) -> Channel:
         """Create an input data channel for the training job.
diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py
@@ -24,6 +24,9 @@
 
 from pathlib import Path
 
+from sagemaker_core.main.resources import TrainingJob
+
+from sagemaker.estimator import Estimator
 from sagemaker.enums import Tag
 from sagemaker.jumpstart.accessors import JumpStartS3PayloadAccessor
 from sagemaker.jumpstart.utils import get_jumpstart_content_bucket
@@ -176,8 +179,9 @@ class ModelBuilder(Triton, DJL, JumpStart, TGI, Transformers, TensorflowServing,
             The schema builder can be omitted for HuggingFace models with task types TextGeneration,
             TextClassification, and QuestionAnswering. Omitting SchemaBuilder is in
             beta for FillMask, and AutomaticSpeechRecognition use-cases.
-        model (Optional[Union[object, str]): Model object (with ``predict`` method to perform
-            inference) or a HuggingFace/JumpStart Model ID. Either ``model`` or ``inference_spec``
+        model (Optional[Union[object, str, ModelTrainer, TrainingJob, Estimator]]):
+            Define object from which training artifacts can be extracted.
+            Either ``model`` or ``inference_spec``
             is required for the model builder to build the artifact.
         inference_spec (InferenceSpec): The inference spec file with your customized
             ``invoke`` and ``load`` functions.
@@ -268,14 +272,9 @@ class ModelBuilder(Triton, DJL, JumpStart, TGI, Transformers, TensorflowServing,
     schema_builder: Optional[SchemaBuilder] = field(
         default=None, metadata={"help": "Defines the i/o schema of the model"}
     )
-    model: Optional[Union[object, str]] = field(
+    model: Optional[Union[object, str, "ModelTrainer", TrainingJob, Estimator]] = field(
         default=None,
-        metadata={
-            "help": (
-                'Model object with "predict" method to perform inference '
-                "or HuggingFace/JumpStart Model ID"
-            )
-        },
+        metadata={"help": "Define object from which training artifacts can be extracted"}
     )
     inference_spec: InferenceSpec = field(
         default=None,
@@ -852,13 +851,24 @@ def build(  # pylint: disable=R0911
         Returns:
             Type[Model]: A deployable ``Model`` object.
         """
+        from sagemaker.modules.train.model_trainer import ModelTrainer
         self.modes = dict()
 
         if mode:
             self.mode = mode
         if role_arn:
             self.role_arn = role_arn
 
+        if isinstance(self.model, TrainingJob):
+            self.model_path = self.model.model_artifacts.s3_model_artifacts
+            self.model = None
+        elif isinstance(self.model, ModelTrainer):
+            self.model_path = self.model._latest_training_job.model_artifacts.s3_model_artifacts
+            self.model = None
+        elif isinstance(self.model, Estimator):
+            self.model_path = self.model.output_path
+            self.model = None
+
         self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session()
 
         self.sagemaker_session.settings._local_download_dir = self.model_path
diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py
@@ -20,6 +20,8 @@
 import pytest
 from unittest.mock import patch, MagicMock
 
+from sagemaker_core.main.resources import TrainingJob
+
 from sagemaker.session import Session
 from sagemaker.modules.train.model_trainer import ModelTrainer
 from sagemaker.modules.constants import (
@@ -316,6 +318,7 @@ def test_debugger_settings(mock_training_job, modules_session):
 
     assert model_trainer._debug_hook_config == debug_hook_config
     assert model_trainer._debug_rule_configurations == debug_rule_config
+
     assert model_trainer._profiler_config == profiler_config
     assert model_trainer._profiler_rule_configurations == profiler_rule_config
     assert model_trainer._tensor_board_output_config == tensor_board_output_config
@@ -485,7 +488,12 @@ def test_train_with_distributed_runner(
             assert test_case["distributed_runner"].model_dump(exclude_none=True) == (
                 json.loads(runner_json_content)
             )
-
+        assert os.path.exists(expected_source_code_json_path)
+        with open(expected_source_code_json_path, "r") as f:
+            source_code_json_content = f.read()
+            assert test_case["source_code"].model_dump(exclude_none=True) == (
+                json.loads(source_code_json_content)
+            )
         assert os.path.exists(expected_source_code_json_path)
         with open(expected_source_code_json_path, "r") as f:
             source_code_json_content = f.read()
@@ -495,3 +503,11 @@ def test_train_with_distributed_runner(
     finally:
         shutil.rmtree(tmp_dir.name)
         assert not os.path.exists(tmp_dir.name)
+
+
+@patch("sagemaker.modules.train.model_trainer.TrainingJob")
+def test_train_stores_created_training_job(mock_training_job, model_trainer):
+    mock_training_job.create.return_value = TrainingJob(training_job_name="Created-job")
+    model_trainer.train(wait=False)
+    assert model_trainer._latest_training_job is not None
+    assert model_trainer._latest_training_job == TrainingJob(training_job_name="Created-job")