aws
diff --git a/‎src/sagemaker/modules/local_core/local_container.py
+540 b/‎src/sagemaker/modules/local_core/local_container.py
+540
diff --git a/‎src/sagemaker/modules/templates.py
+1-1 b/‎src/sagemaker/modules/templates.py
+1-1
diff --git a/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
+1-1 b/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
+1-1
diff --git a/‎src/sagemaker/modules/testing_notebooks/basic-script-mode/local_training_script.py
+148 b/‎src/sagemaker/modules/testing_notebooks/basic-script-mode/local_training_script.py
+148
diff --git a/‎src/sagemaker/modules/testing_notebooks/local_model_trainer.ipynb
+186 b/‎src/sagemaker/modules/testing_notebooks/local_model_trainer.ipynb
+186
diff --git a/‎src/sagemaker/modules/train/container_drivers/scripts/environment.py
+1-1 b/‎src/sagemaker/modules/train/container_drivers/scripts/environment.py
+1-1
@@ -86,7 +86,7 @@
 
 echo "Setting up environment variables"
 $SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/scripts/environment.py
-source /opt/ml/input/data/sm_drivers/scripts/sm_training.env
+source /opt/ml/input/sm_training.env
 
 {working_dir}
 {install_requirements}
 
@@ -269,7 +269,7 @@
     "    instance_count=1,\n",
     "    instance_type=\"ml.g5.48xlarge\",\n",
     "    volume_size_in_gb=96,\n",
-    "    keep_alive_period_in_seconds=3600\n",
+    "    keep_alive_period_in_seconds=3600,\n",
     ")\n",
     "\n",
     "hugging_face_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"\n",
 
@@ -0,0 +1,148 @@
+# flake8: noqa
+import argparse
+import numpy as np
+import os
+import sys
+import logging
+import json
+import shutil
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pytorch_model_def import get_model
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+current_dir = os.path.dirname(os.path.abspath(__file__))
+data_dir = "/opt/ml/input/data"
+
+
+def get_train_data(train_dir):
+    """
+    Get the training data and convert to tensors
+    """
+
+    x_train = np.load(os.path.join(train_dir, "x_train.npy"))
+    y_train = np.load(os.path.join(train_dir, "y_train.npy"))
+    logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
+
+    return torch.from_numpy(x_train), torch.from_numpy(y_train)
+
+
+def get_test_data(test_dir):
+    """
+    Get the testing data and convert to tensors
+    """
+
+    x_test = np.load(os.path.join(test_dir, "x_test.npy"))
+    y_test = np.load(os.path.join(test_dir, "y_test.npy"))
+    logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
+
+    return torch.from_numpy(x_test), torch.from_numpy(y_test)
+
+
+def model_fn(model_dir):
+    """
+    Load the model for inference
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = get_model()
+    model.load_state_dict(torch.load(model_dir + "/model.pth"))
+    model.eval()
+    return model.to(device)
+
+
+def input_fn(request_body, request_content_type):
+    """
+    Deserialize and prepare the prediction input
+    """
+
+    if request_content_type == "application/json":
+        request = json.loads(request_body)
+        train_inputs = torch.tensor(request)
+        return train_inputs
+
+
+def predict_fn(input_data, model):
+    """
+    Apply model to the incoming request
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        return model(input_data.float()).numpy()[0]
+
+
+def train():
+    """
+    Train the PyTorch model
+    """
+    # Directories: train, test and model
+    train_dir = os.path.join(data_dir, "train")
+    test_dir = os.path.join(data_dir, "test")
+    model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
+
+    # Load the training and testing data
+    x_train, y_train = get_train_data(train_dir)
+    x_test, y_test = get_test_data(test_dir)
+    train_ds = TensorDataset(x_train, y_train)
+
+    # Training parameters - used to configure the training loop
+    batch_size = 64
+    epochs = 1
+    learning_rate = 0.1
+    logger.info(
+        "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
+    )
+
+    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
+
+    # Define the model, loss function and optimizer
+    model = get_model()
+    model = model.to(device)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(epochs):
+        for x_train_batch, y_train_batch in train_dl:
+            y = model(x_train_batch.float())
+            loss = criterion(y.flatten(), y_train_batch.float())
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        epoch += 1
+        logger.info(f"epoch: {epoch} -> loss: {loss}")
+
+    # Test the model
+    with torch.no_grad():
+        y = model(x_test.float()).flatten()
+        mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
+    print("\nTest MSE:", mse.numpy())
+
+    # Save the model
+    os.makedirs(model_dir, exist_ok=True)
+    torch.save(model.state_dict(), model_dir + "/model.pth")
+    inference_code_path = model_dir + "/code/"
+
+    if not os.path.exists(inference_code_path):
+        os.mkdir(inference_code_path)
+        logger.info("Created a folder at {}!".format(inference_code_path))
+
+    code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
+    shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
+    shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
+    logger.info("Saving models files to {}".format(inference_code_path))
+
+
+if __name__ == "__main__":
+    print("Running the training job ...\n")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train()
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "from sagemaker import image_uris\n",
+    "\n",
+    "# Get the absolute path of the root directory\n",
+    "root_dir = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n",
+    "sys.path.insert(0, root_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Local Mode ModelTrainer\n",
+    "\n",
+    "In local mode training, user will train their model in a container that runs in their local machine.\n",
+    "You don't need to access any AWS resources unless you want to use data from S3 as input, or pull images from ECR."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple Case Minimally Setup Local ModelTrainer and Execute Commands\n",
+    "When running model trainer in local mode, you need to have docker engine running in your environment.\n",
+    "When you run the following cell for the first time, a SageMaker session will be initiated to pull the image from ECR.\n",
+    "Once the image has been pulled, there won't be any AWS API call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.modules.train.model_trainer import ModelTrainer, Mode\n",
+    "from sagemaker.modules.configs import SourceCode\n",
+    "from sagemaker.modules.constants import DEFAULT_INSTANCE_TYPE\n",
+    "\n",
+    "hugging_face_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"\n",
+    "\n",
+    "source_code = SourceCode(\n",
+    "    command=\"echo 'Hello World' && env\",\n",
+    ")\n",
+    "model_trainer = ModelTrainer(\n",
+    "    training_image=hugging_face_image,\n",
+    "    source_code=source_code,\n",
+    "    training_input_mode=Mode.LOCAL_CONTAINER,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple Script Mode Case - 1: Training with Local Data\n",
+    "In this example, everything (input, output, training resource) will be in your local environment. You don't need to use your AWS account at all for this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.modules.configs import Compute, InputData, SourceCode\n",
+    "\n",
+    "source_code = SourceCode(\n",
+    "    source_dir=\"basic-script-mode\",\n",
+    "    entry_script=\"local_training_script.py\",\n",
+    ")\n",
+    "\n",
+    "compute = Compute(\n",
+    "    instance_type=\"local_cpu\",\n",
+    "    instance_count=1,\n",
+    ")\n",
+    "\n",
+    "train_data = InputData(\n",
+    "    channel_name=\"train\",\n",
+    "    data_source=\"basic-script-mode/data/train/\",\n",
+    ")\n",
+    "\n",
+    "test_data = InputData(\n",
+    "    channel_name=\"test\",\n",
+    "    data_source=\"basic-script-mode/data/test/\",\n",
+    ")\n",
+    "\n",
+    "model_trainer = ModelTrainer(\n",
+    "    training_image=hugging_face_image,\n",
+    "    source_code=source_code,\n",
+    "    compute=compute,\n",
+    "    input_data_config=[train_data, test_data],\n",
+    "    base_job_name=\"local_mode_single_container_case_1\",\n",
+    "    training_mode=Mode.LOCAL_CONTAINER,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple Script Mode Case - 2: Training with Input Data from S3\n",
+    "In this example, the input data is read from S3. You will have to configure your AWS credentials before running this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = InputData(\n",
+    "    channel_name=\"train\", data_source=\"s3://morpheus-bugbash/basic-script-mode/data/train/\"\n",
+    ")\n",
+    "\n",
+    "test_data = InputData(\n",
+    "    channel_name=\"test\", data_source=\"s3://morpheus-bugbash/basic-script-mode/data/test/\"\n",
+    ")\n",
+    "\n",
+    "model_trainer = ModelTrainer(\n",
+    "    training_image=hugging_face_image,\n",
+    "    source_code=source_code,\n",
+    "    compute=compute,\n",
+    "    input_data_config=[train_data, test_data],\n",
+    "    base_job_name=\"local_mode_single_container_case_2\",\n",
+    "    training_mode=Mode.LOCAL_CONTAINER,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -50,7 +50,7 @@
 INPUT_DATA_CONFIG = f"{SM_INPUT_CONFIG_DIR}/inputdataconfig.json"
 HYPERPARAMETERS_CONFIG = f"{SM_INPUT_CONFIG_DIR}/hyperparameters.json"
 
-ENV_OUTPUT_FILE = "/opt/ml/input/data/sm_drivers/scripts/sm_training.env"
+ENV_OUTPUT_FILE = "/opt/ml/input/sm_training.env"
 
 SENSITIVE_KEYWORDS = ["SECRET", "PASSWORD", "KEY", "TOKEN", "PRIVATE", "CREDS", "CREDENTIALS"]
 HIDDEN_VALUE = "******"