Skip to content

Commit 5a37fc5

Browse files
pintaoz-awsbeniericnargokulpravali96
committed
Single container local training (#1556)
* Base model trainer (#1521) * Base model trainer * flake8 * add testing notebook * add param validation & set defaults * Implement simple train method * feature: support script mode with local train.sh (#1523) * feature: support script mode with local train.sh * Stop tracking train.sh and add it to .gitignore * update message * make dir if not exist * fix docs * fix: docstyle * Address comments * fix hyperparams * Revert pydantic custom error * pylint * Image Spec refactoring and updates (#1525) * Image Spec refactoring and updates * Unit tests and update function for Image Spec * Fix hugging face test * Fix Tests * Add unit tests for ModelTrainer (#1527) * Add unit tests for ModelTrainer * Flake8 * format * Add example notebook (#1528) * Add testing notebook * format * use smaller data * remove large dataset * update * pylint * flake8 * ignore docstyle in directories with test * format * format * Add enviornment variable bootstrapping script (#1530) * Add enviornment variables scripts * format * fix comment * add docstrings * fix comment * feature: add utility function to capture local snapshot (#1524) * local snapshot * Update pip list command * Remove function calls * Address comments * Address comments * Support intelligent parameters (#1540) * Support intelligent parameters * fix codestyle * Revert Image Spec (#1541) * Cleanup ModelTrainer (#1542) * General image builder (#1546) * General image builder * General image builder * Fix codestyle * Fix codestyle * Move location * Add warnings * Add integ tests * Fix integ test * Fix integ test * Fix region error * Add region * Latest Container Image (#1545) * Latest Container Image * Test Fixes * Parameterized tests and some logic updates * Test fixes * Move to Image URI * Fixes for unit test * Fixes for unit test * Fix codestyle error checks * Cleanup ModelTrainer code (#1552) * Single container local mode training * Add wait argument * Implement helper funtions * Add helper functions * Fix bugs * Fix codestyle * feat: add pre-processing and post-processing logic to inference_spec (#1560) * add pre-processing and post-processing logic to inference_spec * fix format * make accept_type and content_type optional * remove accept_type and content_type from pre/post processing * correct typo * Fix test and codestyle * Add Distributed Training Support Model Trainer (#1536) * Add tests * Add path to set Additional Settings in ModelTrainer (#1555) * Added example notebook * Fix codestyle * Address comments * resolve merge conflict * Support multi container local training (#1576) * Fix codestyle * Mask Sensitive Env Logs in Container (#1568) * Fix bug in script mode setup ModelTrainer (#1575) * Support multi container local training * Merge branch 'single_container_local_training' into multi_container_local_training * Update unit tests --------- Co-authored-by: Erick Benitez-Ramos <[email protected]> * Remove LocalTrainingJob class * Bypass pydantic check * Add example --------- Co-authored-by: Erick Benitez-Ramos <[email protected]> Co-authored-by: Gokul Anantha Narayanan <[email protected]> Co-authored-by: Pravali Uppugunduri <[email protected]>
1 parent e4701e8 commit 5a37fc5

File tree

8 files changed

+1163
-69
lines changed

8 files changed

+1163
-69
lines changed

src/sagemaker/modules/local_core/local_container.py

+540
Large diffs are not rendered by default.

src/sagemaker/modules/templates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686
8787
echo "Setting up environment variables"
8888
$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/scripts/environment.py
89-
source /opt/ml/input/data/sm_drivers/scripts/sm_training.env
89+
source /opt/ml/input/sm_training.env
9090
9191
{working_dir}
9292
{install_requirements}

src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@
269269
" instance_count=1,\n",
270270
" instance_type=\"ml.g5.48xlarge\",\n",
271271
" volume_size_in_gb=96,\n",
272-
" keep_alive_period_in_seconds=3600\n",
272+
" keep_alive_period_in_seconds=3600,\n",
273273
")\n",
274274
"\n",
275275
"hugging_face_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"\n",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# flake8: noqa
2+
import argparse
3+
import numpy as np
4+
import os
5+
import sys
6+
import logging
7+
import json
8+
import shutil
9+
import torch
10+
import torch.nn as nn
11+
from torch.utils.data import DataLoader, TensorDataset
12+
from pytorch_model_def import get_model
13+
14+
15+
logger = logging.getLogger(__name__)
16+
logger.setLevel(logging.DEBUG)
17+
logger.addHandler(logging.StreamHandler(sys.stdout))
18+
current_dir = os.path.dirname(os.path.abspath(__file__))
19+
data_dir = "/opt/ml/input/data"
20+
21+
22+
def get_train_data(train_dir):
23+
"""
24+
Get the training data and convert to tensors
25+
"""
26+
27+
x_train = np.load(os.path.join(train_dir, "x_train.npy"))
28+
y_train = np.load(os.path.join(train_dir, "y_train.npy"))
29+
logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
30+
31+
return torch.from_numpy(x_train), torch.from_numpy(y_train)
32+
33+
34+
def get_test_data(test_dir):
35+
"""
36+
Get the testing data and convert to tensors
37+
"""
38+
39+
x_test = np.load(os.path.join(test_dir, "x_test.npy"))
40+
y_test = np.load(os.path.join(test_dir, "y_test.npy"))
41+
logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
42+
43+
return torch.from_numpy(x_test), torch.from_numpy(y_test)
44+
45+
46+
def model_fn(model_dir):
47+
"""
48+
Load the model for inference
49+
"""
50+
51+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52+
model = get_model()
53+
model.load_state_dict(torch.load(model_dir + "/model.pth"))
54+
model.eval()
55+
return model.to(device)
56+
57+
58+
def input_fn(request_body, request_content_type):
59+
"""
60+
Deserialize and prepare the prediction input
61+
"""
62+
63+
if request_content_type == "application/json":
64+
request = json.loads(request_body)
65+
train_inputs = torch.tensor(request)
66+
return train_inputs
67+
68+
69+
def predict_fn(input_data, model):
70+
"""
71+
Apply model to the incoming request
72+
"""
73+
74+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75+
model.to(device)
76+
model.eval()
77+
with torch.no_grad():
78+
return model(input_data.float()).numpy()[0]
79+
80+
81+
def train():
82+
"""
83+
Train the PyTorch model
84+
"""
85+
# Directories: train, test and model
86+
train_dir = os.path.join(data_dir, "train")
87+
test_dir = os.path.join(data_dir, "test")
88+
model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
89+
90+
# Load the training and testing data
91+
x_train, y_train = get_train_data(train_dir)
92+
x_test, y_test = get_test_data(test_dir)
93+
train_ds = TensorDataset(x_train, y_train)
94+
95+
# Training parameters - used to configure the training loop
96+
batch_size = 64
97+
epochs = 1
98+
learning_rate = 0.1
99+
logger.info(
100+
"batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
101+
)
102+
103+
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
104+
105+
# Define the model, loss function and optimizer
106+
model = get_model()
107+
model = model.to(device)
108+
criterion = nn.MSELoss()
109+
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
110+
111+
# Train the model
112+
for epoch in range(epochs):
113+
for x_train_batch, y_train_batch in train_dl:
114+
y = model(x_train_batch.float())
115+
loss = criterion(y.flatten(), y_train_batch.float())
116+
optimizer.zero_grad()
117+
loss.backward()
118+
optimizer.step()
119+
epoch += 1
120+
logger.info(f"epoch: {epoch} -> loss: {loss}")
121+
122+
# Test the model
123+
with torch.no_grad():
124+
y = model(x_test.float()).flatten()
125+
mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
126+
print("\nTest MSE:", mse.numpy())
127+
128+
# Save the model
129+
os.makedirs(model_dir, exist_ok=True)
130+
torch.save(model.state_dict(), model_dir + "/model.pth")
131+
inference_code_path = model_dir + "/code/"
132+
133+
if not os.path.exists(inference_code_path):
134+
os.mkdir(inference_code_path)
135+
logger.info("Created a folder at {}!".format(inference_code_path))
136+
137+
code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
138+
shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
139+
shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
140+
logger.info("Saving models files to {}".format(inference_code_path))
141+
142+
143+
if __name__ == "__main__":
144+
print("Running the training job ...\n")
145+
146+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147+
148+
train()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import sys, os\n",
10+
"from sagemaker import image_uris\n",
11+
"\n",
12+
"# Get the absolute path of the root directory\n",
13+
"root_dir = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n",
14+
"sys.path.insert(0, root_dir)"
15+
]
16+
},
17+
{
18+
"cell_type": "markdown",
19+
"metadata": {},
20+
"source": [
21+
"## Local Mode ModelTrainer\n",
22+
"\n",
23+
"In local mode training, user will train their model in a container that runs in their local machine.\n",
24+
"You don't need to access any AWS resources unless you want to use data from S3 as input, or pull images from ECR."
25+
]
26+
},
27+
{
28+
"cell_type": "markdown",
29+
"metadata": {},
30+
"source": [
31+
"## Simple Case Minimally Setup Local ModelTrainer and Execute Commands\n",
32+
"When running model trainer in local mode, you need to have docker engine running in your environment.\n",
33+
"When you run the following cell for the first time, a SageMaker session will be initiated to pull the image from ECR.\n",
34+
"Once the image has been pulled, there won't be any AWS API call."
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"from sagemaker.modules.train.model_trainer import ModelTrainer, Mode\n",
44+
"from sagemaker.modules.configs import SourceCode\n",
45+
"from sagemaker.modules.constants import DEFAULT_INSTANCE_TYPE\n",
46+
"\n",
47+
"hugging_face_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"\n",
48+
"\n",
49+
"source_code = SourceCode(\n",
50+
" command=\"echo 'Hello World' && env\",\n",
51+
")\n",
52+
"model_trainer = ModelTrainer(\n",
53+
" training_image=hugging_face_image,\n",
54+
" source_code=source_code,\n",
55+
" training_input_mode=Mode.LOCAL_CONTAINER,\n",
56+
")"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"model_trainer.train()"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"## Simple Script Mode Case - 1: Training with Local Data\n",
73+
"In this example, everything (input, output, training resource) will be in your local environment. You don't need to use your AWS account at all for this."
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": null,
79+
"metadata": {},
80+
"outputs": [],
81+
"source": [
82+
"from sagemaker.modules.configs import Compute, InputData, SourceCode\n",
83+
"\n",
84+
"source_code = SourceCode(\n",
85+
" source_dir=\"basic-script-mode\",\n",
86+
" entry_script=\"local_training_script.py\",\n",
87+
")\n",
88+
"\n",
89+
"compute = Compute(\n",
90+
" instance_type=\"local_cpu\",\n",
91+
" instance_count=1,\n",
92+
")\n",
93+
"\n",
94+
"train_data = InputData(\n",
95+
" channel_name=\"train\",\n",
96+
" data_source=\"basic-script-mode/data/train/\",\n",
97+
")\n",
98+
"\n",
99+
"test_data = InputData(\n",
100+
" channel_name=\"test\",\n",
101+
" data_source=\"basic-script-mode/data/test/\",\n",
102+
")\n",
103+
"\n",
104+
"model_trainer = ModelTrainer(\n",
105+
" training_image=hugging_face_image,\n",
106+
" source_code=source_code,\n",
107+
" compute=compute,\n",
108+
" input_data_config=[train_data, test_data],\n",
109+
" base_job_name=\"local_mode_single_container_case_1\",\n",
110+
" training_mode=Mode.LOCAL_CONTAINER,\n",
111+
")"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": null,
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"model_trainer.train()"
121+
]
122+
},
123+
{
124+
"cell_type": "markdown",
125+
"metadata": {},
126+
"source": [
127+
"## Simple Script Mode Case - 2: Training with Input Data from S3\n",
128+
"In this example, the input data is read from S3. You will have to configure your AWS credentials before running this."
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": null,
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"train_data = InputData(\n",
138+
" channel_name=\"train\", data_source=\"s3://morpheus-bugbash/basic-script-mode/data/train/\"\n",
139+
")\n",
140+
"\n",
141+
"test_data = InputData(\n",
142+
" channel_name=\"test\", data_source=\"s3://morpheus-bugbash/basic-script-mode/data/test/\"\n",
143+
")\n",
144+
"\n",
145+
"model_trainer = ModelTrainer(\n",
146+
" training_image=hugging_face_image,\n",
147+
" source_code=source_code,\n",
148+
" compute=compute,\n",
149+
" input_data_config=[train_data, test_data],\n",
150+
" base_job_name=\"local_mode_single_container_case_2\",\n",
151+
" training_mode=Mode.LOCAL_CONTAINER,\n",
152+
")"
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": null,
158+
"metadata": {},
159+
"outputs": [],
160+
"source": [
161+
"model_trainer.train()"
162+
]
163+
}
164+
],
165+
"metadata": {
166+
"kernelspec": {
167+
"display_name": "py3.10",
168+
"language": "python",
169+
"name": "python3"
170+
},
171+
"language_info": {
172+
"codemirror_mode": {
173+
"name": "ipython",
174+
"version": 3
175+
},
176+
"file_extension": ".py",
177+
"mimetype": "text/x-python",
178+
"name": "python",
179+
"nbconvert_exporter": "python",
180+
"pygments_lexer": "ipython3",
181+
"version": "3.10.14"
182+
}
183+
},
184+
"nbformat": 4,
185+
"nbformat_minor": 2
186+
}

src/sagemaker/modules/train/container_drivers/scripts/environment.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
INPUT_DATA_CONFIG = f"{SM_INPUT_CONFIG_DIR}/inputdataconfig.json"
5151
HYPERPARAMETERS_CONFIG = f"{SM_INPUT_CONFIG_DIR}/hyperparameters.json"
5252

53-
ENV_OUTPUT_FILE = "/opt/ml/input/data/sm_drivers/scripts/sm_training.env"
53+
ENV_OUTPUT_FILE = "/opt/ml/input/sm_training.env"
5454

5555
SENSITIVE_KEYWORDS = ["SECRET", "PASSWORD", "KEY", "TOKEN", "PRIVATE", "CREDS", "CREDENTIALS"]
5656
HIDDEN_VALUE = "******"

0 commit comments

Comments
 (0)