Skip to content

Commit 2bad8fd

Browse files
beniericmufiAmazonmufaddal-rohawala
committed
feature: support local mode in SageMaker Studio (aws#1300)
* feature: support local mode in SageMaker Studio * chore: fix typo * chore: fix formatting * chore: revert changes for docker compose logs * chore: black-format * change: Use predtermined dns-allow-listed-hostname for Studio Local Support * add support for CodeEditor and JupyterLabs --------- Co-authored-by: Mufaddal Rohawala <[email protected]> Co-authored-by: Mufaddal Rohawala <[email protected]>
1 parent c797f2d commit 2bad8fd

File tree

4 files changed

+313
-30
lines changed

4 files changed

+313
-30
lines changed

src/sagemaker/local/image.py

+54-27
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import sagemaker.utils
4343

4444
CONTAINER_PREFIX = "algo"
45+
STUDIO_HOST_NAME = "sagemaker-local"
4546
DOCKER_COMPOSE_FILENAME = "docker-compose.yaml"
4647
DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = "COMPOSE_HTTP_TIMEOUT"
4748
DOCKER_COMPOSE_HTTP_TIMEOUT = "120"
@@ -50,6 +51,7 @@
5051
REGION_ENV_NAME = "AWS_REGION"
5152
TRAINING_JOB_NAME_ENV_NAME = "TRAINING_JOB_NAME"
5253
S3_ENDPOINT_URL_ENV_NAME = "S3_ENDPOINT_URL"
54+
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
5355

5456
# SELinux Enabled
5557
SELINUX_ENABLED = os.environ.get("SAGEMAKER_LOCAL_SELINUX_ENABLED", "False").lower() in [
@@ -107,10 +109,30 @@ def __init__(
107109
# Since we are using a single docker network, Generate a random suffix to attach to the
108110
# container names. This way multiple jobs can run in parallel.
109111
suffix = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(5))
110-
self.hosts = [
111-
"{}-{}-{}".format(CONTAINER_PREFIX, i, suffix)
112-
for i in range(1, self.instance_count + 1)
113-
]
112+
self.is_studio = sagemaker.local.utils.check_for_studio()
113+
if self.is_studio:
114+
if self.instance_count > 1:
115+
raise NotImplementedError(
116+
"Multi instance Local Mode execution is "
117+
"currently not supported in SageMaker Studio."
118+
)
119+
# For studio use-case, directories need to be created in `~/tmp`, rather than /tmp
120+
home = os.path.expanduser("~")
121+
root_dir = os.path.join(home, "tmp")
122+
if not os.path.isdir(root_dir):
123+
os.mkdir(root_dir)
124+
if self.sagemaker_session.config:
125+
self.sagemaker_session.config["local"]["container_root"] = root_dir
126+
else:
127+
self.sagemaker_session.config = {"local": {"container_root": root_dir}}
128+
# Studio only supports single instance run
129+
self.hosts = [STUDIO_HOST_NAME]
130+
else:
131+
self.hosts = [
132+
"{}-{}-{}".format(CONTAINER_PREFIX, i, suffix)
133+
for i in range(1, self.instance_count + 1)
134+
]
135+
114136
self.container_root = None
115137
self.container = None
116138

@@ -201,22 +223,17 @@ def process(
201223
self._generate_compose_file(
202224
"process", additional_volumes=volumes, additional_env_vars=environment
203225
)
204-
compose_command = self._compose()
205226

206227
if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image):
207228
_pull_image(self.image)
208229

230+
compose_command = self._compose()
209231
process = subprocess.Popen(
210232
compose_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
211233
)
212234

213235
try:
214236
_stream_output(process)
215-
except RuntimeError as e:
216-
# _stream_output() doesn't have the command line. We will handle the exception
217-
# which contains the exit code and append the command line to it.
218-
msg = f"Failed to run: {compose_command}"
219-
raise RuntimeError(msg) from e
220237
finally:
221238
# Uploading processing outputs back to Amazon S3.
222239
self._upload_processing_outputs(data_dir, processing_output_config)
@@ -283,22 +300,17 @@ def train(self, input_data_config, output_data_config, hyperparameters, environm
283300
compose_data = self._generate_compose_file(
284301
"train", additional_volumes=volumes, additional_env_vars=training_env_vars
285302
)
286-
compose_command = self._compose()
287303

288304
if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image):
289305
_pull_image(self.image)
290306

307+
compose_command = self._compose()
291308
process = subprocess.Popen(
292309
compose_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
293310
)
294311

295312
try:
296313
_stream_output(process)
297-
except RuntimeError as e:
298-
# _stream_output() doesn't have the command line. We will handle the exception
299-
# which contains the exit code and append the command line to it.
300-
msg = "Failed to run: %s, %s" % (compose_command, str(e))
301-
raise RuntimeError(msg)
302314
finally:
303315
artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
304316

@@ -347,6 +359,7 @@ def serve(self, model_dir, environment):
347359
self._generate_compose_file(
348360
"serve", additional_env_vars=environment, additional_volumes=volumes
349361
)
362+
350363
compose_command = self._compose()
351364

352365
self.container = _HostingContainer(compose_command)
@@ -710,6 +723,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
710723
additional_env_var_list = ["{}={}".format(k, v) for k, v in additional_env_vars.items()]
711724
environment.extend(additional_env_var_list)
712725

726+
if self.is_studio:
727+
environment.extend([f"{SM_STUDIO_LOCAL_MODE}=True"])
728+
713729
if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None:
714730
os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT
715731

@@ -723,12 +739,19 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
723739
for h in self.hosts
724740
}
725741

726-
content = {
727-
# Use version 2.3 as a minimum so that we can specify the runtime
728-
"version": "2.3",
729-
"services": services,
730-
"networks": {"sagemaker-local": {"name": "sagemaker-local"}},
731-
}
742+
if self.is_studio:
743+
content = {
744+
# Use version 2.3 as a minimum so that we can specify the runtime
745+
"version": "2.3",
746+
"services": services,
747+
}
748+
else:
749+
content = {
750+
# Use version 2.3 as a minimum so that we can specify the runtime
751+
"version": "2.3",
752+
"services": services,
753+
"networks": {"sagemaker-local": {"name": "sagemaker-local"}},
754+
}
732755

733756
docker_compose_path = os.path.join(self.container_root, DOCKER_COMPOSE_FILENAME)
734757

@@ -810,7 +833,6 @@ def _create_docker_host(
810833
"tty": True,
811834
"volumes": [v.map for v in optml_volumes],
812835
"environment": environment,
813-
"networks": {"sagemaker-local": {"aliases": [host]}},
814836
}
815837

816838
is_train_with_entrypoint = False
@@ -827,14 +849,19 @@ def _create_docker_host(
827849
if self.container_arguments:
828850
host_config["entrypoint"] = host_config["entrypoint"] + self.container_arguments
829851

852+
if self.is_studio:
853+
host_config["network_mode"] = "sagemaker"
854+
else:
855+
host_config["networks"] = {"sagemaker-local": {"aliases": [host]}}
856+
830857
# for GPU support pass in nvidia as the runtime, this is equivalent
831858
# to setting --runtime=nvidia in the docker commandline.
832859
if self.instance_type == "local_gpu":
833860
host_config["deploy"] = {
834861
"resources": {"reservations": {"devices": [{"capabilities": ["gpu"]}]}}
835862
}
836863

837-
if command == "serve":
864+
if not self.is_studio and command == "serve":
838865
serving_port = (
839866
sagemaker.utils.get_config_value(
840867
"local.serving_port", self.sagemaker_session.config
@@ -910,7 +937,7 @@ def __init__(self, command):
910937
"""Creates a new threaded hosting container.
911938
912939
Args:
913-
command:
940+
command (dict): docker compose command
914941
"""
915942
Thread.__init__(self)
916943
self.command = command
@@ -987,8 +1014,8 @@ def _stream_output(process):
9871014
sys.stdout.write(stdout)
9881015
exit_code = process.poll()
9891016

990-
if exit_code != 0:
991-
raise RuntimeError("Process exited with code: %s" % exit_code)
1017+
if exit_code not in [0, 130]:
1018+
raise RuntimeError(f"Failed to run: {process.args}. Process exited with code: {exit_code}")
9921019

9931020
return exit_code
9941021

src/sagemaker/local/utils.py

+28
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
logger = logging.getLogger(__name__)
3030

31+
STUDIO_APP_TYPES = ["KernelGateway", "CodeEditor", "JupyterLab"]
32+
3133

3234
def copy_directory_structure(destination_directory, relative_path):
3335
"""Creates intermediate directory structure for relative_path.
@@ -216,3 +218,29 @@ def get_using_dot_notation(dictionary, keys):
216218
return get_using_dot_notation(inner_dict, rest)
217219
except (KeyError, IndexError, TypeError):
218220
raise ValueError(f"{keys} does not exist in input dictionary.")
221+
222+
223+
def check_for_studio():
224+
"""Helper function to determine if the run environment is studio.
225+
226+
Returns (bool): Returns True if valid Studio request.
227+
228+
Raises:
229+
NotImplementedError:
230+
if run environment = Studio and AppType not in STUDIO_APP_TYPES
231+
"""
232+
is_studio = False
233+
if os.path.exists("/opt/ml/metadata/resource-metadata.json"):
234+
with open("/opt/ml/metadata/resource-metadata.json", "r") as handle:
235+
metadata = json.load(handle)
236+
app_type = metadata.get("AppType")
237+
if app_type:
238+
# check if the execution is triggered from Studio KernelGateway App
239+
if app_type in STUDIO_APP_TYPES:
240+
is_studio = True
241+
else:
242+
raise NotImplementedError(
243+
f"AppType {app_type} in Studio does not support Local Mode."
244+
)
245+
# if no apptype, case of classic notebooks
246+
return is_studio

0 commit comments

Comments
 (0)