42
42
import sagemaker .utils
43
43
44
44
CONTAINER_PREFIX = "algo"
45
+ STUDIO_HOST_NAME = "sagemaker-local"
45
46
DOCKER_COMPOSE_FILENAME = "docker-compose.yaml"
46
47
DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = "COMPOSE_HTTP_TIMEOUT"
47
48
DOCKER_COMPOSE_HTTP_TIMEOUT = "120"
50
51
REGION_ENV_NAME = "AWS_REGION"
51
52
TRAINING_JOB_NAME_ENV_NAME = "TRAINING_JOB_NAME"
52
53
S3_ENDPOINT_URL_ENV_NAME = "S3_ENDPOINT_URL"
54
+ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
53
55
54
56
# SELinux Enabled
55
57
SELINUX_ENABLED = os .environ .get ("SAGEMAKER_LOCAL_SELINUX_ENABLED" , "False" ).lower () in [
@@ -107,10 +109,30 @@ def __init__(
107
109
# Since we are using a single docker network, Generate a random suffix to attach to the
108
110
# container names. This way multiple jobs can run in parallel.
109
111
suffix = "" .join (random .choice (string .ascii_lowercase + string .digits ) for _ in range (5 ))
110
- self .hosts = [
111
- "{}-{}-{}" .format (CONTAINER_PREFIX , i , suffix )
112
- for i in range (1 , self .instance_count + 1 )
113
- ]
112
+ self .is_studio = sagemaker .local .utils .check_for_studio ()
113
+ if self .is_studio :
114
+ if self .instance_count > 1 :
115
+ raise NotImplementedError (
116
+ "Multi instance Local Mode execution is "
117
+ "currently not supported in SageMaker Studio."
118
+ )
119
+ # For studio use-case, directories need to be created in `~/tmp`, rather than /tmp
120
+ home = os .path .expanduser ("~" )
121
+ root_dir = os .path .join (home , "tmp" )
122
+ if not os .path .isdir (root_dir ):
123
+ os .mkdir (root_dir )
124
+ if self .sagemaker_session .config :
125
+ self .sagemaker_session .config ["local" ]["container_root" ] = root_dir
126
+ else :
127
+ self .sagemaker_session .config = {"local" : {"container_root" : root_dir }}
128
+ # Studio only supports single instance run
129
+ self .hosts = [STUDIO_HOST_NAME ]
130
+ else :
131
+ self .hosts = [
132
+ "{}-{}-{}" .format (CONTAINER_PREFIX , i , suffix )
133
+ for i in range (1 , self .instance_count + 1 )
134
+ ]
135
+
114
136
self .container_root = None
115
137
self .container = None
116
138
@@ -201,22 +223,17 @@ def process(
201
223
self ._generate_compose_file (
202
224
"process" , additional_volumes = volumes , additional_env_vars = environment
203
225
)
204
- compose_command = self ._compose ()
205
226
206
227
if _ecr_login_if_needed (self .sagemaker_session .boto_session , self .image ):
207
228
_pull_image (self .image )
208
229
230
+ compose_command = self ._compose ()
209
231
process = subprocess .Popen (
210
232
compose_command , stdout = subprocess .PIPE , stderr = subprocess .STDOUT
211
233
)
212
234
213
235
try :
214
236
_stream_output (process )
215
- except RuntimeError as e :
216
- # _stream_output() doesn't have the command line. We will handle the exception
217
- # which contains the exit code and append the command line to it.
218
- msg = f"Failed to run: { compose_command } "
219
- raise RuntimeError (msg ) from e
220
237
finally :
221
238
# Uploading processing outputs back to Amazon S3.
222
239
self ._upload_processing_outputs (data_dir , processing_output_config )
@@ -283,22 +300,17 @@ def train(self, input_data_config, output_data_config, hyperparameters, environm
283
300
compose_data = self ._generate_compose_file (
284
301
"train" , additional_volumes = volumes , additional_env_vars = training_env_vars
285
302
)
286
- compose_command = self ._compose ()
287
303
288
304
if _ecr_login_if_needed (self .sagemaker_session .boto_session , self .image ):
289
305
_pull_image (self .image )
290
306
307
+ compose_command = self ._compose ()
291
308
process = subprocess .Popen (
292
309
compose_command , stdout = subprocess .PIPE , stderr = subprocess .STDOUT
293
310
)
294
311
295
312
try :
296
313
_stream_output (process )
297
- except RuntimeError as e :
298
- # _stream_output() doesn't have the command line. We will handle the exception
299
- # which contains the exit code and append the command line to it.
300
- msg = "Failed to run: %s, %s" % (compose_command , str (e ))
301
- raise RuntimeError (msg )
302
314
finally :
303
315
artifacts = self .retrieve_artifacts (compose_data , output_data_config , job_name )
304
316
@@ -347,6 +359,7 @@ def serve(self, model_dir, environment):
347
359
self ._generate_compose_file (
348
360
"serve" , additional_env_vars = environment , additional_volumes = volumes
349
361
)
362
+
350
363
compose_command = self ._compose ()
351
364
352
365
self .container = _HostingContainer (compose_command )
@@ -710,6 +723,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
710
723
additional_env_var_list = ["{}={}" .format (k , v ) for k , v in additional_env_vars .items ()]
711
724
environment .extend (additional_env_var_list )
712
725
726
+ if self .is_studio :
727
+ environment .extend ([f"{ SM_STUDIO_LOCAL_MODE } =True" ])
728
+
713
729
if os .environ .get (DOCKER_COMPOSE_HTTP_TIMEOUT_ENV ) is None :
714
730
os .environ [DOCKER_COMPOSE_HTTP_TIMEOUT_ENV ] = DOCKER_COMPOSE_HTTP_TIMEOUT
715
731
@@ -723,12 +739,19 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
723
739
for h in self .hosts
724
740
}
725
741
726
- content = {
727
- # Use version 2.3 as a minimum so that we can specify the runtime
728
- "version" : "2.3" ,
729
- "services" : services ,
730
- "networks" : {"sagemaker-local" : {"name" : "sagemaker-local" }},
731
- }
742
+ if self .is_studio :
743
+ content = {
744
+ # Use version 2.3 as a minimum so that we can specify the runtime
745
+ "version" : "2.3" ,
746
+ "services" : services ,
747
+ }
748
+ else :
749
+ content = {
750
+ # Use version 2.3 as a minimum so that we can specify the runtime
751
+ "version" : "2.3" ,
752
+ "services" : services ,
753
+ "networks" : {"sagemaker-local" : {"name" : "sagemaker-local" }},
754
+ }
732
755
733
756
docker_compose_path = os .path .join (self .container_root , DOCKER_COMPOSE_FILENAME )
734
757
@@ -810,7 +833,6 @@ def _create_docker_host(
810
833
"tty" : True ,
811
834
"volumes" : [v .map for v in optml_volumes ],
812
835
"environment" : environment ,
813
- "networks" : {"sagemaker-local" : {"aliases" : [host ]}},
814
836
}
815
837
816
838
is_train_with_entrypoint = False
@@ -827,14 +849,19 @@ def _create_docker_host(
827
849
if self .container_arguments :
828
850
host_config ["entrypoint" ] = host_config ["entrypoint" ] + self .container_arguments
829
851
852
+ if self .is_studio :
853
+ host_config ["network_mode" ] = "sagemaker"
854
+ else :
855
+ host_config ["networks" ] = {"sagemaker-local" : {"aliases" : [host ]}}
856
+
830
857
# for GPU support pass in nvidia as the runtime, this is equivalent
831
858
# to setting --runtime=nvidia in the docker commandline.
832
859
if self .instance_type == "local_gpu" :
833
860
host_config ["deploy" ] = {
834
861
"resources" : {"reservations" : {"devices" : [{"capabilities" : ["gpu" ]}]}}
835
862
}
836
863
837
- if command == "serve" :
864
+ if not self . is_studio and command == "serve" :
838
865
serving_port = (
839
866
sagemaker .utils .get_config_value (
840
867
"local.serving_port" , self .sagemaker_session .config
@@ -910,7 +937,7 @@ def __init__(self, command):
910
937
"""Creates a new threaded hosting container.
911
938
912
939
Args:
913
- command:
940
+ command (dict): docker compose command
914
941
"""
915
942
Thread .__init__ (self )
916
943
self .command = command
@@ -987,8 +1014,8 @@ def _stream_output(process):
987
1014
sys .stdout .write (stdout )
988
1015
exit_code = process .poll ()
989
1016
990
- if exit_code != 0 :
991
- raise RuntimeError (" Process exited with code: %s" % exit_code )
1017
+ if exit_code not in [ 0 , 130 ] :
1018
+ raise RuntimeError (f"Failed to run: { process . args } . Process exited with code: { exit_code } " )
992
1019
993
1020
return exit_code
994
1021
0 commit comments