@@ -3223,14 +3223,11 @@ def create_model_package_from_containers(
3223
3223
3224
3224
def submit (request ):
3225
3225
if model_package_group_name is not None :
3226
- try :
3227
- self .sagemaker_client .describe_model_package_group (
3228
- ModelPackageGroupName = request ["ModelPackageGroupName" ]
3229
- )
3230
- except ClientError :
3231
- self .sagemaker_client .create_model_package_group (
3226
+ _create_resource (
3227
+ lambda : self .sagemaker_client .create_model_package_group (
3232
3228
ModelPackageGroupName = request ["ModelPackageGroupName" ]
3233
3229
)
3230
+ )
3234
3231
return self .sagemaker_client .create_model_package (** request )
3235
3232
3236
3233
return self ._intercept_create_request (
@@ -3918,42 +3915,40 @@ def endpoint_from_model_data(
3918
3915
name = name or name_from_image (image_uri )
3919
3916
model_vpc_config = vpc_utils .sanitize (model_vpc_config )
3920
3917
3921
- if _deployment_entity_exists (
3922
- lambda : self .sagemaker_client .describe_endpoint (EndpointName = name )
3923
- ):
3924
- raise ValueError (
3925
- 'Endpoint with name "{}" already exists; please pick a different name.' .format (name )
3926
- )
3918
+ primary_container = container_def (
3919
+ image_uri = image_uri ,
3920
+ model_data_url = model_s3_location ,
3921
+ env = model_environment_vars ,
3922
+ )
3927
3923
3928
- if not _deployment_entity_exists (
3929
- lambda : self .sagemaker_client .describe_model (ModelName = name )
3930
- ):
3931
- primary_container = container_def (
3932
- image_uri = image_uri ,
3933
- model_data_url = model_s3_location ,
3934
- env = model_environment_vars ,
3935
- )
3936
- self .create_model (
3937
- name = name , role = role , container_defs = primary_container , vpc_config = model_vpc_config
3938
- )
3924
+ self .create_model (
3925
+ name = name , role = role , container_defs = primary_container , vpc_config = model_vpc_config
3926
+ )
3939
3927
3940
3928
data_capture_config_dict = None
3941
3929
if data_capture_config is not None :
3942
3930
data_capture_config_dict = data_capture_config ._to_request_dict ()
3943
3931
3944
- if not _deployment_entity_exists (
3945
- lambda : self .sagemaker_client .describe_endpoint_config (EndpointConfigName = name )
3946
- ):
3947
- self .create_endpoint_config (
3932
+ _create_resource (
3933
+ lambda : self .create_endpoint_config (
3948
3934
name = name ,
3949
3935
model_name = name ,
3950
3936
initial_instance_count = initial_instance_count ,
3951
3937
instance_type = instance_type ,
3952
3938
accelerator_type = accelerator_type ,
3953
3939
data_capture_config_dict = data_capture_config_dict ,
3954
3940
)
3941
+ )
3942
+
3943
+ # to make change backwards compatible
3944
+ response = _create_resource (
3945
+ lambda : self .create_endpoint (endpoint_name = name , config_name = name , wait = wait )
3946
+ )
3947
+ if not response :
3948
+ raise ValueError (
3949
+ 'Endpoint with name "{}" already exists; please pick a different name.' .format (name )
3950
+ )
3955
3951
3956
- self .create_endpoint (endpoint_name = name , config_name = name , wait = wait )
3957
3952
return name
3958
3953
3959
3954
def endpoint_from_production_variants (
@@ -5452,34 +5447,54 @@ def _deployment_entity_exists(describe_fn):
5452
5447
return False
5453
5448
5454
5449
5450
+ def _create_resource (create_fn ):
5451
+ """Call create function and accepts/pass when resource already exists.
5452
+
5453
+ This is a helper function to use an existing resource if found when creating.
5454
+
5455
+ Args:
5456
+ create_fn: Create resource function.
5457
+
5458
+ Returns:
5459
+ (bool): True if new resource was created, False if resource already exists.
5460
+ """
5461
+ try :
5462
+ create_fn ()
5463
+ # create function succeeded, resource does not exist already
5464
+ return True
5465
+ except ClientError as ce :
5466
+ error_code = ce .response ["Error" ]["Code" ]
5467
+ error_message = ce .response ["Error" ]["Message" ]
5468
+ already_exists_exceptions = ["ValidationException" , "ResourceInUse" ]
5469
+ already_exists_msg_patterns = ["Cannot create already existing" , "already exists" ]
5470
+ if not (
5471
+ error_code in already_exists_exceptions
5472
+ and any (p in error_message for p in already_exists_msg_patterns )
5473
+ ):
5474
+ raise ce
5475
+ # no new resource created as resource already exists
5476
+ return False
5477
+
5478
+
5455
5479
def _train_done (sagemaker_client , job_name , last_desc ):
5456
5480
"""Placeholder docstring"""
5457
5481
in_progress_statuses = ["InProgress" , "Created" ]
5458
5482
5459
- for _ in retries (
5460
- max_retry_count = 10 , # 10*30 = 5min
5461
- exception_message_prefix = "Waiting for schedule to leave 'Pending' status" ,
5462
- seconds_to_sleep = 30 ,
5463
- ):
5464
- try :
5465
- desc = sagemaker_client .describe_training_job (TrainingJobName = job_name )
5466
- status = desc ["TrainingJobStatus" ]
5483
+ desc = sagemaker_client .describe_training_job (TrainingJobName = job_name )
5484
+ status = desc ["TrainingJobStatus" ]
5467
5485
5468
- if secondary_training_status_changed (desc , last_desc ):
5469
- print ()
5470
- print (secondary_training_status_message (desc , last_desc ), end = "" )
5471
- else :
5472
- print ("." , end = "" )
5473
- sys .stdout .flush ()
5486
+ if secondary_training_status_changed (desc , last_desc ):
5487
+ print ()
5488
+ print (secondary_training_status_message (desc , last_desc ), end = "" )
5489
+ else :
5490
+ print ("." , end = "" )
5491
+ sys .stdout .flush ()
5474
5492
5475
- if status in in_progress_statuses :
5476
- return desc , False
5493
+ if status in in_progress_statuses :
5494
+ return desc , False
5477
5495
5478
- print ()
5479
- return desc , True
5480
- except botocore .exceptions .ClientError as err :
5481
- if err .response ["Error" ]["Code" ] == "AccessDeniedException" :
5482
- pass
5496
+ print ()
5497
+ return desc , True
5483
5498
5484
5499
5485
5500
def _processing_job_status (sagemaker_client , job_name ):
@@ -5799,19 +5814,54 @@ def _deploy_done(sagemaker_client, endpoint_name):
5799
5814
5800
5815
def _wait_until_training_done (callable_fn , desc , poll = 5 ):
5801
5816
"""Placeholder docstring"""
5802
- job_desc , finished = callable_fn (desc )
5817
+ elapsed_time = 0
5818
+ finished = None
5819
+ job_desc = desc
5803
5820
while not finished :
5804
- time .sleep (poll )
5805
- job_desc , finished = callable_fn (job_desc )
5821
+ try :
5822
+ elapsed_time += poll
5823
+ time .sleep (poll )
5824
+ job_desc , finished = callable_fn (job_desc )
5825
+ except botocore .exceptions .ClientError as err :
5826
+ # For initial 5 mins we accept/pass AccessDeniedException.
5827
+ # The reason is to await tag propagation to avoid false AccessDenied claims for an
5828
+ # access policy based on resource tags, The caveat here is for true AccessDenied
5829
+ # cases the routine will fail after 5 mins
5830
+ if err .response ["Error" ]["Code" ] == "AccessDeniedException" and elapsed_time <= 300 :
5831
+ LOGGER .warning (
5832
+ "Received AccessDeniedException. This could mean the IAM role does not "
5833
+ "have the resource permissions, in which case please add resource access "
5834
+ "and retry. For cases where the role has tag based resource policy, "
5835
+ "continuing to wait for tag propagation.."
5836
+ )
5837
+ continue
5838
+ raise err
5806
5839
return job_desc
5807
5840
5808
5841
5809
5842
def _wait_until (callable_fn , poll = 5 ):
5810
5843
"""Placeholder docstring"""
5811
- result = callable_fn ()
5844
+ elapsed_time = 0
5845
+ result = None
5812
5846
while result is None :
5813
- time .sleep (poll )
5814
- result = callable_fn ()
5847
+ try :
5848
+ elapsed_time += poll
5849
+ time .sleep (poll )
5850
+ result = callable_fn ()
5851
+ except botocore .exceptions .ClientError as err :
5852
+ # For initial 5 mins we accept/pass AccessDeniedException.
5853
+ # The reason is to await tag propagation to avoid false AccessDenied claims for an
5854
+ # access policy based on resource tags, The caveat here is for true AccessDenied
5855
+ # cases the routine will fail after 5 mins
5856
+ if err .response ["Error" ]["Code" ] == "AccessDeniedException" and elapsed_time <= 300 :
5857
+ LOGGER .warning (
5858
+ "Received AccessDeniedException. This could mean the IAM role does not "
5859
+ "have the resource permissions, in which case please add resource access "
5860
+ "and retry. For cases where the role has tag based resource policy, "
5861
+ "continuing to wait for tag propagation.."
5862
+ )
5863
+ continue
5864
+ raise err
5815
5865
return result
5816
5866
5817
5867
0 commit comments