Skip to content

Commit 5bc3ccf

Browse files
fix: Add retry in session.py to check if training is finished (aws#3285)
Co-authored-by: Basil Beirouti <[email protected]>
1 parent 277f818 commit 5bc3ccf

File tree

1 file changed

+22
-12
lines changed

1 file changed

+22
-12
lines changed

src/sagemaker/session.py

+22-12
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
secondary_training_status_changed,
4242
secondary_training_status_message,
4343
sts_regional_endpoint,
44+
retries,
4445
)
4546
from sagemaker import exceptions
4647
from sagemaker.session_settings import SessionSettings
@@ -4699,21 +4700,30 @@ def _train_done(sagemaker_client, job_name, last_desc):
46994700
"""Placeholder docstring"""
47004701
in_progress_statuses = ["InProgress", "Created"]
47014702

4702-
desc = sagemaker_client.describe_training_job(TrainingJobName=job_name)
4703-
status = desc["TrainingJobStatus"]
4703+
for _ in retries(
4704+
max_retry_count=10, # 10*30 = 5min
4705+
exception_message_prefix="Waiting for schedule to leave 'Pending' status",
4706+
seconds_to_sleep=30,
4707+
):
4708+
try:
4709+
desc = sagemaker_client.describe_training_job(TrainingJobName=job_name)
4710+
status = desc["TrainingJobStatus"]
47044711

4705-
if secondary_training_status_changed(desc, last_desc):
4706-
print()
4707-
print(secondary_training_status_message(desc, last_desc), end="")
4708-
else:
4709-
print(".", end="")
4710-
sys.stdout.flush()
4712+
if secondary_training_status_changed(desc, last_desc):
4713+
print()
4714+
print(secondary_training_status_message(desc, last_desc), end="")
4715+
else:
4716+
print(".", end="")
4717+
sys.stdout.flush()
47114718

4712-
if status in in_progress_statuses:
4713-
return desc, False
4719+
if status in in_progress_statuses:
4720+
return desc, False
47144721

4715-
print()
4716-
return desc, True
4722+
print()
4723+
return desc, True
4724+
except botocore.exceptions.ClientError as err:
4725+
if err.response["Error"]["Code"] == "AccessDeniedException":
4726+
pass
47174727

47184728

47194729
def _processing_job_status(sagemaker_client, job_name):

0 commit comments

Comments
 (0)