|
41 | 41 | secondary_training_status_changed,
|
42 | 42 | secondary_training_status_message,
|
43 | 43 | sts_regional_endpoint,
|
| 44 | + retries, |
44 | 45 | )
|
45 | 46 | from sagemaker import exceptions
|
46 | 47 | from sagemaker.session_settings import SessionSettings
|
@@ -4699,21 +4700,30 @@ def _train_done(sagemaker_client, job_name, last_desc):
|
4699 | 4700 | """Placeholder docstring"""
|
4700 | 4701 | in_progress_statuses = ["InProgress", "Created"]
|
4701 | 4702 |
|
4702 |
| - desc = sagemaker_client.describe_training_job(TrainingJobName=job_name) |
4703 |
| - status = desc["TrainingJobStatus"] |
| 4703 | + for _ in retries( |
| 4704 | + max_retry_count=10, # 10*30 = 5min |
| 4705 | + exception_message_prefix="Waiting for schedule to leave 'Pending' status", |
| 4706 | + seconds_to_sleep=30, |
| 4707 | + ): |
| 4708 | + try: |
| 4709 | + desc = sagemaker_client.describe_training_job(TrainingJobName=job_name) |
| 4710 | + status = desc["TrainingJobStatus"] |
4704 | 4711 |
|
4705 |
| - if secondary_training_status_changed(desc, last_desc): |
4706 |
| - print() |
4707 |
| - print(secondary_training_status_message(desc, last_desc), end="") |
4708 |
| - else: |
4709 |
| - print(".", end="") |
4710 |
| - sys.stdout.flush() |
| 4712 | + if secondary_training_status_changed(desc, last_desc): |
| 4713 | + print() |
| 4714 | + print(secondary_training_status_message(desc, last_desc), end="") |
| 4715 | + else: |
| 4716 | + print(".", end="") |
| 4717 | + sys.stdout.flush() |
4711 | 4718 |
|
4712 |
| - if status in in_progress_statuses: |
4713 |
| - return desc, False |
| 4719 | + if status in in_progress_statuses: |
| 4720 | + return desc, False |
4714 | 4721 |
|
4715 |
| - print() |
4716 |
| - return desc, True |
| 4722 | + print() |
| 4723 | + return desc, True |
| 4724 | + except botocore.exceptions.ClientError as err: |
| 4725 | + if err.response["Error"]["Code"] == "AccessDeniedException": |
| 4726 | + pass |
4717 | 4727 |
|
4718 | 4728 |
|
4719 | 4729 | def _processing_job_status(sagemaker_client, job_name):
|
|
0 commit comments