23
23
import json
24
24
import six
25
25
import yaml
26
+ import botocore .config
26
27
from botocore .exceptions import ClientError
27
28
28
29
from sagemaker .user_agent import prepend_user_agent
@@ -549,7 +550,7 @@ def get_caller_identity_arn(self):
549
550
role = re .sub (r'^(.+)sts::(\d+):assumed-role/(.+?)/.*$' , r'\1iam::\2:role/\3' , assumed_role )
550
551
return role
551
552
552
- def logs_for_job (self , job_name , wait = False , poll = 5 ): # noqa: C901 - suppress complexity warning for this method
553
+ def logs_for_job (self , job_name , wait = False , poll = 10 ): # noqa: C901 - suppress complexity warning for this method
553
554
"""Display the logs for a given training job, optionally tailing them until the
554
555
job is complete. If the output is a tty or a Jupyter cell, it will be color-coded
555
556
based on which instance the log entry is from.
@@ -569,7 +570,11 @@ def logs_for_job(self, job_name, wait=False, poll=5): # noqa: C901 - suppress c
569
570
570
571
stream_names = [] # The list of log streams
571
572
positions = {} # The current position in each stream, map of stream name -> position
572
- client = self .boto_session .client ('logs' )
573
+
574
+ # Increase retries allowed (from default of 4), as we don't want waiting for a training job
575
+ # to be interrupted by a transient exception.
576
+ config = botocore .config .Config (retries = {'max_attempts' : 15 })
577
+ client = self .boto_session .client ('logs' , config = config )
573
578
log_group = '/aws/sagemaker/TrainingJobs'
574
579
575
580
job_already_completed = True if status == 'Completed' or status == 'Failed' else False
0 commit comments