Skip to content

Test stability - increase retries on cloudwatch log client + set training timeout for test_cifar #159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/sagemaker/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import json
import six
import yaml
import botocore.config
from botocore.exceptions import ClientError

from sagemaker.user_agent import prepend_user_agent
Expand Down Expand Up @@ -549,7 +550,7 @@ def get_caller_identity_arn(self):
role = re.sub(r'^(.+)sts::(\d+):assumed-role/(.+?)/.*$', r'\1iam::\2:role/\3', assumed_role)
return role

def logs_for_job(self, job_name, wait=False, poll=5): # noqa: C901 - suppress complexity warning for this method
def logs_for_job(self, job_name, wait=False, poll=10): # noqa: C901 - suppress complexity warning for this method
"""Display the logs for a given training job, optionally tailing them until the
job is complete. If the output is a tty or a Jupyter cell, it will be color-coded
based on which instance the log entry is from.
Expand All @@ -569,7 +570,11 @@ def logs_for_job(self, job_name, wait=False, poll=5): # noqa: C901 - suppress c

stream_names = [] # The list of log streams
positions = {} # The current position in each stream, map of stream name -> position
client = self.boto_session.client('logs')

# Increase retries allowed (from default of 4), as we don't want waiting for a training job
# to be interrupted by a transient exception.
config = botocore.config.Config(retries={'max_attempts': 15})
client = self.boto_session.client('logs', config=config)
log_group = '/aws/sagemaker/TrainingJobs'

job_already_completed = True if status == 'Completed' or status == 'Failed' else False
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/test_tf_cifar.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_cifar(sagemaker_session, tf_full_version):
estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
framework_version=tf_full_version, training_steps=20, evaluation_steps=5,
train_instance_count=2, train_instance_type='ml.p2.xlarge',
sagemaker_session=sagemaker_session,
sagemaker_session=sagemaker_session, train_max_run=20 * 60,
base_job_name='test-cifar')

inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
Expand Down