Skip to content

Commit 645e10e

Browse files
authored
Merge branch 'master' into edit-readme
2 parents 00ebc3a + 6ac8f08 commit 645e10e

17 files changed

+72
-68
lines changed

src/sagemaker/local/image.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import sys
2727
import tarfile
2828
import tempfile
29-
from fcntl import fcntl, F_GETFL, F_SETFL
3029
from six.moves.urllib.parse import urlparse
3130
from threading import Thread
3231

@@ -105,7 +104,7 @@ def train(self, input_data_config, hyperparameters):
105104
compose_command = self._compose()
106105

107106
_ecr_login_if_needed(self.sagemaker_session.boto_session, self.image)
108-
process = subprocess.Popen(compose_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
107+
process = subprocess.Popen(compose_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
109108

110109
try:
111110
_stream_output(process)
@@ -555,34 +554,20 @@ def __init__(self, host_dir, container_dir=None, channel=None):
555554
def _stream_output(process):
556555
"""Stream the output of a process to stdout
557556
558-
This function takes an existing process that will be polled for output. Both stdout and
559-
stderr will be polled and both will be sent to sys.stdout.
557+
This function takes an existing process that will be polled for output. Only stdout
558+
will be polled and sent to sys.stdout.
560559
561560
Args:
562561
process(subprocess.Popen): a process that has been started with
563-
stdout=PIPE and stderr=PIPE
562+
stdout=PIPE and stderr=STDOUT
564563
565564
Returns (int): process exit code
566565
"""
567566
exit_code = None
568567

569-
# Get the current flags for the stderr file descriptor
570-
# And add the NONBLOCK flag to allow us to read even if there is no data.
571-
# Since usually stderr will be empty unless there is an error.
572-
flags = fcntl(process.stderr, F_GETFL) # get current process.stderr flags
573-
fcntl(process.stderr, F_SETFL, flags | os.O_NONBLOCK)
574-
575568
while exit_code is None:
576569
stdout = process.stdout.readline().decode("utf-8")
577570
sys.stdout.write(stdout)
578-
try:
579-
stderr = process.stderr.readline().decode("utf-8")
580-
sys.stdout.write(stderr)
581-
except IOError:
582-
# If there is nothing to read on stderr we will get an IOError
583-
# this is fine.
584-
pass
585-
586571
exit_code = process.poll()
587572

588573
if exit_code != 0:

src/sagemaker/tensorflow/README.rst

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -779,8 +779,6 @@ In your ``entry_point`` script, you can use ``PipeModeDataset`` like a ``Dataset
779779
780780
from sagemaker_tensorflow import PipeModeDataset
781781
782-
ds = PipeModeDataset(channel='training', record_format='TFRecord')
783-
784782
features = {
785783
'data': tf.FixedLenFeature([], tf.string),
786784
'labels': tf.FixedLenFeature([], tf.int64),
@@ -792,12 +790,13 @@ In your ``entry_point`` script, you can use ``PipeModeDataset`` like a ``Dataset
792790
'data': tf.decode_raw(parsed['data'], tf.float64)
793791
}, parsed['labels'])
794792
795-
ds = PipeModeDataset(channel='training', record_format='TFRecord')
796-
num_epochs = 20
797-
ds = ds.repeat(num_epochs)
798-
ds = ds.prefetch(10)
799-
ds = ds.map(parse, num_parallel_calls=10)
800-
ds = ds.batch(64)
793+
def train_input_fn(training_dir, hyperparameters):
794+
ds = PipeModeDataset(channel='training', record_format='TFRecord')
795+
ds = ds.repeat(20)
796+
ds = ds.prefetch(10)
797+
ds = ds.map(parse, num_parallel_calls=10)
798+
ds = ds.batch(64)
799+
return ds
801800
802801
803802
To run training job with Pipe input mode, pass in ``input_mode='Pipe'`` to your TensorFlow Estimator:
@@ -826,6 +825,23 @@ If your TFRecords are compressed, you can train on Gzipped TF Records by passing
826825
You can learn more about ``PipeModeDataset`` in the sagemaker-tensorflow-extensions repository: https://github.com/aws/sagemaker-tensorflow-extensions
827826

828827

828+
Training with MKL-DNN disabled
829+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
830+
831+
SageMaker TensorFlow CPU images use TensorFlow built with Intel® MKL-DNN optimization.
832+
833+
In certain cases you might be able to get a better performance by disabling this optimization
834+
(`for example when using small models <https://github.com/awslabs/amazon-sagemaker-examples/blob/d88d1c19861fb7733941969f5a68821d9da2982e/sagemaker-python-sdk/tensorflow_iris_dnn_classifier_using_estimators/iris_dnn_classifier.py#L7-L9>`_)
835+
836+
You can disable MKL-DNN optimization for TensorFlow ``1.8.0`` by setting two following environment variables:
837+
838+
.. code:: python
839+
840+
import os
841+
842+
os.environ['TF_DISABLE_MKL'] = '1'
843+
os.environ['TF_DISABLE_POOL_ALLOCATOR'] = '1'
844+
829845
830846
SageMaker TensorFlow Docker containers
831847
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

tests/integ/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import os
1717

1818
DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
19+
TRAINING_DEFAULT_TIMEOUT_MINUTES = 20
20+
TUNING_DEFAULT_TIMEOUT_MINUTES = 20
1921

2022
logging.getLogger('boto3').setLevel(logging.INFO)
2123
logging.getLogger('botocore').setLevel(logging.INFO)

tests/integ/test_byo_estimator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from sagemaker.amazon.amazon_estimator import registry
2525
from sagemaker.estimator import Estimator
2626
from sagemaker.utils import name_from_base
27-
from tests.integ import DATA_DIR
27+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2828
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2929

3030

@@ -55,7 +55,7 @@ def test_byo_estimator(sagemaker_session, region):
5555
image_name = registry(region) + "/factorization-machines:1"
5656
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
5757

58-
with timeout(minutes=15):
58+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
5959
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
6060
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
6161

tests/integ/test_chainer_train.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from sagemaker.chainer.estimator import Chainer
2323
from sagemaker.chainer.model import ChainerModel
2424
from sagemaker.utils import sagemaker_timestamp
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

@@ -40,7 +40,7 @@ def test_distributed_gpu_training(sagemaker_session, chainer_full_version):
4040

4141

4242
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version):
43-
with timeout(minutes=15):
43+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
4444
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
4545
data_path = os.path.join(DATA_DIR, 'chainer_mnist')
4646

@@ -101,7 +101,7 @@ def test_async_fit(sagemaker_session):
101101

102102

103103
def test_failed_training_job(sagemaker_session, chainer_full_version):
104-
with timeout(minutes=15):
104+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
105105
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
106106
data_path = os.path.join(DATA_DIR, 'chainer_mnist')
107107

@@ -119,7 +119,7 @@ def test_failed_training_job(sagemaker_session, chainer_full_version):
119119

120120
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count,
121121
chainer_full_version, wait=True):
122-
with timeout(minutes=15):
122+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
123123

124124
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
125125
os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

tests/integ/test_factorization_machines.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
from sagemaker import FactorizationMachines, FactorizationMachinesModel
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_factorization_machines(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_kmeans.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
from sagemaker import KMeans, KMeansModel
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_kmeans(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_knn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
from sagemaker import KNN, KNNModel
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_knn_regressor(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_lda.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
from sagemaker import LDA, LDAModel
2121
from sagemaker.amazon.common import read_records
2222
from sagemaker.utils import name_from_base
23-
from tests.integ import DATA_DIR
23+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2424
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2525
from tests.integ.record_set import prepare_record_set_from_local_files
2626

2727

2828
@pytest.mark.continuous_testing
2929
def test_lda(sagemaker_session):
30-
with timeout(minutes=15):
30+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3131
data_path = os.path.join(DATA_DIR, 'lda')
3232
data_filename = 'nips-train_1.pbr'
3333

tests/integ/test_linear_learner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323

2424
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
2525
from sagemaker.utils import name_from_base, sagemaker_timestamp
26-
from tests.integ import DATA_DIR
26+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2727
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2828

2929

3030
@pytest.mark.continuous_testing
3131
def test_linear_learner(sagemaker_session):
32-
with timeout(minutes=15):
32+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3333
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3434
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3535

@@ -93,7 +93,7 @@ def test_linear_learner(sagemaker_session):
9393

9494

9595
def test_linear_learner_multiclass(sagemaker_session):
96-
with timeout(minutes=15):
96+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
9797
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
9898
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
9999

@@ -125,7 +125,7 @@ def test_async_linear_learner(sagemaker_session):
125125
training_job_name = ""
126126
endpoint_name = 'test-linear-learner-async-{}'.format(sagemaker_timestamp())
127127

128-
with timeout(minutes=5):
128+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
129129
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
130130
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
131131

tests/integ/test_mxnet_train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121
from sagemaker.mxnet.estimator import MXNet
2222
from sagemaker.mxnet.model import MXNetModel
2323
from sagemaker.utils import sagemaker_timestamp
24-
from tests.integ import DATA_DIR
24+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2525
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2626

2727

2828
@pytest.fixture(scope='module')
2929
def mxnet_training_job(sagemaker_session, mxnet_full_version):
30-
with timeout(minutes=15):
30+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3131
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
3232
data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
3333

@@ -100,7 +100,7 @@ def test_async_fit(sagemaker_session):
100100

101101

102102
def test_failed_training_job(sagemaker_session, mxnet_full_version):
103-
with timeout(minutes=15):
103+
with timeout():
104104
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')
105105
data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
106106

tests/integ/test_ntm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
from sagemaker import NTM, NTMModel
2121
from sagemaker.amazon.common import read_records
2222
from sagemaker.utils import name_from_base
23-
from tests.integ import DATA_DIR
23+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2424
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2525
from tests.integ.record_set import prepare_record_set_from_local_files
2626

2727

2828
@pytest.mark.continuous_testing
2929
def test_ntm(sagemaker_session):
30-
with timeout(minutes=15):
30+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3131
data_path = os.path.join(DATA_DIR, 'ntm')
3232
data_filename = 'nips-train_1.pbr'
3333

tests/integ/test_pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
import sagemaker.amazon.pca
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_pca(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_pytorch_train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from sagemaker.pytorch.estimator import PyTorch
2020
from sagemaker.pytorch.model import PyTorchModel
2121
from sagemaker.utils import sagemaker_timestamp
22-
from tests.integ import DATA_DIR
22+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2323
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2424

2525
MNIST_DIR = os.path.join(DATA_DIR, 'pytorch_mnist')
@@ -30,7 +30,7 @@
3030
@pytest.fixture(scope='module', name='pytorch_training_job')
3131
def fixture_training_job(sagemaker_session, pytorch_full_version):
3232
instance_type = 'ml.c4.xlarge'
33-
with timeout(minutes=15):
33+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3434
pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)
3535

3636
pytorch.fit({'training': _upload_training_data(pytorch)})
@@ -103,7 +103,7 @@ def test_async_fit_deploy(sagemaker_session, pytorch_full_version):
103103
def test_failed_training_job(sagemaker_session, pytorch_full_version):
104104
script_path = os.path.join(MNIST_DIR, 'failure_script.py')
105105

106-
with timeout(minutes=15):
106+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
107107
pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, entry_point=script_path)
108108

109109
with pytest.raises(ValueError) as e:

tests/integ/test_randomcutforest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717

1818
from sagemaker import RandomCutForest, RandomCutForestModel
1919
from sagemaker.utils import name_from_base
20+
from tests.integ import TRAINING_DEFAULT_TIMEOUT_MINUTES
2021
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2122

2223

2324
@pytest.mark.continuous_testing
2425
def test_randomcutforest(sagemaker_session):
25-
with timeout(minutes=15):
26+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
2627
# Generate a thousand 14-dimensional datapoints.
2728
feature_num = 14
2829
train_input = np.random.rand(1000, feature_num)

0 commit comments

Comments
 (0)