Skip to content

breaking: require framework_version, py_version for tensorflow #1580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/frameworks/tensorflow/upgrade_from_legacy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ the difference in code would be as follows:
...
source_dir="code",
framework_version="1.10.0",
py_version="py3",
train_instance_type="ml.m4.xlarge",
image_name="520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2",
hyperparameters={
Expand Down
5 changes: 3 additions & 2 deletions doc/frameworks/tensorflow/using_tf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ To run training job with Pipe input mode, pass in ``input_mode='Pipe'`` to your
tf_estimator = TensorFlow(entry_point='tf-train-with-pipemodedataset.py', role='SageMakerRole',
training_steps=10000, evaluation_steps=100,
train_instance_count=1, train_instance_type='ml.p2.xlarge',
framework_version='1.10.0', input_mode='Pipe')
framework_version='1.10.0', py_version='py3', input_mode='Pipe')

tf_estimator.fit('s3://bucket/path/to/training/data')

Expand Down Expand Up @@ -383,7 +383,8 @@ estimator object to create a SageMaker Endpoint:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf-train.py', ..., train_instance_count=1,
train_instance_type='ml.c4.xlarge', framework_version='1.11')
train_instance_type='ml.c4.xlarge', framework_version='1.11',
py_version='py3')

estimator.fit(inputs)

Expand Down
4 changes: 3 additions & 1 deletion src/sagemaker/rl/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,9 @@ def create_model(
)

if self.framework == RLFramework.TENSORFLOW.value:
return TensorFlowModel(framework_version=self.framework_version, **base_args)
return TensorFlowModel(
framework_version=self.framework_version, py_version=PYTHON_VERSION, **base_args
)
if self.framework == RLFramework.MXNET.value:
return MXNetModel(
framework_version=self.framework_version, py_version=PYTHON_VERSION, **extended_args
Expand Down
30 changes: 15 additions & 15 deletions src/sagemaker/tensorflow/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ def __init__(

Args:
py_version (str): Python version you want to use for executing your model training
code (default: 'py2').
code. One of 'py2', 'py3', or 'py37'. Defaults to ``None``. Required unless
``image_name`` is provided.
framework_version (str): TensorFlow version you want to use for executing your model
training code. List of supported versions
training code. Defaults to ``None``. Required unless ``image_name`` is provided.
List of supported versions:
https://github.com/aws/sagemaker-python-sdk#tensorflow-sagemaker-estimators.
If not specified, this will default to 1.11.
model_dir (str): S3 location where the checkpoint data and models can be exported to
during training (default: None). It will be passed in the training script as one of
the command line arguments. If not specified, one is provided based on
Expand All @@ -81,6 +82,10 @@ def __init__(
Examples:
123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0
custom-image:latest.

If ``framework_version`` or ``py_version`` are ``None``, then
``image_name`` is required. If also ``None``, then a ``ValueError``
will be raised.
distributions (dict): A dictionary with information on how to run distributed training
(default: None). Currently we support distributed training with parameter servers
and MPI.
Expand Down Expand Up @@ -114,18 +119,13 @@ def __init__(
:class:`~sagemaker.estimator.Framework` and
:class:`~sagemaker.estimator.EstimatorBase`.
"""
if framework_version is None:
logger.warning(
fw.empty_framework_version_warning(defaults.TF_VERSION, self.LATEST_VERSION)
)
self.framework_version = framework_version or defaults.TF_VERSION

if not py_version:
py_version = "py3" if self._only_python_3_supported() else "py2"
fw.validate_version_or_image_args(framework_version, py_version, image_name)
if py_version == "py2":
logger.warning(
fw.python_deprecation_warning(self.__framework_name__, defaults.LATEST_PY2_VERSION)
)
self.framework_version = framework_version
self.py_version = py_version

if distributions is not None:
logger.warning(fw.parameter_v2_rename_warning("distribution", distributions))
Expand All @@ -136,12 +136,11 @@ def __init__(

if "enable_sagemaker_metrics" not in kwargs:
# enable sagemaker metrics for TF v1.15 or greater:
if fw.is_version_equal_or_higher([1, 15], self.framework_version):
if framework_version and fw.is_version_equal_or_higher([1, 15], framework_version):
kwargs["enable_sagemaker_metrics"] = True

super(TensorFlow, self).__init__(image_name=image_name, **kwargs)

self.py_version = py_version
self.model_dir = model_dir
self.distributions = distributions or {}

Expand All @@ -150,7 +149,7 @@ def __init__(
def _validate_args(self, py_version, framework_version):
"""Placeholder docstring"""

if py_version == "py3":
if py_version:
if framework_version is None:
raise AttributeError(fw.EMPTY_FRAMEWORK_VERSION_ERROR)

Expand All @@ -161,7 +160,7 @@ def _validate_args(self, py_version, framework_version):
)
raise AttributeError(msg)

if self._only_legacy_mode_supported() and self.image_name is None:
if self.image_name is None and self._only_legacy_mode_supported():
legacy_image_uri = fw.create_image_uri(
self.sagemaker_session.boto_region_name,
"tensorflow",
Expand Down Expand Up @@ -294,6 +293,7 @@ def create_model(
role=role or self.role,
container_log_level=self.container_log_level,
framework_version=self.framework_version,
py_version=self.py_version,
sagemaker_session=self.sagemaker_session,
vpc_config=self.get_vpc_config(vpc_config_override),
entry_point=entry_point,
Expand Down
22 changes: 16 additions & 6 deletions src/sagemaker/tensorflow/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@

import sagemaker
from sagemaker.content_types import CONTENT_TYPE_JSON
from sagemaker.fw_utils import create_image_uri
from sagemaker.fw_utils import create_image_uri, validate_version_or_image_args
from sagemaker.predictor import json_serializer, json_deserializer
from sagemaker.tensorflow.defaults import TF_VERSION


class TensorFlowPredictor(sagemaker.RealTimePredictor):
Expand Down Expand Up @@ -138,7 +137,8 @@ def __init__(
role,
entry_point=None,
image=None,
framework_version=TF_VERSION,
framework_version=None,
py_version=None,
container_log_level=None,
predictor_cls=TensorFlowPredictor,
**kwargs
Expand All @@ -158,9 +158,16 @@ def __init__(
hosting. If ``source_dir`` is specified, then ``entry_point``
must point to a file located at the root of ``source_dir``.
image (str): A Docker image URI (default: None). If not specified, a
default image for TensorFlow Serving will be used.
default image for TensorFlow Serving will be used. If
``framework_version`` or ``py_version`` are ``None``, then
``image`` is required. If also ``None``, then a ``ValueError``
will be raised.
framework_version (str): Optional. TensorFlow Serving version you
want to use.
want to use. Defaults to ``None``. Required unless ``image`` is
provided.
py_version (str): Python version you want to use for executing your
model training code. One of 'py2', 'py3', or 'py37'. Defaults to
``None``. Required unless ``image`` is provided.
container_log_level (int): Log level to use within the container
(default: logging.ERROR). Valid values are defined in the Python
logging module.
Expand All @@ -176,6 +183,10 @@ def __init__(
:class:`~sagemaker.model.FrameworkModel` and
:class:`~sagemaker.model.Model`.
"""
validate_version_or_image_args(framework_version, py_version, image)
self.framework_version = framework_version
self.py_version = py_version

super(TensorFlowModel, self).__init__(
model_data=model_data,
role=role,
Expand All @@ -184,7 +195,6 @@ def __init__(
entry_point=entry_point,
**kwargs
)
self.framework_version = framework_version
self._container_log_level = container_log_level

def deploy(
Expand Down
4 changes: 4 additions & 0 deletions tests/integ/test_data_capture_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from sagemaker.model_monitor import DataCaptureConfig, NetworkConfig
from sagemaker.tensorflow.model import TensorFlowModel
from sagemaker.utils import unique_name_from_base
from tests.integ import PYTHON_VERSION
from tests.integ.retry import retries

ROLE = "SageMakerRole"
Expand Down Expand Up @@ -53,6 +54,7 @@ def test_enabling_data_capture_on_endpoint_shows_correct_data_capture_status(
model_data=model_data,
role=ROLE,
framework_version=tf_serving_version,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
)
predictor = model.deploy(
Expand Down Expand Up @@ -110,6 +112,7 @@ def test_disabling_data_capture_on_endpoint_shows_correct_data_capture_status(
model_data=model_data,
role=ROLE,
framework_version=tf_serving_version,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
)
destination_s3_uri = os.path.join(
Expand Down Expand Up @@ -196,6 +199,7 @@ def test_updating_data_capture_on_endpoint_shows_correct_data_capture_status(
model_data=model_data,
role=ROLE,
framework_version=tf_serving_version,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
)
destination_s3_uri = os.path.join(
Expand Down
3 changes: 2 additions & 1 deletion tests/integ/test_model_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from sagemaker.s3 import S3Uploader
from datetime import datetime, timedelta

from tests.integ import DATA_DIR
from tests.integ import DATA_DIR, PYTHON_VERSION
from sagemaker.model_monitor import DatasetFormat
from sagemaker.model_monitor import NetworkConfig, Statistics, Constraints
from sagemaker.model_monitor import ModelMonitor
Expand Down Expand Up @@ -101,6 +101,7 @@ def predictor(sagemaker_session, tf_serving_version):
model_data=model_data,
role=ROLE,
framework_version=tf_serving_version,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
)
predictor = model.deploy(
Expand Down
31 changes: 15 additions & 16 deletions tests/integ/test_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@
import pytest

from sagemaker.tensorflow import TensorFlow
from sagemaker.tensorflow.defaults import LATEST_SERVING_VERSION
from sagemaker.tensorflow.defaults import LATEST_VERSION, LATEST_SERVING_VERSION
from sagemaker.utils import unique_name_from_base, sagemaker_timestamp

import tests.integ
from tests.integ import timeout
from tests.integ import kms_utils
from tests.integ import kms_utils, timeout, PYTHON_VERSION
from tests.integ.retry import retries
from tests.integ.s3_utils import assert_s3_files_exist

Expand All @@ -39,10 +38,12 @@
MPI_DISTRIBUTION = {"mpi": {"enabled": True}}
TAGS = [{"Key": "some-key", "Value": "some-value"}]

PY37_SUPPORTED_FRAMEWORK_VERSION = [TensorFlow._LATEST_1X_VERSION, LATEST_VERSION]


@pytest.fixture(scope="module")
def py_version(tf_full_version, tf_serving_version):
return "py37" if tf_full_version == tf_serving_version else tests.integ.PYTHON_VERSION
def py_version(tf_full_version):
return "py37" if tf_full_version in PY37_SUPPORTED_FRAMEWORK_VERSION else PYTHON_VERSION


def test_mnist_with_checkpoint_config(
Expand All @@ -59,7 +60,7 @@ def test_mnist_with_checkpoint_config(
train_instance_type=instance_type,
sagemaker_session=sagemaker_session,
framework_version=tf_full_version,
py_version="py37",
py_version=py_version,
metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
checkpoint_s3_uri=checkpoint_s3_uri,
checkpoint_local_path=checkpoint_local_path,
Expand Down Expand Up @@ -89,7 +90,7 @@ def test_mnist_with_checkpoint_config(
assert actual_training_checkpoint_config == expected_training_checkpoint_config


def test_server_side_encryption(sagemaker_session, tf_serving_version, py_version):
def test_server_side_encryption(sagemaker_session, tf_serving_version):
with kms_utils.bucket_with_encryption(sagemaker_session, ROLE) as (bucket_with_kms, kms_key):
output_path = os.path.join(
bucket_with_kms, "test-server-side-encryption", time.strftime("%y%m%d-%H%M")
Expand All @@ -103,7 +104,7 @@ def test_server_side_encryption(sagemaker_session, tf_serving_version, py_versio
train_instance_type="ml.c5.xlarge",
sagemaker_session=sagemaker_session,
framework_version=tf_serving_version,
py_version=py_version,
py_version=PYTHON_VERSION,
code_location=output_path,
output_path=output_path,
model_dir="/opt/ml/model",
Expand Down Expand Up @@ -137,8 +138,8 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, py
train_instance_count=2,
train_instance_type=instance_type,
sagemaker_session=sagemaker_session,
py_version="py37",
framework_version=tf_full_version,
py_version=py_version,
distributions=PARAMETER_SERVER_DISTRIBUTION,
)
inputs = estimator.sagemaker_session.upload_data(
Expand All @@ -154,13 +155,13 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, py
)


def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, py_version):
def test_mnist_async(sagemaker_session, cpu_instance_type):
estimator = TensorFlow(
entry_point=SCRIPT,
role=ROLE,
train_instance_count=1,
train_instance_type="ml.c5.4xlarge",
py_version=tests.integ.PYTHON_VERSION,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
# testing py-sdk functionality, no need to run against all TF versions
framework_version=LATEST_SERVING_VERSION,
Expand Down Expand Up @@ -195,18 +196,16 @@ def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, py_v
_assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name)


def test_deploy_with_input_handlers(
sagemaker_session, instance_type, tf_serving_version, py_version
):
def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_serving_version):
estimator = TensorFlow(
entry_point="training.py",
source_dir=TFS_RESOURCE_PATH,
role=ROLE,
train_instance_count=1,
train_instance_type=instance_type,
py_version=py_version,
sagemaker_session=sagemaker_session,
framework_version=tf_serving_version,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
tags=TAGS,
)

Expand Down
6 changes: 3 additions & 3 deletions tests/integ/test_tf_efs_fsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
FSX_DIR_PATH = "/fsx/tensorflow"
MAX_JOBS = 2
MAX_PARALLEL_JOBS = 2
PY_VERSION = "py3"
PY_VERSION = "py37"


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -139,8 +139,8 @@ def test_tuning_tf_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
train_instance_count=1,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
framework_version=TensorFlow.LATEST_VERSION,
py_version=PY_VERSION,
subnets=subnets,
security_group_ids=security_group_ids,
)
Expand Down Expand Up @@ -186,8 +186,8 @@ def test_tuning_tf_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
train_instance_count=1,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
framework_version=TensorFlow.LATEST_VERSION,
py_version=PY_VERSION,
subnets=subnets,
security_group_ids=security_group_ids,
)
Expand Down
Loading