Skip to content

Commit 592f94a

Browse files
authored
feature: add support to TF 1.14 serving with elastic accelerator. (#1045)
1 parent dcccbee commit 592f94a

File tree

7 files changed

+111
-27
lines changed

7 files changed

+111
-27
lines changed

README.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ By using TensorFlow SageMaker Estimators, you can train and host TensorFlow mode
192192

193193
Supported versions of TensorFlow: ``1.4.1``, ``1.5.0``, ``1.6.0``, ``1.7.0``, ``1.8.0``, ``1.9.0``, ``1.10.0``, ``1.11.0``, ``1.12.0``, ``1.13.1``, ``1.14``.
194194

195-
Supported versions of TensorFlow for Elastic Inference: ``1.11.0``, ``1.12.0``, ``1.13.1``
195+
Supported versions of TensorFlow for Elastic Inference: ``1.11.0``, ``1.12.0``, ``1.13.1``, ``1.14``.
196196

197197
We recommend that you use the latest supported version, because that's where we focus most of our development efforts.
198198

src/sagemaker/fw_utils.py

+43-19
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,16 @@
6363
"tensorflow-scriptmode": "tensorflow-training",
6464
"mxnet": "mxnet-training",
6565
"tensorflow-serving": "tensorflow-inference",
66-
"mxnet-serving": "mxnet-inference",
66+
"tensorflow-serving-eia": "tensorflow-inference-eia",
67+
"mxnet-serving-eia": "mxnet-inference-eia",
6768
}
6869

6970
MERGED_FRAMEWORKS_LOWEST_VERSIONS = {
7071
"tensorflow-scriptmode": [1, 13, 1],
7172
"mxnet": [1, 4, 1],
7273
"tensorflow-serving": [1, 13, 0],
73-
"mxnet-serving": [1, 4, 1],
74+
"tensorflow-serving-eia": [1, 14, 0],
75+
"mxnet-serving-eia": [1, 4, 1],
7476
}
7577

7678

@@ -101,7 +103,7 @@ def _is_merged_versions(framework, framework_version):
101103
return False
102104

103105

104-
def _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
106+
def _using_merged_images(region, framework, py_version, framework_version):
105107
"""
106108
Args:
107109
region:
@@ -116,8 +118,11 @@ def _using_merged_images(region, framework, py_version, accelerator_type, framew
116118
return (
117119
(not is_gov_region)
118120
and is_merged_versions
119-
and (is_py3 or _is_tf_14_or_later(framework, framework_version))
120-
and accelerator_type is None
121+
and (
122+
is_py3
123+
or _is_tf_14_or_later(framework, framework_version)
124+
or _is_mxnet_serving_141_or_later(framework, framework_version)
125+
)
121126
)
122127

123128

@@ -135,7 +140,25 @@ def _is_tf_14_or_later(framework, framework_version):
135140
)
136141

137142

138-
def _registry_id(region, framework, py_version, account, accelerator_type, framework_version):
143+
def _is_mxnet_serving_141_or_later(framework, framework_version):
144+
"""
145+
Args:
146+
framework:
147+
framework_version:
148+
"""
149+
asimov_lowest_mxnet = [1, 4, 1]
150+
151+
version = [int(s) for s in framework_version.split(".")]
152+
153+
if len(version) == 2:
154+
version.append(0)
155+
156+
return (
157+
framework.startswith("mxnet-serving") and version >= asimov_lowest_mxnet[0 : len(version)]
158+
)
159+
160+
161+
def _registry_id(region, framework, py_version, account, framework_version):
139162
"""
140163
Args:
141164
region:
@@ -145,7 +168,7 @@ def _registry_id(region, framework, py_version, account, accelerator_type, frame
145168
accelerator_type:
146169
framework_version:
147170
"""
148-
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
171+
if _using_merged_images(region, framework, py_version, framework_version):
149172
if region in ASIMOV_OPT_IN_ACCOUNTS_BY_REGION:
150173
return ASIMOV_OPT_IN_ACCOUNTS_BY_REGION.get(region)
151174
return "763104351884"
@@ -187,13 +210,19 @@ def create_image_uri(
187210
if py_version and py_version not in VALID_PY_VERSIONS:
188211
raise ValueError("invalid py_version argument: {}".format(py_version))
189212

213+
if _accelerator_type_valid_for_framework(
214+
framework=framework,
215+
accelerator_type=accelerator_type,
216+
optimized_families=optimized_families,
217+
):
218+
framework += "-eia"
219+
190220
# Handle Account Number for Gov Cloud and frameworks with DLC merged images
191221
account = _registry_id(
192222
region=region,
193223
framework=framework,
194224
py_version=py_version,
195225
account=account,
196-
accelerator_type=accelerator_type,
197226
framework_version=framework_version,
198227
)
199228

@@ -218,19 +247,14 @@ def create_image_uri(
218247
else:
219248
device_type = "cpu"
220249

221-
if py_version:
222-
tag = "{}-{}-{}".format(framework_version, device_type, py_version)
223-
else:
224-
tag = "{}-{}".format(framework_version, device_type)
250+
using_merged_images = _using_merged_images(region, framework, py_version, framework_version)
225251

226-
if _accelerator_type_valid_for_framework(
227-
framework=framework,
228-
accelerator_type=accelerator_type,
229-
optimized_families=optimized_families,
230-
):
231-
framework += "-eia"
252+
if not py_version or (using_merged_images and framework == "tensorflow-serving-eia"):
253+
tag = "{}-{}".format(framework_version, device_type)
254+
else:
255+
tag = "{}-{}-{}".format(framework_version, device_type, py_version)
232256

233-
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
257+
if using_merged_images:
234258
return "{}/{}:{}".format(
235259
get_ecr_image_uri_prefix(account, region), MERGED_FRAMEWORKS_REPO_MAP[framework], tag
236260
)

src/sagemaker/tensorflow/serving.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ class Model(sagemaker.model.FrameworkModel):
131131
logging.ERROR: "error",
132132
logging.CRITICAL: "crit",
133133
}
134-
LATEST_EIA_VERSION = [1, 13]
134+
LATEST_EIA_VERSION = [1, 14]
135135

136136
def __init__(
137137
self,

tests/integ/test_tfs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def tfs_predictor_with_accelerator(sagemaker_session, tf_full_version, cpu_insta
121121
model = Model(
122122
model_data=model_data,
123123
role="SageMakerRole",
124-
framework_version="1.13",
124+
framework_version="1.14",
125125
sagemaker_session=sagemaker_session,
126126
)
127127
predictor = model.deploy(

tests/unit/test_fw_utils.py

+62-3
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,49 @@ def test_create_image_uri_hkg():
146146
}
147147

148148

149+
def test_tf_eia_images():
150+
image_uri = fw_utils.create_image_uri(
151+
"us-west-2",
152+
"tensorflow-serving",
153+
"ml.p3.2xlarge",
154+
"1.14.0",
155+
"py3",
156+
accelerator_type="ml.eia1.medium",
157+
)
158+
assert (
159+
image_uri
160+
== "763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-inference-eia:1.14.0-gpu"
161+
)
162+
163+
164+
def test_mxnet_eia_images():
165+
image_uri = fw_utils.create_image_uri(
166+
"us-west-2",
167+
"mxnet-serving",
168+
"ml.p3.2xlarge",
169+
"1.4.1",
170+
"py2",
171+
accelerator_type="ml.eia1.medium",
172+
)
173+
assert (
174+
image_uri
175+
== "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-inference-eia:1.4.1-gpu-py2"
176+
)
177+
178+
image_uri = fw_utils.create_image_uri(
179+
"us-east-1",
180+
"mxnet-serving",
181+
"ml.c4.2xlarge",
182+
"1.4.1",
183+
"py3",
184+
accelerator_type="ml.eia1.large",
185+
)
186+
assert (
187+
image_uri
188+
== "763104351884.dkr.ecr.us-east-1.amazonaws.com/mxnet-inference-eia:1.4.1-cpu-py3"
189+
)
190+
191+
149192
def test_create_image_uri_merged():
150193
image_uri = fw_utils.create_image_uri(
151194
"us-west-2", "tensorflow-scriptmode", "ml.p3.2xlarge", "1.14", "py3"
@@ -175,7 +218,23 @@ def test_create_image_uri_merged():
175218
image_uri = fw_utils.create_image_uri(
176219
"us-west-2", "mxnet-serving", "ml.c4.2xlarge", "1.4.1", "py3"
177220
)
178-
assert image_uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-inference:1.4.1-cpu-py3"
221+
assert (
222+
image_uri
223+
== "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-serving:1.4.1-cpu-py3"
224+
)
225+
226+
image_uri = fw_utils.create_image_uri(
227+
"us-west-2",
228+
"mxnet-serving",
229+
"ml.c4.2xlarge",
230+
"1.4.1",
231+
"py3",
232+
accelerator_type="ml.eia1.medium",
233+
)
234+
assert (
235+
image_uri
236+
== "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-inference-eia:1.4.1-cpu-py3"
237+
)
179238

180239

181240
def test_create_image_uri_merged_py2():
@@ -198,11 +257,11 @@ def test_create_image_uri_merged_py2():
198257
assert image_uri == "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.4.1-gpu-py2"
199258

200259
image_uri = fw_utils.create_image_uri(
201-
"us-west-2", "mxnet-serving", "ml.c4.2xlarge", "1.4.1", "py2"
260+
"us-west-2", "mxnet-serving", "ml.c4.2xlarge", "1.3.1", "py2"
202261
)
203262
assert (
204263
image_uri
205-
== "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-serving:1.4.1-cpu-py2"
264+
== "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-serving:1.3.1-cpu-py2"
206265
)
207266

208267

tests/unit/test_mxnet.py

+1
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ def test_mxnet_mms_version(
337337
model = mx.create_model()
338338

339339
expected_image_base = _get_full_image_uri(mxnet_version, IMAGE_REPO_SERVING_NAME, "gpu")
340+
340341
environment = {
341342
"Environment": {
342343
"SAGEMAKER_SUBMIT_DIRECTORY": "s3://mybucket/sagemaker-mxnet-2017-11-06-14:14:15.672/model.tar.gz",

tests/unit/test_tfs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def test_tfs_model_image_accelerator_not_supported(sagemaker_session):
113113
model = Model(
114114
"s3://some/data.tar.gz",
115115
role=ROLE,
116-
framework_version="1.14",
116+
framework_version="1.15",
117117
sagemaker_session=sagemaker_session,
118118
)
119119

@@ -128,7 +128,7 @@ def test_tfs_model_image_accelerator_not_supported(sagemaker_session):
128128
initial_instance_count=1,
129129
)
130130

131-
assert str(e.value) == "The TensorFlow version 1.14 doesn't support EIA."
131+
assert str(e.value) == "The TensorFlow version 1.15 doesn't support EIA."
132132

133133

134134
def test_tfs_model_with_log_level(sagemaker_session, tf_version):

0 commit comments

Comments
 (0)