Skip to content
This repository was archived by the owner on May 23, 2024. It is now read-only.

Commit e8bd2be

Browse files
Fix tfs start failure due to null version number (#218)
* fix tfs start failure * switch to docker pull from public ecr * Update .gitignore Co-authored-by: Sai Parthasarathy Miduthuri <[email protected]>
1 parent 3b853bd commit e8bd2be

File tree

9 files changed

+251
-167
lines changed

9 files changed

+251
-167
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@ __pycache__
22
.tox/
33
log.txt
44
.idea/
5+
node_modules/
6+
package.json
7+
package-lock.json

docker/1.15/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ubuntu:18.04
1+
FROM public.ecr.aws/ubuntu/ubuntu:18.04
22

33
LABEL maintainer="Amazon AI"
44
# Specify LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT

docker/2.1/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ubuntu:18.04
1+
FROM public.ecr.aws/ubuntu/ubuntu:18.04
22

33
LABEL maintainer="Amazon AI"
44
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true

docker/build_artifacts/deep_learning_container.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,33 @@ def _retrieve_instance_region():
4949
Retrieve instance region from instance metadata service
5050
"""
5151
region = None
52-
valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2',
53-
'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1',
54-
'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1',
55-
'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
52+
valid_regions = [
53+
"ap-northeast-1",
54+
"ap-northeast-2",
55+
"ap-southeast-1",
56+
"ap-southeast-2",
57+
"ap-south-1",
58+
"ca-central-1",
59+
"eu-central-1",
60+
"eu-north-1",
61+
"eu-west-1",
62+
"eu-west-2",
63+
"eu-west-3",
64+
"sa-east-1",
65+
"us-east-1",
66+
"us-east-2",
67+
"us-west-1",
68+
"us-west-2",
69+
]
5670

5771
url = "http://169.254.169.254/latest/dynamic/instance-identity/document"
5872
response = requests_helper(url, timeout=0.1)
5973

6074
if response is not None:
6175
response_json = json.loads(response.text)
6276

63-
if response_json['region'] in valid_regions:
64-
region = response_json['region']
77+
if response_json["region"] in valid_regions:
78+
region = response_json["region"]
6579

6680
return region
6781

@@ -75,8 +89,10 @@ def query_bucket():
7589
region = _retrieve_instance_region()
7690

7791
if instance_id is not None and region is not None:
78-
url = ("https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com"
79-
"/dlc-containers.txt?x-instance-id={1}".format(region, instance_id))
92+
url = (
93+
"https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com"
94+
"/dlc-containers.txt?x-instance-id={1}".format(region, instance_id)
95+
)
8096
response = requests_helper(url, timeout=0.2)
8197

8298
logging.debug("Query bucket finished: {}".format(response))
@@ -105,5 +121,5 @@ def main():
105121
query_bucket()
106122

107123

108-
if __name__ == '__main__':
124+
if __name__ == "__main__":
109125
main()

docker/build_artifacts/dockerd-entrypoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@
1919
if not os.path.exists("/opt/ml/input/config"):
2020
subprocess.call(["python", "/usr/local/bin/deep_learning_container.py", "&>/dev/null", "&"])
2121

22-
subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))
22+
subprocess.check_call(shlex.split(" ".join(sys.argv[1:])))

docker/build_artifacts/sagemaker/python_service.py

Lines changed: 67 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ def default_handler(data, context):
5858

5959

6060
class PythonServiceResource:
61-
6261
def __init__(self):
6362
if SAGEMAKER_MULTI_MODEL_ENABLED:
6463
self._model_tfs_rest_port = {}
@@ -81,9 +80,9 @@ def __init__(self):
8180
if os.path.exists(INFERENCE_SCRIPT_PATH):
8281
# Single-Model Mode & Multi-Model Mode both use one inference.py
8382
self._handler, self._input_handler, self._output_handler = self._import_handlers()
84-
self._handlers = self._make_handler(self._handler,
85-
self._input_handler,
86-
self._output_handler)
83+
self._handlers = self._make_handler(
84+
self._handler, self._input_handler, self._output_handler
85+
)
8786
else:
8887
self._handlers = default_handler
8988

@@ -105,7 +104,7 @@ def _pick_port(self, ports):
105104
return random.choice(ports)
106105

107106
def _parse_sagemaker_port_range_mme(self, port_range):
108-
lower, upper = port_range.split('-')
107+
lower, upper = port_range.split("-")
109108
lower = int(lower)
110109
upper = lower + int((int(upper) - lower) * 0.9) # only utilizing 90% of the ports
111110
rest_port = lower
@@ -129,16 +128,14 @@ def _handle_load_model_post(self, res, data): # noqa: C901
129128
# model is already loaded
130129
if model_name in self._model_tfs_pid:
131130
res.status = falcon.HTTP_409
132-
res.body = json.dumps({
133-
"error": "Model {} is already loaded.".format(model_name)
134-
})
131+
res.body = json.dumps({"error": "Model {} is already loaded.".format(model_name)})
135132

136133
# check if there are available ports
137134
if not self._ports_available():
138135
res.status = falcon.HTTP_507
139-
res.body = json.dumps({
140-
"error": "Memory exhausted: no available ports to load the model."
141-
})
136+
res.body = json.dumps(
137+
{"error": "Memory exhausted: no available ports to load the model."}
138+
)
142139
with lock():
143140
self._model_tfs_rest_port[model_name] = self._tfs_ports["rest_port"].pop()
144141
self._model_tfs_grpc_port[model_name] = self._tfs_ports["grpc_port"].pop()
@@ -154,7 +151,8 @@ def _handle_load_model_post(self, res, data): # noqa: C901
154151
f.write(tfs_config)
155152

156153
batching_config_file = "/sagemaker/batching/{}/batching-config.cfg".format(
157-
model_name)
154+
model_name
155+
)
158156
if self._tfs_enable_batching:
159157
tfs_utils.create_batching_config(batching_config_file)
160158

@@ -167,22 +165,26 @@ def _handle_load_model_post(self, res, data): # noqa: C901
167165
)
168166
p = subprocess.Popen(cmd.split())
169167

170-
tfs_utils.wait_for_model(self._model_tfs_rest_port[model_name], model_name,
171-
self._tfs_wait_time_seconds)
168+
tfs_utils.wait_for_model(
169+
self._model_tfs_rest_port[model_name], model_name, self._tfs_wait_time_seconds
170+
)
172171

173172
log.info("started tensorflow serving (pid: %d)", p.pid)
174173
# update model name <-> tfs pid map
175174
self._model_tfs_pid[model_name] = p
176175

177176
res.status = falcon.HTTP_200
178-
res.body = json.dumps({
179-
"success":
180-
"Successfully loaded model {}, "
177+
res.body = json.dumps(
178+
{
179+
"success": "Successfully loaded model {}, "
181180
"listening on rest port {} "
182-
"and grpc port {}.".format(model_name,
183-
self._model_tfs_rest_port,
184-
self._model_tfs_grpc_port,)
185-
})
181+
"and grpc port {}.".format(
182+
model_name,
183+
self._model_tfs_rest_port,
184+
self._model_tfs_grpc_port,
185+
)
186+
}
187+
)
186188
except MultiModelException as multi_model_exception:
187189
self._cleanup_config_file(tfs_config_file)
188190
self._cleanup_config_file(batching_config_file)
@@ -196,25 +198,28 @@ def _handle_load_model_post(self, res, data): # noqa: C901
196198
raise MultiModelException(falcon.HTTP_500, multi_model_exception.msg)
197199
except FileExistsError as e:
198200
res.status = falcon.HTTP_409
199-
res.body = json.dumps({
200-
"error": "Model {} is already loaded. {}".format(model_name, str(e))
201-
})
201+
res.body = json.dumps(
202+
{"error": "Model {} is already loaded. {}".format(model_name, str(e))}
203+
)
202204
except OSError as os_error:
203205
self._cleanup_config_file(tfs_config_file)
204206
self._cleanup_config_file(batching_config_file)
205207
if os_error.errno == 12:
206-
raise MultiModelException(falcon.HTTP_507,
207-
"Memory exhausted: "
208-
"not enough memory to start TFS instance")
208+
raise MultiModelException(
209+
falcon.HTTP_507,
210+
"Memory exhausted: " "not enough memory to start TFS instance",
211+
)
209212
else:
210213
raise MultiModelException(falcon.HTTP_500, os_error.strerror)
211214
else:
212215
res.status = falcon.HTTP_404
213-
res.body = json.dumps({
214-
"error":
215-
"Could not find valid base path {} for servable {}".format(base_path,
216-
model_name)
217-
})
216+
res.body = json.dumps(
217+
{
218+
"error": "Could not find valid base path {} for servable {}".format(
219+
base_path, model_name
220+
)
221+
}
222+
)
218223

219224
def _cleanup_config_file(self, config_file):
220225
if os.path.exists(config_file):
@@ -225,31 +230,37 @@ def _handle_invocation_post(self, req, res, model_name=None):
225230
if model_name:
226231
if model_name not in self._model_tfs_rest_port:
227232
res.status = falcon.HTTP_404
228-
res.body = json.dumps({
229-
"error": "Model {} is not loaded yet.".format(model_name)
230-
})
233+
res.body = json.dumps(
234+
{"error": "Model {} is not loaded yet.".format(model_name)}
235+
)
231236
return
232237
else:
233238
log.info("model name: {}".format(model_name))
234239
rest_port = self._model_tfs_rest_port[model_name]
235240
log.info("rest port: {}".format(str(self._model_tfs_rest_port[model_name])))
236241
grpc_port = self._model_tfs_grpc_port[model_name]
237242
log.info("grpc port: {}".format(str(self._model_tfs_grpc_port[model_name])))
238-
data, context = tfs_utils.parse_request(req, rest_port, grpc_port,
239-
self._tfs_default_model_name,
240-
model_name=model_name)
243+
data, context = tfs_utils.parse_request(
244+
req,
245+
rest_port,
246+
grpc_port,
247+
self._tfs_default_model_name,
248+
model_name=model_name,
249+
)
241250
else:
242251
res.status = falcon.HTTP_400
243-
res.body = json.dumps({
244-
"error": "Invocation request does not contain model name."
245-
})
252+
res.body = json.dumps({"error": "Invocation request does not contain model name."})
246253
else:
247254
# Randomly pick port used for routing incoming request.
248255
grpc_port = self._pick_port(self._tfs_grpc_ports)
249256
rest_port = self._pick_port(self._tfs_rest_ports)
250-
data, context = tfs_utils.parse_request(req, rest_port, grpc_port,
251-
self._tfs_default_model_name,
252-
channel=self._channels[grpc_port])
257+
data, context = tfs_utils.parse_request(
258+
req,
259+
rest_port,
260+
grpc_port,
261+
self._tfs_default_model_name,
262+
channel=self._channels[grpc_port],
263+
)
253264

254265
try:
255266
res.status = falcon.HTTP_200
@@ -258,9 +269,7 @@ def _handle_invocation_post(self, req, res, model_name=None):
258269
except Exception as e: # pylint: disable=broad-except
259270
log.exception("exception handling request: {}".format(e))
260271
res.status = falcon.HTTP_500
261-
res.body = json.dumps({
262-
"error": str(e)
263-
}).encode("utf-8") # pylint: disable=E1101
272+
res.body = json.dumps({"error": str(e)}).encode("utf-8") # pylint: disable=E1101
264273

265274
def _setup_channel(self, grpc_port):
266275
if grpc_port not in self._channels:
@@ -306,39 +315,31 @@ def on_get(self, req, res, model_name=None): # pylint: disable=W0613
306315
except ValueError as e:
307316
log.exception("exception handling request: {}".format(e))
308317
res.status = falcon.HTTP_500
309-
res.body = json.dumps({
310-
"error": str(e)
311-
}).encode("utf-8")
318+
res.body = json.dumps({"error": str(e)}).encode("utf-8")
312319
res.status = falcon.HTTP_200
313320
res.body = json.dumps(models_info)
314321
else:
315322
if model_name not in self._model_tfs_rest_port:
316323
res.status = falcon.HTTP_404
317-
res.body = json.dumps({
318-
"error": "Model {} is loaded yet.".format(model_name)
319-
}).encode("utf-8")
324+
res.body = json.dumps(
325+
{"error": "Model {} is loaded yet.".format(model_name)}
326+
).encode("utf-8")
320327
else:
321328
port = self._model_tfs_rest_port[model_name]
322329
uri = "http://localhost:{}/v1/models/{}".format(port, model_name)
323330
try:
324331
info = requests.get(uri)
325332
res.status = falcon.HTTP_200
326-
res.body = json.dumps({
327-
"model": info
328-
}).encode("utf-8")
333+
res.body = json.dumps({"model": info}).encode("utf-8")
329334
except ValueError as e:
330335
log.exception("exception handling GET models request.")
331336
res.status = falcon.HTTP_500
332-
res.body = json.dumps({
333-
"error": str(e)
334-
}).encode("utf-8")
337+
res.body = json.dumps({"error": str(e)}).encode("utf-8")
335338

336339
def on_delete(self, req, res, model_name): # pylint: disable=W0613
337340
if model_name not in self._model_tfs_pid:
338341
res.status = falcon.HTTP_404
339-
res.body = json.dumps({
340-
"error": "Model {} is not loaded yet".format(model_name)
341-
})
342+
res.body = json.dumps({"error": "Model {} is not loaded yet".format(model_name)})
342343
else:
343344
try:
344345
self._model_tfs_pid[model_name].kill()
@@ -353,14 +354,12 @@ def on_delete(self, req, res, model_name): # pylint: disable=W0613
353354
del self._model_tfs_grpc_port[model_name]
354355
del self._model_tfs_pid[model_name]
355356
res.status = falcon.HTTP_200
356-
res.body = json.dumps({
357-
"success": "Successfully unloaded model {}.".format(model_name)
358-
})
357+
res.body = json.dumps(
358+
{"success": "Successfully unloaded model {}.".format(model_name)}
359+
)
359360
except OSError as error:
360361
res.status = falcon.HTTP_500
361-
res.body = json.dumps({
362-
"error": str(error)
363-
}).encode("utf-8")
362+
res.body = json.dumps({"error": str(error)}).encode("utf-8")
364363

365364
def validate_model_dir(self, model_path):
366365
# model base path doesn't exits

0 commit comments

Comments
 (0)