15
15
import logging
16
16
from typing import Type
17
17
from abc import ABC , abstractmethod
18
- from pathlib import Path
19
18
from datetime import datetime , timedelta
20
19
21
20
from sagemaker .model import Model
31
30
_more_performant ,
32
31
_pretty_print_results ,
33
32
)
33
+ from sagemaker .serve .utils .hf_utils import _get_model_config_properties_from_hf
34
34
from sagemaker .serve .model_server .djl_serving .utils import (
35
- _auto_detect_engine ,
36
- _set_serve_properties ,
37
35
_get_admissible_tensor_parallel_degrees ,
38
36
_get_admissible_dtypes ,
39
37
_get_default_tensor_parallel_degree ,
38
+ _get_default_djl_configurations ,
40
39
)
41
40
from sagemaker .serve .utils .local_hardware import (
42
41
_get_nb_instance ,
45
44
_get_gpu_info_fallback ,
46
45
)
47
46
from sagemaker .serve .model_server .djl_serving .prepare import (
48
- prepare_for_djl_serving ,
49
47
_create_dir_structure ,
50
48
)
51
49
from sagemaker .serve .utils .predictors import DjlLocalModePredictor
52
- from sagemaker .serve .utils .types import ModelServer , _DjlEngine
50
+ from sagemaker .serve .utils .types import ModelServer
53
51
from sagemaker .serve .mode .function_pointers import Mode
54
52
from sagemaker .serve .utils .telemetry_logger import _capture_telemetry
55
- from sagemaker .djl_inference .model import (
56
- DeepSpeedModel ,
57
- FasterTransformerModel ,
58
- HuggingFaceAccelerateModel ,
59
- )
53
+ from sagemaker .djl_inference .model import DJLModel
60
54
from sagemaker .base_predictor import PredictorBase
61
55
62
56
logger = logging .getLogger (__name__ )
63
57
64
58
# Match JumpStart DJL entrypoint format
65
- _DJL_MODEL_BUILDER_ENTRY_POINT = "inference.py"
66
59
_CODE_FOLDER = "code"
67
60
_INVALID_SAMPLE_DATA_EX = (
68
61
'For djl-serving, sample input must be of {"inputs": str, "parameters": dict}, '
@@ -88,14 +81,11 @@ def __init__(self):
88
81
self .vpc_config = None
89
82
self ._original_deploy = None
90
83
self .secret_key = None
91
- self .engine = None
92
84
self .hf_model_config = None
93
85
self ._default_tensor_parallel_degree = None
94
86
self ._default_data_type = None
95
87
self ._default_max_tokens = None
96
- self ._default_max_new_tokens = None
97
88
self .pysdk_model = None
98
- self .overwrite_props_from_file = None
99
89
self .schema_builder = None
100
90
self .env_vars = None
101
91
self .nb_instance_type = None
@@ -117,6 +107,7 @@ def _validate_djl_serving_sample_data(self):
117
107
"""Placeholder docstring"""
118
108
sample_input = self .schema_builder .sample_input
119
109
sample_output = self .schema_builder .sample_output
110
+ logger .info (f"sample input is { sample_input } , sample output is { sample_output } " )
120
111
121
112
if ( # pylint: disable=R0916
122
113
not isinstance (sample_input , dict )
@@ -130,37 +121,15 @@ def _validate_djl_serving_sample_data(self):
130
121
131
122
def _create_djl_model (self ) -> Type [Model ]:
132
123
"""Placeholder docstring"""
133
- code_dir = str (Path (self .model_path ).joinpath (_CODE_FOLDER ))
134
-
135
- kwargs = {
136
- "model_id" : self .model ,
137
- "role" : self .serve_settings .role_arn ,
138
- "entry_point" : _DJL_MODEL_BUILDER_ENTRY_POINT ,
139
- "dtype" : self ._default_data_type ,
140
- "sagemaker_session" : self .sagemaker_session ,
141
- "source_dir" : code_dir ,
142
- "env" : self .env_vars ,
143
- "hf_hub_token" : self .env_vars .get ("HUGGING_FACE_HUB_TOKEN" ),
144
- "image_config" : self .image_config ,
145
- "vpc_config" : self .vpc_config ,
146
- }
147
-
148
- if self .engine == _DjlEngine .DEEPSPEED :
149
- pysdk_model = DeepSpeedModel (
150
- tensor_parallel_degree = self ._default_tensor_parallel_degree ,
151
- max_tokens = self ._default_max_tokens ,
152
- ** kwargs ,
153
- )
154
- elif self .engine == _DjlEngine .FASTER_TRANSFORMER :
155
- pysdk_model = FasterTransformerModel (
156
- tensor_parallel_degree = self ._default_tensor_parallel_degree ,
157
- ** kwargs ,
158
- )
159
- else :
160
- pysdk_model = HuggingFaceAccelerateModel (
161
- number_of_partitions = self ._default_tensor_parallel_degree ,
162
- ** kwargs ,
163
- )
124
+ pysdk_model = DJLModel (
125
+ model_id = self .model ,
126
+ role = self .serve_settings .role_arn ,
127
+ sagemaker_session = self .sagemaker_session ,
128
+ env = self .env_vars ,
129
+ huggingface_hub_token = self .env_vars .get ("HF_TOKEN" ),
130
+ image_config = self .image_config ,
131
+ vpc_config = self .vpc_config ,
132
+ )
164
133
165
134
if not self .image_uri :
166
135
self .image_uri = pysdk_model .serving_image_uri (self .sagemaker_session .boto_region_name )
@@ -196,7 +165,6 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
196
165
else :
197
166
raise ValueError ("Mode %s is not supported!" % overwrite_mode )
198
167
199
- manual_set_props = None
200
168
if self .mode == Mode .SAGEMAKER_ENDPOINT :
201
169
if self .nb_instance_type and "instance_type" not in kwargs :
202
170
kwargs .update ({"instance_type" : self .nb_instance_type })
@@ -212,17 +180,9 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
212
180
default_tensor_parallel_degree = _get_default_tensor_parallel_degree (
213
181
self .hf_model_config , tot_gpus
214
182
)
215
- manual_set_props = {
216
- "option.tensor_parallel_degree" : str (default_tensor_parallel_degree ) + "\n "
217
- }
218
-
219
- prepare_for_djl_serving (
220
- model_path = self .model_path ,
221
- model = self .pysdk_model ,
222
- dependencies = self .dependencies ,
223
- overwrite_props_from_file = self .overwrite_props_from_file ,
224
- manual_set_props = manual_set_props ,
225
- )
183
+ self .pysdk_model .env .update (
184
+ {"TENSOR_PARALLEL_DEGREE" : str (default_tensor_parallel_degree )}
185
+ )
226
186
227
187
serializer = self .schema_builder .input_serializer
228
188
deserializer = self .schema_builder ._output_deserializer
@@ -239,7 +199,7 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
239
199
timeout if timeout else 1800 ,
240
200
self .secret_key ,
241
201
predictor ,
242
- self .env_vars ,
202
+ self .pysdk_model . env ,
243
203
)
244
204
ram_usage_after = _get_ram_usage_mb ()
245
205
@@ -281,25 +241,22 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
281
241
282
242
def _build_for_hf_djl (self ):
283
243
"""Placeholder docstring"""
284
- self .overwrite_props_from_file = True
285
244
self .nb_instance_type = _get_nb_instance ()
286
245
287
246
_create_dir_structure (self .model_path )
288
- self .engine , self .hf_model_config = _auto_detect_engine (
289
- self .model , self .env_vars .get ("HUGGING_FACE_HUB_TOKEN" )
290
- )
291
-
292
247
if not hasattr (self , "pysdk_model" ):
293
- (
294
- self ._default_tensor_parallel_degree ,
295
- self ._default_data_type ,
296
- _ ,
297
- self ._default_max_tokens ,
298
- self ._default_max_new_tokens ,
299
- ) = _set_serve_properties (self .hf_model_config , self .schema_builder )
248
+ self .env_vars .update ({"HF_MODEL_ID" : self .model })
249
+ self .hf_model_config = _get_model_config_properties_from_hf (
250
+ self .model , self .env_vars .get ("HF_TOKEN" )
251
+ )
252
+ default_djl_configurations , _default_max_new_tokens = _get_default_djl_configurations (
253
+ self .model , self .hf_model_config , self .schema_builder
254
+ )
255
+ self .env_vars .update (default_djl_configurations )
300
256
self .schema_builder .sample_input ["parameters" ][
301
257
"max_new_tokens"
302
- ] = self ._default_max_new_tokens
258
+ ] = _default_max_new_tokens
259
+ logger .info (f"env vars are { self .env_vars } " )
303
260
self .pysdk_model = self ._create_djl_model ()
304
261
305
262
if self .mode == Mode .LOCAL_CONTAINER :
@@ -316,8 +273,6 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
316
273
)
317
274
return self .pysdk_model
318
275
319
- self .overwrite_props_from_file = False
320
-
321
276
admissible_tensor_parallel_degrees = _get_admissible_tensor_parallel_degrees (
322
277
self .hf_model_config
323
278
)
@@ -337,8 +292,9 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
337
292
"Trying tensor parallel degree: %s, dtype: %s..." , tensor_parallel_degree , dtype
338
293
)
339
294
340
- self ._default_tensor_parallel_degree = tensor_parallel_degree
341
- self ._default_data_type = dtype
295
+ self .env_vars .update (
296
+ {"TENSOR_PARALLEL_DEGREE" : str (tensor_parallel_degree ), "OPTION_DTYPE" : dtype }
297
+ )
342
298
self .pysdk_model = self ._create_djl_model ()
343
299
344
300
try :
@@ -353,15 +309,15 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
353
309
predictor , self .schema_builder .sample_input
354
310
)
355
311
356
- serving_properties = self .pysdk_model .generate_serving_properties ()
312
+ tested_env = self .pysdk_model .env . copy ()
357
313
logger .info (
358
314
"Average latency: %s, throughput/s: %s for configuration: %s" ,
359
315
avg_latency ,
360
316
throughput_per_second ,
361
- serving_properties ,
317
+ tested_env ,
362
318
)
363
319
benchmark_results [avg_latency ] = [
364
- serving_properties ,
320
+ tested_env ,
365
321
p90 ,
366
322
avg_tokens_per_second ,
367
323
throughput_per_second ,
@@ -449,48 +405,43 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
449
405
if best_tuned_combination :
450
406
self ._default_tensor_parallel_degree = best_tuned_combination [1 ]
451
407
self ._default_data_type = best_tuned_combination [2 ]
408
+ self .env_vars .update (
409
+ {
410
+ "TENSOR_PARALLEL_DEGREE" : str (self ._default_tensor_parallel_degree ),
411
+ "OPTION_DTYPE" : self ._default_data_type ,
412
+ }
413
+ )
452
414
self .pysdk_model = self ._create_djl_model ()
453
415
454
416
_pretty_print_results (benchmark_results )
455
417
logger .info (
456
418
"Model Configuration: %s was most performant with avg latency: %s, "
457
419
"p90 latency: %s, average tokens per second: %s, throughput/s: %s, "
458
420
"standard deviation of request %s" ,
459
- self .pysdk_model .generate_serving_properties () ,
421
+ self .pysdk_model .env ,
460
422
best_tuned_combination [0 ],
461
423
best_tuned_combination [3 ],
462
424
best_tuned_combination [4 ],
463
425
best_tuned_combination [5 ],
464
426
best_tuned_combination [6 ],
465
427
)
466
428
else :
467
- (
468
- self ._default_tensor_parallel_degree ,
469
- self ._default_data_type ,
470
- _ ,
471
- self ._default_max_tokens ,
472
- self ._default_max_new_tokens ,
473
- ) = _set_serve_properties (self .hf_model_config , self .schema_builder )
429
+ default_djl_configurations , _default_max_new_tokens = _get_default_djl_configurations (
430
+ self .model , self .hf_model_config , self .schema_builder
431
+ )
432
+ self .env_vars .update (default_djl_configurations )
474
433
self .schema_builder .sample_input ["parameters" ][
475
434
"max_new_tokens"
476
- ] = self . _default_max_new_tokens
435
+ ] = _default_max_new_tokens
477
436
self .pysdk_model = self ._create_djl_model ()
478
437
479
438
logger .debug (
480
439
"Failed to gather any tuning results. "
481
440
"Please inspect the stack trace emitted from live logging for more details. "
482
441
"Falling back to default serving.properties: %s" ,
483
- self .pysdk_model .generate_serving_properties () ,
442
+ self .pysdk_model .env ,
484
443
)
485
444
486
- prepare_for_djl_serving (
487
- model_path = self .model_path ,
488
- model = self .pysdk_model ,
489
- dependencies = self .dependencies ,
490
- overwrite_props_from_file = self .overwrite_props_from_file ,
491
- )
492
- self .overwrite_props_from_file = True
493
-
494
445
return self .pysdk_model
495
446
496
447
def _build_for_djl (self ):
0 commit comments