Skip to content

Commit 8802bff

Browse files
committed
Merge remote-tracking branch 'origin' into feat/jumpstart-model-estimator-classes
2 parents 9c8fc36 + 4844aa1 commit 8802bff

File tree

15 files changed

+219
-37
lines changed

15 files changed

+219
-37
lines changed

doc/frameworks/djl/sagemaker.djl_inference.rst

+8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ HuggingFaceAccelerateModel
2626
:undoc-members:
2727
:show-inheritance:
2828

29+
FasterTransformerModel
30+
---------------------------
31+
32+
.. autoclass:: sagemaker.djl_inference.model.FasterTransformerModel
33+
:members:
34+
:undoc-members:
35+
:show-inheritance:
36+
2937
DJLPredictor
3038
---------------------------
3139

doc/frameworks/djl/using_djl.rst

+13-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ With the SageMaker Python SDK, you can use DJL Serving to host models that have
2323
These can either be models you have trained/fine-tuned yourself, or models available publicly from the HuggingFace Hub.
2424
DJL Serving in the SageMaker Python SDK supports hosting models for the popular HuggingFace NLP tasks, as well as Stable Diffusion.
2525

26-
You can either deploy your model using DeepSpeed or HuggingFace Accelerate, or let DJL Serving determine the best backend based on your model architecture and configuration.
26+
You can either deploy your model using DeepSpeed, FasterTransformer, or HuggingFace Accelerate, or let DJL Serving determine the best backend based on your model architecture and configuration.
2727

2828
.. code:: python
2929
@@ -63,11 +63,23 @@ If you want to use a specific backend, then you can create an instance of the co
6363
number_of_partitions=2, # number of gpus to partition the model across
6464
)
6565
66+
# Create a model using the FasterTransformer backend
67+
68+
fastertransformer_model = FasterTransformerModel(
69+
"s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
70+
"my_sagemaker_role",
71+
data_type="fp16",
72+
task="text-generation",
73+
tensor_parallel_degree=2, # number of gpus to partition the model across
74+
)
75+
6676
# Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
6777
deepspeed_predictor = deepspeed_model.deploy("ml.g5.12xlarge",
6878
initial_instance_count=1)
6979
hf_accelerate_predictor = hf_accelerate_model.deploy("ml.g5.12xlarge",
7080
initial_instance_count=1)
81+
fastertransformer_predictor = fastertransformer_model.deploy("ml.g5.12xlarge",
82+
initial_instance_count=1)
7183
7284
Regardless of which way you choose to create your model, a ``Predictor`` object is returned. You can use this ``Predictor``
7385
to do inference on the endpoint hosting your DJLModel.

src/sagemaker/automl/automl.py

+19
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def __init__(
4949
channel_type=None,
5050
content_type=None,
5151
s3_data_type=None,
52+
sample_weight_attribute_name=None,
5253
):
5354
"""Convert an S3 Uri or a list of S3 Uri to an AutoMLInput object.
5455
@@ -67,13 +68,16 @@ def __init__(
6768
The content type of the data from the input source.
6869
s3_data_type (str, PipelineVariable): The data type for S3 data source.
6970
Valid values: ManifestFile or S3Prefix.
71+
sample_weight_attribute_name (str, PipelineVariable):
72+
the name of the dataset column representing sample weights
7073
"""
7174
self.inputs = inputs
7275
self.target_attribute_name = target_attribute_name
7376
self.compression = compression
7477
self.channel_type = channel_type
7578
self.content_type = content_type
7679
self.s3_data_type = s3_data_type
80+
self.sample_weight_attribute_name = sample_weight_attribute_name
7781

7882
def to_request_dict(self):
7983
"""Generates a request dictionary using the parameters provided to the class."""
@@ -96,6 +100,8 @@ def to_request_dict(self):
96100
input_entry["ContentType"] = self.content_type
97101
if self.s3_data_type is not None:
98102
input_entry["DataSource"]["S3DataSource"]["S3DataType"] = self.s3_data_type
103+
if self.sample_weight_attribute_name is not None:
104+
input_entry["SampleWeightAttributeName"] = self.sample_weight_attribute_name
99105
auto_ml_input.append(input_entry)
100106
return auto_ml_input
101107

@@ -129,6 +135,7 @@ def __init__(
129135
mode: Optional[str] = None,
130136
auto_generate_endpoint_name: Optional[bool] = None,
131137
endpoint_name: Optional[str] = None,
138+
sample_weight_attribute_name: str = None,
132139
):
133140
"""Initialize the an AutoML object.
134141
@@ -179,6 +186,8 @@ def __init__(
179186
model deployment if the endpoint name is not generated automatically.
180187
Specify the endpoint_name if and only if
181188
auto_generate_endpoint_name is set to False
189+
sample_weight_attribute_name (str): The name of dataset column representing
190+
sample weights.
182191
183192
Returns:
184193
AutoML object.
@@ -234,6 +243,7 @@ def __init__(
234243
)
235244

236245
self._check_problem_type_and_job_objective(self.problem_type, self.job_objective)
246+
self.sample_weight_attribute_name = sample_weight_attribute_name
237247

238248
@runnable_by_pipeline
239249
def fit(self, inputs=None, wait=True, logs=True, job_name=None):
@@ -342,6 +352,9 @@ def attach(cls, auto_ml_job_name, sagemaker_session=None):
342352
"AutoGenerateEndpointName", False
343353
),
344354
endpoint_name=auto_ml_job_desc.get("ModelDeployConfig", {}).get("EndpointName"),
355+
sample_weight_attribute_name=auto_ml_job_desc["InputDataConfig"][0].get(
356+
"SampleWeightAttributeName", None
357+
),
345358
)
346359
amlj.current_job_name = auto_ml_job_name
347360
amlj.latest_auto_ml_job = auto_ml_job_name # pylint: disable=W0201
@@ -867,6 +880,7 @@ def _load_config(cls, inputs, auto_ml, expand_role=True, validate_uri=True):
867880
auto_ml.target_attribute_name,
868881
auto_ml.content_type,
869882
auto_ml.s3_data_type,
883+
auto_ml.sample_weight_attribute_name,
870884
)
871885
output_config = _Job._prepare_output_config(auto_ml.output_path, auto_ml.output_kms_key)
872886

@@ -932,6 +946,7 @@ def _format_inputs_to_input_config(
932946
target_attribute_name=None,
933947
content_type=None,
934948
s3_data_type=None,
949+
sample_weight_attribute_name=None,
935950
):
936951
"""Convert inputs to AutoML InputDataConfig.
937952
@@ -961,6 +976,8 @@ def _format_inputs_to_input_config(
961976
channel["ContentType"] = content_type
962977
if s3_data_type is not None:
963978
channel["DataSource"]["S3DataSource"]["S3DataType"] = s3_data_type
979+
if sample_weight_attribute_name is not None:
980+
channel["SampleWeightAttributeName"] = sample_weight_attribute_name
964981
channels.append(channel)
965982
elif isinstance(inputs, list):
966983
for input_entry in inputs:
@@ -974,6 +991,8 @@ def _format_inputs_to_input_config(
974991
channel["ContentType"] = content_type
975992
if s3_data_type is not None:
976993
channel["DataSource"]["S3DataSource"]["S3DataType"] = s3_data_type
994+
if sample_weight_attribute_name is not None:
995+
channel["SampleWeightAttributeName"] = sample_weight_attribute_name
977996
channels.append(channel)
978997
else:
979998
msg = (

src/sagemaker/djl_inference/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@
1717
from sagemaker.djl_inference.model import DJLModel # noqa: F401
1818
from sagemaker.djl_inference.model import DeepSpeedModel # noqa: F401
1919
from sagemaker.djl_inference.model import HuggingFaceAccelerateModel # noqa: F401
20+
from sagemaker.djl_inference.model import FasterTransformerModel # noqa: F401

src/sagemaker/djl_inference/defaults.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,19 @@
3030
STABLE_DIFFUSION_MODEL_TYPE,
3131
}
3232

33-
DEEPSPEED_SUPPORTED_ARCHITECTURES = {
33+
FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES = {
34+
"t5",
35+
}
36+
37+
FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES = {
38+
"bert",
39+
"gpt2",
3440
"bloom",
3541
"opt",
36-
"gpt_neox",
3742
"gptj",
43+
"gpt_neox",
3844
"gpt_neo",
39-
"gpt2",
40-
"xlm-roberta",
41-
"roberta",
42-
"bert",
43-
STABLE_DIFFUSION_MODEL_TYPE,
45+
"t5",
4446
}
4547

4648
ALLOWED_INSTANCE_FAMILIES = {

src/sagemaker/djl_inference/model.py

+75-7
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class DJLServingEngineEntryPointDefaults(Enum):
5252
DEEPSPEED = ("DeepSpeed", "djl_python.deepspeed")
5353
HUGGINGFACE_ACCELERATE = ("Python", "djl_python.huggingface")
5454
STABLE_DIFFUSION = ("DeepSpeed", "djl_python.stable-diffusion")
55+
FASTER_TRANSFORMER = ("FasterTransformer", "djl_python.fastertransformer")
5556

5657

5758
class DJLPredictor(Predictor):
@@ -93,30 +94,34 @@ def __init__(
9394
def _determine_engine_for_model(model_type: str, num_partitions: int, num_heads: int):
9495
"""Placeholder docstring"""
9596

96-
# Tensor Parallelism with DeepSpeed is only possible if attention heads can be split evenly
97+
# Tensor Parallelism is only possible if attention heads can be split evenly
9798
# across devices
9899
if num_heads is not None and num_partitions is not None and num_heads % num_partitions:
99100
return HuggingFaceAccelerateModel
100101
if model_type in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES:
101102
return DeepSpeedModel
103+
if model_type in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES:
104+
return FasterTransformerModel
102105
return HuggingFaceAccelerateModel
103106

104107

105108
def _validate_engine_for_model_type(cls, model_type: str, num_partitions: int, num_heads: int):
106109
"""Placeholder docstring"""
107110

108111
if cls == DeepSpeedModel:
109-
if model_type not in defaults.DEEPSPEED_SUPPORTED_ARCHITECTURES:
110-
raise ValueError(
111-
f"{model_type} is not supported by DeepSpeed. "
112-
f"Supported model_types are {defaults.DEEPSPEED_SUPPORTED_ARCHITECTURES}"
113-
)
114112
if num_heads is not None and num_partitions is not None and num_heads % num_partitions:
115113
raise ValueError(
116114
"The number of attention heads is not evenly divisible by the number of partitions."
117115
"Please set the number of partitions such that the number of attention heads can be"
118116
"evenly split across the partitions."
119117
)
118+
if cls == FasterTransformerModel:
119+
if model_type not in defaults.FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES:
120+
raise ValueError(
121+
f"The model architecture {model_type} is currently not supported by "
122+
f"FasterTransformer. Please use a different engine, or use the DJLModel"
123+
f"to let SageMaker pick a recommended engine for this model."
124+
)
120125
return cls
121126

122127

@@ -223,6 +228,8 @@ def __new__(
223228
instance.engine = DJLServingEngineEntryPointDefaults.STABLE_DIFFUSION
224229
elif isinstance(instance, DeepSpeedModel):
225230
instance.engine = DJLServingEngineEntryPointDefaults.DEEPSPEED
231+
elif isinstance(instance, FasterTransformerModel):
232+
instance.engine = DJLServingEngineEntryPointDefaults.FASTER_TRANSFORMER
226233
else:
227234
instance.engine = DJLServingEngineEntryPointDefaults.HUGGINGFACE_ACCELERATE
228235
return instance
@@ -606,7 +613,7 @@ def serving_image_uri(self, region_name):
606613
str: The appropriate image URI based on the given parameters.
607614
"""
608615
if not self.djl_version:
609-
self.djl_version = "0.21.0"
616+
self.djl_version = "0.22.1"
610617

611618
return image_uris.retrieve(
612619
self._framework(),
@@ -856,3 +863,64 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
856863
serving_properties["option.dtype"] = "auto"
857864
serving_properties.pop("option.load_in_8bit", None)
858865
return serving_properties
866+
867+
868+
class FasterTransformerModel(DJLModel):
869+
"""A DJL FasterTransformer SageMaker ``Model``
870+
871+
This can be deployed to a SageMaker ``Endpoint``.
872+
"""
873+
874+
_framework_name = "djl-fastertransformer"
875+
876+
def __init__(
877+
self,
878+
model_id: str,
879+
role: str,
880+
tensor_parallel_degree: Optional[int] = None,
881+
**kwargs,
882+
):
883+
"""Initialize a FasterTransformerModel.
884+
885+
Args:
886+
model_id (str): This is either the HuggingFace Hub model_id, or the Amazon S3 location
887+
containing the uncompressed model artifacts (i.e. not a tar.gz file).
888+
The model artifacts are expected to be in HuggingFace pre-trained model
889+
format (i.e. model should be loadable from the huggingface transformers
890+
from_pretrained api, and should also include tokenizer configs if applicable).
891+
role (str): An AWS IAM role specified with either the name or full ARN. The Amazon
892+
SageMaker training jobs and APIs that create Amazon SageMaker
893+
endpoints use this role to access model artifacts. After the endpoint is created,
894+
the inference code
895+
might use the IAM role, if it needs to access an AWS resource.
896+
tensor_parllel_degree (int): The number of gpus to shard a single instance of the
897+
model across via tensor_parallelism. This should be set to greater than 1 if the
898+
size of the model is larger than the memory available on a single GPU on the
899+
instance. Defaults to None. If not set, no tensor parallel sharding is done.
900+
**kwargs: Keyword arguments passed to the superclasses
901+
:class:`~sagemaker.djl_inference.DJLModel`,
902+
:class:`~sagemaker.model.FrameworkModel`, and
903+
:class:`~sagemaker.model.Model`
904+
905+
.. tip::
906+
907+
You can find additional parameters for initializing this class at
908+
:class:`~sagemaker.djl_inference.DJLModel`,
909+
:class:`~sagemaker.model.FrameworkModel`, and
910+
:class:`~sagemaker.model.Model`.
911+
"""
912+
913+
super(FasterTransformerModel, self).__init__(
914+
model_id,
915+
role,
916+
**kwargs,
917+
)
918+
if self.number_of_partitions and tensor_parallel_degree:
919+
logger.warning(
920+
"Both number_of_partitions and tensor_parallel_degree have been set for "
921+
"FasterTransformerModel."
922+
"These mean the same thing for FasterTransformerModel. Please only set "
923+
"tensor_parallel_degree."
924+
"number_of_partitions will be ignored"
925+
)
926+
self.number_of_partitions = tensor_parallel_degree or self.number_of_partitions

src/sagemaker/image_uri_config/djl-deepspeed.json

+30
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
11
{
22
"scope": ["inference"],
33
"versions": {
4+
"0.22.1": {
5+
"registries": {
6+
"af-south-1": "626614931356",
7+
"ap-east-1": "871362719292",
8+
"ap-northeast-1": "763104351884",
9+
"ap-northeast-2": "763104351884",
10+
"ap-northeast-3": "364406365360",
11+
"ap-south-1": "763104351884",
12+
"ap-southeast-1": "763104351884",
13+
"ap-southeast-2": "763104351884",
14+
"ap-southeast-3": "907027046896",
15+
"ca-central-1": "763104351884",
16+
"cn-north-1": "727897471807",
17+
"cn-northwest-1": "727897471807",
18+
"eu-central-1": "763104351884",
19+
"eu-north-1": "763104351884",
20+
"eu-west-1": "763104351884",
21+
"eu-west-2": "763104351884",
22+
"eu-west-3": "763104351884",
23+
"eu-south-1": "692866216735",
24+
"me-south-1": "217643126080",
25+
"sa-east-1": "763104351884",
26+
"us-east-1": "763104351884",
27+
"us-east-2": "763104351884",
28+
"us-west-1": "763104351884",
29+
"us-west-2": "763104351884"
30+
},
31+
"repository": "djl-inference",
32+
"tag_prefix": "0.22.1-deepspeed0.8.3-cu118"
33+
},
434
"0.21.0": {
535
"registries": {
636
"af-south-1": "626614931356",

src/sagemaker/image_uri_config/djl-fastertransformer.json

+30
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
11
{
22
"scope": ["inference"],
33
"versions": {
4+
"0.22.1": {
5+
"registries": {
6+
"af-south-1": "626614931356",
7+
"ap-east-1": "871362719292",
8+
"ap-northeast-1": "763104351884",
9+
"ap-northeast-2": "763104351884",
10+
"ap-northeast-3": "364406365360",
11+
"ap-south-1": "763104351884",
12+
"ap-southeast-1": "763104351884",
13+
"ap-southeast-2": "763104351884",
14+
"ap-southeast-3": "907027046896",
15+
"ca-central-1": "763104351884",
16+
"cn-north-1": "727897471807",
17+
"cn-northwest-1": "727897471807",
18+
"eu-central-1": "763104351884",
19+
"eu-north-1": "763104351884",
20+
"eu-west-1": "763104351884",
21+
"eu-west-2": "763104351884",
22+
"eu-west-3": "763104351884",
23+
"eu-south-1": "692866216735",
24+
"me-south-1": "217643126080",
25+
"sa-east-1": "763104351884",
26+
"us-east-1": "763104351884",
27+
"us-east-2": "763104351884",
28+
"us-west-1": "763104351884",
29+
"us-west-2": "763104351884"
30+
},
31+
"repository": "djl-inference",
32+
"tag_prefix": "0.22.1-fastertransformer5.3.0-cu118"
33+
},
434
"0.21.0": {
535
"registries": {
636
"af-south-1": "626614931356",

0 commit comments

Comments
 (0)