Skip to content

[feat] Add FasterTransformer DJL support #3823

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/frameworks/djl/sagemaker.djl_inference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ HuggingFaceAccelerateModel
:undoc-members:
:show-inheritance:

FasterTransformerModel
---------------------------

.. autoclass:: sagemaker.djl_inference.model.FasterTransformerModel
:members:
:undoc-members:
:show-inheritance:

DJLPredictor
---------------------------

Expand Down
14 changes: 13 additions & 1 deletion doc/frameworks/djl/using_djl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ With the SageMaker Python SDK, you can use DJL Serving to host models that have
These can either be models you have trained/fine-tuned yourself, or models available publicly from the HuggingFace Hub.
DJL Serving in the SageMaker Python SDK supports hosting models for the popular HuggingFace NLP tasks, as well as Stable Diffusion.

You can either deploy your model using DeepSpeed or HuggingFace Accelerate, or let DJL Serving determine the best backend based on your model architecture and configuration.
You can either deploy your model using DeepSpeed, FasterTransformer, or HuggingFace Accelerate, or let DJL Serving determine the best backend based on your model architecture and configuration.

.. code:: python

Expand Down Expand Up @@ -63,11 +63,23 @@ If you want to use a specific backend, then you can create an instance of the co
number_of_partitions=2, # number of gpus to partition the model across
)

# Create a model using the FasterTransformer backend

fastertransformer_model = FasterTransformerModel(
"s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
"my_sagemaker_role",
data_type="fp16",
task="text-generation",
tensor_parallel_degree=2, # number of gpus to partition the model across
)

# Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
deepspeed_predictor = deepspeed_model.deploy("ml.g5.12xlarge",
initial_instance_count=1)
hf_accelerate_predictor = hf_accelerate_model.deploy("ml.g5.12xlarge",
initial_instance_count=1)
fastertransformer_predictor = fastertransformer_model.deploy("ml.g5.12xlarge",
initial_instance_count=1)

Regardless of which way you choose to create your model, a ``Predictor`` object is returned. You can use this ``Predictor``
to do inference on the endpoint hosting your DJLModel.
Expand Down
1 change: 1 addition & 0 deletions src/sagemaker/djl_inference/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
from sagemaker.djl_inference.model import DJLModel # noqa: F401
from sagemaker.djl_inference.model import DeepSpeedModel # noqa: F401
from sagemaker.djl_inference.model import HuggingFaceAccelerateModel # noqa: F401
from sagemaker.djl_inference.model import FasterTransformerModel # noqa: F401
16 changes: 9 additions & 7 deletions src/sagemaker/djl_inference/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,19 @@
STABLE_DIFFUSION_MODEL_TYPE,
}

DEEPSPEED_SUPPORTED_ARCHITECTURES = {
FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES = {
"t5",
}

FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES = {
"bert",
"gpt2",
"bloom",
"opt",
"gpt_neox",
"gptj",
"gpt_neox",
"gpt_neo",
"gpt2",
"xlm-roberta",
"roberta",
"bert",
STABLE_DIFFUSION_MODEL_TYPE,
"t5",
}

ALLOWED_INSTANCE_FAMILIES = {
Expand Down
80 changes: 74 additions & 6 deletions src/sagemaker/djl_inference/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class DJLServingEngineEntryPointDefaults(Enum):
DEEPSPEED = ("DeepSpeed", "djl_python.deepspeed")
HUGGINGFACE_ACCELERATE = ("Python", "djl_python.huggingface")
STABLE_DIFFUSION = ("DeepSpeed", "djl_python.stable-diffusion")
FASTER_TRANSFORMER = ("FasterTransformer", "djl_python.fastertransformer")


class DJLPredictor(Predictor):
Expand Down Expand Up @@ -93,30 +94,34 @@ def __init__(
def _determine_engine_for_model(model_type: str, num_partitions: int, num_heads: int):
"""Placeholder docstring"""

# Tensor Parallelism with DeepSpeed is only possible if attention heads can be split evenly
# Tensor Parallelism is only possible if attention heads can be split evenly
# across devices
if num_heads is not None and num_partitions is not None and num_heads % num_partitions:
return HuggingFaceAccelerateModel
if model_type in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES:
return DeepSpeedModel
if model_type in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES:
return FasterTransformerModel
return HuggingFaceAccelerateModel


def _validate_engine_for_model_type(cls, model_type: str, num_partitions: int, num_heads: int):
"""Placeholder docstring"""

if cls == DeepSpeedModel:
if model_type not in defaults.DEEPSPEED_SUPPORTED_ARCHITECTURES:
raise ValueError(
f"{model_type} is not supported by DeepSpeed. "
f"Supported model_types are {defaults.DEEPSPEED_SUPPORTED_ARCHITECTURES}"
)
if num_heads is not None and num_partitions is not None and num_heads % num_partitions:
raise ValueError(
"The number of attention heads is not evenly divisible by the number of partitions."
"Please set the number of partitions such that the number of attention heads can be"
"evenly split across the partitions."
)
if cls == FasterTransformerModel:
if model_type not in defaults.FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES:
raise ValueError(
f"The model architecture {model_type} is currently not supported by "
f"FasterTransformer. Please use a different engine, or use the DJLModel"
f"to let SageMaker pick a recommended engine for this model."
)
return cls


Expand Down Expand Up @@ -223,6 +228,8 @@ def __new__(
instance.engine = DJLServingEngineEntryPointDefaults.STABLE_DIFFUSION
elif isinstance(instance, DeepSpeedModel):
instance.engine = DJLServingEngineEntryPointDefaults.DEEPSPEED
elif isinstance(instance, FasterTransformerModel):
instance.engine = DJLServingEngineEntryPointDefaults.FASTER_TRANSFORMER
else:
instance.engine = DJLServingEngineEntryPointDefaults.HUGGINGFACE_ACCELERATE
return instance
Expand Down Expand Up @@ -849,3 +856,64 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
serving_properties["option.dtype"] = "auto"
serving_properties.pop("option.load_in_8bit", None)
return serving_properties


class FasterTransformerModel(DJLModel):
"""A DJL FasterTransformer SageMaker ``Model``

This can be deployed to a SageMaker ``Endpoint``.
"""

_framework_name = "djl-fastertransformer"

def __init__(
self,
model_id: str,
role: str,
tensor_parallel_degree: Optional[int] = None,
**kwargs,
):
"""Initialize a FasterTransformerModel.

Args:
model_id (str): This is either the HuggingFace Hub model_id, or the Amazon S3 location
containing the uncompressed model artifacts (i.e. not a tar.gz file).
The model artifacts are expected to be in HuggingFace pre-trained model
format (i.e. model should be loadable from the huggingface transformers
from_pretrained api, and should also include tokenizer configs if applicable).
role (str): An AWS IAM role specified with either the name or full ARN. The Amazon
SageMaker training jobs and APIs that create Amazon SageMaker
endpoints use this role to access model artifacts. After the endpoint is created,
the inference code
might use the IAM role, if it needs to access an AWS resource.
tensor_parllel_degree (int): The number of gpus to shard a single instance of the
model across via tensor_parallelism. This should be set to greater than 1 if the
size of the model is larger than the memory available on a single GPU on the
instance. Defaults to None. If not set, no tensor parallel sharding is done.
**kwargs: Keyword arguments passed to the superclasses
:class:`~sagemaker.djl_inference.DJLModel`,
:class:`~sagemaker.model.FrameworkModel`, and
:class:`~sagemaker.model.Model`

.. tip::

You can find additional parameters for initializing this class at
:class:`~sagemaker.djl_inference.DJLModel`,
:class:`~sagemaker.model.FrameworkModel`, and
:class:`~sagemaker.model.Model`.
"""

super(FasterTransformerModel, self).__init__(
model_id,
role,
**kwargs,
)
if self.number_of_partitions and tensor_parallel_degree:
logger.warning(
"Both number_of_partitions and tensor_parallel_degree have been set for "
"FasterTransformerModel."
"These mean the same thing for FasterTransformerModel. Please only set "
"tensor_parallel_degree."
"number_of_partitions will be ignored"
)
self.number_of_partitions = tensor_parallel_degree or self.number_of_partitions
16 changes: 1 addition & 15 deletions tests/unit/test_djl_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def test_create_model_automatic_engine_selection(mock_s3_list, mock_read_file, s
sagemaker_session=sagemaker_session,
number_of_partitions=4,
)
assert hf_model.engine == DJLServingEngineEntryPointDefaults.HUGGINGFACE_ACCELERATE
assert hf_model.engine == DJLServingEngineEntryPointDefaults.FASTER_TRANSFORMER

hf_model_config = {
"model_type": "gpt2",
Expand Down Expand Up @@ -200,20 +200,6 @@ def test_create_deepspeed_model(mock_s3_list, mock_read_file, sagemaker_session)
)
assert ds_model.engine == DJLServingEngineEntryPointDefaults.DEEPSPEED

ds_model_config = {
"model_type": "t5",
"n_head": 12,
}
mock_read_file.return_value = json.dumps(ds_model_config)
with pytest.raises(ValueError) as invalid_model_type:
_ = DeepSpeedModel(
VALID_UNCOMPRESSED_MODEL_DATA,
ROLE,
sagemaker_session=sagemaker_session,
tensor_parallel_degree=1,
)
assert str(invalid_model_type.value).startswith("t5 is not supported by DeepSpeed")

ds_model_config = {
"model_type": "opt",
"n_head": 25,
Expand Down