@@ -52,6 +52,7 @@ class DJLServingEngineEntryPointDefaults(Enum):
52
52
DEEPSPEED = ("DeepSpeed" , "djl_python.deepspeed" )
53
53
HUGGINGFACE_ACCELERATE = ("Python" , "djl_python.huggingface" )
54
54
STABLE_DIFFUSION = ("DeepSpeed" , "djl_python.stable-diffusion" )
55
+ FASTER_TRANSFORMER = ("FasterTransformer" , "djl_python.fastertransformer" )
55
56
56
57
57
58
class DJLPredictor (Predictor ):
@@ -93,30 +94,34 @@ def __init__(
93
94
def _determine_engine_for_model (model_type : str , num_partitions : int , num_heads : int ):
94
95
"""Placeholder docstring"""
95
96
96
- # Tensor Parallelism with DeepSpeed is only possible if attention heads can be split evenly
97
+ # Tensor Parallelism is only possible if attention heads can be split evenly
97
98
# across devices
98
99
if num_heads is not None and num_partitions is not None and num_heads % num_partitions :
99
100
return HuggingFaceAccelerateModel
100
101
if model_type in defaults .DEEPSPEED_RECOMMENDED_ARCHITECTURES :
101
102
return DeepSpeedModel
103
+ if model_type in defaults .FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES :
104
+ return FasterTransformerModel
102
105
return HuggingFaceAccelerateModel
103
106
104
107
105
108
def _validate_engine_for_model_type (cls , model_type : str , num_partitions : int , num_heads : int ):
106
109
"""Placeholder docstring"""
107
110
108
111
if cls == DeepSpeedModel :
109
- if model_type not in defaults .DEEPSPEED_SUPPORTED_ARCHITECTURES :
110
- raise ValueError (
111
- f"{ model_type } is not supported by DeepSpeed. "
112
- f"Supported model_types are { defaults .DEEPSPEED_SUPPORTED_ARCHITECTURES } "
113
- )
114
112
if num_heads is not None and num_partitions is not None and num_heads % num_partitions :
115
113
raise ValueError (
116
114
"The number of attention heads is not evenly divisible by the number of partitions."
117
115
"Please set the number of partitions such that the number of attention heads can be"
118
116
"evenly split across the partitions."
119
117
)
118
+ if cls == FasterTransformerModel :
119
+ if model_type not in defaults .FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES :
120
+ raise ValueError (
121
+ f"The model architecture { model_type } is currently not supported by "
122
+ f"FasterTransformer. Please use a different engine, or use the DJLModel"
123
+ f"to let SageMaker pick a recommended engine for this model."
124
+ )
120
125
return cls
121
126
122
127
@@ -223,6 +228,8 @@ def __new__(
223
228
instance .engine = DJLServingEngineEntryPointDefaults .STABLE_DIFFUSION
224
229
elif isinstance (instance , DeepSpeedModel ):
225
230
instance .engine = DJLServingEngineEntryPointDefaults .DEEPSPEED
231
+ elif isinstance (instance , FasterTransformerModel ):
232
+ instance .engine = DJLServingEngineEntryPointDefaults .FASTER_TRANSFORMER
226
233
else :
227
234
instance .engine = DJLServingEngineEntryPointDefaults .HUGGINGFACE_ACCELERATE
228
235
return instance
@@ -849,3 +856,63 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
849
856
serving_properties ["option.dtype" ] = "auto"
850
857
serving_properties .pop ("option.load_in_8bit" , None )
851
858
return serving_properties
859
+
860
+
861
+ class FasterTransformerModel (DJLModel ):
862
+ """A DJL FasterTransformer SageMaker ``Model`` that can be deployed to a
863
+ SageMaker ``Endpoint``.
864
+ """
865
+
866
+ _framework_name = "djl-fastertransformer"
867
+
868
+ def __init__ (
869
+ self ,
870
+ model_id : str ,
871
+ role : str ,
872
+ tensor_parallel_degree : Optional [int ] = None ,
873
+ ** kwargs ,
874
+ ):
875
+ """Initialize a FasterTransformerModel.
876
+
877
+ Args:
878
+ model_id (str): This is either the HuggingFace Hub model_id, or the Amazon S3 location
879
+ containing the uncompressed model artifacts (i.e. not a tar.gz file).
880
+ The model artifacts are expected to be in HuggingFace pre-trained model
881
+ format (i.e. model should be loadable from the huggingface transformers
882
+ from_pretrained api, and should also include tokenizer configs if applicable).
883
+ role (str): An AWS IAM role specified with either the name or full ARN. The Amazon
884
+ SageMaker training jobs and APIs that create Amazon SageMaker
885
+ endpoints use this role to access model artifacts. After the endpoint is created,
886
+ the inference code
887
+ might use the IAM role, if it needs to access an AWS resource.
888
+ tensor_parllel_degree (int): The number of gpus to shard a single instance of the
889
+ model across via tensor_parallelism. This should be set to greater than 1 if the
890
+ size of the model is larger than the memory available on a single GPU on the
891
+ instance. Defaults to None. If not set, no tensor parallel sharding is done.
892
+ **kwargs: Keyword arguments passed to the superclasses
893
+ :class:`~sagemaker.djl_inference.DJLModel`,
894
+ :class:`~sagemaker.model.FrameworkModel`, and
895
+ :class:`~sagemaker.model.Model`
896
+
897
+ .. tip::
898
+
899
+ You can find additional parameters for initializing this class at
900
+ :class:`~sagemaker.djl_inference.DJLModel`,
901
+ :class:`~sagemaker.model.FrameworkModel`, and
902
+ :class:`~sagemaker.model.Model`.
903
+ """
904
+
905
+ super (FasterTransformerModel , self ).__init__ (
906
+ model_id ,
907
+ role ,
908
+ ** kwargs ,
909
+ )
910
+ if self .number_of_partitions and tensor_parallel_degree :
911
+ logger .warning (
912
+ "Both number_of_partitions and tensor_parallel_degree have been set for "
913
+ "FasterTransformerModel."
914
+ "These mean the same thing for FasterTransformerModel. Please only set "
915
+ "tensor_parallel_degree."
916
+ "number_of_partitions will be ignored"
917
+ )
918
+ self .number_of_partitions = tensor_parallel_degree or self .number_of_partitions
0 commit comments