aws
diff --git a/‎doc/frameworks/djl/sagemaker.djl_inference.rst
+8 b/‎doc/frameworks/djl/sagemaker.djl_inference.rst
+8
diff --git a/‎doc/frameworks/djl/using_djl.rst
+13-1 b/‎doc/frameworks/djl/using_djl.rst
+13-1
diff --git a/‎src/sagemaker/automl/automl.py
+19 b/‎src/sagemaker/automl/automl.py
+19
diff --git a/‎src/sagemaker/djl_inference/__init__.py
+1 b/‎src/sagemaker/djl_inference/__init__.py
+1
diff --git a/‎src/sagemaker/djl_inference/defaults.py
+9-7 b/‎src/sagemaker/djl_inference/defaults.py
+9-7
diff --git a/‎src/sagemaker/djl_inference/model.py
+75-7 b/‎src/sagemaker/djl_inference/model.py
+75-7
diff --git a/‎src/sagemaker/image_uri_config/djl-deepspeed.json
+30 b/‎src/sagemaker/image_uri_config/djl-deepspeed.json
+30
diff --git a/‎src/sagemaker/image_uri_config/djl-fastertransformer.json
+30 b/‎src/sagemaker/image_uri_config/djl-fastertransformer.json
+30
@@ -26,6 +26,14 @@ HuggingFaceAccelerateModel
     :undoc-members:
     :show-inheritance:
 
+FasterTransformerModel
+---------------------------
+
+.. autoclass:: sagemaker.djl_inference.model.FasterTransformerModel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 DJLPredictor
 ---------------------------
 
 
@@ -23,7 +23,7 @@ With the SageMaker Python SDK, you can use DJL Serving to host models that have
 These can either be models you have trained/fine-tuned yourself, or models available publicly from the HuggingFace Hub.
 DJL Serving in the SageMaker Python SDK supports hosting models for the popular HuggingFace NLP tasks, as well as Stable Diffusion.
 
-You can either deploy your model using DeepSpeed or HuggingFace Accelerate, or let DJL Serving determine the best backend based on your model architecture and configuration.
+You can either deploy your model using DeepSpeed, FasterTransformer, or HuggingFace Accelerate, or let DJL Serving determine the best backend based on your model architecture and configuration.
 
 .. code:: python
 
@@ -63,11 +63,23 @@ If you want to use a specific backend, then you can create an instance of the co
         number_of_partitions=2, # number of gpus to partition the model across
     )
 
+    # Create a model using the FasterTransformer backend
+
+    fastertransformer_model = FasterTransformerModel(
+        "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
+        "my_sagemaker_role",
+        data_type="fp16",
+        task="text-generation",
+        tensor_parallel_degree=2, # number of gpus to partition the model across
+    )
+
     # Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
     deepspeed_predictor = deepspeed_model.deploy("ml.g5.12xlarge",
                                                  initial_instance_count=1)
     hf_accelerate_predictor = hf_accelerate_model.deploy("ml.g5.12xlarge",
                                                          initial_instance_count=1)
+    fastertransformer_predictor = fastertransformer_model.deploy("ml.g5.12xlarge",
+                                                                 initial_instance_count=1)
 
 Regardless of which way you choose to create your model, a ``Predictor`` object is returned. You can use this ``Predictor``
 to do inference on the endpoint hosting your DJLModel.
 
@@ -49,6 +49,7 @@ def __init__(
         channel_type=None,
         content_type=None,
         s3_data_type=None,
+        sample_weight_attribute_name=None,
     ):
         """Convert an S3 Uri or a list of S3 Uri to an AutoMLInput object.
 
@@ -67,13 +68,16 @@ def __init__(
                 The content type of the data from the input source.
             s3_data_type (str, PipelineVariable): The data type for S3 data source.
                 Valid values: ManifestFile or S3Prefix.
+            sample_weight_attribute_name (str, PipelineVariable):
+                the name of the dataset column representing sample weights
         """
         self.inputs = inputs
         self.target_attribute_name = target_attribute_name
         self.compression = compression
         self.channel_type = channel_type
         self.content_type = content_type
         self.s3_data_type = s3_data_type
+        self.sample_weight_attribute_name = sample_weight_attribute_name
 
     def to_request_dict(self):
         """Generates a request dictionary using the parameters provided to the class."""
@@ -96,6 +100,8 @@ def to_request_dict(self):
                 input_entry["ContentType"] = self.content_type
             if self.s3_data_type is not None:
                 input_entry["DataSource"]["S3DataSource"]["S3DataType"] = self.s3_data_type
+            if self.sample_weight_attribute_name is not None:
+                input_entry["SampleWeightAttributeName"] = self.sample_weight_attribute_name
             auto_ml_input.append(input_entry)
         return auto_ml_input
 
@@ -129,6 +135,7 @@ def __init__(
         mode: Optional[str] = None,
         auto_generate_endpoint_name: Optional[bool] = None,
         endpoint_name: Optional[str] = None,
+        sample_weight_attribute_name: str = None,
     ):
         """Initialize the an AutoML object.
 
@@ -179,6 +186,8 @@ def __init__(
                 model deployment if the endpoint name is not generated automatically.
                 Specify the endpoint_name if and only if
                 auto_generate_endpoint_name is set to False
+            sample_weight_attribute_name (str): The name of dataset column representing
+                sample weights.
 
         Returns:
             AutoML object.
@@ -234,6 +243,7 @@ def __init__(
         )
 
         self._check_problem_type_and_job_objective(self.problem_type, self.job_objective)
+        self.sample_weight_attribute_name = sample_weight_attribute_name
 
     @runnable_by_pipeline
     def fit(self, inputs=None, wait=True, logs=True, job_name=None):
@@ -342,6 +352,9 @@ def attach(cls, auto_ml_job_name, sagemaker_session=None):
                 "AutoGenerateEndpointName", False
             ),
             endpoint_name=auto_ml_job_desc.get("ModelDeployConfig", {}).get("EndpointName"),
+            sample_weight_attribute_name=auto_ml_job_desc["InputDataConfig"][0].get(
+                "SampleWeightAttributeName", None
+            ),
         )
         amlj.current_job_name = auto_ml_job_name
         amlj.latest_auto_ml_job = auto_ml_job_name  # pylint: disable=W0201
@@ -867,6 +880,7 @@ def _load_config(cls, inputs, auto_ml, expand_role=True, validate_uri=True):
                 auto_ml.target_attribute_name,
                 auto_ml.content_type,
                 auto_ml.s3_data_type,
+                auto_ml.sample_weight_attribute_name,
             )
         output_config = _Job._prepare_output_config(auto_ml.output_path, auto_ml.output_kms_key)
 
@@ -932,6 +946,7 @@ def _format_inputs_to_input_config(
         target_attribute_name=None,
         content_type=None,
         s3_data_type=None,
+        sample_weight_attribute_name=None,
     ):
         """Convert inputs to AutoML InputDataConfig.
 
@@ -961,6 +976,8 @@ def _format_inputs_to_input_config(
                 channel["ContentType"] = content_type
             if s3_data_type is not None:
                 channel["DataSource"]["S3DataSource"]["S3DataType"] = s3_data_type
+            if sample_weight_attribute_name is not None:
+                channel["SampleWeightAttributeName"] = sample_weight_attribute_name
             channels.append(channel)
         elif isinstance(inputs, list):
             for input_entry in inputs:
@@ -974,6 +991,8 @@ def _format_inputs_to_input_config(
                     channel["ContentType"] = content_type
                 if s3_data_type is not None:
                     channel["DataSource"]["S3DataSource"]["S3DataType"] = s3_data_type
+                if sample_weight_attribute_name is not None:
+                    channel["SampleWeightAttributeName"] = sample_weight_attribute_name
                 channels.append(channel)
         else:
             msg = (
 
@@ -17,3 +17,4 @@
 from sagemaker.djl_inference.model import DJLModel  # noqa: F401
 from sagemaker.djl_inference.model import DeepSpeedModel  # noqa: F401
 from sagemaker.djl_inference.model import HuggingFaceAccelerateModel  # noqa: F401
+from sagemaker.djl_inference.model import FasterTransformerModel  # noqa: F401
@@ -30,17 +30,19 @@
     STABLE_DIFFUSION_MODEL_TYPE,
 }
 
-DEEPSPEED_SUPPORTED_ARCHITECTURES = {
+FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES = {
+    "t5",
+}
+
+FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES = {
+    "bert",
+    "gpt2",
     "bloom",
     "opt",
-    "gpt_neox",
     "gptj",
+    "gpt_neox",
     "gpt_neo",
-    "gpt2",
-    "xlm-roberta",
-    "roberta",
-    "bert",
-    STABLE_DIFFUSION_MODEL_TYPE,
+    "t5",
 }
 
 ALLOWED_INSTANCE_FAMILIES = {
 
@@ -52,6 +52,7 @@ class DJLServingEngineEntryPointDefaults(Enum):
     DEEPSPEED = ("DeepSpeed", "djl_python.deepspeed")
     HUGGINGFACE_ACCELERATE = ("Python", "djl_python.huggingface")
     STABLE_DIFFUSION = ("DeepSpeed", "djl_python.stable-diffusion")
+    FASTER_TRANSFORMER = ("FasterTransformer", "djl_python.fastertransformer")
 
 
 class DJLPredictor(Predictor):
@@ -93,30 +94,34 @@ def __init__(
 def _determine_engine_for_model(model_type: str, num_partitions: int, num_heads: int):
     """Placeholder docstring"""
 
-    # Tensor Parallelism with DeepSpeed is only possible if attention heads can be split evenly
+    # Tensor Parallelism is only possible if attention heads can be split evenly
     # across devices
     if num_heads is not None and num_partitions is not None and num_heads % num_partitions:
         return HuggingFaceAccelerateModel
     if model_type in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES:
         return DeepSpeedModel
+    if model_type in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES:
+        return FasterTransformerModel
     return HuggingFaceAccelerateModel
 
 
 def _validate_engine_for_model_type(cls, model_type: str, num_partitions: int, num_heads: int):
     """Placeholder docstring"""
 
     if cls == DeepSpeedModel:
-        if model_type not in defaults.DEEPSPEED_SUPPORTED_ARCHITECTURES:
-            raise ValueError(
-                f"{model_type} is not supported by DeepSpeed. "
-                f"Supported model_types are {defaults.DEEPSPEED_SUPPORTED_ARCHITECTURES}"
-            )
         if num_heads is not None and num_partitions is not None and num_heads % num_partitions:
             raise ValueError(
                 "The number of attention heads is not evenly divisible by the number of partitions."
                 "Please set the number of partitions such that the number of attention heads can be"
                 "evenly split across the partitions."
             )
+    if cls == FasterTransformerModel:
+        if model_type not in defaults.FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES:
+            raise ValueError(
+                f"The model architecture {model_type} is currently not supported by "
+                f"FasterTransformer. Please use a different engine, or use the DJLModel"
+                f"to let SageMaker pick a recommended engine for this model."
+            )
     return cls
 
 
@@ -223,6 +228,8 @@ def __new__(
             instance.engine = DJLServingEngineEntryPointDefaults.STABLE_DIFFUSION
         elif isinstance(instance, DeepSpeedModel):
             instance.engine = DJLServingEngineEntryPointDefaults.DEEPSPEED
+        elif isinstance(instance, FasterTransformerModel):
+            instance.engine = DJLServingEngineEntryPointDefaults.FASTER_TRANSFORMER
         else:
             instance.engine = DJLServingEngineEntryPointDefaults.HUGGINGFACE_ACCELERATE
         return instance
@@ -606,7 +613,7 @@ def serving_image_uri(self, region_name):
             str: The appropriate image URI based on the given parameters.
         """
         if not self.djl_version:
-            self.djl_version = "0.21.0"
+            self.djl_version = "0.22.1"
 
         return image_uris.retrieve(
             self._framework(),
@@ -856,3 +863,64 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
             serving_properties["option.dtype"] = "auto"
             serving_properties.pop("option.load_in_8bit", None)
         return serving_properties
+
+
+class FasterTransformerModel(DJLModel):
+    """A DJL FasterTransformer SageMaker ``Model``
+
+    This can be deployed to a SageMaker ``Endpoint``.
+    """
+
+    _framework_name = "djl-fastertransformer"
+
+    def __init__(
+        self,
+        model_id: str,
+        role: str,
+        tensor_parallel_degree: Optional[int] = None,
+        **kwargs,
+    ):
+        """Initialize a FasterTransformerModel.
+
+        Args:
+            model_id (str): This is either the HuggingFace Hub model_id, or the Amazon S3 location
+                containing the uncompressed model artifacts (i.e. not a tar.gz file).
+                The model artifacts are expected to be in HuggingFace pre-trained model
+                format (i.e. model should be loadable from the huggingface transformers
+                from_pretrained api, and should also include tokenizer configs if applicable).
+            role (str): An AWS IAM role specified with either the name or full ARN. The Amazon
+                SageMaker training jobs and APIs that create Amazon SageMaker
+                endpoints use this role to access model artifacts. After the endpoint is created,
+                the inference code
+                might use the IAM role, if it needs to access an AWS resource.
+            tensor_parllel_degree (int): The number of gpus to shard a single instance of the
+                 model across via tensor_parallelism. This should be set to greater than 1 if the
+                 size of the model is larger than the memory available on a single GPU on the
+                 instance. Defaults to None. If not set, no tensor parallel sharding is done.
+            **kwargs: Keyword arguments passed to the superclasses
+                :class:`~sagemaker.djl_inference.DJLModel`,
+                :class:`~sagemaker.model.FrameworkModel`, and
+                :class:`~sagemaker.model.Model`
+
+        .. tip::
+
+            You can find additional parameters for initializing this class at
+            :class:`~sagemaker.djl_inference.DJLModel`,
+            :class:`~sagemaker.model.FrameworkModel`, and
+            :class:`~sagemaker.model.Model`.
+        """
+
+        super(FasterTransformerModel, self).__init__(
+            model_id,
+            role,
+            **kwargs,
+        )
+        if self.number_of_partitions and tensor_parallel_degree:
+            logger.warning(
+                "Both number_of_partitions and tensor_parallel_degree have been set for "
+                "FasterTransformerModel."
+                "These mean the same thing for FasterTransformerModel. Please only set "
+                "tensor_parallel_degree."
+                "number_of_partitions will be ignored"
+            )
+        self.number_of_partitions = tensor_parallel_degree or self.number_of_partitions
@@ -1,6 +1,36 @@
 {
     "scope": ["inference"],
     "versions": {
+        "0.22.1": {
+            "registries": {
+                "af-south-1": "626614931356",
+                "ap-east-1": "871362719292",
+                "ap-northeast-1": "763104351884",
+                "ap-northeast-2": "763104351884",
+                "ap-northeast-3": "364406365360",
+                "ap-south-1": "763104351884",
+                "ap-southeast-1": "763104351884",
+                "ap-southeast-2": "763104351884",
+                "ap-southeast-3": "907027046896",
+                "ca-central-1": "763104351884",
+                "cn-north-1": "727897471807",
+                "cn-northwest-1": "727897471807",
+                "eu-central-1": "763104351884",
+                "eu-north-1": "763104351884",
+                "eu-west-1": "763104351884",
+                "eu-west-2": "763104351884",
+                "eu-west-3": "763104351884",
+                "eu-south-1": "692866216735",
+                "me-south-1": "217643126080",
+                "sa-east-1": "763104351884",
+                "us-east-1": "763104351884",
+                "us-east-2": "763104351884",
+                "us-west-1": "763104351884",
+                "us-west-2": "763104351884"
+            },
+            "repository": "djl-inference",
+            "tag_prefix": "0.22.1-deepspeed0.8.3-cu118"
+        },
         "0.21.0": {
             "registries": {
                 "af-south-1": "626614931356",
 
@@ -1,6 +1,36 @@
 {
     "scope": ["inference"],
     "versions": {
+        "0.22.1": {
+            "registries": {
+                "af-south-1": "626614931356",
+                "ap-east-1": "871362719292",
+                "ap-northeast-1": "763104351884",
+                "ap-northeast-2": "763104351884",
+                "ap-northeast-3": "364406365360",
+                "ap-south-1": "763104351884",
+                "ap-southeast-1": "763104351884",
+                "ap-southeast-2": "763104351884",
+                "ap-southeast-3": "907027046896",
+                "ca-central-1": "763104351884",
+                "cn-north-1": "727897471807",
+                "cn-northwest-1": "727897471807",
+                "eu-central-1": "763104351884",
+                "eu-north-1": "763104351884",
+                "eu-west-1": "763104351884",
+                "eu-west-2": "763104351884",
+                "eu-west-3": "763104351884",
+                "eu-south-1": "692866216735",
+                "me-south-1": "217643126080",
+                "sa-east-1": "763104351884",
+                "us-east-1": "763104351884",
+                "us-east-2": "763104351884",
+                "us-west-1": "763104351884",
+                "us-west-2": "763104351884"
+            },
+            "repository": "djl-inference",
+            "tag_prefix": "0.22.1-fastertransformer5.3.0-cu118"
+        },
         "0.21.0": {
             "registries": {
                 "af-south-1": "626614931356",