Merge branch 'aws:master' into master

chiui0x18 · web-flow · commit 902fa80a46d5 · 2023-07-25T01:00:35.000-07:00
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
@@ -494,6 +494,102 @@ smdistributed.modelparallel.torch.DistributedOptimizer
       ``state_dict`` contains elements corresponding to only the current
       partition, or to the entire model.
 
+smdistributed.modelparallel.torch.nn.FlashAttentionLayer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: smdistributed.modelparallel.torch.nn.FlashAttentionLayer(attention_dropout_prob=0.1, attention_head_size=None, scale_attention_scores=True, scale_attn_by_layer_idx=False, layer_idx=None, scale=None, triton_flash_attention=False, use_alibi=False)
+
+   This class supports
+   `FlashAttention <https://github.com/HazyResearch/flash-attention>`_
+   for PyTorch 2.0.
+   It takes the ``qkv`` matrix as an argument through its ``forward`` class method,
+   computes attention scores and probabilities,
+   and then operates the matrix multiplication with value layers.
+
+   Through this class, the smp library supports
+   custom attention masks such as Attention with
+   Linear Biases (ALiBi), and you can activate them by setting
+   ``triton_flash_attention`` and ``use_alibi`` to ``True``.
+
+   Note that the Triton flash attention does not support dropout
+   on the attention probabilities. It uses standard lower triangular
+   causal mask when causal mode is enabled. It also runs only
+   on P4d and P4de instances, with fp16 or bf16.
+
+   This class computes the scale factor to apply when computing attention.
+   By default, ``scale`` is set to ``None``, and it's automatically calculated.
+   When ``scale_attention_scores`` is ``True`` (which is default), you must pass a value
+   to ``attention_head_size``. When ``scale_attn_by_layer_idx`` is ``True``,
+   you must pass a value to ``layer_idx``. If both factors are used, they are
+   multiplied as follows: ``(1/(sqrt(attention_head_size) * (layer_idx+1)))``.
+   This scale calculation can be bypassed if you specify a custom scaling
+   factor to ``scale``. In other words, if you specify a value to ``scale``, the set of parameters
+   (``scale_attention_scores``, ``attention_head_size``, ``scale_attn_by_layer_idx``, ``layer_idx``)
+   is overridden and ignored.
+
+   **Parameters**
+
+   * ``attention_dropout_prob`` (float): (default: 0.1) specifies dropout probability
+     to apply to attention.
+   * ``attention_head_size`` (int): Required when ``scale_attention_scores`` is True.
+     When ``scale_attention_scores`` is passed, this contributes
+     ``1/sqrt(attention_head_size)`` to the scale factor.
+   * ``scale_attention_scores`` (boolean): (default: True) determines whether
+     to multiply 1/sqrt(attention_head_size) to the scale factor.
+   * ``layer_idx`` (int): Required when ``scale_attn_by_layer_idx`` is ``True``.
+     The layer id to use for scaling attention by layer id.
+     It contributes 1/(layer_idx + 1) to the scaling factor.
+   * ``scale_attn_by_layer_idx`` (boolean): (default: False) determines whether
+     to multiply 1/(layer_idx + 1) to the scale factor.
+   * ``scale`` (float) (default: None): If passed, this scale factor will be
+     applied bypassing the all of the previous arguments.
+   * ``triton_flash_attention`` (bool): (default: False) If passed, Triton
+     implementation of flash attention will be used. This is necessary to supports
+     Attention with Linear Biases (ALiBi) (see next arg). Note that this version
+     of the kernel doesn’t support dropout.
+   * ``use_alibi`` (bool): (default: False) If passed, it enables Attention with
+     Linear Biases (ALiBi) using the mask provided.
+
+   .. method:: forward(self, qkv, attn_mask=None, causal=False)
+
+      Returns a single ``torch.Tensor`` ``(batch_size x num_heads x seq_len x head_size)``,
+      which represents the output of attention computation.
+
+      **Parameters**
+
+      * ``qkv``: ``torch.Tensor`` in the form of ``(batch_size x seqlen x 3 x num_heads x head_size)``.
+      * ``attn_mask``: ``torch.Tensor`` in the form of ``(batch_size x 1 x 1 x seqlen)``.
+        By default it is ``None``, and usage of this mask needs ``triton_flash_attention``
+        and ``use_alibi`` to be set. See how to generate the mask in the following code snippet.
+      * ``causal``: When passed, it uses the standard lower triangular mask. The default is ``False``.
+
+   When using ALiBi, it needs an attention mask prepared like the following.
+
+   .. code:: python
+
+      def generate_alibi_attn_mask(attention_mask, batch_size, seq_length,
+         num_attention_heads, alibi_bias_max=8):
+
+         device, dtype = attention_mask.device, attention_mask.dtype
+         alibi_attention_mask = torch.zeros(
+            1, num_attention_heads, 1, seq_length, dtype=dtype, device=device
+         )
+
+         alibi_bias = torch.arange(1 - seq_length, 1, dtype=dtype, device=device).view(
+            1, 1, 1, seq_length
+         )
+         m = torch.arange(1, num_attention_heads + 1, dtype=dtype, device=device)
+         m.mul_(alibi_bias_max / num_attention_heads)
+         alibi_bias = alibi_bias * (1.0 / (2 ** m.view(1, num_attention_heads, 1, 1)))
+
+         alibi_attention_mask.add_(alibi_bias)
+         alibi_attention_mask = alibi_attention_mask[..., :seq_length, :seq_length]
+         if attention_mask is not None and attention_mask.bool().any():
+            alibi_attention_mask.masked_fill(
+                  attention_mask.bool().view(batch_size, 1, 1, seq_length), float("-inf")
+            )
+
+         return alibi_attention_mask
 
 smdistributed.modelparallel.torch Context Managers and Util Functions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/requirements/tox/doc8_requirements.txt b/requirements/tox/doc8_requirements.txt
@@ -1,2 +1,2 @@
 doc8==0.10.1
-Pygments==2.11.2
+Pygments==2.15.0
diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py
@@ -334,6 +334,8 @@ class JumpStartModelSpecs(JumpStartDataHolderType):
         "training_dependencies",
         "training_vulnerabilities",
         "deprecated",
+        "deprecated_message",
+        "deprecate_warn_message",
         "default_inference_instance_type",
         "supported_inference_instance_types",
         "default_training_instance_type",
@@ -389,6 +391,8 @@ def from_json(self, json_obj: Dict[str, Any]) -> None:
         self.training_dependencies: List[str] = json_obj["training_dependencies"]
         self.training_vulnerabilities: List[str] = json_obj["training_vulnerabilities"]
         self.deprecated: bool = bool(json_obj["deprecated"])
+        self.deprecated_message: Optional[str] = json_obj.get("deprecated_message")
+        self.deprecate_warn_message: Optional[str] = json_obj.get("deprecate_warn_message")
         self.default_inference_instance_type: Optional[str] = json_obj.get(
             "default_inference_instance_type"
         )
diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py
@@ -415,9 +415,14 @@ def verify_model_region_and_return_specs(
 
     if model_specs.deprecated:
         if not tolerate_deprecated_model:
-            raise DeprecatedJumpStartModelError(model_id=model_id, version=version)
+            raise DeprecatedJumpStartModelError(
+                model_id=model_id, version=version, message=model_specs.deprecated_message
+            )
         LOGGER.warning("Using deprecated JumpStart model '%s' and version '%s'.", model_id, version)
 
+    if model_specs.deprecate_warn_message:
+        LOGGER.warning(model_specs.deprecate_warn_message)
+
     if scope == constants.JumpStartScriptScope.INFERENCE.value and model_specs.inference_vulnerable:
         if not tolerate_vulnerable_model:
             raise VulnerableJumpStartModelError(
diff --git a/src/sagemaker/jumpstart/validators.py b/src/sagemaker/jumpstart/validators.py
@@ -15,10 +15,15 @@
 from typing import Any, Dict, List, Optional
 from sagemaker.jumpstart.constants import JUMPSTART_DEFAULT_REGION_NAME
 
-from sagemaker.jumpstart.enums import HyperparameterValidationMode, VariableScope, VariableTypes
-from sagemaker.jumpstart import accessors as jumpstart_accessors
+from sagemaker.jumpstart.enums import (
+    HyperparameterValidationMode,
+    JumpStartScriptScope,
+    VariableScope,
+    VariableTypes,
+)
 from sagemaker.jumpstart.exceptions import JumpStartHyperparametersError
 from sagemaker.jumpstart.types import JumpStartHyperparameter
+from sagemaker.jumpstart.utils import verify_model_region_and_return_specs
 
 
 def _validate_hyperparameter(
@@ -190,8 +195,11 @@ def validate_hyperparameters(
     if region is None:
         region = JUMPSTART_DEFAULT_REGION_NAME
 
-    model_specs = jumpstart_accessors.JumpStartModelsAccessor.get_model_specs(
-        region=region, model_id=model_id, version=model_version
+    model_specs = verify_model_region_and_return_specs(
+        model_id=model_id,
+        version=model_version,
+        region=region,
+        scope=JumpStartScriptScope.TRAINING,
     )
     hyperparameters_specs = model_specs.hyperparameters
 
diff --git a/tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py b/tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py
@@ -14,6 +14,10 @@
 import os
 import time
 
+import pytest
+
+import tests.integ
+
 from sagemaker.jumpstart.model import JumpStartModel
 from tests.integ.sagemaker.jumpstart.constants import (
     ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID,
@@ -29,6 +33,8 @@
 
 MAX_INIT_TIME_SECONDS = 5
 
+MODEL_PACKAGE_ARN_SUPPORTED_REGIONS = {"us-west-2", "us-east-1"}
+
 
 def test_non_prepacked_jumpstart_model(setup):
 
@@ -73,6 +79,35 @@ def test_prepacked_jumpstart_model(setup):
     assert response is not None
 
 
+@pytest.mark.skipif(
+    tests.integ.test_region() not in MODEL_PACKAGE_ARN_SUPPORTED_REGIONS,
+    reason=f"JumpStart Model Package models unavailable in {tests.integ.test_region()}.",
+)
+def test_model_package_arn_jumpstart_model(setup):
+
+    model_id = "meta-textgeneration-llama-2-7b"
+
+    model = JumpStartModel(
+        model_id=model_id,
+        role=get_sm_session().get_caller_identity_arn(),
+        sagemaker_session=get_sm_session(),
+    )
+
+    # uses ml.g5.2xlarge instance
+    predictor = model.deploy(
+        tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
+    )
+
+    payload = {
+        "inputs": "some-payload",
+        "parameters": {"max_new_tokens": 256, "top_p": 0.9, "temperature": 0.6},
+    }
+
+    response = predictor.predict(payload, custom_attributes="accept_eula=true")
+
+    assert response is not None
+
+
 def test_instatiating_model_not_too_slow(setup):
 
     model_id = "catboost-regression-model"
diff --git a/tests/scripts/run-notebook-test.sh b/tests/scripts/run-notebook-test.sh
@@ -126,12 +126,7 @@ echo "set SAGEMAKER_ROLE_ARN=$SAGEMAKER_ROLE_ARN"
 ./amazon-sagemaker-examples/advanced_functionality/kmeans_bring_your_own_model/kmeans_bring_your_own_model.ipynb \
 ./amazon-sagemaker-examples/advanced_functionality/tensorflow_iris_byom/tensorflow_BYOM_iris.ipynb \
 ./amazon-sagemaker-examples/sagemaker-python-sdk/1P_kmeans_highlevel/kmeans_mnist.ipynb \
-./amazon-sagemaker-examples/sagemaker-python-sdk/1P_kmeans_lowlevel/kmeans_mnist_lowlevel.ipynb \
-./amazon-sagemaker-examples/sagemaker-python-sdk/mxnet_gluon_sentiment/mxnet_sentiment_analysis_with_gluon.ipynb \
-./amazon-sagemaker-examples/sagemaker-python-sdk/mxnet_onnx_export/mxnet_onnx_export.ipynb \
 ./amazon-sagemaker-examples/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb \
 ./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_moving_from_framework_mode_to_script_mode/tensorflow_moving_from_framework_mode_to_script_mode.ipynb \
-./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_script_mode_pipe_mode/tensorflow_script_mode_pipe_mode.ipynb \
-./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_serving_using_elastic_inference_with_your_own_model/tensorflow_serving_pretrained_model_elastic_inference.ipynb \
 
 (DeleteLifeCycleConfig "$LIFECYCLE_CONFIG_NAME")
diff --git a/tests/unit/sagemaker/jumpstart/constants.py b/tests/unit/sagemaker/jumpstart/constants.py
@@ -14,6 +14,63 @@
 
 
 SPECIAL_MODEL_SPECS_DICT = {
+    "js-model-package-arn": {
+        "model_id": "meta-textgeneration-llama-2-7b-f",
+        "url": "https://ai.meta.com/resources/models-and-libraries/llama-downloads/",
+        "version": "1.0.0",
+        "min_sdk_version": "2.173.0",
+        "training_supported": False,
+        "incremental_training_supported": False,
+        "hosting_ecr_specs": {
+            "framework": "pytorch",
+            "framework_version": "1.12.0",
+            "py_version": "py38",
+        },
+        "hosting_artifact_key": "meta-infer/infer-meta-textgeneration-llama-2-7b-f.tar.gz",
+        "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.0.0/sourcedir.tar.gz",
+        "hosting_eula_key": "fmhMetadata/eula/llamaEula.txt",
+        "hosting_model_package_arns": {
+            "us-west-2": "arn:aws:sagemaker:us-west-2:594846645681:model-package/"
+            "llama2-7b-f-e46eb8a833643ed58aaccd81498972c3",
+            "us-east-1": "arn:aws:sagemaker:us-east-1:865070037744:model-package/"
+            "llama2-7b-f-e46eb8a833643ed58aaccd81498972c3",
+        },
+        "inference_vulnerable": False,
+        "inference_dependencies": [],
+        "inference_vulnerabilities": [],
+        "training_vulnerable": False,
+        "training_dependencies": [],
+        "training_vulnerabilities": [],
+        "deprecated": False,
+        "inference_environment_variables": [],
+        "metrics": [],
+        "default_inference_instance_type": "ml.g5.2xlarge",
+        "supported_inference_instance_types": [
+            "ml.g5.2xlarge",
+            "ml.g5.4xlarge",
+            "ml.g5.8xlarge",
+            "ml.g5.12xlarge",
+            "ml.g5.24xlarge",
+            "ml.g5.48xlarge",
+            "ml.p4d.24xlarge",
+        ],
+        "model_kwargs": {},
+        "deploy_kwargs": {
+            "model_data_download_timeout": 3600,
+            "container_startup_health_check_timeout": 3600,
+        },
+        "predictor_specs": {
+            "supported_content_types": ["application/json"],
+            "supported_accept_types": ["application/json"],
+            "default_content_type": "application/json",
+            "default_accept_type": "application/json",
+        },
+        "inference_volume_size": 256,
+        "inference_enable_network_isolation": True,
+        "validation_supported": False,
+        "fine_tuning_supported": False,
+        "resource_name_base": "meta-textgeneration-llama-2-7b-f",
+    },
     "js-trainable-model-prepacked": {
         "model_id": "huggingface-text2text-flan-t5-base",
         "url": "https://huggingface.co/google/flan-t5-base",
@@ -2299,6 +2356,8 @@
     "training_script_key": "source-directory-tarballs/pytorch/transfer_learning/ic/v1.0.0/sourcedir.tar.gz",
     "training_prepacked_script_key": None,
     "hosting_prepacked_artifact_key": None,
+    "deprecate_warn_message": None,
+    "deprecated_message": None,
     "hosting_model_package_arns": None,
     "hosting_eula_key": None,
     "hyperparameters": [
diff --git a/tests/unit/sagemaker/jumpstart/model/test_model.py b/tests/unit/sagemaker/jumpstart/model/test_model.py
diff --git a/tests/unit/sagemaker/jumpstart/test_utils.py b/tests/unit/sagemaker/jumpstart/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`doc8==0.10.1`
`2`		`-Pygments==2.11.2`
	`2`	`+Pygments==2.15.0`