diff --git a/README.md b/README.md index d6a2757..d8ce012 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,14 @@ The custom module can override the following methods: * `output_fn(prediction, accept)`: overrides the default method for postprocessing, the return value `result` will be the respond of your request(e.g.`JSON`). The inputs are `predictions`, the result of the `predict()` method and `accept` the return accept type from the HTTP Request, e.g. `application/json` +## 🏎️ Deploy Models on AWS Inferentia2 +The SageMaker Hugging Face Inference Toolkit provides support for deploying Hugging Face on AWS Inferentia2. To deploy a model on Inferentia2 you have 3 options: +* Provide `HF_MODEL_ID`, the model repo id on huggingface.co which contains the compiled model under `.neuron` format. e.g. `optimum/bge-base-en-v1.5-neuronx` +* Provide the `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` environment variables to compile the model on the fly, e.g. `HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128` +* Include `neuron` dictionary in the [config.json](https://huggingface.co/optimum/tiny_random_bert_neuron/blob/main/config.json) file in the model archive, e.g. `neuron: {"static_batch_size": 1, "static_sequence_length": 128}` + +The currently supported tasks can be found [here](https://huggingface.co/docs/optimum-neuron/en/package_reference/supported_models). If you plan to deploy an LLM, we recommend taking a look at [Neuronx TGI](https://huggingface.co/blog/text-generation-inference-on-inferentia2), which is purposly build for LLMs --- ## 🤝 Contributing @@ -201,4 +208,35 @@ curl --request POST \ --header 'Content-Type: application/json' \ --data '"{\"inputs\": \"Camera\"}" \ --output image.png +``` + + +## Run Inferentia2 Model Locally + +_Note: You need to run this on an Inferentia2 instance._ + +1. manually change `MMS_CONFIG_FILE` +``` +wget -O sagemaker-mms.properties https://raw.githubusercontent.com/aws/deep-learning-containers/master/huggingface/build_artifacts/inference/config.properties +``` + +2. Adjust `handler_service.py` and comment out `if content_type in content_types.UTF8_TYPES:` thats needed for SageMaker but cannot be used locally + +2. Run Container, + +- transformers `text-classification` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` +``` +HF_MODEL_ID="distilbert/distilbert-base-uncased-finetuned-sst-2-english" HF_TASK="text-classification" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py +``` +- sentence transformers `feature-extration` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` +``` +HF_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2" HF_TASK="feature-extraction" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py +``` + +3. Send request +``` +curl --request POST \ + --url http://localhost:8080/invocations \ + --header 'Content-Type: application/json' \ + --data "{\"inputs\": \"I like you.\"}" ``` \ No newline at end of file diff --git a/makefile b/makefile index 391abfe..7d0420d 100644 --- a/makefile +++ b/makefile @@ -5,7 +5,7 @@ check_dirs := src tests # run tests unit-test: - python -m pytest -n auto --dist loadfile -s -v ./tests/unit/ + python -m pytest -v -s ./tests/unit/ integ-test: python -m pytest -n 2 -s -v ./tests/integ/ diff --git a/setup.py b/setup.py index 5e4f72a..7f2515b 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ extras["test"] = [ - "pytest", + "pytest<8", "pytest-xdist", "parameterized", "psutil", diff --git a/src/sagemaker_huggingface_inference_toolkit/mms_model_server.py b/src/sagemaker_huggingface_inference_toolkit/mms_model_server.py index e2bc592..8b6b89c 100644 --- a/src/sagemaker_huggingface_inference_toolkit/mms_model_server.py +++ b/src/sagemaker_huggingface_inference_toolkit/mms_model_server.py @@ -33,11 +33,11 @@ ) from sagemaker_huggingface_inference_toolkit import handler_service +from sagemaker_huggingface_inference_toolkit.optimum_utils import is_optimum_neuron_available from sagemaker_huggingface_inference_toolkit.transformers_utils import ( HF_API_TOKEN, HF_MODEL_REVISION, _load_model_from_hub, - is_aws_neuron_available, ) @@ -73,11 +73,6 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE): elif use_hf_hub: # Use different model store directory model_store = DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY - if is_aws_neuron_available(): - raise ValueError( - "Hugging Face Hub deployments are currently not supported with AWS Neuron and Inferentia." - "You need to create a `inference.py` script to run your model using AWS Neuron" - ) storage_dir = _load_model_from_hub( model_id=os.environ["HF_MODEL_ID"], model_dir=DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY, @@ -90,6 +85,15 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE): env = environment.Environment() + # Set the number of workers to available number if optimum neuron is available and not already set + if is_optimum_neuron_available() and os.environ.get("SAGEMAKER_MODEL_SERVER_WORKERS", None) is None: + from optimum.neuron.utils.cache_utils import get_num_neuron_cores + + try: + env._model_server_workers = str(get_num_neuron_cores()) + except Exception: + env._model_server_workers = "1" + # Note: multi-model default config already sets default_service_handler handler_service_for_config = None if ENABLE_MULTI_MODEL else handler_service _create_model_server_config_file(env, handler_service_for_config) diff --git a/src/sagemaker_huggingface_inference_toolkit/optimum_utils.py b/src/sagemaker_huggingface_inference_toolkit/optimum_utils.py index 22a5202..63151b9 100644 --- a/src/sagemaker_huggingface_inference_toolkit/optimum_utils.py +++ b/src/sagemaker_huggingface_inference_toolkit/optimum_utils.py @@ -38,21 +38,25 @@ def get_input_shapes(model_dir): # try to get input shapes from config file try: config = AutoConfig.from_pretrained(model_dir) - if hasattr(config, "neuron_batch_size") and hasattr(config, "neuron_sequence_length"): - input_shapes["batch_size"] = config.neuron_batch_size - input_shapes["sequence_length"] = config.neuron_sequence_length - input_shapes_available = True - logger.info( - f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}" - ) - if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None: - logger.warning( - "HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used." - ) - if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None: - logger.warning( - "HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used." + if hasattr(config, "neuron"): + # check if static batch size and sequence length are available + if config.neuron.get("static_batch_size", None) and config.neuron.get("static_sequence_length", None): + input_shapes["batch_size"] = config.neuron["static_batch_size"] + input_shapes["sequence_length"] = config.neuron["static_sequence_length"] + input_shapes_available = True + logger.info( + f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}" ) + else: + # Add warning if environment variables are set but will be ignored + if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None: + logger.warning( + "HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used." + ) + if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None: + logger.warning( + "HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used." + ) except Exception: input_shapes_available = False @@ -62,6 +66,11 @@ def get_input_shapes(model_dir): # extract input shapes from environment variables sequence_length = os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) + if sequence_length is None: + raise ValueError( + "HF_OPTIMUM_SEQUENCE_LENGTH environment variable is not set. Please set HF_OPTIMUM_SEQUENCE_LENGTH to a positive integer." + ) + if not int(sequence_length) > 0: raise ValueError( f"HF_OPTIMUM_SEQUENCE_LENGTH must be set to a positive integer. Current value is {sequence_length}" @@ -73,10 +82,9 @@ def get_input_shapes(model_dir): return {"batch_size": int(batch_size), "sequence_length": int(sequence_length)} -# TODO: not used yet, need to sync on how to determine if we are running on inf2 instance def get_optimum_neuron_pipeline(task, model_dir): """Method to get optimum neuron pipeline for a given task. Method checks if task is supported by optimum neuron and if required environment variables are set, in case model is not converted. If all checks pass, optimum neuron pipeline is returned. If checks fail, an error is raised.""" - from optimum.neuron.pipelines import NEURONX_SUPPORTED_TASKS, pipeline + from optimum.neuron.pipelines.transformers.base import NEURONX_SUPPORTED_TASKS, pipeline from optimum.neuron.utils import NEURON_FILE_NAME # check task support @@ -94,6 +102,8 @@ def get_optimum_neuron_pipeline(task, model_dir): # get static input shapes to run inference input_shapes = get_input_shapes(model_dir) + # set NEURON_RT_NUM_CORES to 1 to avoid conflicts with multiple HTTP workers + os.environ["NEURON_RT_NUM_CORES"] = "1" # get optimum neuron pipeline neuron_pipe = pipeline(task, model=model_dir, export=export, input_shapes=input_shapes) diff --git a/src/sagemaker_huggingface_inference_toolkit/transformers_utils.py b/src/sagemaker_huggingface_inference_toolkit/transformers_utils.py index ba8141a..80fbc79 100644 --- a/src/sagemaker_huggingface_inference_toolkit/transformers_utils.py +++ b/src/sagemaker_huggingface_inference_toolkit/transformers_utils.py @@ -24,6 +24,10 @@ from transformers.pipelines import Conversation, Pipeline from sagemaker_huggingface_inference_toolkit.diffusers_utils import get_diffusers_pipeline, is_diffusers_available +from sagemaker_huggingface_inference_toolkit.optimum_utils import ( + get_optimum_neuron_pipeline, + is_optimum_neuron_available, +) if is_tf_available(): @@ -71,6 +75,7 @@ def strtobool(val): "savedmodel": "*tar.gz", "openvino": "*openvino*", "ckpt": "*ckpt", + "neuronx": "*neuron", } @@ -202,7 +207,9 @@ def _load_model_from_hub( # check if safetensors weights are available if framework == "pytorch": files = HfApi().model_info(model_id).siblings - if any(f.rfilename.endswith("safetensors") for f in files): + if is_optimum_neuron_available() and any(f.rfilename.endswith("neuron") for f in files): + framework = "neuronx" + elif any(f.rfilename.endswith("safetensors") for f in files): framework = "safetensors" # create regex to only include the framework specific weights @@ -282,8 +289,10 @@ def get_pipeline(task: str, device: int, model_dir: Path, **kwargs) -> Pipeline: kwargs["feature_extractor"] = model_dir else: kwargs["tokenizer"] = model_dir - - if TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0: + # check if optimum neuron is available and tries to load it + if is_optimum_neuron_available(): + hf_pipeline = get_optimum_neuron_pipeline(task=task, model_dir=model_dir) + elif TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0: tokenizer = AutoTokenizer.from_pretrained(os.environ["HF_MODEL_ID"]) hf_pipeline = pipeline( diff --git a/tests/unit/test_handler_service_without_context.py b/tests/unit/test_handler_service_without_context.py index cda8360..37c37e7 100644 --- a/tests/unit/test_handler_service_without_context.py +++ b/tests/unit/test_handler_service_without_context.py @@ -77,9 +77,7 @@ def test_handle(inference_handler): inference_handler.initialize(CONTEXT) json_data = json.dumps(INPUT) prediction = inference_handler.handle([{"body": json_data.encode()}], CONTEXT) - loaded_response = json.loads(prediction[0]) - assert "entity" in loaded_response[0] - assert "score" in loaded_response[0] + assert "output" in prediction[0] @require_torch @@ -90,13 +88,15 @@ def test_load(inference_handler): model_dir=tmpdirname, ) # test with automatic infer + if "HF_TASK" in os.environ: + del os.environ["HF_TASK"] hf_pipeline_without_task = inference_handler.load(storage_folder) assert hf_pipeline_without_task.task == "token-classification" # test with automatic infer - os.environ["HF_TASK"] = TASK + os.environ["HF_TASK"] = "text-classification" hf_pipeline_with_task = inference_handler.load(storage_folder) - assert hf_pipeline_with_task.task == TASK + assert hf_pipeline_with_task.task == "text-classification" def test_preprocess(inference_handler): @@ -139,10 +139,7 @@ def test_validate_and_initialize_user_module(inference_handler): prediction = inference_handler.handle([{"body": b""}], CONTEXT) assert "output" in prediction[0] - assert inference_handler.load({}) == "model" - assert inference_handler.preprocess({}, "") == "data" - assert inference_handler.predict({}, "model") == "output" - assert inference_handler.postprocess("output", "") == "output" + assert inference_handler.load({}) == "Loading inference_tranform_fn.py" def test_validate_and_initialize_user_module_transform_fn(): diff --git a/tests/unit/test_mms_model_server.py b/tests/unit/test_mms_model_server.py index 07693af..a38d5d8 100644 --- a/tests/unit/test_mms_model_server.py +++ b/tests/unit/test_mms_model_server.py @@ -13,7 +13,6 @@ # limitations under the License.import os import os -import pytest from sagemaker_inference.environment import model_dir from mock import patch @@ -186,35 +185,3 @@ def test_start_mms_with_model_from_hub( subprocess_popen.assert_called_once_with(multi_model_server_cmd) sigterm.assert_called_once_with(retrieve.return_value) os.remove(mms_model_server.DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY) - - -@patch("sagemaker_huggingface_inference_toolkit.transformers_utils._aws_neuron_available", return_value=True) -@patch("subprocess.call") -@patch("subprocess.Popen") -@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._retry_retrieve_mms_server_process") -@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._load_model_from_hub") -@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._add_sigterm_handler") -@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._install_requirements") -@patch("os.makedirs", return_value=True) -@patch("os.remove", return_value=True) -@patch("os.path.exists", return_value=True) -@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._create_model_server_config_file") -@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._adapt_to_mms_format") -def test_start_mms_neuron_and_model_from_hub( - adapt, - create_config, - exists, - remove, - dir, - install_requirements, - sigterm, - load_model_from_hub, - retrieve, - subprocess_popen, - subprocess_call, - _aws_neuron_available, -): - with pytest.raises(ValueError): - os.environ["HF_MODEL_ID"] = "lysandre/tiny-bert-random" - - mms_model_server.start_model_server() diff --git a/tests/unit/test_optimum_utils.py b/tests/unit/test_optimum_utils.py index 5a22072..7910ef0 100644 --- a/tests/unit/test_optimum_utils.py +++ b/tests/unit/test_optimum_utils.py @@ -54,7 +54,7 @@ def test_get_input_shapes_from_file(): ) input_shapes = get_input_shapes(model_dir=storage_folder) assert input_shapes["batch_size"] == 1 - assert input_shapes["sequence_length"] == 16 + assert input_shapes["sequence_length"] == 32 @require_torch