Merge pull request #58 from aws/add-audio

philschmid · web-flow · commit 698422ddf986 · 2022-02-17T14:18:46.000+01:00
Add audio
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -0,0 +1,147 @@
+FROM ubuntu:20.04
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+# Specify accept-bind-to-port LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT
+# https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipeline-real-time.html
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
+# Specify multi-models LABEL to indicate container is capable of loading and serving multiple models concurrently
+# https://docs.aws.amazon.com/sagemaker/latest/dg/build-multi-model-build-container.html
+LABEL com.amazonaws.sagemaker.capabilities.multi-models=true
+
+ARG MMS_VERSION=1.1.8
+ARG PYTHON=python3
+ARG PYTHON_VERSION=3.8.10
+ARG OPEN_MPI_VERSION=4.0.1
+# HF ARGS
+ARG PT_INFERENCE_URL=https://pytorch-ei-binaries.s3.us-west-2.amazonaws.com/r1.10.0_inference/20211027-061940/36dea191ed0df524207de5acc4e6fb4322306d1a/cpu/torch-1.10.0%2Bcpu-cp38-cp38-manylinux1_x86_64.whl
+ARG TRANSFORMERS_VERSION
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    LD_LIBRARY_PATH="/opt/conda/lib/:${LD_LIBRARY_PATH}:/usr/local/lib" \
+    PYTHONIOENCODING=UTF-8 \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    TEMP=/home/model-server/tmp \
+    DEBIAN_FRONTEND=noninteractive
+
+ENV PATH /opt/conda/bin:$PATH
+
+RUN apt-get update \
+ # TODO: Remove upgrade statements once packages are updated in base image
+ && apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \
+ && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    build-essential \
+    openssl \
+    openjdk-8-jdk-headless \
+    vim \
+    wget \
+    curl \
+    emacs \
+    unzip \
+    git \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+ && chmod +x ~/miniconda.sh \
+ && ~/miniconda.sh -b -p /opt/conda \
+ && rm ~/miniconda.sh \
+ && /opt/conda/bin/conda update conda \
+ && /opt/conda/bin/conda install -c conda-forge \
+    python=$PYTHON_VERSION \
+ && /opt/conda/bin/conda install -y \
+    # conda 4.10.0 requires ruamel_yaml to be installed. Currently pinned at latest.
+    ruamel_yaml==0.15.100 \
+    cython \
+    "mkl-include==2021.4.0" \
+    "mkl==2021.4.0" \
+    botocore \
+ && /opt/conda/bin/conda clean -ya
+
+RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
+ && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \
+ && pip install packaging==20.4 \
+    enum-compat==0.0.3 \
+    "cryptography>3.2"
+
+RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \
+ && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \
+ && cd openmpi-$OPEN_MPI_VERSION \
+ && ./configure --prefix=/home/.openmpi \
+ && make all install \
+ && cd .. \
+ && rm openmpi-$OPEN_MPI_VERSION.tar.gz \
+ && rm -rf openmpi-$OPEN_MPI_VERSION
+
+# The ENV variables declared below are changed in the previous section
+# Grouping these ENV variables in the first section causes
+# ompi_info to fail. This is only observed in CPU containers
+ENV PATH="$PATH:/home/.openmpi/bin"
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
+RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
+
+WORKDIR /
+
+RUN pip install --no-cache-dir \
+    multi-model-server==$MMS_VERSION \
+    sagemaker-inference
+
+RUN useradd -m model-server \
+ && mkdir -p /home/model-server/tmp \
+ && chown -R model-server /home/model-server
+
+COPY mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
+COPY config.properties /etc/sagemaker-mms.properties
+
+RUN chmod +x /usr/local/bin/dockerd-entrypoint.py
+
+ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+#################################
+# Hugging Face specific section #
+#################################
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.10/license.txt -o /license.txt
+
+# Uninstall and re-install torch and torchvision from the PyTorch website
+RUN pip uninstall -y torch \
+ && pip install --no-cache-dir -U $PT_INFERENCE_URL
+
+# install Hugging Face libraries and its dependencies
+RUN pip install --no-cache-dir \
+	transformers[sentencepiece]==${TRANSFORMERS_VERSION} \
+	protobuf==3.12.0 
+
+RUN apt-get update \
+   && apt-get install -y libsndfile1 ffmpeg  \
+   && apt-get clean \
+   && rm -rf /var/lib/apt/lists/*
+RUN pip install torchaudio
+
+# TEMP: Copying package
+COPY src /tmp/inference/src
+COPY ./README.md /tmp/inference/README.md
+COPY ./setup.py /tmp/inference/setup.py
+
+# Install and SageMaker Inference Toolkit to set up MMS
+RUN pip3 --no-cache-dir install "/tmp/inference" 
+                                # torch==1.6.0 
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance*
+
+EXPOSE 8080 8081
+ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
+CMD ["serve"]
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 </div>
 
 
-* [x] ImageSegmentationPipeline
+* [X] ImageSegmentationPipeline
 * [ ] ObjectDetectionPipeline we need `timm` installed
 * [ ] ImageClassificationPipeline we need `timm` installed
 * [ ] AutomaticSpeechRecognitionPipeline 
diff --git a/config.properties b/config.properties
@@ -0,0 +1,5 @@
+vmargs=-XX:-UseContainerSupport -XX:InitialRAMPercentage=8.0 -XX:MaxRAMPercentage=10.0 -XX:-UseLargePages -XX:+UseG1GC -XX:+ExitOnOutOfMemoryError
+model_store=/opt/ml/model
+load_models=ALL
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
diff --git a/makefile b/makefile
@@ -29,13 +29,19 @@ style:
 
 run:
 	docker run -t -i \
-	--env HF_TASK="automatic-speech-recognition" \
-	--env HF_MODEL_ID="facebook/wav2vec2-base-100h" \
-	-p 8080:8080  558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.8.1-cpu
+	--env HF_TASK="image-classification" \
+	--env HF_MODEL_ID="google/vit-base-patch16-224" \
+	-p 8080:8080  558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu
+
+	# docker run -t -i \
+	# --env HF_TASK="automatic-speech-recognition" \
+	# --env HF_MODEL_ID="facebook/wav2vec2-base-100h" \
+	# -p 8080:8080  558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu
+
 
 build:
-	docker build --tag 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.8.1-cpu \
-							 --build-arg TRANSFORMERS_VERSION=4.9.2 \
-							 --file ./docker/Dockerfile.cpu \
+	docker build --tag 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu \
+							 --build-arg TRANSFORMERS_VERSION=4.16.2 \
+							 --file ./Dockerfile.cpu \
 							 .
 start:	build run
diff --git a/setup.py b/setup.py
@@ -32,13 +32,23 @@
 
 VERSION = "1.3.1"
 
+
+# Ubuntu packages
+# libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev
+# ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
+# libavcodec-extra : libavcodec-extra  inculdes additional codecs for ffmpeg
+
 install_requires = [
     "sagemaker-inference>=1.5.11",
     "huggingface_hub>=0.0.8",
     "retrying",
     "numpy",
     # vision
     "Pillow",
+    # speech + torchaudio
+    "librosa",
+    "pyctcdecode>=0.3.0",
+    "phonemizer",
 ]
 
 extras = {}
@@ -47,7 +57,7 @@
 extras["transformers"] = ["transformers[sklearn,sentencepiece]>=4.5.1"]
 
 # framework specific dependencies
-extras["torch"] = ["torch>=1.8.0"]
+extras["torch"] = ["torch>=1.8.0", "torchaudio"]
 extras["tensorflow"] = ["tensorflow>=2.4.0"]
 
 # MMS Server dependencies
diff --git a/src/sagemaker_huggingface_inference_toolkit/content_types.py b/src/sagemaker_huggingface_inference_toolkit/content_types.py
@@ -28,3 +28,10 @@
 WEBP = "image/webp"
 X_IMAGE = "image/x-image"
 VISION_TYPES = [JPEG, PNG, TIFF, BMP, GIF, WEBP,X_IMAGE]
+# Speech Mime-Types
+FLAC = "audio/x-flac"
+MP3 = "audio/mpeg"
+WAV = "audio/wave"
+OGG = "audio/ogg"
+X_AUDIO = "audio/x-audio"
+AUDIO_TYPES = [FLAC, MP3, WAV, OGG, X_AUDIO]
diff --git a/src/sagemaker_huggingface_inference_toolkit/decoder_encoder.py b/src/sagemaker_huggingface_inference_toolkit/decoder_encoder.py
@@ -64,6 +64,17 @@ def decode_image(bpayload: bytearray):
     return {"inputs": image}
 
 
+def decode_audio(bpayload: bytearray):
+    """Convert a .wav / .flac / .mp3 object to a proper inputs dict.
+    Args:
+        bpayload (bytes): byte stream.
+    Returns:
+        (dict): dictonatry for input
+    """
+
+    return {"inputs": bytes(bpayload)}
+
+
 # https://github.com/automl/SMAC3/issues/453
 class _JSONEncoder(json.JSONEncoder):
     """
@@ -133,6 +144,12 @@ def encode_csv(content):  # type: (str) -> np.array
     content_types.GIF: decode_image,
     content_types.WEBP: decode_image,
     content_types.X_IMAGE: decode_image,
+    # audio mime-types
+    content_types.FLAC: decode_audio,
+    content_types.MP3: decode_audio,
+    content_types.WAV: decode_audio,
+    content_types.OGG: decode_audio,
+    content_types.X_AUDIO: decode_audio,
 }
 
 
diff --git a/src/sagemaker_huggingface_inference_toolkit/handler_service.py b/src/sagemaker_huggingface_inference_toolkit/handler_service.py
@@ -23,7 +23,7 @@
 from transformers.pipelines import SUPPORTED_TASKS
 
 from mms.service import PredictionException
-from sagemaker_huggingface_inference_toolkit import decoder_encoder, content_types
+from sagemaker_huggingface_inference_toolkit import content_types, decoder_encoder
 from sagemaker_huggingface_inference_toolkit.transformers_utils import (
     _is_gpu_available,
     get_pipeline,
@@ -228,7 +228,7 @@ def handle(self, data, context):
                 accept = content_types.JSON
 
             if content_type in content_types.UTF8_TYPES:
-                input_data = input_data.decode("utf-8")               
+                input_data = input_data.decode("utf-8")
 
             predict_start = time.time()
             response = self.transform_fn(self.model, input_data, content_type, accept)
diff --git a/tests/integ/config.py b/tests/integ/config.py
@@ -1,6 +1,7 @@
 import os
 
 from integ.utils import (
+    validate_automatic_speech_recognition,
     validate_classification,
     validate_feature_extraction,
     validate_fill_mask,
@@ -59,6 +60,10 @@
         "pytorch": "google/vit-base-patch16-224",
         "tensorflow": "google/vit-base-patch16-224",
     },
+    "automatic-speech-recognition": {
+        "pytorch": "facebook/wav2vec2-base-100h",
+        "tensorflow": "facebook/wav2vec2-base-960h",
+    },
 }
 
 task2input = {
@@ -85,6 +90,7 @@
     },
     "text-generation": {"inputs": "My name is philipp and I am"},
     "image-classification": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
+    "automatic-speech-recognition": open(os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb").read(),
 }
 
 task2output = {
@@ -112,6 +118,9 @@
         {"score": 0.0004262699221726507, "label": "dhole, Cuon alpinus"},
         {"score": 0.00030842673731967807, "label": "lion, king of beasts, Panthera leo"},
     ],
+    "automatic-speech-recognition": {
+        "text": "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP OAUDIENCES IN DROFTY SCHOOL ROOMS DAY AFTER DAY FOR A FORT NIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS"
+    },
 }
 
 task2performance = {
@@ -203,6 +212,14 @@
             "average_request_time": 1,
         },
     },
+    "automatic-speech-recognition": {
+        "cpu": {
+            "average_request_time": 6,
+        },
+        "gpu": {
+            "average_request_time": 6,
+        },
+    },
 }
 
 task2validation = {
@@ -217,4 +234,5 @@
     "text2text-generation": validate_text2text_generation,
     "text-generation": validate_text_generation,
     "image-classification": validate_classification,
+    "automatic-speech-recognition": validate_automatic_speech_recognition,
 }
diff --git a/tests/integ/test_models_from_hub.py b/tests/integ/test_models_from_hub.py
@@ -112,6 +112,13 @@ def test_deployment_from_hub(task, device, framework):
                 ContentType="image/jpeg",
                 Accept="application/json",
             )
+        elif task == "automatic-speech-recognition":
+            response = client.invoke_endpoint(
+                EndpointName=name,
+                Body=task2input[task],
+                ContentType="audio/x-flac",
+                Accept="application/json",
+            )
         else:
             response = client.invoke_endpoint(
                 EndpointName=name,
@@ -134,6 +141,13 @@ def test_deployment_from_hub(task, device, framework):
                         ContentType="image/jpeg",
                         Accept="application/json",
                     )
+                elif task == "automatic-speech-recognition":
+                    response = client.invoke_endpoint(
+                        EndpointName=name,
+                        Body=task2input[task],
+                        ContentType="audio/x-flac",
+                        Accept="application/json",
+                    )
                 else:
                     response = client.invoke_endpoint(
                         EndpointName=name,
diff --git a/tests/integ/utils.py b/tests/integ/utils.py
@@ -145,3 +145,9 @@ def validate_feature_extraction(result=None, snapshot=None):
 def validate_fill_mask(result=None, snapshot=None):
     assert result is not None
     return True
+
+
+def validate_automatic_speech_recognition(result=None, snapshot=None):
+    assert result is not None
+    assert "text" in result
+    return True
diff --git a/tests/resources/audio/sample1.flac b/tests/resources/audio/sample1.flac
diff --git a/tests/resources/audio/sample1.mp3 b/tests/resources/audio/sample1.mp3
diff --git a/tests/resources/audio/sample1.ogg b/tests/resources/audio/sample1.ogg
diff --git a/tests/resources/audio/sample1.wav b/tests/resources/audio/sample1.wav
diff --git a/tests/unit/test_decoder_encoder.py b/tests/unit/test_decoder_encoder.py
@@ -62,6 +62,16 @@ def test_decode_image():
         assert isinstance(decoded_data["inputs"], Image.Image)
 
 
+def test_decode_audio():
+    audio_files_path = os.path.join(os.getcwd(), "tests/resources/audio")
+
+    for audio_file in os.listdir(audio_files_path):
+        audio_bytes = open(os.path.join(audio_files_path, audio_file), "rb").read()
+        decoded_data = decoder_encoder.decode_audio(bytearray(audio_bytes))
+
+        assert {"inputs": audio_bytes} == decoded_data
+
+
 def test_decode_csv_without_header():
     with pytest.raises(PredictionException):
         decoder_encoder.decode_csv(