Skip to content

Add audio #58

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
FROM ubuntu:20.04

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

# Specify accept-bind-to-port LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT
# https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipeline-real-time.html
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
# Specify multi-models LABEL to indicate container is capable of loading and serving multiple models concurrently
# https://docs.aws.amazon.com/sagemaker/latest/dg/build-multi-model-build-container.html
LABEL com.amazonaws.sagemaker.capabilities.multi-models=true

ARG MMS_VERSION=1.1.8
ARG PYTHON=python3
ARG PYTHON_VERSION=3.8.10
ARG OPEN_MPI_VERSION=4.0.1
# HF ARGS
ARG PT_INFERENCE_URL=https://pytorch-ei-binaries.s3.us-west-2.amazonaws.com/r1.10.0_inference/20211027-061940/36dea191ed0df524207de5acc4e6fb4322306d1a/cpu/torch-1.10.0%2Bcpu-cp38-cp38-manylinux1_x86_64.whl
ARG TRANSFORMERS_VERSION

ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
LD_LIBRARY_PATH="/opt/conda/lib/:${LD_LIBRARY_PATH}:/usr/local/lib" \
PYTHONIOENCODING=UTF-8 \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
TEMP=/home/model-server/tmp \
DEBIAN_FRONTEND=noninteractive

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update \
# TODO: Remove upgrade statements once packages are updated in base image
&& apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \
&& apt-get install -y --no-install-recommends \
ca-certificates \
build-essential \
openssl \
openjdk-8-jdk-headless \
vim \
wget \
curl \
emacs \
unzip \
git \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& chmod +x ~/miniconda.sh \
&& ~/miniconda.sh -b -p /opt/conda \
&& rm ~/miniconda.sh \
&& /opt/conda/bin/conda update conda \
&& /opt/conda/bin/conda install -c conda-forge \
python=$PYTHON_VERSION \
&& /opt/conda/bin/conda install -y \
# conda 4.10.0 requires ruamel_yaml to be installed. Currently pinned at latest.
ruamel_yaml==0.15.100 \
cython \
"mkl-include==2021.4.0" \
"mkl==2021.4.0" \
botocore \
&& /opt/conda/bin/conda clean -ya

RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \
&& pip install packaging==20.4 \
enum-compat==0.0.3 \
"cryptography>3.2"

RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \
&& gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \
&& cd openmpi-$OPEN_MPI_VERSION \
&& ./configure --prefix=/home/.openmpi \
&& make all install \
&& cd .. \
&& rm openmpi-$OPEN_MPI_VERSION.tar.gz \
&& rm -rf openmpi-$OPEN_MPI_VERSION

# The ENV variables declared below are changed in the previous section
# Grouping these ENV variables in the first section causes
# ompi_info to fail. This is only observed in CPU containers
ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value

WORKDIR /

RUN pip install --no-cache-dir \
multi-model-server==$MMS_VERSION \
sagemaker-inference

RUN useradd -m model-server \
&& mkdir -p /home/model-server/tmp \
&& chown -R model-server /home/model-server

COPY mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
COPY config.properties /etc/sagemaker-mms.properties

RUN chmod +x /usr/local/bin/dockerd-entrypoint.py

ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py

RUN chmod +x /usr/local/bin/deep_learning_container.py

#################################
# Hugging Face specific section #
#################################

RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.10/license.txt -o /license.txt

# Uninstall and re-install torch and torchvision from the PyTorch website
RUN pip uninstall -y torch \
&& pip install --no-cache-dir -U $PT_INFERENCE_URL

# install Hugging Face libraries and its dependencies
RUN pip install --no-cache-dir \
transformers[sentencepiece]==${TRANSFORMERS_VERSION} \
protobuf==3.12.0

RUN apt-get update \
&& apt-get install -y libsndfile1 ffmpeg \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN pip install torchaudio

# TEMP: Copying package
COPY src /tmp/inference/src
COPY ./README.md /tmp/inference/README.md
COPY ./setup.py /tmp/inference/setup.py

# Install and SageMaker Inference Toolkit to set up MMS
RUN pip3 --no-cache-dir install "/tmp/inference"
# torch==1.6.0

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance*

EXPOSE 8080 8081
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
CMD ["serve"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
</div>


* [x] ImageSegmentationPipeline
* [X] ImageSegmentationPipeline
* [ ] ObjectDetectionPipeline we need `timm` installed
* [ ] ImageClassificationPipeline we need `timm` installed
* [ ] AutomaticSpeechRecognitionPipeline
Expand Down
5 changes: 5 additions & 0 deletions config.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vmargs=-XX:-UseContainerSupport -XX:InitialRAMPercentage=8.0 -XX:MaxRAMPercentage=10.0 -XX:-UseLargePages -XX:+UseG1GC -XX:+ExitOnOutOfMemoryError
model_store=/opt/ml/model
load_models=ALL
inference_address=http://0.0.0.0:8080
management_address=http://0.0.0.0:8081
18 changes: 12 additions & 6 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,19 @@ style:

run:
docker run -t -i \
--env HF_TASK="automatic-speech-recognition" \
--env HF_MODEL_ID="facebook/wav2vec2-base-100h" \
-p 8080:8080 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.8.1-cpu
--env HF_TASK="image-classification" \
--env HF_MODEL_ID="google/vit-base-patch16-224" \
-p 8080:8080 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu

# docker run -t -i \
# --env HF_TASK="automatic-speech-recognition" \
# --env HF_MODEL_ID="facebook/wav2vec2-base-100h" \
# -p 8080:8080 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu


build:
docker build --tag 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.8.1-cpu \
--build-arg TRANSFORMERS_VERSION=4.9.2 \
--file ./docker/Dockerfile.cpu \
docker build --tag 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu \
--build-arg TRANSFORMERS_VERSION=4.16.2 \
--file ./Dockerfile.cpu \
.
start: build run
12 changes: 11 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,23 @@

VERSION = "1.3.1"


# Ubuntu packages
# libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev
# ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
# libavcodec-extra : libavcodec-extra inculdes additional codecs for ffmpeg

install_requires = [
"sagemaker-inference>=1.5.11",
"huggingface_hub>=0.0.8",
"retrying",
"numpy",
# vision
"Pillow",
# speech + torchaudio
"librosa",
"pyctcdecode>=0.3.0",
"phonemizer",
]

extras = {}
Expand All @@ -47,7 +57,7 @@
extras["transformers"] = ["transformers[sklearn,sentencepiece]>=4.5.1"]

# framework specific dependencies
extras["torch"] = ["torch>=1.8.0"]
extras["torch"] = ["torch>=1.8.0", "torchaudio"]
extras["tensorflow"] = ["tensorflow>=2.4.0"]

# MMS Server dependencies
Expand Down
7 changes: 7 additions & 0 deletions src/sagemaker_huggingface_inference_toolkit/content_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,10 @@
WEBP = "image/webp"
X_IMAGE = "image/x-image"
VISION_TYPES = [JPEG, PNG, TIFF, BMP, GIF, WEBP,X_IMAGE]
# Speech Mime-Types
FLAC = "audio/x-flac"
MP3 = "audio/mpeg"
WAV = "audio/wave"
OGG = "audio/ogg"
X_AUDIO = "audio/x-audio"
AUDIO_TYPES = [FLAC, MP3, WAV, OGG, X_AUDIO]
17 changes: 17 additions & 0 deletions src/sagemaker_huggingface_inference_toolkit/decoder_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ def decode_image(bpayload: bytearray):
return {"inputs": image}


def decode_audio(bpayload: bytearray):
"""Convert a .wav / .flac / .mp3 object to a proper inputs dict.
Args:
bpayload (bytes): byte stream.
Returns:
(dict): dictonatry for input
"""

return {"inputs": bytes(bpayload)}


# https://github.com/automl/SMAC3/issues/453
class _JSONEncoder(json.JSONEncoder):
"""
Expand Down Expand Up @@ -133,6 +144,12 @@ def encode_csv(content): # type: (str) -> np.array
content_types.GIF: decode_image,
content_types.WEBP: decode_image,
content_types.X_IMAGE: decode_image,
# audio mime-types
content_types.FLAC: decode_audio,
content_types.MP3: decode_audio,
content_types.WAV: decode_audio,
content_types.OGG: decode_audio,
content_types.X_AUDIO: decode_audio,
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from transformers.pipelines import SUPPORTED_TASKS

from mms.service import PredictionException
from sagemaker_huggingface_inference_toolkit import decoder_encoder, content_types
from sagemaker_huggingface_inference_toolkit import content_types, decoder_encoder
from sagemaker_huggingface_inference_toolkit.transformers_utils import (
_is_gpu_available,
get_pipeline,
Expand Down Expand Up @@ -228,7 +228,7 @@ def handle(self, data, context):
accept = content_types.JSON

if content_type in content_types.UTF8_TYPES:
input_data = input_data.decode("utf-8")
input_data = input_data.decode("utf-8")

predict_start = time.time()
response = self.transform_fn(self.model, input_data, content_type, accept)
Expand Down
18 changes: 18 additions & 0 deletions tests/integ/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

from integ.utils import (
validate_automatic_speech_recognition,
validate_classification,
validate_feature_extraction,
validate_fill_mask,
Expand Down Expand Up @@ -59,6 +60,10 @@
"pytorch": "google/vit-base-patch16-224",
"tensorflow": "google/vit-base-patch16-224",
},
"automatic-speech-recognition": {
"pytorch": "facebook/wav2vec2-base-100h",
"tensorflow": "facebook/wav2vec2-base-960h",
},
}

task2input = {
Expand All @@ -85,6 +90,7 @@
},
"text-generation": {"inputs": "My name is philipp and I am"},
"image-classification": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
"automatic-speech-recognition": open(os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb").read(),
}

task2output = {
Expand Down Expand Up @@ -112,6 +118,9 @@
{"score": 0.0004262699221726507, "label": "dhole, Cuon alpinus"},
{"score": 0.00030842673731967807, "label": "lion, king of beasts, Panthera leo"},
],
"automatic-speech-recognition": {
"text": "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP OAUDIENCES IN DROFTY SCHOOL ROOMS DAY AFTER DAY FOR A FORT NIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS"
},
}

task2performance = {
Expand Down Expand Up @@ -203,6 +212,14 @@
"average_request_time": 1,
},
},
"automatic-speech-recognition": {
"cpu": {
"average_request_time": 6,
},
"gpu": {
"average_request_time": 6,
},
},
}

task2validation = {
Expand All @@ -217,4 +234,5 @@
"text2text-generation": validate_text2text_generation,
"text-generation": validate_text_generation,
"image-classification": validate_classification,
"automatic-speech-recognition": validate_automatic_speech_recognition,
}
14 changes: 14 additions & 0 deletions tests/integ/test_models_from_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ def test_deployment_from_hub(task, device, framework):
ContentType="image/jpeg",
Accept="application/json",
)
elif task == "automatic-speech-recognition":
response = client.invoke_endpoint(
EndpointName=name,
Body=task2input[task],
ContentType="audio/x-flac",
Accept="application/json",
)
else:
response = client.invoke_endpoint(
EndpointName=name,
Expand All @@ -134,6 +141,13 @@ def test_deployment_from_hub(task, device, framework):
ContentType="image/jpeg",
Accept="application/json",
)
elif task == "automatic-speech-recognition":
response = client.invoke_endpoint(
EndpointName=name,
Body=task2input[task],
ContentType="audio/x-flac",
Accept="application/json",
)
else:
response = client.invoke_endpoint(
EndpointName=name,
Expand Down
6 changes: 6 additions & 0 deletions tests/integ/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,9 @@ def validate_feature_extraction(result=None, snapshot=None):
def validate_fill_mask(result=None, snapshot=None):
assert result is not None
return True


def validate_automatic_speech_recognition(result=None, snapshot=None):
assert result is not None
assert "text" in result
return True
Binary file added tests/resources/audio/sample1.flac
Binary file not shown.
Binary file added tests/resources/audio/sample1.mp3
Binary file not shown.
Binary file added tests/resources/audio/sample1.ogg
Binary file not shown.
Binary file added tests/resources/audio/sample1.wav
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/unit/test_decoder_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ def test_decode_image():
assert isinstance(decoded_data["inputs"], Image.Image)


def test_decode_audio():
audio_files_path = os.path.join(os.getcwd(), "tests/resources/audio")

for audio_file in os.listdir(audio_files_path):
audio_bytes = open(os.path.join(audio_files_path, audio_file), "rb").read()
decoded_data = decoder_encoder.decode_audio(bytearray(audio_bytes))

assert {"inputs": audio_bytes} == decoded_data


def test_decode_csv_without_header():
with pytest.raises(PredictionException):
decoder_encoder.decode_csv(
Expand Down