Skip to content

Commit 698422d

Browse files
authored
Merge pull request #58 from aws/add-audio
Add audio
2 parents 1e57715 + 2ecd873 commit 698422d

File tree

16 files changed

+250
-10
lines changed

16 files changed

+250
-10
lines changed

Dockerfile.cpu

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
FROM ubuntu:20.04
2+
3+
LABEL maintainer="Amazon AI"
4+
LABEL dlc_major_version="1"
5+
6+
# Specify accept-bind-to-port LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT
7+
# https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipeline-real-time.html
8+
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
9+
# Specify multi-models LABEL to indicate container is capable of loading and serving multiple models concurrently
10+
# https://docs.aws.amazon.com/sagemaker/latest/dg/build-multi-model-build-container.html
11+
LABEL com.amazonaws.sagemaker.capabilities.multi-models=true
12+
13+
ARG MMS_VERSION=1.1.8
14+
ARG PYTHON=python3
15+
ARG PYTHON_VERSION=3.8.10
16+
ARG OPEN_MPI_VERSION=4.0.1
17+
# HF ARGS
18+
ARG PT_INFERENCE_URL=https://pytorch-ei-binaries.s3.us-west-2.amazonaws.com/r1.10.0_inference/20211027-061940/36dea191ed0df524207de5acc4e6fb4322306d1a/cpu/torch-1.10.0%2Bcpu-cp38-cp38-manylinux1_x86_64.whl
19+
ARG TRANSFORMERS_VERSION
20+
21+
ENV PYTHONDONTWRITEBYTECODE=1 \
22+
PYTHONUNBUFFERED=1 \
23+
LD_LIBRARY_PATH="/opt/conda/lib/:${LD_LIBRARY_PATH}:/usr/local/lib" \
24+
PYTHONIOENCODING=UTF-8 \
25+
LANG=C.UTF-8 \
26+
LC_ALL=C.UTF-8 \
27+
TEMP=/home/model-server/tmp \
28+
DEBIAN_FRONTEND=noninteractive
29+
30+
ENV PATH /opt/conda/bin:$PATH
31+
32+
RUN apt-get update \
33+
# TODO: Remove upgrade statements once packages are updated in base image
34+
&& apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \
35+
&& apt-get install -y --no-install-recommends \
36+
ca-certificates \
37+
build-essential \
38+
openssl \
39+
openjdk-8-jdk-headless \
40+
vim \
41+
wget \
42+
curl \
43+
emacs \
44+
unzip \
45+
git \
46+
&& apt-get clean \
47+
&& rm -rf /var/lib/apt/lists/*
48+
49+
RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
50+
&& chmod +x ~/miniconda.sh \
51+
&& ~/miniconda.sh -b -p /opt/conda \
52+
&& rm ~/miniconda.sh \
53+
&& /opt/conda/bin/conda update conda \
54+
&& /opt/conda/bin/conda install -c conda-forge \
55+
python=$PYTHON_VERSION \
56+
&& /opt/conda/bin/conda install -y \
57+
# conda 4.10.0 requires ruamel_yaml to be installed. Currently pinned at latest.
58+
ruamel_yaml==0.15.100 \
59+
cython \
60+
"mkl-include==2021.4.0" \
61+
"mkl==2021.4.0" \
62+
botocore \
63+
&& /opt/conda/bin/conda clean -ya
64+
65+
RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
66+
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \
67+
&& pip install packaging==20.4 \
68+
enum-compat==0.0.3 \
69+
"cryptography>3.2"
70+
71+
RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \
72+
&& gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \
73+
&& cd openmpi-$OPEN_MPI_VERSION \
74+
&& ./configure --prefix=/home/.openmpi \
75+
&& make all install \
76+
&& cd .. \
77+
&& rm openmpi-$OPEN_MPI_VERSION.tar.gz \
78+
&& rm -rf openmpi-$OPEN_MPI_VERSION
79+
80+
# The ENV variables declared below are changed in the previous section
81+
# Grouping these ENV variables in the first section causes
82+
# ompi_info to fail. This is only observed in CPU containers
83+
ENV PATH="$PATH:/home/.openmpi/bin"
84+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
85+
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
86+
87+
WORKDIR /
88+
89+
RUN pip install --no-cache-dir \
90+
multi-model-server==$MMS_VERSION \
91+
sagemaker-inference
92+
93+
RUN useradd -m model-server \
94+
&& mkdir -p /home/model-server/tmp \
95+
&& chown -R model-server /home/model-server
96+
97+
COPY mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
98+
COPY config.properties /etc/sagemaker-mms.properties
99+
100+
RUN chmod +x /usr/local/bin/dockerd-entrypoint.py
101+
102+
ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py
103+
104+
RUN chmod +x /usr/local/bin/deep_learning_container.py
105+
106+
#################################
107+
# Hugging Face specific section #
108+
#################################
109+
110+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.10/license.txt -o /license.txt
111+
112+
# Uninstall and re-install torch and torchvision from the PyTorch website
113+
RUN pip uninstall -y torch \
114+
&& pip install --no-cache-dir -U $PT_INFERENCE_URL
115+
116+
# install Hugging Face libraries and its dependencies
117+
RUN pip install --no-cache-dir \
118+
transformers[sentencepiece]==${TRANSFORMERS_VERSION} \
119+
protobuf==3.12.0
120+
121+
RUN apt-get update \
122+
&& apt-get install -y libsndfile1 ffmpeg \
123+
&& apt-get clean \
124+
&& rm -rf /var/lib/apt/lists/*
125+
RUN pip install torchaudio
126+
127+
# TEMP: Copying package
128+
COPY src /tmp/inference/src
129+
COPY ./README.md /tmp/inference/README.md
130+
COPY ./setup.py /tmp/inference/setup.py
131+
132+
# Install and SageMaker Inference Toolkit to set up MMS
133+
RUN pip3 --no-cache-dir install "/tmp/inference"
134+
# torch==1.6.0
135+
136+
RUN HOME_DIR=/root \
137+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
138+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
139+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
140+
&& chmod +x /usr/local/bin/testOSSCompliance \
141+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
142+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
143+
&& rm -rf ${HOME_DIR}/oss_compliance*
144+
145+
EXPOSE 8080 8081
146+
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
147+
CMD ["serve"]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
</div>
55

66

7-
* [x] ImageSegmentationPipeline
7+
* [X] ImageSegmentationPipeline
88
* [ ] ObjectDetectionPipeline we need `timm` installed
99
* [ ] ImageClassificationPipeline we need `timm` installed
1010
* [ ] AutomaticSpeechRecognitionPipeline

config.properties

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
vmargs=-XX:-UseContainerSupport -XX:InitialRAMPercentage=8.0 -XX:MaxRAMPercentage=10.0 -XX:-UseLargePages -XX:+UseG1GC -XX:+ExitOnOutOfMemoryError
2+
model_store=/opt/ml/model
3+
load_models=ALL
4+
inference_address=http://0.0.0.0:8080
5+
management_address=http://0.0.0.0:8081

makefile

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,19 @@ style:
2929

3030
run:
3131
docker run -t -i \
32-
--env HF_TASK="automatic-speech-recognition" \
33-
--env HF_MODEL_ID="facebook/wav2vec2-base-100h" \
34-
-p 8080:8080 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.8.1-cpu
32+
--env HF_TASK="image-classification" \
33+
--env HF_MODEL_ID="google/vit-base-patch16-224" \
34+
-p 8080:8080 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu
35+
36+
# docker run -t -i \
37+
# --env HF_TASK="automatic-speech-recognition" \
38+
# --env HF_MODEL_ID="facebook/wav2vec2-base-100h" \
39+
# -p 8080:8080 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu
40+
3541

3642
build:
37-
docker build --tag 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.8.1-cpu \
38-
--build-arg TRANSFORMERS_VERSION=4.9.2 \
39-
--file ./docker/Dockerfile.cpu \
43+
docker build --tag 558105141721.dkr.ecr.us-east-1.amazonaws.com/huggingface-inference-pytorch:1.10.2-cpu \
44+
--build-arg TRANSFORMERS_VERSION=4.16.2 \
45+
--file ./Dockerfile.cpu \
4046
.
4147
start: build run

setup.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,23 @@
3232

3333
VERSION = "1.3.1"
3434

35+
36+
# Ubuntu packages
37+
# libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev
38+
# ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
39+
# libavcodec-extra : libavcodec-extra inculdes additional codecs for ffmpeg
40+
3541
install_requires = [
3642
"sagemaker-inference>=1.5.11",
3743
"huggingface_hub>=0.0.8",
3844
"retrying",
3945
"numpy",
4046
# vision
4147
"Pillow",
48+
# speech + torchaudio
49+
"librosa",
50+
"pyctcdecode>=0.3.0",
51+
"phonemizer",
4252
]
4353

4454
extras = {}
@@ -47,7 +57,7 @@
4757
extras["transformers"] = ["transformers[sklearn,sentencepiece]>=4.5.1"]
4858

4959
# framework specific dependencies
50-
extras["torch"] = ["torch>=1.8.0"]
60+
extras["torch"] = ["torch>=1.8.0", "torchaudio"]
5161
extras["tensorflow"] = ["tensorflow>=2.4.0"]
5262

5363
# MMS Server dependencies

src/sagemaker_huggingface_inference_toolkit/content_types.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,10 @@
2828
WEBP = "image/webp"
2929
X_IMAGE = "image/x-image"
3030
VISION_TYPES = [JPEG, PNG, TIFF, BMP, GIF, WEBP,X_IMAGE]
31+
# Speech Mime-Types
32+
FLAC = "audio/x-flac"
33+
MP3 = "audio/mpeg"
34+
WAV = "audio/wave"
35+
OGG = "audio/ogg"
36+
X_AUDIO = "audio/x-audio"
37+
AUDIO_TYPES = [FLAC, MP3, WAV, OGG, X_AUDIO]

src/sagemaker_huggingface_inference_toolkit/decoder_encoder.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,17 @@ def decode_image(bpayload: bytearray):
6464
return {"inputs": image}
6565

6666

67+
def decode_audio(bpayload: bytearray):
68+
"""Convert a .wav / .flac / .mp3 object to a proper inputs dict.
69+
Args:
70+
bpayload (bytes): byte stream.
71+
Returns:
72+
(dict): dictonatry for input
73+
"""
74+
75+
return {"inputs": bytes(bpayload)}
76+
77+
6778
# https://github.com/automl/SMAC3/issues/453
6879
class _JSONEncoder(json.JSONEncoder):
6980
"""
@@ -133,6 +144,12 @@ def encode_csv(content): # type: (str) -> np.array
133144
content_types.GIF: decode_image,
134145
content_types.WEBP: decode_image,
135146
content_types.X_IMAGE: decode_image,
147+
# audio mime-types
148+
content_types.FLAC: decode_audio,
149+
content_types.MP3: decode_audio,
150+
content_types.WAV: decode_audio,
151+
content_types.OGG: decode_audio,
152+
content_types.X_AUDIO: decode_audio,
136153
}
137154

138155

src/sagemaker_huggingface_inference_toolkit/handler_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from transformers.pipelines import SUPPORTED_TASKS
2424

2525
from mms.service import PredictionException
26-
from sagemaker_huggingface_inference_toolkit import decoder_encoder, content_types
26+
from sagemaker_huggingface_inference_toolkit import content_types, decoder_encoder
2727
from sagemaker_huggingface_inference_toolkit.transformers_utils import (
2828
_is_gpu_available,
2929
get_pipeline,
@@ -228,7 +228,7 @@ def handle(self, data, context):
228228
accept = content_types.JSON
229229

230230
if content_type in content_types.UTF8_TYPES:
231-
input_data = input_data.decode("utf-8")
231+
input_data = input_data.decode("utf-8")
232232

233233
predict_start = time.time()
234234
response = self.transform_fn(self.model, input_data, content_type, accept)

tests/integ/config.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22

33
from integ.utils import (
4+
validate_automatic_speech_recognition,
45
validate_classification,
56
validate_feature_extraction,
67
validate_fill_mask,
@@ -59,6 +60,10 @@
5960
"pytorch": "google/vit-base-patch16-224",
6061
"tensorflow": "google/vit-base-patch16-224",
6162
},
63+
"automatic-speech-recognition": {
64+
"pytorch": "facebook/wav2vec2-base-100h",
65+
"tensorflow": "facebook/wav2vec2-base-960h",
66+
},
6267
}
6368

6469
task2input = {
@@ -85,6 +90,7 @@
8590
},
8691
"text-generation": {"inputs": "My name is philipp and I am"},
8792
"image-classification": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
93+
"automatic-speech-recognition": open(os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb").read(),
8894
}
8995

9096
task2output = {
@@ -112,6 +118,9 @@
112118
{"score": 0.0004262699221726507, "label": "dhole, Cuon alpinus"},
113119
{"score": 0.00030842673731967807, "label": "lion, king of beasts, Panthera leo"},
114120
],
121+
"automatic-speech-recognition": {
122+
"text": "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP OAUDIENCES IN DROFTY SCHOOL ROOMS DAY AFTER DAY FOR A FORT NIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS"
123+
},
115124
}
116125

117126
task2performance = {
@@ -203,6 +212,14 @@
203212
"average_request_time": 1,
204213
},
205214
},
215+
"automatic-speech-recognition": {
216+
"cpu": {
217+
"average_request_time": 6,
218+
},
219+
"gpu": {
220+
"average_request_time": 6,
221+
},
222+
},
206223
}
207224

208225
task2validation = {
@@ -217,4 +234,5 @@
217234
"text2text-generation": validate_text2text_generation,
218235
"text-generation": validate_text_generation,
219236
"image-classification": validate_classification,
237+
"automatic-speech-recognition": validate_automatic_speech_recognition,
220238
}

tests/integ/test_models_from_hub.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ def test_deployment_from_hub(task, device, framework):
112112
ContentType="image/jpeg",
113113
Accept="application/json",
114114
)
115+
elif task == "automatic-speech-recognition":
116+
response = client.invoke_endpoint(
117+
EndpointName=name,
118+
Body=task2input[task],
119+
ContentType="audio/x-flac",
120+
Accept="application/json",
121+
)
115122
else:
116123
response = client.invoke_endpoint(
117124
EndpointName=name,
@@ -134,6 +141,13 @@ def test_deployment_from_hub(task, device, framework):
134141
ContentType="image/jpeg",
135142
Accept="application/json",
136143
)
144+
elif task == "automatic-speech-recognition":
145+
response = client.invoke_endpoint(
146+
EndpointName=name,
147+
Body=task2input[task],
148+
ContentType="audio/x-flac",
149+
Accept="application/json",
150+
)
137151
else:
138152
response = client.invoke_endpoint(
139153
EndpointName=name,

tests/integ/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,9 @@ def validate_feature_extraction(result=None, snapshot=None):
145145
def validate_fill_mask(result=None, snapshot=None):
146146
assert result is not None
147147
return True
148+
149+
150+
def validate_automatic_speech_recognition(result=None, snapshot=None):
151+
assert result is not None
152+
assert "text" in result
153+
return True

tests/resources/audio/sample1.flac

276 KB
Binary file not shown.

tests/resources/audio/sample1.mp3

40.6 KB
Binary file not shown.

tests/resources/audio/sample1.ogg

69.1 KB
Binary file not shown.

tests/resources/audio/sample1.wav

428 KB
Binary file not shown.

tests/unit/test_decoder_encoder.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,16 @@ def test_decode_image():
6262
assert isinstance(decoded_data["inputs"], Image.Image)
6363

6464

65+
def test_decode_audio():
66+
audio_files_path = os.path.join(os.getcwd(), "tests/resources/audio")
67+
68+
for audio_file in os.listdir(audio_files_path):
69+
audio_bytes = open(os.path.join(audio_files_path, audio_file), "rb").read()
70+
decoded_data = decoder_encoder.decode_audio(bytearray(audio_bytes))
71+
72+
assert {"inputs": audio_bytes} == decoded_data
73+
74+
6575
def test_decode_csv_without_header():
6676
with pytest.raises(PredictionException):
6777
decoder_encoder.decode_csv(

0 commit comments

Comments
 (0)