Skip to content

Commit 876bfd9

Browse files
committed
add TFLite micro_speech example
1 parent 9a85c11 commit 876bfd9

18 files changed

+3192
-0
lines changed
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Micro Speech Example
2+
3+
This example shows how to run a 20 kB model that can recognize 2 keywords,
4+
"yes" and "no", from speech data.
5+
6+
The application listens to its surroundings with a microphone and indicates
7+
when it has detected a word by displaying data on a screen.
8+
9+
## Deploy to ESP32
10+
11+
The sample has been tested on ESP-IDF version `release/v4.2` and `release/v4.4` with the following devices:
12+
- [ESP32-DevKitC](http://esp-idf.readthedocs.io/en/latest/get-started/get-started-devkitc.html)
13+
- [ESP32-S3-DevKitC](https://docs.espressif.com/projects/esp-idf/en/latest/esp32s3/hw-reference/esp32s3/user-guide-devkitc-1.html)
14+
- [ESP-EYE](https://github.com/espressif/esp-who/blob/master/docs/en/get-started/ESP-EYE_Getting_Started_Guide.md)
15+
16+
### Sample output
17+
18+
* When a keyword is detected you will see following output sample output on the log screen:
19+
20+
```
21+
Heard yes (<score>) at <time>
22+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#include "audio_provider.h"
17+
18+
#include <cstdlib>
19+
#include <cstring>
20+
21+
// FreeRTOS.h must be included before some of the following dependencies.
22+
// Solves b/150260343.
23+
// clang-format off
24+
#include "freertos/FreeRTOS.h"
25+
// clang-format on
26+
27+
#include "driver/i2s.h"
28+
#include "esp_log.h"
29+
#include "esp_spi_flash.h"
30+
#include "esp_system.h"
31+
#include "esp_timer.h"
32+
#include "freertos/task.h"
33+
#include "ringbuf.h"
34+
#include "micro_model_settings.h"
35+
36+
using namespace std;
37+
38+
#define NO_I2S_SUPPORT CONFIG_IDF_TARGET_ESP32C2 || \
39+
(CONFIG_IDF_TARGET_ESP32C3 \
40+
&& (ESP_IDF_VERSION < ESP_IDF_VERSION_VAL(4, 4, 0)))
41+
42+
static const char* TAG = "TF_LITE_AUDIO_PROVIDER";
43+
/* ringbuffer to hold the incoming audio data */
44+
ringbuf_t* g_audio_capture_buffer;
45+
volatile int32_t g_latest_audio_timestamp = 0;
46+
/* model requires 20ms new data from g_audio_capture_buffer and 10ms old data
47+
* each time , storing old data in the histrory buffer , {
48+
* history_samples_to_keep = 10 * 16 } */
49+
constexpr int32_t history_samples_to_keep =
50+
((kFeatureSliceDurationMs - kFeatureSliceStrideMs) *
51+
(kAudioSampleFrequency / 1000));
52+
/* new samples to get each time from ringbuffer, { new_samples_to_get = 20 * 16
53+
* } */
54+
constexpr int32_t new_samples_to_get =
55+
(kFeatureSliceStrideMs * (kAudioSampleFrequency / 1000));
56+
57+
namespace {
58+
int16_t g_audio_output_buffer[kMaxAudioSampleSize];
59+
bool g_is_audio_initialized = false;
60+
int16_t g_history_buffer[history_samples_to_keep];
61+
} // namespace
62+
63+
const int32_t kAudioCaptureBufferSize = 80000;
64+
const int32_t i2s_bytes_to_read = 3200;
65+
66+
#if NO_I2S_SUPPORT
67+
// nothing to be done here
68+
#else
69+
static void i2s_init(void) {
70+
// Start listening for audio: MONO @ 16KHz
71+
i2s_config_t i2s_config = {
72+
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_TX),
73+
.sample_rate = 16000,
74+
.bits_per_sample = (i2s_bits_per_sample_t)16,
75+
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
76+
.communication_format = I2S_COMM_FORMAT_I2S,
77+
.intr_alloc_flags = 0,
78+
.dma_buf_count = 3,
79+
.dma_buf_len = 300,
80+
.use_apll = false,
81+
.tx_desc_auto_clear = false,
82+
.fixed_mclk = -1,
83+
};
84+
i2s_pin_config_t pin_config = {
85+
.bck_io_num = 26, // IIS_SCLK
86+
.ws_io_num = 32, // IIS_LCLK
87+
.data_out_num = -1, // IIS_DSIN
88+
.data_in_num = 33, // IIS_DOUT
89+
};
90+
esp_err_t ret = 0;
91+
ret = i2s_driver_install((i2s_port_t)1, &i2s_config, 0, NULL);
92+
if (ret != ESP_OK) {
93+
ESP_LOGE(TAG, "Error in i2s_driver_install");
94+
}
95+
ret = i2s_set_pin((i2s_port_t)1, &pin_config);
96+
if (ret != ESP_OK) {
97+
ESP_LOGE(TAG, "Error in i2s_set_pin");
98+
}
99+
100+
ret = i2s_zero_dma_buffer((i2s_port_t)1);
101+
if (ret != ESP_OK) {
102+
ESP_LOGE(TAG, "Error in initializing dma buffer with 0");
103+
}
104+
}
105+
#endif
106+
107+
static void CaptureSamples(void* arg) {
108+
#if NO_I2S_SUPPORT
109+
ESP_LOGE(TAG, "i2s support not available on C3 chip for IDF < 4.4.0");
110+
return;
111+
#else
112+
size_t bytes_read = i2s_bytes_to_read;
113+
uint8_t i2s_read_buffer[i2s_bytes_to_read] = {};
114+
i2s_init();
115+
while (1) {
116+
/* read 100ms data at once from i2s */
117+
i2s_read((i2s_port_t)1, (void*)i2s_read_buffer, i2s_bytes_to_read,
118+
&bytes_read, pdMS_TO_TICKS(100));
119+
if (bytes_read <= 0) {
120+
ESP_LOGE(TAG, "Error in I2S read : %d", bytes_read);
121+
} else {
122+
if (bytes_read < i2s_bytes_to_read) {
123+
ESP_LOGW(TAG, "Partial I2S read");
124+
}
125+
/* write bytes read by i2s into ring buffer */
126+
int bytes_written = rb_write(g_audio_capture_buffer,
127+
(uint8_t*)i2s_read_buffer, bytes_read, pdMS_TO_TICKS(100));
128+
/* update the timestamp (in ms) to let the model know that new data has
129+
* arrived */
130+
g_latest_audio_timestamp = g_latest_audio_timestamp +
131+
((1000 * (bytes_written / 2)) / kAudioSampleFrequency);
132+
if (bytes_written <= 0) {
133+
ESP_LOGE(TAG, "Could Not Write in Ring Buffer: %d ", bytes_written);
134+
} else if (bytes_written < bytes_read) {
135+
ESP_LOGW(TAG, "Partial Write");
136+
}
137+
}
138+
}
139+
#endif
140+
vTaskDelete(NULL);
141+
}
142+
143+
TfLiteStatus InitAudioRecording() {
144+
g_audio_capture_buffer = rb_init("tf_ringbuffer", kAudioCaptureBufferSize);
145+
if (!g_audio_capture_buffer) {
146+
ESP_LOGE(TAG, "Error creating ring buffer");
147+
return kTfLiteError;
148+
}
149+
/* create CaptureSamples Task which will get the i2s_data from mic and fill it
150+
* in the ring buffer */
151+
xTaskCreate(CaptureSamples, "CaptureSamples", 1024 * 32, NULL, 10, NULL);
152+
while (!g_latest_audio_timestamp) {
153+
vTaskDelay(1); // one tick delay to avoid watchdog
154+
}
155+
ESP_LOGI(TAG, "Audio Recording started");
156+
return kTfLiteOk;
157+
}
158+
159+
TfLiteStatus GetAudioSamples(int start_ms, int duration_ms,
160+
int* audio_samples_size, int16_t** audio_samples) {
161+
if (!g_is_audio_initialized) {
162+
TfLiteStatus init_status = InitAudioRecording();
163+
if (init_status != kTfLiteOk) {
164+
return init_status;
165+
}
166+
g_is_audio_initialized = true;
167+
}
168+
/* copy 160 samples (320 bytes) into output_buff from history */
169+
memcpy((void*)(g_audio_output_buffer), (void*)(g_history_buffer),
170+
history_samples_to_keep * sizeof(int16_t));
171+
172+
/* copy 320 samples (640 bytes) from rb at ( int16_t*(g_audio_output_buffer) +
173+
* 160 ), first 160 samples (320 bytes) will be from history */
174+
int bytes_read =
175+
rb_read(g_audio_capture_buffer,
176+
((uint8_t*)(g_audio_output_buffer + history_samples_to_keep)),
177+
new_samples_to_get * sizeof(int16_t), pdMS_TO_TICKS(100));
178+
if (bytes_read < 0) {
179+
ESP_LOGE(TAG, " Model Could not read data from Ring Buffer");
180+
} else if (bytes_read < new_samples_to_get * sizeof(int16_t)) {
181+
ESP_LOGD(TAG, "RB FILLED RIGHT NOW IS %d",
182+
rb_filled(g_audio_capture_buffer));
183+
ESP_LOGD(TAG, " Partial Read of Data by Model ");
184+
ESP_LOGV(TAG, " Could only read %d bytes when required %d bytes ",
185+
bytes_read, (int) (new_samples_to_get * sizeof(int16_t)));
186+
}
187+
188+
/* copy 320 bytes from output_buff into history */
189+
memcpy((void*)(g_history_buffer),
190+
(void*)(g_audio_output_buffer + new_samples_to_get),
191+
history_samples_to_keep * sizeof(int16_t));
192+
193+
*audio_samples_size = kMaxAudioSampleSize;
194+
*audio_samples = g_audio_output_buffer;
195+
return kTfLiteOk;
196+
}
197+
198+
int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
17+
#define TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
18+
19+
#include "tensorflow/lite/c/common.h"
20+
21+
// This is an abstraction around an audio source like a microphone, and is
22+
// expected to return 16-bit PCM sample data for a given point in time. The
23+
// sample data itself should be used as quickly as possible by the caller, since
24+
// to allow memory optimizations there are no guarantees that the samples won't
25+
// be overwritten by new data in the future. In practice, implementations should
26+
// ensure that there's a reasonable time allowed for clients to access the data
27+
// before any reuse.
28+
// The reference implementation can have no platform-specific dependencies, so
29+
// it just returns an array filled with zeros. For real applications, you should
30+
// ensure there's a specialized implementation that accesses hardware APIs.
31+
TfLiteStatus GetAudioSamples(int start_ms, int duration_ms,
32+
int* audio_samples_size, int16_t** audio_samples);
33+
34+
// Returns the time that audio data was last captured in milliseconds. There's
35+
// no contract about what time zero represents, the accuracy, or the granularity
36+
// of the result. Subsequent calls will generally not return a lower value, but
37+
// even that's not guaranteed if there's an overflow wraparound.
38+
// The reference implementation of this function just returns a constantly
39+
// incrementing value for each call, since it would need a non-portable platform
40+
// call to access time information. For real applications, you'll need to write
41+
// your own platform-specific implementation.
42+
int32_t LatestAudioTimestamp();
43+
44+
#endif // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#include "command_responder.h"
17+
#include "tensorflow/lite/micro/micro_log.h"
18+
19+
// The default implementation writes out the name of the recognized command
20+
// to the error console. Real applications will want to take some custom
21+
// action instead, and should implement their own versions of this function.
22+
void RespondToCommand(int32_t current_time, const char* found_command,
23+
uint8_t score, bool is_new_command) {
24+
if (is_new_command) {
25+
MicroPrintf("Heard %s (%d) @%dms", found_command, score, current_time);
26+
}
27+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
// Provides an interface to take an action based on an audio command.
17+
18+
#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_COMMAND_RESPONDER_H_
19+
#define TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_COMMAND_RESPONDER_H_
20+
21+
#include "tensorflow/lite/c/common.h"
22+
23+
// Called every time the results of an audio recognition run are available. The
24+
// human-readable name of any recognized command is in the `found_command`
25+
// argument, `score` has the numerical confidence, and `is_new_command` is set
26+
// if the previous command was different to this one.
27+
void RespondToCommand(int32_t current_time, const char* found_command,
28+
uint8_t score, bool is_new_command);
29+
30+
#endif // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_COMMAND_RESPONDER_H_

0 commit comments

Comments
 (0)