Skip to content

Commit 264bc13

Browse files
committed
Add MFCC feature extraction to machine learning
1 parent 1488cde commit 264bc13

File tree

1 file changed

+396
-0
lines changed

1 file changed

+396
-0
lines changed

Diff for: machine_learning/mfcc.py

+396
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,396 @@
1+
"""
2+
MFCC (Mel Frequency Cepstral Coefficients) Calculation
3+
4+
MFCC is a feature widely used in audio and speech processing to represent the
5+
short-term power spectrum of a sound signal in a more compact and
6+
discriminative way. It is particularly popular in speech and audio processing
7+
tasks such as speech recognition and speaker identification.
8+
9+
How MFCC is Calculated:
10+
1. Preprocessing:
11+
- Load an audio signal and normalize it to ensure that the values fall
12+
within a specific range (e.g., between -1 and 1).
13+
- Frame the audio signal into overlapping, fixed-length segments, typically
14+
using a technique like windowing to reduce spectral leakage.
15+
16+
2. Fourier Transform:
17+
- Apply a Fast Fourier Transform (FFT) to each audio frame to convert it
18+
from the time domain to the frequency domain. This results in a
19+
representation of the audio frame as a sequence of frequency components.
20+
21+
3. Power Spectrum:
22+
- Calculate the power spectrum by taking the squared magnitude of each
23+
frequency component obtained from the FFT. This step measures the energy
24+
distribution across different frequency bands.
25+
26+
4. Mel Filterbank:
27+
- Apply a set of triangular filterbanks spaced in the Mel frequency scale
28+
to the power spectrum. These filters mimic the human auditory system's
29+
frequency response. Each filterbank sums the power spectrum values within
30+
its band.
31+
32+
5. Logarithmic Compression:
33+
- Take the logarithm (typically base 10) of the filterbank values to
34+
compress the dynamic range. This step mimics the logarithmic response of
35+
the human ear to sound intensity.
36+
37+
6. Discrete Cosine Transform (DCT):
38+
- Apply the Discrete Cosine Transform to the log filterbank energies to
39+
obtain the MFCC coefficients. This transformation helps decorrelate the
40+
filterbank energies and captures the most important features of the audio
41+
signal.
42+
43+
7. Feature Extraction:
44+
- Select a subset of the DCT coefficients to form the feature vector.
45+
Often, the first few coefficients (e.g., 12-13) are used for most
46+
applications.
47+
48+
References:
49+
- Mel-Frequency Cepstral Coefficients (MFCCs):
50+
https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
51+
- Speech and Language Processing by Daniel Jurafsky & James H. Martin:
52+
https://web.stanford.edu/~jurafsky/slp3/
53+
- Mel Frequency Cepstral Coefficient (MFCC) tutorial
54+
http://practicalcryptography.com/miscellaneous/machine-learning
55+
/guide-mel-frequency-cepstral-coefficients-mfccs/
56+
57+
Author: Amir Lavasani
58+
"""
59+
60+
61+
import logging
62+
63+
import numpy as np
64+
import scipy.fftpack as fft
65+
from scipy.io import wavfile
66+
from scipy.signal import get_window
67+
68+
logging.basicConfig(level=logging.WARNING)
69+
70+
71+
def mfcc(
72+
audio: np.ndarray,
73+
sample_rate: int,
74+
ftt_size: int = 1024,
75+
hop_length: int = 20,
76+
mel_filter_num: int = 10,
77+
dct_filter_num: int = 40,
78+
) -> np.ndarray:
79+
logging.info(f"Sample rate: {sample_rate}Hz")
80+
logging.info(f"Audio duration: {len(audio) / sample_rate}s")
81+
logging.info(f"Audio min: {np.min(audio)}")
82+
logging.info(f"Audio max: {np.max(audio)}")
83+
84+
# normalize audio
85+
audio_normalized = normalize(audio)
86+
87+
logging.info(f"Normalized audio min: {np.min(audio_normalized)}")
88+
logging.info(f"Normalized audio max: {np.max(audio_normalized)}")
89+
90+
# frame audio into
91+
audio_framed = frame(
92+
audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length
93+
)
94+
95+
logging.info(f"Framed audio shape: {audio_framed.shape}")
96+
logging.info(f"First frame: {audio_framed[0]}")
97+
98+
# convert to frequency domain
99+
# For simplicity we will choose the Hanning window.
100+
window = get_window("hann", ftt_size, fftbins=True)
101+
audio_windowed = audio_framed * window
102+
103+
logging.info(f"Windowed audio shape: {audio_windowed.shape}")
104+
logging.info(f"First frame: {audio_windowed[0]}")
105+
106+
audio_fft = calculate_fft(audio_windowed, ftt_size)
107+
logging.info(f"fft audio shape: {audio_fft.shape}")
108+
logging.info(f"First frame: {audio_fft[0]}")
109+
110+
audio_power = calculate_signal_power(audio_fft)
111+
logging.info(f"power audio shape: {audio_power.shape}")
112+
logging.info(f"First frame: {audio_power[0]}")
113+
114+
filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size)
115+
logging.info(f"filters shape: {filters.shape}")
116+
117+
audio_filtered = np.dot(filters, np.transpose(audio_power))
118+
audio_log = 10.0 * np.log10(audio_filtered)
119+
logging.info(f"audio_log shape: {audio_log.shape}")
120+
121+
dct_filters = dct(dct_filter_num, mel_filter_num)
122+
cepstral_coefficents = np.dot(dct_filters, audio_log)
123+
124+
logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}")
125+
return cepstral_coefficents
126+
127+
128+
def normalize(audio: np.ndarray) -> np.ndarray:
129+
"""
130+
Normalize an audio signal by scaling it to have values between -1 and 1.
131+
132+
Args:
133+
audio (np.ndarray): The input audio signal.
134+
135+
Returns:
136+
np.ndarray: The normalized audio signal.
137+
"""
138+
# Find the maximum absolute value in the audio signal
139+
max_abs_value = np.max(np.abs(audio))
140+
141+
# Divide the entire audio signal by the maximum absolute value
142+
normalized_audio = audio / max_abs_value
143+
144+
return normalized_audio
145+
146+
147+
def frame(
148+
audio: np.ndarray,
149+
sample_rate: int,
150+
hop_length: int = 20,
151+
ftt_size: int = 1024,
152+
) -> np.ndarray:
153+
"""
154+
Split an audio signal into overlapping frames.
155+
156+
Args:
157+
audio (np.ndarray): The input audio signal.
158+
sample_rate (int): The sample rate of the audio signal.
159+
hop_length (Optional[int]): The length of the hopping (default is 20ms).
160+
ftt_size (Optional[int]): The size of the FFT window (default is 1024).
161+
162+
Returns:
163+
np.ndarray: An array of overlapping frames.
164+
"""
165+
166+
hop_size = np.round(sample_rate * hop_length / 1000).astype(int)
167+
168+
# Pad the audio signal to handle edge cases
169+
audio = np.pad(audio, int(ftt_size / 2), mode="reflect")
170+
171+
# Calculate the number of frames
172+
frame_num = int((len(audio) - ftt_size) / hop_size) + 1
173+
174+
# Initialize an array to store the frames
175+
frames = np.zeros((frame_num, ftt_size))
176+
177+
# Split the audio signal into frames
178+
for n in range(frame_num):
179+
frames[n] = audio[n * hop_size : n * hop_size + ftt_size]
180+
181+
return frames
182+
183+
184+
def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray:
185+
"""
186+
Calculate the Fast Fourier Transform (FFT) of windowed audio data.
187+
188+
Args:
189+
audio_windowed (np.ndarray): The windowed audio signal.
190+
ftt_size (Optional[int]): The size of the FFT (default is 1024).
191+
192+
Returns:
193+
np.ndarray: The FFT of the audio data.
194+
"""
195+
# Transpose the audio data to have time in rows and channels in columns
196+
audio_transposed = np.transpose(audio_windowed)
197+
198+
# Initialize an array to store the FFT results
199+
audio_fft = np.empty(
200+
(int(1 + ftt_size // 2), audio_transposed.shape[1]),
201+
dtype=np.complex64,
202+
order="F",
203+
)
204+
205+
# Compute FFT for each channel
206+
for n in range(audio_fft.shape[1]):
207+
audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]]
208+
209+
# Transpose the FFT results back to the original shape
210+
audio_fft = np.transpose(audio_fft)
211+
212+
return audio_fft
213+
214+
215+
def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
216+
"""
217+
Calculate the power of the audio signal from its FFT.
218+
219+
Args:
220+
audio_fft (np.ndarray): The FFT of the audio signal.
221+
222+
Returns:
223+
np.ndarray: The power of the audio signal.
224+
"""
225+
# Calculate the power by squaring the absolute values of the FFT coefficients
226+
audio_power = np.square(np.abs(audio_fft))
227+
228+
return audio_power
229+
230+
231+
def freq_to_mel(freq):
232+
"""
233+
Convert a frequency in Hertz to the mel scale.
234+
235+
Args:
236+
freq (float): The frequency in Hertz.
237+
238+
Returns:
239+
float: The frequency in mel scale.
240+
"""
241+
# Use the formula to convert frequency to the mel scale
242+
return 2595.0 * np.log10(1.0 + freq / 700.0)
243+
244+
245+
def mel_to_freq(mels):
246+
"""
247+
Convert a frequency in the mel scale to Hertz.
248+
249+
Args:
250+
mels (float): The frequency in mel scale.
251+
252+
Returns:
253+
float: The frequency in Hertz.
254+
"""
255+
# Use the formula to convert mel scale to frequency
256+
return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
257+
258+
259+
def mel_spaced_filterbank(
260+
sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024
261+
) -> np.ndarray:
262+
"""
263+
Create a Mel-spaced filter bank for audio processing.
264+
265+
Args:
266+
sample_rate (int): The sample rate of the audio.
267+
mel_filter_num (Optional[int]): The number of mel filters (default is 10).
268+
ftt_size (Optional[int]): The size of the FFT (default is 1024).
269+
270+
Returns:
271+
np.ndarray: Mel-spaced filter bank.
272+
"""
273+
freq_min = 0
274+
freq_high = sample_rate // 2
275+
276+
logging.info(f"Minimum frequency: {freq_min}")
277+
logging.info(f"Maximum frequency: {freq_high}")
278+
279+
# Calculate filter points and mel frequencies
280+
filter_points, mel_freqs = get_filter_points(
281+
sample_rate,
282+
freq_min,
283+
freq_high,
284+
mel_filter_num,
285+
ftt_size,
286+
)
287+
288+
filters = get_filters(filter_points, ftt_size)
289+
290+
# normalize filters
291+
# taken from the librosa library
292+
enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num])
293+
filters *= enorm[:, np.newaxis]
294+
295+
return filters
296+
297+
298+
def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
299+
"""
300+
Generate filters for audio processing.
301+
302+
Args:
303+
filter_points (list): A list of filter points.
304+
ftt_size (int): The size of the FFT.
305+
306+
Returns:
307+
np.ndarray: A matrix of filters.
308+
"""
309+
num_filters = len(filter_points) - 2
310+
filters = np.zeros((num_filters, int(ftt_size / 2) + 1))
311+
312+
for n in range(num_filters):
313+
start = filter_points[n]
314+
mid = filter_points[n + 1]
315+
end = filter_points[n + 2]
316+
317+
# Linearly increase values from 0 to 1
318+
filters[n, start:mid] = np.linspace(0, 1, mid - start)
319+
320+
# Linearly decrease values from 1 to 0
321+
filters[n, mid:end] = np.linspace(1, 0, end - mid)
322+
323+
return filters
324+
325+
326+
def get_filter_points(
327+
sample_rate: int,
328+
freq_min: int,
329+
freq_high: int,
330+
mel_filter_num: int = 10,
331+
ftt_size: int = 1024,
332+
):
333+
"""
334+
Calculate the filter points and frequencies for mel frequency filters.
335+
336+
Args:
337+
sample_rate (int): The sample rate of the audio.
338+
freq_min (int): The minimum frequency in Hertz.
339+
freq_high (int): The maximum frequency in Hertz.
340+
mel_filter_num (Optional[int]): The number of mel filters (default is 10).
341+
ftt_size (Optional[int]): The size of the FFT (default is 1024).
342+
343+
Returns:
344+
Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
345+
"""
346+
347+
# Convert minimum and maximum frequencies to mel scale
348+
fmin_mel = freq_to_mel(freq_min)
349+
fmax_mel = freq_to_mel(freq_high)
350+
351+
logging.info(f"MEL min: {fmin_mel}")
352+
logging.info(f"MEL max: {fmax_mel}")
353+
354+
# Generate equally spaced mel frequencies
355+
mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2)
356+
357+
# Convert mel frequencies back to Hertz
358+
freqs = mel_to_freq(mels)
359+
360+
# Calculate filter points as integer values
361+
filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int)
362+
363+
return filter_points, freqs
364+
365+
366+
def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
367+
"""
368+
Compute the Discrete Cosine Transform (DCT) basis matrix.
369+
370+
Args:
371+
dct_filter_num (int): The number of DCT filters to generate.
372+
filter_num (int): The number of the fbank filters.
373+
374+
Returns:
375+
np.ndarray: The DCT basis matrix.
376+
"""
377+
basis = np.empty((dct_filter_num, filter_num))
378+
basis[0, :] = 1.0 / np.sqrt(filter_num)
379+
380+
samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num)
381+
382+
for i in range(1, dct_filter_num):
383+
basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num)
384+
385+
return basis
386+
387+
388+
if __name__ == "__main__":
389+
TRAIN_PATH = "./signal_processing/"
390+
sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav")
391+
392+
print(mfcc(audio, sample_rate))
393+
394+
import doctest
395+
396+
doctest.testmod()

0 commit comments

Comments
 (0)