|
| 1 | +""" |
| 2 | +Mel Frequency Cepstral Coefficients (MFCC) Calculation |
| 3 | +
|
| 4 | +MFCC is an algorithm widely used in audio and speech processing to represent the |
| 5 | +short-term power spectrum of a sound signal in a more compact and |
| 6 | +discriminative way. It is particularly popular in speech and audio processing |
| 7 | +tasks such as speech recognition and speaker identification. |
| 8 | +
|
| 9 | +How Mel Frequency Cepstral Coefficients are Calculated: |
| 10 | +1. Preprocessing: |
| 11 | + - Load an audio signal and normalize it to ensure that the values fall |
| 12 | + within a specific range (e.g., between -1 and 1). |
| 13 | + - Frame the audio signal into overlapping, fixed-length segments, typically |
| 14 | + using a technique like windowing to reduce spectral leakage. |
| 15 | +
|
| 16 | +2. Fourier Transform: |
| 17 | + - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it |
| 18 | + from the time domain to the frequency domain. This results in a |
| 19 | + representation of the audio frame as a sequence of frequency components. |
| 20 | +
|
| 21 | +3. Power Spectrum: |
| 22 | + - Calculate the power spectrum by taking the squared magnitude of each |
| 23 | + frequency component obtained from the FFT. This step measures the energy |
| 24 | + distribution across different frequency bands. |
| 25 | +
|
| 26 | +4. Mel Filterbank: |
| 27 | + - Apply a set of triangular filterbanks spaced in the Mel frequency scale |
| 28 | + to the power spectrum. These filters mimic the human auditory system's |
| 29 | + frequency response. Each filterbank sums the power spectrum values within |
| 30 | + its band. |
| 31 | +
|
| 32 | +5. Logarithmic Compression: |
| 33 | + - Take the logarithm (typically base 10) of the filterbank values to |
| 34 | + compress the dynamic range. This step mimics the logarithmic response of |
| 35 | + the human ear to sound intensity. |
| 36 | +
|
| 37 | +6. Discrete Cosine Transform (DCT): |
| 38 | + - Apply the Discrete Cosine Transform to the log filterbank energies to |
| 39 | + obtain the MFCC coefficients. This transformation helps decorrelate the |
| 40 | + filterbank energies and captures the most important features of the audio |
| 41 | + signal. |
| 42 | +
|
| 43 | +7. Feature Extraction: |
| 44 | + - Select a subset of the DCT coefficients to form the feature vector. |
| 45 | + Often, the first few coefficients (e.g., 12-13) are used for most |
| 46 | + applications. |
| 47 | +
|
| 48 | +References: |
| 49 | +- Mel-Frequency Cepstral Coefficients (MFCCs): |
| 50 | + https://en.wikipedia.org/wiki/Mel-frequency_cepstrum |
| 51 | +- Speech and Language Processing by Daniel Jurafsky & James H. Martin: |
| 52 | + https://web.stanford.edu/~jurafsky/slp3/ |
| 53 | +- Mel Frequency Cepstral Coefficient (MFCC) tutorial |
| 54 | + http://practicalcryptography.com/miscellaneous/machine-learning |
| 55 | + /guide-mel-frequency-cepstral-coefficients-mfccs/ |
| 56 | +
|
| 57 | +Author: Amir Lavasani |
| 58 | +""" |
| 59 | + |
| 60 | + |
| 61 | +import logging |
| 62 | + |
| 63 | +import numpy as np |
| 64 | +import scipy.fftpack as fft |
| 65 | +from scipy.signal import get_window |
| 66 | + |
| 67 | +logging.basicConfig(filename=f"{__file__}.log", level=logging.INFO) |
| 68 | + |
| 69 | + |
| 70 | +def mfcc( |
| 71 | + audio: np.ndarray, |
| 72 | + sample_rate: int, |
| 73 | + ftt_size: int = 1024, |
| 74 | + hop_length: int = 20, |
| 75 | + mel_filter_num: int = 10, |
| 76 | + dct_filter_num: int = 40, |
| 77 | +) -> np.ndarray: |
| 78 | + """ |
| 79 | + Calculate Mel Frequency Cepstral Coefficients (MFCCs) from an audio signal. |
| 80 | +
|
| 81 | + Args: |
| 82 | + audio: The input audio signal. |
| 83 | + sample_rate: The sample rate of the audio signal (in Hz). |
| 84 | + ftt_size: The size of the FFT window (default is 1024). |
| 85 | + hop_length: The hop length for frame creation (default is 20ms). |
| 86 | + mel_filter_num: The number of Mel filters (default is 10). |
| 87 | + dct_filter_num: The number of DCT filters (default is 40). |
| 88 | +
|
| 89 | + Returns: |
| 90 | + A matrix of MFCCs for the input audio. |
| 91 | +
|
| 92 | + Raises: |
| 93 | + ValueError: If the input audio is empty. |
| 94 | +
|
| 95 | + Example: |
| 96 | + >>> sample_rate = 44100 # Sample rate of 44.1 kHz |
| 97 | + >>> duration = 2.0 # Duration of 1 second |
| 98 | + >>> t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) |
| 99 | + >>> audio = 0.5 * np.sin(2 * np.pi * 440.0 * t) # Generate a 440 Hz sine wave |
| 100 | + >>> mfccs = mfcc(audio, sample_rate) |
| 101 | + >>> mfccs.shape |
| 102 | + (40, 101) |
| 103 | + """ |
| 104 | + logging.info(f"Sample rate: {sample_rate}Hz") |
| 105 | + logging.info(f"Audio duration: {len(audio) / sample_rate}s") |
| 106 | + logging.info(f"Audio min: {np.min(audio)}") |
| 107 | + logging.info(f"Audio max: {np.max(audio)}") |
| 108 | + |
| 109 | + # normalize audio |
| 110 | + audio_normalized = normalize(audio) |
| 111 | + |
| 112 | + logging.info(f"Normalized audio min: {np.min(audio_normalized)}") |
| 113 | + logging.info(f"Normalized audio max: {np.max(audio_normalized)}") |
| 114 | + |
| 115 | + # frame audio into |
| 116 | + audio_framed = audio_frames( |
| 117 | + audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length |
| 118 | + ) |
| 119 | + |
| 120 | + logging.info(f"Framed audio shape: {audio_framed.shape}") |
| 121 | + logging.info(f"First frame: {audio_framed[0]}") |
| 122 | + |
| 123 | + # convert to frequency domain |
| 124 | + # For simplicity we will choose the Hanning window. |
| 125 | + window = get_window("hann", ftt_size, fftbins=True) |
| 126 | + audio_windowed = audio_framed * window |
| 127 | + |
| 128 | + logging.info(f"Windowed audio shape: {audio_windowed.shape}") |
| 129 | + logging.info(f"First frame: {audio_windowed[0]}") |
| 130 | + |
| 131 | + audio_fft = calculate_fft(audio_windowed, ftt_size) |
| 132 | + logging.info(f"fft audio shape: {audio_fft.shape}") |
| 133 | + logging.info(f"First frame: {audio_fft[0]}") |
| 134 | + |
| 135 | + audio_power = calculate_signal_power(audio_fft) |
| 136 | + logging.info(f"power audio shape: {audio_power.shape}") |
| 137 | + logging.info(f"First frame: {audio_power[0]}") |
| 138 | + |
| 139 | + filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size) |
| 140 | + logging.info(f"filters shape: {filters.shape}") |
| 141 | + |
| 142 | + audio_filtered = np.dot(filters, np.transpose(audio_power)) |
| 143 | + audio_log = 10.0 * np.log10(audio_filtered) |
| 144 | + logging.info(f"audio_log shape: {audio_log.shape}") |
| 145 | + |
| 146 | + dct_filters = discrete_cosine_transform(dct_filter_num, mel_filter_num) |
| 147 | + cepstral_coefficents = np.dot(dct_filters, audio_log) |
| 148 | + |
| 149 | + logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}") |
| 150 | + return cepstral_coefficents |
| 151 | + |
| 152 | + |
| 153 | +def normalize(audio: np.ndarray) -> np.ndarray: |
| 154 | + """ |
| 155 | + Normalize an audio signal by scaling it to have values between -1 and 1. |
| 156 | +
|
| 157 | + Args: |
| 158 | + audio: The input audio signal. |
| 159 | +
|
| 160 | + Returns: |
| 161 | + The normalized audio signal. |
| 162 | +
|
| 163 | + Examples: |
| 164 | + >>> audio = np.array([1, 2, 3, 4, 5]) |
| 165 | + >>> normalized_audio = normalize(audio) |
| 166 | + >>> np.max(normalized_audio) |
| 167 | + 1.0 |
| 168 | + >>> np.min(normalized_audio) |
| 169 | + 0.2 |
| 170 | + """ |
| 171 | + # Divide the entire audio signal by the maximum absolute value |
| 172 | + return audio / np.max(np.abs(audio)) |
| 173 | + |
| 174 | + |
| 175 | +def audio_frames( |
| 176 | + audio: np.ndarray, |
| 177 | + sample_rate: int, |
| 178 | + hop_length: int = 20, |
| 179 | + ftt_size: int = 1024, |
| 180 | +) -> np.ndarray: |
| 181 | + """ |
| 182 | + Split an audio signal into overlapping frames. |
| 183 | +
|
| 184 | + Args: |
| 185 | + audio: The input audio signal. |
| 186 | + sample_rate: The sample rate of the audio signal. |
| 187 | + hop_length: The length of the hopping (default is 20ms). |
| 188 | + ftt_size: The size of the FFT window (default is 1024). |
| 189 | +
|
| 190 | + Returns: |
| 191 | + An array of overlapping frames. |
| 192 | +
|
| 193 | + Examples: |
| 194 | + >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) |
| 195 | + >>> sample_rate = 8000 |
| 196 | + >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) |
| 197 | + >>> frames.shape |
| 198 | + (126, 512) |
| 199 | + """ |
| 200 | + |
| 201 | + hop_size = np.round(sample_rate * hop_length / 1000).astype(int) |
| 202 | + |
| 203 | + # Pad the audio signal to handle edge cases |
| 204 | + audio = np.pad(audio, int(ftt_size / 2), mode="reflect") |
| 205 | + |
| 206 | + # Calculate the number of frames |
| 207 | + frame_count = int((len(audio) - ftt_size) / hop_size) + 1 |
| 208 | + |
| 209 | + # Initialize an array to store the frames |
| 210 | + frames = np.zeros((frame_count, ftt_size)) |
| 211 | + |
| 212 | + # Split the audio signal into frames |
| 213 | + for n in range(frame_count): |
| 214 | + frames[n] = audio[n * hop_size : n * hop_size + ftt_size] |
| 215 | + |
| 216 | + return frames |
| 217 | + |
| 218 | + |
| 219 | +def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray: |
| 220 | + """ |
| 221 | + Calculate the Fast Fourier Transform (FFT) of windowed audio data. |
| 222 | +
|
| 223 | + Args: |
| 224 | + audio_windowed: The windowed audio signal. |
| 225 | + ftt_size: The size of the FFT (default is 1024). |
| 226 | +
|
| 227 | + Returns: |
| 228 | + The FFT of the audio data. |
| 229 | +
|
| 230 | + Examples: |
| 231 | + >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) |
| 232 | + >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) |
| 233 | + >>> np.allclose(audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])) |
| 234 | + True |
| 235 | + """ |
| 236 | + # Transpose the audio data to have time in rows and channels in columns |
| 237 | + audio_transposed = np.transpose(audio_windowed) |
| 238 | + |
| 239 | + # Initialize an array to store the FFT results |
| 240 | + audio_fft = np.empty( |
| 241 | + (int(1 + ftt_size // 2), audio_transposed.shape[1]), |
| 242 | + dtype=np.complex64, |
| 243 | + order="F", |
| 244 | + ) |
| 245 | + |
| 246 | + # Compute FFT for each channel |
| 247 | + for n in range(audio_fft.shape[1]): |
| 248 | + audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] |
| 249 | + |
| 250 | + # Transpose the FFT results back to the original shape |
| 251 | + return np.transpose(audio_fft) |
| 252 | + |
| 253 | + |
| 254 | +def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: |
| 255 | + """ |
| 256 | + Calculate the power of the audio signal from its FFT. |
| 257 | +
|
| 258 | + Args: |
| 259 | + audio_fft: The FFT of the audio signal. |
| 260 | +
|
| 261 | + Returns: |
| 262 | + The power of the audio signal. |
| 263 | +
|
| 264 | + Examples: |
| 265 | + >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) |
| 266 | + >>> power = calculate_signal_power(audio_fft) |
| 267 | + >>> np.allclose(power, np.array([5, 13, 25, 41])) |
| 268 | + True |
| 269 | + """ |
| 270 | + # Calculate the power by squaring the absolute values of the FFT coefficients |
| 271 | + return np.square(np.abs(audio_fft)) |
| 272 | + |
| 273 | + |
| 274 | +def freq_to_mel(freq: float) -> float: |
| 275 | + """ |
| 276 | + Convert a frequency in Hertz to the mel scale. |
| 277 | +
|
| 278 | + Args: |
| 279 | + freq: The frequency in Hertz. |
| 280 | +
|
| 281 | + Returns: |
| 282 | + The frequency in mel scale. |
| 283 | +
|
| 284 | + Examples: |
| 285 | + >>> round(freq_to_mel(1000), 2) |
| 286 | + 999.99 |
| 287 | + """ |
| 288 | + # Use the formula to convert frequency to the mel scale |
| 289 | + return 2595.0 * np.log10(1.0 + freq / 700.0) |
| 290 | + |
| 291 | + |
| 292 | +def mel_to_freq(mels: float) -> float: |
| 293 | + """ |
| 294 | + Convert a frequency in the mel scale to Hertz. |
| 295 | +
|
| 296 | + Args: |
| 297 | + mels: The frequency in mel scale. |
| 298 | +
|
| 299 | + Returns: |
| 300 | + The frequency in Hertz. |
| 301 | +
|
| 302 | + Examples: |
| 303 | + >>> round(mel_to_freq(999.99), 2) |
| 304 | + 1000.01 |
| 305 | + """ |
| 306 | + # Use the formula to convert mel scale to frequency |
| 307 | + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) |
| 308 | + |
| 309 | + |
| 310 | +def mel_spaced_filterbank( |
| 311 | + sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024 |
| 312 | +) -> np.ndarray: |
| 313 | + """ |
| 314 | + Create a Mel-spaced filter bank for audio processing. |
| 315 | +
|
| 316 | + Args: |
| 317 | + sample_rate: The sample rate of the audio. |
| 318 | + mel_filter_num: The number of mel filters (default is 10). |
| 319 | + ftt_size: The size of the FFT (default is 1024). |
| 320 | +
|
| 321 | + Returns: |
| 322 | + Mel-spaced filter bank. |
| 323 | +
|
| 324 | + Examples: |
| 325 | + >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10) |
| 326 | + 0.0004603981 |
| 327 | + """ |
| 328 | + freq_min = 0 |
| 329 | + freq_high = sample_rate // 2 |
| 330 | + |
| 331 | + logging.info(f"Minimum frequency: {freq_min}") |
| 332 | + logging.info(f"Maximum frequency: {freq_high}") |
| 333 | + |
| 334 | + # Calculate filter points and mel frequencies |
| 335 | + filter_points, mel_freqs = get_filter_points( |
| 336 | + sample_rate, |
| 337 | + freq_min, |
| 338 | + freq_high, |
| 339 | + mel_filter_num, |
| 340 | + ftt_size, |
| 341 | + ) |
| 342 | + |
| 343 | + filters = get_filters(filter_points, ftt_size) |
| 344 | + |
| 345 | + # normalize filters |
| 346 | + # taken from the librosa library |
| 347 | + enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) |
| 348 | + return filters * enorm[:, np.newaxis] |
| 349 | + |
| 350 | + |
| 351 | +def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: |
| 352 | + """ |
| 353 | + Generate filters for audio processing. |
| 354 | +
|
| 355 | + Args: |
| 356 | + filter_points: A list of filter points. |
| 357 | + ftt_size: The size of the FFT. |
| 358 | +
|
| 359 | + Returns: |
| 360 | + A matrix of filters. |
| 361 | +
|
| 362 | + Examples: |
| 363 | + >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape |
| 364 | + (4, 257) |
| 365 | + """ |
| 366 | + num_filters = len(filter_points) - 2 |
| 367 | + filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) |
| 368 | + |
| 369 | + for n in range(num_filters): |
| 370 | + start = filter_points[n] |
| 371 | + mid = filter_points[n + 1] |
| 372 | + end = filter_points[n + 2] |
| 373 | + |
| 374 | + # Linearly increase values from 0 to 1 |
| 375 | + filters[n, start:mid] = np.linspace(0, 1, mid - start) |
| 376 | + |
| 377 | + # Linearly decrease values from 1 to 0 |
| 378 | + filters[n, mid:end] = np.linspace(1, 0, end - mid) |
| 379 | + |
| 380 | + return filters |
| 381 | + |
| 382 | + |
| 383 | +def get_filter_points( |
| 384 | + sample_rate: int, |
| 385 | + freq_min: int, |
| 386 | + freq_high: int, |
| 387 | + mel_filter_num: int = 10, |
| 388 | + ftt_size: int = 1024, |
| 389 | +) -> tuple[np.ndarray, np.ndarray]: |
| 390 | + """ |
| 391 | + Calculate the filter points and frequencies for mel frequency filters. |
| 392 | +
|
| 393 | + Args: |
| 394 | + sample_rate: The sample rate of the audio. |
| 395 | + freq_min: The minimum frequency in Hertz. |
| 396 | + freq_high: The maximum frequency in Hertz. |
| 397 | + mel_filter_num: The number of mel filters (default is 10). |
| 398 | + ftt_size: The size of the FFT (default is 1024). |
| 399 | +
|
| 400 | + Returns: |
| 401 | + Filter points and corresponding frequencies. |
| 402 | +
|
| 403 | + Examples: |
| 404 | + >>> filter_points = get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512) |
| 405 | + >>> filter_points[0] |
| 406 | + array([ 0, 20, 51, 95, 161, 256]) |
| 407 | + >>> filter_points[1] |
| 408 | + array([ 0. , 324.46707094, 799.33254207, 1494.30973963, |
| 409 | + 2511.42581671, 4000. ]) |
| 410 | + """ |
| 411 | + # Convert minimum and maximum frequencies to mel scale |
| 412 | + fmin_mel = freq_to_mel(freq_min) |
| 413 | + fmax_mel = freq_to_mel(freq_high) |
| 414 | + |
| 415 | + logging.info(f"MEL min: {fmin_mel}") |
| 416 | + logging.info(f"MEL max: {fmax_mel}") |
| 417 | + |
| 418 | + # Generate equally spaced mel frequencies |
| 419 | + mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2) |
| 420 | + |
| 421 | + # Convert mel frequencies back to Hertz |
| 422 | + freqs = mel_to_freq(mels) |
| 423 | + |
| 424 | + # Calculate filter points as integer values |
| 425 | + filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int) |
| 426 | + |
| 427 | + return filter_points, freqs |
| 428 | + |
| 429 | + |
| 430 | +def discrete_cosine_transform(dct_filter_num: int, filter_num: int) -> np.ndarray: |
| 431 | + """ |
| 432 | + Compute the Discrete Cosine Transform (DCT) basis matrix. |
| 433 | +
|
| 434 | + Args: |
| 435 | + dct_filter_num: The number of DCT filters to generate. |
| 436 | + filter_num: The number of the fbank filters. |
| 437 | +
|
| 438 | + Returns: |
| 439 | + The DCT basis matrix. |
| 440 | +
|
| 441 | + Examples: |
| 442 | + >>> round(discrete_cosine_transform(3, 5)[0][0], 5) |
| 443 | + 0.44721 |
| 444 | + """ |
| 445 | + basis = np.empty((dct_filter_num, filter_num)) |
| 446 | + basis[0, :] = 1.0 / np.sqrt(filter_num) |
| 447 | + |
| 448 | + samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num) |
| 449 | + |
| 450 | + for i in range(1, dct_filter_num): |
| 451 | + basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num) |
| 452 | + |
| 453 | + return basis |
| 454 | + |
| 455 | + |
| 456 | +def example(wav_file_path: str = "./path-to-file/sample.wav") -> np.ndarray: |
| 457 | + """ |
| 458 | + Example function to calculate Mel Frequency Cepstral Coefficients |
| 459 | + (MFCCs) from an audio file. |
| 460 | +
|
| 461 | + Args: |
| 462 | + wav_file_path: The path to the WAV audio file. |
| 463 | +
|
| 464 | + Returns: |
| 465 | + np.ndarray: The computed MFCCs for the audio. |
| 466 | + """ |
| 467 | + from scipy.io import wavfile |
| 468 | + |
| 469 | + # Load the audio from the WAV file |
| 470 | + sample_rate, audio = wavfile.read(wav_file_path) |
| 471 | + |
| 472 | + # Calculate MFCCs |
| 473 | + return mfcc(audio, sample_rate) |
| 474 | + |
| 475 | + |
| 476 | +if __name__ == "__main__": |
| 477 | + import doctest |
| 478 | + |
| 479 | + doctest.testmod() |
0 commit comments