|
| 1 | +""" |
| 2 | +MFCC (Mel Frequency Cepstral Coefficients) Calculation |
| 3 | +
|
| 4 | +MFCC is a feature widely used in audio and speech processing to represent the |
| 5 | +short-term power spectrum of a sound signal in a more compact and |
| 6 | +discriminative way. It is particularly popular in speech and audio processing |
| 7 | +tasks such as speech recognition and speaker identification. |
| 8 | +
|
| 9 | +How MFCC is Calculated: |
| 10 | +1. Preprocessing: |
| 11 | + - Load an audio signal and normalize it to ensure that the values fall |
| 12 | + within a specific range (e.g., between -1 and 1). |
| 13 | + - Frame the audio signal into overlapping, fixed-length segments, typically |
| 14 | + using a technique like windowing to reduce spectral leakage. |
| 15 | +
|
| 16 | +2. Fourier Transform: |
| 17 | + - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it |
| 18 | + from the time domain to the frequency domain. This results in a |
| 19 | + representation of the audio frame as a sequence of frequency components. |
| 20 | +
|
| 21 | +3. Power Spectrum: |
| 22 | + - Calculate the power spectrum by taking the squared magnitude of each |
| 23 | + frequency component obtained from the FFT. This step measures the energy |
| 24 | + distribution across different frequency bands. |
| 25 | +
|
| 26 | +4. Mel Filterbank: |
| 27 | + - Apply a set of triangular filterbanks spaced in the Mel frequency scale |
| 28 | + to the power spectrum. These filters mimic the human auditory system's |
| 29 | + frequency response. Each filterbank sums the power spectrum values within |
| 30 | + its band. |
| 31 | +
|
| 32 | +5. Logarithmic Compression: |
| 33 | + - Take the logarithm (typically base 10) of the filterbank values to |
| 34 | + compress the dynamic range. This step mimics the logarithmic response of |
| 35 | + the human ear to sound intensity. |
| 36 | +
|
| 37 | +6. Discrete Cosine Transform (DCT): |
| 38 | + - Apply the Discrete Cosine Transform to the log filterbank energies to |
| 39 | + obtain the MFCC coefficients. This transformation helps decorrelate the |
| 40 | + filterbank energies and captures the most important features of the audio |
| 41 | + signal. |
| 42 | +
|
| 43 | +7. Feature Extraction: |
| 44 | + - Select a subset of the DCT coefficients to form the feature vector. |
| 45 | + Often, the first few coefficients (e.g., 12-13) are used for most |
| 46 | + applications. |
| 47 | +
|
| 48 | +References: |
| 49 | +- Mel-Frequency Cepstral Coefficients (MFCCs): |
| 50 | + https://en.wikipedia.org/wiki/Mel-frequency_cepstrum |
| 51 | +- Speech and Language Processing by Daniel Jurafsky & James H. Martin: |
| 52 | + https://web.stanford.edu/~jurafsky/slp3/ |
| 53 | +- Mel Frequency Cepstral Coefficient (MFCC) tutorial |
| 54 | + http://practicalcryptography.com/miscellaneous/machine-learning |
| 55 | + /guide-mel-frequency-cepstral-coefficients-mfccs/ |
| 56 | +
|
| 57 | +Author: Amir Lavasani |
| 58 | +""" |
| 59 | + |
| 60 | + |
| 61 | +import logging |
| 62 | + |
| 63 | +import numpy as np |
| 64 | +import scipy.fftpack as fft |
| 65 | +from scipy.io import wavfile |
| 66 | +from scipy.signal import get_window |
| 67 | + |
| 68 | +logging.basicConfig(level=logging.WARNING) |
| 69 | + |
| 70 | + |
| 71 | +def mfcc( |
| 72 | + audio: np.ndarray, |
| 73 | + sample_rate: int, |
| 74 | + ftt_size: int = 1024, |
| 75 | + hop_length: int = 20, |
| 76 | + mel_filter_num: int = 10, |
| 77 | + dct_filter_num: int = 40, |
| 78 | +) -> np.ndarray: |
| 79 | + logging.info(f"Sample rate: {sample_rate}Hz") |
| 80 | + logging.info(f"Audio duration: {len(audio) / sample_rate}s") |
| 81 | + logging.info(f"Audio min: {np.min(audio)}") |
| 82 | + logging.info(f"Audio max: {np.max(audio)}") |
| 83 | + |
| 84 | + # normalize audio |
| 85 | + audio_normalized = normalize(audio) |
| 86 | + |
| 87 | + logging.info(f"Normalized audio min: {np.min(audio_normalized)}") |
| 88 | + logging.info(f"Normalized audio max: {np.max(audio_normalized)}") |
| 89 | + |
| 90 | + # frame audio into |
| 91 | + audio_framed = frame( |
| 92 | + audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length |
| 93 | + ) |
| 94 | + |
| 95 | + logging.info(f"Framed audio shape: {audio_framed.shape}") |
| 96 | + logging.info(f"First frame: {audio_framed[0]}") |
| 97 | + |
| 98 | + # convert to frequency domain |
| 99 | + # For simplicity we will choose the Hanning window. |
| 100 | + window = get_window("hann", ftt_size, fftbins=True) |
| 101 | + audio_windowed = audio_framed * window |
| 102 | + |
| 103 | + logging.info(f"Windowed audio shape: {audio_windowed.shape}") |
| 104 | + logging.info(f"First frame: {audio_windowed[0]}") |
| 105 | + |
| 106 | + audio_fft = calculate_fft(audio_windowed, ftt_size) |
| 107 | + logging.info(f"fft audio shape: {audio_fft.shape}") |
| 108 | + logging.info(f"First frame: {audio_fft[0]}") |
| 109 | + |
| 110 | + audio_power = calculate_signal_power(audio_fft) |
| 111 | + logging.info(f"power audio shape: {audio_power.shape}") |
| 112 | + logging.info(f"First frame: {audio_power[0]}") |
| 113 | + |
| 114 | + filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size) |
| 115 | + logging.info(f"filters shape: {filters.shape}") |
| 116 | + |
| 117 | + audio_filtered = np.dot(filters, np.transpose(audio_power)) |
| 118 | + audio_log = 10.0 * np.log10(audio_filtered) |
| 119 | + logging.info(f"audio_log shape: {audio_log.shape}") |
| 120 | + |
| 121 | + dct_filters = dct(dct_filter_num, mel_filter_num) |
| 122 | + cepstral_coefficents = np.dot(dct_filters, audio_log) |
| 123 | + |
| 124 | + logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}") |
| 125 | + return cepstral_coefficents |
| 126 | + |
| 127 | + |
| 128 | +def normalize(audio: np.ndarray) -> np.ndarray: |
| 129 | + """ |
| 130 | + Normalize an audio signal by scaling it to have values between -1 and 1. |
| 131 | +
|
| 132 | + Args: |
| 133 | + audio (np.ndarray): The input audio signal. |
| 134 | +
|
| 135 | + Returns: |
| 136 | + np.ndarray: The normalized audio signal. |
| 137 | + """ |
| 138 | + # Find the maximum absolute value in the audio signal |
| 139 | + max_abs_value = np.max(np.abs(audio)) |
| 140 | + |
| 141 | + # Divide the entire audio signal by the maximum absolute value |
| 142 | + normalized_audio = audio / max_abs_value |
| 143 | + |
| 144 | + return normalized_audio |
| 145 | + |
| 146 | + |
| 147 | +def frame( |
| 148 | + audio: np.ndarray, |
| 149 | + sample_rate: int, |
| 150 | + hop_length: int = 20, |
| 151 | + ftt_size: int = 1024, |
| 152 | +) -> np.ndarray: |
| 153 | + """ |
| 154 | + Split an audio signal into overlapping frames. |
| 155 | +
|
| 156 | + Args: |
| 157 | + audio (np.ndarray): The input audio signal. |
| 158 | + sample_rate (int): The sample rate of the audio signal. |
| 159 | + hop_length (Optional[int]): The length of the hopping (default is 20ms). |
| 160 | + ftt_size (Optional[int]): The size of the FFT window (default is 1024). |
| 161 | +
|
| 162 | + Returns: |
| 163 | + np.ndarray: An array of overlapping frames. |
| 164 | + """ |
| 165 | + |
| 166 | + hop_size = np.round(sample_rate * hop_length / 1000).astype(int) |
| 167 | + |
| 168 | + # Pad the audio signal to handle edge cases |
| 169 | + audio = np.pad(audio, int(ftt_size / 2), mode="reflect") |
| 170 | + |
| 171 | + # Calculate the number of frames |
| 172 | + frame_num = int((len(audio) - ftt_size) / hop_size) + 1 |
| 173 | + |
| 174 | + # Initialize an array to store the frames |
| 175 | + frames = np.zeros((frame_num, ftt_size)) |
| 176 | + |
| 177 | + # Split the audio signal into frames |
| 178 | + for n in range(frame_num): |
| 179 | + frames[n] = audio[n * hop_size : n * hop_size + ftt_size] |
| 180 | + |
| 181 | + return frames |
| 182 | + |
| 183 | + |
| 184 | +def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray: |
| 185 | + """ |
| 186 | + Calculate the Fast Fourier Transform (FFT) of windowed audio data. |
| 187 | +
|
| 188 | + Args: |
| 189 | + audio_windowed (np.ndarray): The windowed audio signal. |
| 190 | + ftt_size (Optional[int]): The size of the FFT (default is 1024). |
| 191 | +
|
| 192 | + Returns: |
| 193 | + np.ndarray: The FFT of the audio data. |
| 194 | + """ |
| 195 | + # Transpose the audio data to have time in rows and channels in columns |
| 196 | + audio_transposed = np.transpose(audio_windowed) |
| 197 | + |
| 198 | + # Initialize an array to store the FFT results |
| 199 | + audio_fft = np.empty( |
| 200 | + (int(1 + ftt_size // 2), audio_transposed.shape[1]), |
| 201 | + dtype=np.complex64, |
| 202 | + order="F", |
| 203 | + ) |
| 204 | + |
| 205 | + # Compute FFT for each channel |
| 206 | + for n in range(audio_fft.shape[1]): |
| 207 | + audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] |
| 208 | + |
| 209 | + # Transpose the FFT results back to the original shape |
| 210 | + audio_fft = np.transpose(audio_fft) |
| 211 | + |
| 212 | + return audio_fft |
| 213 | + |
| 214 | + |
| 215 | +def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: |
| 216 | + """ |
| 217 | + Calculate the power of the audio signal from its FFT. |
| 218 | +
|
| 219 | + Args: |
| 220 | + audio_fft (np.ndarray): The FFT of the audio signal. |
| 221 | +
|
| 222 | + Returns: |
| 223 | + np.ndarray: The power of the audio signal. |
| 224 | + """ |
| 225 | + # Calculate the power by squaring the absolute values of the FFT coefficients |
| 226 | + audio_power = np.square(np.abs(audio_fft)) |
| 227 | + |
| 228 | + return audio_power |
| 229 | + |
| 230 | + |
| 231 | +def freq_to_mel(freq): |
| 232 | + """ |
| 233 | + Convert a frequency in Hertz to the mel scale. |
| 234 | +
|
| 235 | + Args: |
| 236 | + freq (float): The frequency in Hertz. |
| 237 | +
|
| 238 | + Returns: |
| 239 | + float: The frequency in mel scale. |
| 240 | + """ |
| 241 | + # Use the formula to convert frequency to the mel scale |
| 242 | + return 2595.0 * np.log10(1.0 + freq / 700.0) |
| 243 | + |
| 244 | + |
| 245 | +def mel_to_freq(mels): |
| 246 | + """ |
| 247 | + Convert a frequency in the mel scale to Hertz. |
| 248 | +
|
| 249 | + Args: |
| 250 | + mels (float): The frequency in mel scale. |
| 251 | +
|
| 252 | + Returns: |
| 253 | + float: The frequency in Hertz. |
| 254 | + """ |
| 255 | + # Use the formula to convert mel scale to frequency |
| 256 | + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) |
| 257 | + |
| 258 | + |
| 259 | +def mel_spaced_filterbank( |
| 260 | + sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024 |
| 261 | +) -> np.ndarray: |
| 262 | + """ |
| 263 | + Create a Mel-spaced filter bank for audio processing. |
| 264 | +
|
| 265 | + Args: |
| 266 | + sample_rate (int): The sample rate of the audio. |
| 267 | + mel_filter_num (Optional[int]): The number of mel filters (default is 10). |
| 268 | + ftt_size (Optional[int]): The size of the FFT (default is 1024). |
| 269 | +
|
| 270 | + Returns: |
| 271 | + np.ndarray: Mel-spaced filter bank. |
| 272 | + """ |
| 273 | + freq_min = 0 |
| 274 | + freq_high = sample_rate // 2 |
| 275 | + |
| 276 | + logging.info(f"Minimum frequency: {freq_min}") |
| 277 | + logging.info(f"Maximum frequency: {freq_high}") |
| 278 | + |
| 279 | + # Calculate filter points and mel frequencies |
| 280 | + filter_points, mel_freqs = get_filter_points( |
| 281 | + sample_rate, |
| 282 | + freq_min, |
| 283 | + freq_high, |
| 284 | + mel_filter_num, |
| 285 | + ftt_size, |
| 286 | + ) |
| 287 | + |
| 288 | + filters = get_filters(filter_points, ftt_size) |
| 289 | + |
| 290 | + # normalize filters |
| 291 | + # taken from the librosa library |
| 292 | + enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) |
| 293 | + filters *= enorm[:, np.newaxis] |
| 294 | + |
| 295 | + return filters |
| 296 | + |
| 297 | + |
| 298 | +def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: |
| 299 | + """ |
| 300 | + Generate filters for audio processing. |
| 301 | +
|
| 302 | + Args: |
| 303 | + filter_points (list): A list of filter points. |
| 304 | + ftt_size (int): The size of the FFT. |
| 305 | +
|
| 306 | + Returns: |
| 307 | + np.ndarray: A matrix of filters. |
| 308 | + """ |
| 309 | + num_filters = len(filter_points) - 2 |
| 310 | + filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) |
| 311 | + |
| 312 | + for n in range(num_filters): |
| 313 | + start = filter_points[n] |
| 314 | + mid = filter_points[n + 1] |
| 315 | + end = filter_points[n + 2] |
| 316 | + |
| 317 | + # Linearly increase values from 0 to 1 |
| 318 | + filters[n, start:mid] = np.linspace(0, 1, mid - start) |
| 319 | + |
| 320 | + # Linearly decrease values from 1 to 0 |
| 321 | + filters[n, mid:end] = np.linspace(1, 0, end - mid) |
| 322 | + |
| 323 | + return filters |
| 324 | + |
| 325 | + |
| 326 | +def get_filter_points( |
| 327 | + sample_rate: int, |
| 328 | + freq_min: int, |
| 329 | + freq_high: int, |
| 330 | + mel_filter_num: int = 10, |
| 331 | + ftt_size: int = 1024, |
| 332 | +): |
| 333 | + """ |
| 334 | + Calculate the filter points and frequencies for mel frequency filters. |
| 335 | +
|
| 336 | + Args: |
| 337 | + sample_rate (int): The sample rate of the audio. |
| 338 | + freq_min (int): The minimum frequency in Hertz. |
| 339 | + freq_high (int): The maximum frequency in Hertz. |
| 340 | + mel_filter_num (Optional[int]): The number of mel filters (default is 10). |
| 341 | + ftt_size (Optional[int]): The size of the FFT (default is 1024). |
| 342 | +
|
| 343 | + Returns: |
| 344 | + Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. |
| 345 | + """ |
| 346 | + |
| 347 | + # Convert minimum and maximum frequencies to mel scale |
| 348 | + fmin_mel = freq_to_mel(freq_min) |
| 349 | + fmax_mel = freq_to_mel(freq_high) |
| 350 | + |
| 351 | + logging.info(f"MEL min: {fmin_mel}") |
| 352 | + logging.info(f"MEL max: {fmax_mel}") |
| 353 | + |
| 354 | + # Generate equally spaced mel frequencies |
| 355 | + mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2) |
| 356 | + |
| 357 | + # Convert mel frequencies back to Hertz |
| 358 | + freqs = mel_to_freq(mels) |
| 359 | + |
| 360 | + # Calculate filter points as integer values |
| 361 | + filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int) |
| 362 | + |
| 363 | + return filter_points, freqs |
| 364 | + |
| 365 | + |
| 366 | +def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: |
| 367 | + """ |
| 368 | + Compute the Discrete Cosine Transform (DCT) basis matrix. |
| 369 | +
|
| 370 | + Args: |
| 371 | + dct_filter_num (int): The number of DCT filters to generate. |
| 372 | + filter_num (int): The number of the fbank filters. |
| 373 | +
|
| 374 | + Returns: |
| 375 | + np.ndarray: The DCT basis matrix. |
| 376 | + """ |
| 377 | + basis = np.empty((dct_filter_num, filter_num)) |
| 378 | + basis[0, :] = 1.0 / np.sqrt(filter_num) |
| 379 | + |
| 380 | + samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num) |
| 381 | + |
| 382 | + for i in range(1, dct_filter_num): |
| 383 | + basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num) |
| 384 | + |
| 385 | + return basis |
| 386 | + |
| 387 | + |
| 388 | +if __name__ == "__main__": |
| 389 | + TRAIN_PATH = "./signal_processing/" |
| 390 | + sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav") |
| 391 | + |
| 392 | + print(mfcc(audio, sample_rate)) |
| 393 | + |
| 394 | + import doctest |
| 395 | + |
| 396 | + doctest.testmod() |
0 commit comments