Skip to content

Add MFCC Feature Extraction Algorithm #9057

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
395 changes: 395 additions & 0 deletions machine_learning/mfcc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,395 @@
"""
MFCC (Mel Frequency Cepstral Coefficients) Calculation

MFCC is a feature widely used in audio and speech processing to represent the
short-term power spectrum of a sound signal in a more compact and
discriminative way. It is particularly popular in speech and audio processing
tasks such as speech recognition and speaker identification.

How MFCC is Calculated:
1. Preprocessing:
- Load an audio signal and normalize it to ensure that the values fall
within a specific range (e.g., between -1 and 1).
- Frame the audio signal into overlapping, fixed-length segments, typically
using a technique like windowing to reduce spectral leakage.

2. Fourier Transform:
- Apply a Fast Fourier Transform (FFT) to each audio frame to convert it
from the time domain to the frequency domain. This results in a
representation of the audio frame as a sequence of frequency components.

3. Power Spectrum:
- Calculate the power spectrum by taking the squared magnitude of each
frequency component obtained from the FFT. This step measures the energy
distribution across different frequency bands.

4. Mel Filterbank:
- Apply a set of triangular filterbanks spaced in the Mel frequency scale
to the power spectrum. These filters mimic the human auditory system's
frequency response. Each filterbank sums the power spectrum values within
its band.

5. Logarithmic Compression:
- Take the logarithm (typically base 10) of the filterbank values to
compress the dynamic range. This step mimics the logarithmic response of
the human ear to sound intensity.

6. Discrete Cosine Transform (DCT):
- Apply the Discrete Cosine Transform to the log filterbank energies to
obtain the MFCC coefficients. This transformation helps decorrelate the
filterbank energies and captures the most important features of the audio
signal.

7. Feature Extraction:
- Select a subset of the DCT coefficients to form the feature vector.
Often, the first few coefficients (e.g., 12-13) are used for most
applications.

References:
- Mel-Frequency Cepstral Coefficients (MFCCs):
https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
- Speech and Language Processing by Daniel Jurafsky & James H. Martin:
https://web.stanford.edu/~jurafsky/slp3/
- Mel Frequency Cepstral Coefficient (MFCC) tutorial
http://practicalcryptography.com/miscellaneous/machine-learning
/guide-mel-frequency-cepstral-coefficients-mfccs/

Author: Amir Lavasani
"""


import logging

import numpy as np
import scipy.fftpack as fft
from scipy.signal import get_window

logging.basicConfig(level=logging.WARNING)


def mfcc(
audio: np.ndarray,
sample_rate: int,
ftt_size: int = 1024,
hop_length: int = 20,
mel_filter_num: int = 10,
dct_filter_num: int = 40,
) -> np.ndarray:
logging.info(f"Sample rate: {sample_rate}Hz")
logging.info(f"Audio duration: {len(audio) / sample_rate}s")
logging.info(f"Audio min: {np.min(audio)}")
logging.info(f"Audio max: {np.max(audio)}")

# normalize audio
audio_normalized = normalize(audio)

logging.info(f"Normalized audio min: {np.min(audio_normalized)}")
logging.info(f"Normalized audio max: {np.max(audio_normalized)}")

# frame audio into
audio_framed = frame(
audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length
)

logging.info(f"Framed audio shape: {audio_framed.shape}")
logging.info(f"First frame: {audio_framed[0]}")

# convert to frequency domain
# For simplicity we will choose the Hanning window.
window = get_window("hann", ftt_size, fftbins=True)
audio_windowed = audio_framed * window

logging.info(f"Windowed audio shape: {audio_windowed.shape}")
logging.info(f"First frame: {audio_windowed[0]}")

audio_fft = calculate_fft(audio_windowed, ftt_size)
logging.info(f"fft audio shape: {audio_fft.shape}")
logging.info(f"First frame: {audio_fft[0]}")

audio_power = calculate_signal_power(audio_fft)
logging.info(f"power audio shape: {audio_power.shape}")
logging.info(f"First frame: {audio_power[0]}")

filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size)
logging.info(f"filters shape: {filters.shape}")

audio_filtered = np.dot(filters, np.transpose(audio_power))
audio_log = 10.0 * np.log10(audio_filtered)
logging.info(f"audio_log shape: {audio_log.shape}")

dct_filters = dct(dct_filter_num, mel_filter_num)
cepstral_coefficents = np.dot(dct_filters, audio_log)

logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}")
return cepstral_coefficents


def normalize(audio: np.ndarray) -> np.ndarray:
"""
Normalize an audio signal by scaling it to have values between -1 and 1.

Args:
audio (np.ndarray): The input audio signal.

Returns:
np.ndarray: The normalized audio signal.
"""
# Find the maximum absolute value in the audio signal
max_abs_value = np.max(np.abs(audio))

# Divide the entire audio signal by the maximum absolute value
normalized_audio = audio / max_abs_value

return normalized_audio


def frame(
audio: np.ndarray,
sample_rate: int,
hop_length: int = 20,
ftt_size: int = 1024,
) -> np.ndarray:
"""
Split an audio signal into overlapping frames.

Args:
audio (np.ndarray): The input audio signal.
sample_rate (int): The sample rate of the audio signal.
hop_length (Optional[int]): The length of the hopping (default is 20ms).
ftt_size (Optional[int]): The size of the FFT window (default is 1024).

Returns:
np.ndarray: An array of overlapping frames.
"""

hop_size = np.round(sample_rate * hop_length / 1000).astype(int)

# Pad the audio signal to handle edge cases
audio = np.pad(audio, int(ftt_size / 2), mode="reflect")

# Calculate the number of frames
frame_num = int((len(audio) - ftt_size) / hop_size) + 1

# Initialize an array to store the frames
frames = np.zeros((frame_num, ftt_size))

# Split the audio signal into frames
for n in range(frame_num):
frames[n] = audio[n * hop_size : n * hop_size + ftt_size]

return frames


def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray:
"""
Calculate the Fast Fourier Transform (FFT) of windowed audio data.

Args:
audio_windowed (np.ndarray): The windowed audio signal.
ftt_size (Optional[int]): The size of the FFT (default is 1024).

Returns:
np.ndarray: The FFT of the audio data.
"""
# Transpose the audio data to have time in rows and channels in columns
audio_transposed = np.transpose(audio_windowed)

# Initialize an array to store the FFT results
audio_fft = np.empty(
(int(1 + ftt_size // 2), audio_transposed.shape[1]),
dtype=np.complex64,
order="F",
)

# Compute FFT for each channel
for n in range(audio_fft.shape[1]):
audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]]

# Transpose the FFT results back to the original shape
audio_fft = np.transpose(audio_fft)

return audio_fft


def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
"""
Calculate the power of the audio signal from its FFT.

Args:
audio_fft (np.ndarray): The FFT of the audio signal.

Returns:
np.ndarray: The power of the audio signal.
"""
# Calculate the power by squaring the absolute values of the FFT coefficients
audio_power = np.square(np.abs(audio_fft))

return audio_power


def freq_to_mel(freq):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/mfcc.py, please provide doctest for the function freq_to_mel

Please provide return type hint for the function: freq_to_mel. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: freq

"""
Convert a frequency in Hertz to the mel scale.

Args:
freq (float): The frequency in Hertz.

Returns:
float: The frequency in mel scale.
"""
# Use the formula to convert frequency to the mel scale
return 2595.0 * np.log10(1.0 + freq / 700.0)


def mel_to_freq(mels):
"""
Convert a frequency in the mel scale to Hertz.

Args:
mels (float): The frequency in mel scale.

Returns:
float: The frequency in Hertz.
"""
# Use the formula to convert mel scale to frequency
return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)


def mel_spaced_filterbank(
sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024
) -> np.ndarray:
"""
Create a Mel-spaced filter bank for audio processing.

Args:
sample_rate (int): The sample rate of the audio.
mel_filter_num (Optional[int]): The number of mel filters (default is 10).
ftt_size (Optional[int]): The size of the FFT (default is 1024).

Returns:
np.ndarray: Mel-spaced filter bank.
"""
freq_min = 0
freq_high = sample_rate // 2

logging.info(f"Minimum frequency: {freq_min}")
logging.info(f"Maximum frequency: {freq_high}")

# Calculate filter points and mel frequencies
filter_points, mel_freqs = get_filter_points(
sample_rate,
freq_min,
freq_high,
mel_filter_num,
ftt_size,
)

filters = get_filters(filter_points, ftt_size)

# normalize filters
# taken from the librosa library
enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num])
filters *= enorm[:, np.newaxis]

return filters


def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
"""
Generate filters for audio processing.

Args:
filter_points (list): A list of filter points.
ftt_size (int): The size of the FFT.

Returns:
np.ndarray: A matrix of filters.
"""
num_filters = len(filter_points) - 2
filters = np.zeros((num_filters, int(ftt_size / 2) + 1))

for n in range(num_filters):
start = filter_points[n]
mid = filter_points[n + 1]
end = filter_points[n + 2]

# Linearly increase values from 0 to 1
filters[n, start:mid] = np.linspace(0, 1, mid - start)

# Linearly decrease values from 1 to 0
filters[n, mid:end] = np.linspace(1, 0, end - mid)

return filters


def get_filter_points(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide return type hint for the function: get_filter_points. If the function does not return a value, please provide the type hint as: def function() -> None:

sample_rate: int,
freq_min: int,
freq_high: int,
mel_filter_num: int = 10,
ftt_size: int = 1024,
):
"""
Calculate the filter points and frequencies for mel frequency filters.

Args:
sample_rate (int): The sample rate of the audio.
freq_min (int): The minimum frequency in Hertz.
freq_high (int): The maximum frequency in Hertz.
mel_filter_num (Optional[int]): The number of mel filters (default is 10).
ftt_size (Optional[int]): The size of the FFT (default is 1024).

Returns:
Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
"""

# Convert minimum and maximum frequencies to mel scale
fmin_mel = freq_to_mel(freq_min)
fmax_mel = freq_to_mel(freq_high)

logging.info(f"MEL min: {fmin_mel}")
logging.info(f"MEL max: {fmax_mel}")

# Generate equally spaced mel frequencies
mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2)

# Convert mel frequencies back to Hertz
freqs = mel_to_freq(mels)

# Calculate filter points as integer values
filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int)

return filter_points, freqs


def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
"""
Compute the Discrete Cosine Transform (DCT) basis matrix.

Args:
dct_filter_num (int): The number of DCT filters to generate.
filter_num (int): The number of the fbank filters.

Returns:
np.ndarray: The DCT basis matrix.
"""
basis = np.empty((dct_filter_num, filter_num))
basis[0, :] = 1.0 / np.sqrt(filter_num)

samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num)

for i in range(1, dct_filter_num):
basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num)

return basis


if __name__ == "__main__":
# from scipy.io import wavfile
# wav_file_path = "./path-to-file/sample.wav"
# sample_rate, audio = wavfile.read(wav_file_path)
# mfccs = mfcc(audio, sample_rate)

import doctest

doctest.testmod()