From 264bc13fee42202f2298d59707edc7403ed486d3 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Tue, 12 Sep 2023 11:39:01 +0330 Subject: [PATCH 01/18] Add MFCC feature extraction to machine learning --- machine_learning/mfcc.py | 396 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 machine_learning/mfcc.py diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py new file mode 100644 index 000000000000..9144846bbb5d --- /dev/null +++ b/machine_learning/mfcc.py @@ -0,0 +1,396 @@ +""" +MFCC (Mel Frequency Cepstral Coefficients) Calculation + +MFCC is a feature widely used in audio and speech processing to represent the +short-term power spectrum of a sound signal in a more compact and +discriminative way. It is particularly popular in speech and audio processing +tasks such as speech recognition and speaker identification. + +How MFCC is Calculated: +1. Preprocessing: + - Load an audio signal and normalize it to ensure that the values fall + within a specific range (e.g., between -1 and 1). + - Frame the audio signal into overlapping, fixed-length segments, typically + using a technique like windowing to reduce spectral leakage. + +2. Fourier Transform: + - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it + from the time domain to the frequency domain. This results in a + representation of the audio frame as a sequence of frequency components. + +3. Power Spectrum: + - Calculate the power spectrum by taking the squared magnitude of each + frequency component obtained from the FFT. This step measures the energy + distribution across different frequency bands. + +4. Mel Filterbank: + - Apply a set of triangular filterbanks spaced in the Mel frequency scale + to the power spectrum. These filters mimic the human auditory system's + frequency response. Each filterbank sums the power spectrum values within + its band. + +5. Logarithmic Compression: + - Take the logarithm (typically base 10) of the filterbank values to + compress the dynamic range. This step mimics the logarithmic response of + the human ear to sound intensity. + +6. Discrete Cosine Transform (DCT): + - Apply the Discrete Cosine Transform to the log filterbank energies to + obtain the MFCC coefficients. This transformation helps decorrelate the + filterbank energies and captures the most important features of the audio + signal. + +7. Feature Extraction: + - Select a subset of the DCT coefficients to form the feature vector. + Often, the first few coefficients (e.g., 12-13) are used for most + applications. + +References: +- Mel-Frequency Cepstral Coefficients (MFCCs): + https://en.wikipedia.org/wiki/Mel-frequency_cepstrum +- Speech and Language Processing by Daniel Jurafsky & James H. Martin: + https://web.stanford.edu/~jurafsky/slp3/ +- Mel Frequency Cepstral Coefficient (MFCC) tutorial + http://practicalcryptography.com/miscellaneous/machine-learning + /guide-mel-frequency-cepstral-coefficients-mfccs/ + +Author: Amir Lavasani +""" + + +import logging + +import numpy as np +import scipy.fftpack as fft +from scipy.io import wavfile +from scipy.signal import get_window + +logging.basicConfig(level=logging.WARNING) + + +def mfcc( + audio: np.ndarray, + sample_rate: int, + ftt_size: int = 1024, + hop_length: int = 20, + mel_filter_num: int = 10, + dct_filter_num: int = 40, +) -> np.ndarray: + logging.info(f"Sample rate: {sample_rate}Hz") + logging.info(f"Audio duration: {len(audio) / sample_rate}s") + logging.info(f"Audio min: {np.min(audio)}") + logging.info(f"Audio max: {np.max(audio)}") + + # normalize audio + audio_normalized = normalize(audio) + + logging.info(f"Normalized audio min: {np.min(audio_normalized)}") + logging.info(f"Normalized audio max: {np.max(audio_normalized)}") + + # frame audio into + audio_framed = frame( + audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length + ) + + logging.info(f"Framed audio shape: {audio_framed.shape}") + logging.info(f"First frame: {audio_framed[0]}") + + # convert to frequency domain + # For simplicity we will choose the Hanning window. + window = get_window("hann", ftt_size, fftbins=True) + audio_windowed = audio_framed * window + + logging.info(f"Windowed audio shape: {audio_windowed.shape}") + logging.info(f"First frame: {audio_windowed[0]}") + + audio_fft = calculate_fft(audio_windowed, ftt_size) + logging.info(f"fft audio shape: {audio_fft.shape}") + logging.info(f"First frame: {audio_fft[0]}") + + audio_power = calculate_signal_power(audio_fft) + logging.info(f"power audio shape: {audio_power.shape}") + logging.info(f"First frame: {audio_power[0]}") + + filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size) + logging.info(f"filters shape: {filters.shape}") + + audio_filtered = np.dot(filters, np.transpose(audio_power)) + audio_log = 10.0 * np.log10(audio_filtered) + logging.info(f"audio_log shape: {audio_log.shape}") + + dct_filters = dct(dct_filter_num, mel_filter_num) + cepstral_coefficents = np.dot(dct_filters, audio_log) + + logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}") + return cepstral_coefficents + + +def normalize(audio: np.ndarray) -> np.ndarray: + """ + Normalize an audio signal by scaling it to have values between -1 and 1. + + Args: + audio (np.ndarray): The input audio signal. + + Returns: + np.ndarray: The normalized audio signal. + """ + # Find the maximum absolute value in the audio signal + max_abs_value = np.max(np.abs(audio)) + + # Divide the entire audio signal by the maximum absolute value + normalized_audio = audio / max_abs_value + + return normalized_audio + + +def frame( + audio: np.ndarray, + sample_rate: int, + hop_length: int = 20, + ftt_size: int = 1024, +) -> np.ndarray: + """ + Split an audio signal into overlapping frames. + + Args: + audio (np.ndarray): The input audio signal. + sample_rate (int): The sample rate of the audio signal. + hop_length (Optional[int]): The length of the hopping (default is 20ms). + ftt_size (Optional[int]): The size of the FFT window (default is 1024). + + Returns: + np.ndarray: An array of overlapping frames. + """ + + hop_size = np.round(sample_rate * hop_length / 1000).astype(int) + + # Pad the audio signal to handle edge cases + audio = np.pad(audio, int(ftt_size / 2), mode="reflect") + + # Calculate the number of frames + frame_num = int((len(audio) - ftt_size) / hop_size) + 1 + + # Initialize an array to store the frames + frames = np.zeros((frame_num, ftt_size)) + + # Split the audio signal into frames + for n in range(frame_num): + frames[n] = audio[n * hop_size : n * hop_size + ftt_size] + + return frames + + +def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray: + """ + Calculate the Fast Fourier Transform (FFT) of windowed audio data. + + Args: + audio_windowed (np.ndarray): The windowed audio signal. + ftt_size (Optional[int]): The size of the FFT (default is 1024). + + Returns: + np.ndarray: The FFT of the audio data. + """ + # Transpose the audio data to have time in rows and channels in columns + audio_transposed = np.transpose(audio_windowed) + + # Initialize an array to store the FFT results + audio_fft = np.empty( + (int(1 + ftt_size // 2), audio_transposed.shape[1]), + dtype=np.complex64, + order="F", + ) + + # Compute FFT for each channel + for n in range(audio_fft.shape[1]): + audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] + + # Transpose the FFT results back to the original shape + audio_fft = np.transpose(audio_fft) + + return audio_fft + + +def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: + """ + Calculate the power of the audio signal from its FFT. + + Args: + audio_fft (np.ndarray): The FFT of the audio signal. + + Returns: + np.ndarray: The power of the audio signal. + """ + # Calculate the power by squaring the absolute values of the FFT coefficients + audio_power = np.square(np.abs(audio_fft)) + + return audio_power + + +def freq_to_mel(freq): + """ + Convert a frequency in Hertz to the mel scale. + + Args: + freq (float): The frequency in Hertz. + + Returns: + float: The frequency in mel scale. + """ + # Use the formula to convert frequency to the mel scale + return 2595.0 * np.log10(1.0 + freq / 700.0) + + +def mel_to_freq(mels): + """ + Convert a frequency in the mel scale to Hertz. + + Args: + mels (float): The frequency in mel scale. + + Returns: + float: The frequency in Hertz. + """ + # Use the formula to convert mel scale to frequency + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) + + +def mel_spaced_filterbank( + sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024 +) -> np.ndarray: + """ + Create a Mel-spaced filter bank for audio processing. + + Args: + sample_rate (int): The sample rate of the audio. + mel_filter_num (Optional[int]): The number of mel filters (default is 10). + ftt_size (Optional[int]): The size of the FFT (default is 1024). + + Returns: + np.ndarray: Mel-spaced filter bank. + """ + freq_min = 0 + freq_high = sample_rate // 2 + + logging.info(f"Minimum frequency: {freq_min}") + logging.info(f"Maximum frequency: {freq_high}") + + # Calculate filter points and mel frequencies + filter_points, mel_freqs = get_filter_points( + sample_rate, + freq_min, + freq_high, + mel_filter_num, + ftt_size, + ) + + filters = get_filters(filter_points, ftt_size) + + # normalize filters + # taken from the librosa library + enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) + filters *= enorm[:, np.newaxis] + + return filters + + +def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: + """ + Generate filters for audio processing. + + Args: + filter_points (list): A list of filter points. + ftt_size (int): The size of the FFT. + + Returns: + np.ndarray: A matrix of filters. + """ + num_filters = len(filter_points) - 2 + filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) + + for n in range(num_filters): + start = filter_points[n] + mid = filter_points[n + 1] + end = filter_points[n + 2] + + # Linearly increase values from 0 to 1 + filters[n, start:mid] = np.linspace(0, 1, mid - start) + + # Linearly decrease values from 1 to 0 + filters[n, mid:end] = np.linspace(1, 0, end - mid) + + return filters + + +def get_filter_points( + sample_rate: int, + freq_min: int, + freq_high: int, + mel_filter_num: int = 10, + ftt_size: int = 1024, +): + """ + Calculate the filter points and frequencies for mel frequency filters. + + Args: + sample_rate (int): The sample rate of the audio. + freq_min (int): The minimum frequency in Hertz. + freq_high (int): The maximum frequency in Hertz. + mel_filter_num (Optional[int]): The number of mel filters (default is 10). + ftt_size (Optional[int]): The size of the FFT (default is 1024). + + Returns: + Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. + """ + + # Convert minimum and maximum frequencies to mel scale + fmin_mel = freq_to_mel(freq_min) + fmax_mel = freq_to_mel(freq_high) + + logging.info(f"MEL min: {fmin_mel}") + logging.info(f"MEL max: {fmax_mel}") + + # Generate equally spaced mel frequencies + mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2) + + # Convert mel frequencies back to Hertz + freqs = mel_to_freq(mels) + + # Calculate filter points as integer values + filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int) + + return filter_points, freqs + + +def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: + """ + Compute the Discrete Cosine Transform (DCT) basis matrix. + + Args: + dct_filter_num (int): The number of DCT filters to generate. + filter_num (int): The number of the fbank filters. + + Returns: + np.ndarray: The DCT basis matrix. + """ + basis = np.empty((dct_filter_num, filter_num)) + basis[0, :] = 1.0 / np.sqrt(filter_num) + + samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num) + + for i in range(1, dct_filter_num): + basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num) + + return basis + + +if __name__ == "__main__": + TRAIN_PATH = "./signal_processing/" + sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav") + + print(mfcc(audio, sample_rate)) + + import doctest + + doctest.testmod() From a1cb36c5556f3ea1eeeea64a3141e42c0b0bc230 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Tue, 12 Sep 2023 11:46:00 +0330 Subject: [PATCH 02/18] Add standalone usage in comments --- machine_learning/mfcc.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 9144846bbb5d..9fea66ee4cca 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -62,7 +62,6 @@ import numpy as np import scipy.fftpack as fft -from scipy.io import wavfile from scipy.signal import get_window logging.basicConfig(level=logging.WARNING) @@ -386,10 +385,10 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: if __name__ == "__main__": - TRAIN_PATH = "./signal_processing/" - sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav") - - print(mfcc(audio, sample_rate)) + # from scipy.io import wavfile + # wav_file_path = "./path-to-file/sample.wav" + # sample_rate, audio = wavfile.read(wav_file_path) + # mfccs = mfcc(audio, sample_rate) import doctest From 68d2e11e8b566447ba4ccd499192007703a36bff Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 11:01:17 +0330 Subject: [PATCH 03/18] Apply suggestions from code review Co-authored-by: Christian Clauss --- machine_learning/mfcc.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 9fea66ee4cca..ee7af7c8a2bd 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -1,5 +1,5 @@ """ -MFCC (Mel Frequency Cepstral Coefficients) Calculation +Mel Frequency Cepstral Coefficients (MFCC) Calculation MFCC is a feature widely used in audio and speech processing to represent the short-term power spectrum of a sound signal in a more compact and @@ -138,12 +138,10 @@ def normalize(audio: np.ndarray) -> np.ndarray: max_abs_value = np.max(np.abs(audio)) # Divide the entire audio signal by the maximum absolute value - normalized_audio = audio / max_abs_value + return audio / max_abs_value - return normalized_audio - -def frame( +def audio_frames( audio: np.ndarray, sample_rate: int, hop_length: int = 20, @@ -168,7 +166,7 @@ def frame( audio = np.pad(audio, int(ftt_size / 2), mode="reflect") # Calculate the number of frames - frame_num = int((len(audio) - ftt_size) / hop_size) + 1 + frame_count = int((len(audio) - ftt_size) / hop_size) + 1 # Initialize an array to store the frames frames = np.zeros((frame_num, ftt_size)) @@ -206,9 +204,7 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] # Transpose the FFT results back to the original shape - audio_fft = np.transpose(audio_fft) - - return audio_fft + return np.transpose(audio_fft) def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: @@ -289,9 +285,7 @@ def mel_spaced_filterbank( # normalize filters # taken from the librosa library enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) - filters *= enorm[:, np.newaxis] - - return filters + return filters * enorm[:, np.newaxis] def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: From 92d974814dc1b6a65e9cc26f4931ba8c3abeb96a Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Sat, 16 Sep 2023 18:12:31 -0400 Subject: [PATCH 04/18] Delete empty junk file (#9062) * updating DIRECTORY.md * updating DIRECTORY.md * Delete empty junk file * updating DIRECTORY.md * Fix ruff errors * Fix more ruff errors --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- DIRECTORY.md | 1 - arithmetic_analysis/junk.py | 0 computer_vision/haralick_descriptors.py | 8 +++++--- conversions/convert_number_to_words.py | 6 +++--- graphs/tarjans_scc.py | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) delete mode 100644 arithmetic_analysis/junk.py diff --git a/DIRECTORY.md b/DIRECTORY.md index 1b802564f939..d81e4ec1ee83 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -5,7 +5,6 @@ * [In Static Equilibrium](arithmetic_analysis/in_static_equilibrium.py) * [Intersection](arithmetic_analysis/intersection.py) * [Jacobi Iteration Method](arithmetic_analysis/jacobi_iteration_method.py) - * [Junk](arithmetic_analysis/junk.py) * [Lu Decomposition](arithmetic_analysis/lu_decomposition.py) * [Newton Forward Interpolation](arithmetic_analysis/newton_forward_interpolation.py) * [Newton Method](arithmetic_analysis/newton_method.py) diff --git a/arithmetic_analysis/junk.py b/arithmetic_analysis/junk.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/computer_vision/haralick_descriptors.py b/computer_vision/haralick_descriptors.py index 1a86d84ea14b..413cea304f6c 100644 --- a/computer_vision/haralick_descriptors.py +++ b/computer_vision/haralick_descriptors.py @@ -100,7 +100,9 @@ def binarize(image: np.ndarray, threshold: float = 127.0) -> np.ndarray: return np.where(image > threshold, 1, 0) -def transform(image: np.ndarray, kind: str, kernel: np.ndarray = None) -> np.ndarray: +def transform( + image: np.ndarray, kind: str, kernel: np.ndarray | None = None +) -> np.ndarray: """ Simple image transformation using one of two available filter functions: Erosion and Dilation. @@ -154,7 +156,7 @@ def transform(image: np.ndarray, kind: str, kernel: np.ndarray = None) -> np.nda return transformed -def opening_filter(image: np.ndarray, kernel: np.ndarray = None) -> np.ndarray: +def opening_filter(image: np.ndarray, kernel: np.ndarray | None = None) -> np.ndarray: """ Opening filter, defined as the sequence of erosion and then a dilation filter on the same image. @@ -172,7 +174,7 @@ def opening_filter(image: np.ndarray, kernel: np.ndarray = None) -> np.ndarray: return transform(transform(image, "dilation", kernel), "erosion", kernel) -def closing_filter(image: np.ndarray, kernel: np.ndarray = None) -> np.ndarray: +def closing_filter(image: np.ndarray, kernel: np.ndarray | None = None) -> np.ndarray: """ Opening filter, defined as the sequence of dilation and then erosion filter on the same image. diff --git a/conversions/convert_number_to_words.py b/conversions/convert_number_to_words.py index 0e4405319f1f..0c428928b31d 100644 --- a/conversions/convert_number_to_words.py +++ b/conversions/convert_number_to_words.py @@ -54,7 +54,7 @@ def max_value(cls, system: str) -> int: class NumberWords(Enum): - ONES: ClassVar = { + ONES: ClassVar[dict[int, str]] = { 0: "", 1: "one", 2: "two", @@ -67,7 +67,7 @@ class NumberWords(Enum): 9: "nine", } - TEENS: ClassVar = { + TEENS: ClassVar[dict[int, str]] = { 0: "ten", 1: "eleven", 2: "twelve", @@ -80,7 +80,7 @@ class NumberWords(Enum): 9: "nineteen", } - TENS: ClassVar = { + TENS: ClassVar[dict[int, str]] = { 2: "twenty", 3: "thirty", 4: "forty", diff --git a/graphs/tarjans_scc.py b/graphs/tarjans_scc.py index 30f8ca8a204f..dfd2e52704d5 100644 --- a/graphs/tarjans_scc.py +++ b/graphs/tarjans_scc.py @@ -77,7 +77,7 @@ def create_graph(n, edges): n_vertices = 7 source = [0, 0, 1, 2, 3, 3, 4, 4, 6] target = [1, 3, 2, 0, 1, 4, 5, 6, 5] - edges = [(u, v) for u, v in zip(source, target)] + edges = list(zip(source, target)) g = create_graph(n_vertices, edges) assert [[5], [6], [4], [3, 2, 1, 0]] == tarjan(g) From 6c4f90bcad94da12d0a033d79a4c11c4219d2327 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 11:37:33 +0330 Subject: [PATCH 05/18] [main] Fix typo due to auto review change --- machine_learning/mfcc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index ee7af7c8a2bd..5f792fe242be 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -87,7 +87,7 @@ def mfcc( logging.info(f"Normalized audio max: {np.max(audio_normalized)}") # frame audio into - audio_framed = frame( + audio_framed = audio_frames( audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length ) @@ -169,10 +169,10 @@ def audio_frames( frame_count = int((len(audio) - ftt_size) / hop_size) + 1 # Initialize an array to store the frames - frames = np.zeros((frame_num, ftt_size)) + frames = np.zeros((frame_count, ftt_size)) # Split the audio signal into frames - for n in range(frame_num): + for n in range(frame_count): frames[n] = audio[n * hop_size : n * hop_size + ftt_size] return frames From 1d843b5bdae0d62e2ecbf3434539a3cc392062a2 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 12:37:00 +0330 Subject: [PATCH 06/18] Add doctests for all functions --- machine_learning/mfcc.py | 84 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 5f792fe242be..fc4efaf3a7d4 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -133,6 +133,14 @@ def normalize(audio: np.ndarray) -> np.ndarray: Returns: np.ndarray: The normalized audio signal. + + Examples: + >>> audio = np.array([1, 2, 3, 4, 5]) + >>> normalized_audio = normalize(audio) + >>> np.max(normalized_audio) + 1.0 + >>> np.min(normalized_audio) + 0.2 """ # Find the maximum absolute value in the audio signal max_abs_value = np.max(np.abs(audio)) @@ -158,6 +166,14 @@ def audio_frames( Returns: np.ndarray: An array of overlapping frames. + + Examples: + >>> import numpy as np + >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) + >>> sample_rate = 8000 + >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) + >>> frames.shape + (126, 512) """ hop_size = np.round(sample_rate * hop_length / 1000).astype(int) @@ -188,6 +204,13 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra Returns: np.ndarray: The FFT of the audio data. + + Examples: + >>> import numpy as np + >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) + >>> np.allclose(audio_fft, np.array([[6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j], [15.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]])) + True """ # Transpose the audio data to have time in rows and channels in columns audio_transposed = np.transpose(audio_windowed) @@ -216,6 +239,13 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: Returns: np.ndarray: The power of the audio signal. + + Examples: + >>> import numpy as np + >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) + >>> power = calculate_signal_power(audio_fft) + >>> np.allclose(power, np.array([5, 13, 25, 41])) + True """ # Calculate the power by squaring the absolute values of the FFT coefficients audio_power = np.square(np.abs(audio_fft)) @@ -232,6 +262,10 @@ def freq_to_mel(freq): Returns: float: The frequency in mel scale. + + Examples: + >>> round(freq_to_mel(1000), 2) + 999.99 """ # Use the formula to convert frequency to the mel scale return 2595.0 * np.log10(1.0 + freq / 700.0) @@ -246,6 +280,10 @@ def mel_to_freq(mels): Returns: float: The frequency in Hertz. + + Examples: + >>> round(mel_to_freq(999.99), 2) + 1000.01 """ # Use the formula to convert mel scale to frequency return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) @@ -264,6 +302,10 @@ def mel_spaced_filterbank( Returns: np.ndarray: Mel-spaced filter bank. + + Examples: + >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10) + 0.0004603981 """ freq_min = 0 freq_high = sample_rate // 2 @@ -298,6 +340,10 @@ def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: Returns: np.ndarray: A matrix of filters. + + Examples: + >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape + (4, 257) """ num_filters = len(filter_points) - 2 filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) @@ -335,8 +381,11 @@ def get_filter_points( Returns: Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. - """ +Examples: + >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] + array([ 0, 20, 51, 95, 161, 256]) + """ # Convert minimum and maximum frequencies to mel scale fmin_mel = freq_to_mel(freq_min) fmax_mel = freq_to_mel(freq_high) @@ -366,6 +415,9 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: Returns: np.ndarray: The DCT basis matrix. + Examples: + >>> round(dct(3, 5)[0][0], 5) + 0.44721 """ basis = np.empty((dct_filter_num, filter_num)) basis[0, :] = 1.0 / np.sqrt(filter_num) @@ -378,12 +430,30 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: return basis -if __name__ == "__main__": - # from scipy.io import wavfile - # wav_file_path = "./path-to-file/sample.wav" - # sample_rate, audio = wavfile.read(wav_file_path) - # mfccs = mfcc(audio, sample_rate) +def example(wav_file_path="./path-to-file/sample.wav"): + """ + Example function to calculate MFCCs (Mel Frequency Cepstral Coefficients) from an audio file. - import doctest + Args: + wav_file_path (str): The path to the WAV audio file (default is "./path-to-file/sample.wav"). + + Returns: + np.ndarray: The computed MFCCs for the audio. + """ + from scipy.io import wavfile + + try: + # Load the audio from the WAV file + sample_rate, audio = wavfile.read(wav_file_path) + # Calculate MFCCs + mfccs = mfcc(audio, sample_rate) + + return mfccs + + except Exception as e: + logging.error(f"Error processing audio: {str(e)}") + return None +if __name__ == "__main__": + import doctest doctest.testmod() From d14cc0949d8265d4501dff58c9c9a1999935a481 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Tue, 12 Sep 2023 11:39:01 +0330 Subject: [PATCH 07/18] Add MFCC feature extraction to machine learning --- machine_learning/mfcc.py | 396 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 machine_learning/mfcc.py diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py new file mode 100644 index 000000000000..9144846bbb5d --- /dev/null +++ b/machine_learning/mfcc.py @@ -0,0 +1,396 @@ +""" +MFCC (Mel Frequency Cepstral Coefficients) Calculation + +MFCC is a feature widely used in audio and speech processing to represent the +short-term power spectrum of a sound signal in a more compact and +discriminative way. It is particularly popular in speech and audio processing +tasks such as speech recognition and speaker identification. + +How MFCC is Calculated: +1. Preprocessing: + - Load an audio signal and normalize it to ensure that the values fall + within a specific range (e.g., between -1 and 1). + - Frame the audio signal into overlapping, fixed-length segments, typically + using a technique like windowing to reduce spectral leakage. + +2. Fourier Transform: + - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it + from the time domain to the frequency domain. This results in a + representation of the audio frame as a sequence of frequency components. + +3. Power Spectrum: + - Calculate the power spectrum by taking the squared magnitude of each + frequency component obtained from the FFT. This step measures the energy + distribution across different frequency bands. + +4. Mel Filterbank: + - Apply a set of triangular filterbanks spaced in the Mel frequency scale + to the power spectrum. These filters mimic the human auditory system's + frequency response. Each filterbank sums the power spectrum values within + its band. + +5. Logarithmic Compression: + - Take the logarithm (typically base 10) of the filterbank values to + compress the dynamic range. This step mimics the logarithmic response of + the human ear to sound intensity. + +6. Discrete Cosine Transform (DCT): + - Apply the Discrete Cosine Transform to the log filterbank energies to + obtain the MFCC coefficients. This transformation helps decorrelate the + filterbank energies and captures the most important features of the audio + signal. + +7. Feature Extraction: + - Select a subset of the DCT coefficients to form the feature vector. + Often, the first few coefficients (e.g., 12-13) are used for most + applications. + +References: +- Mel-Frequency Cepstral Coefficients (MFCCs): + https://en.wikipedia.org/wiki/Mel-frequency_cepstrum +- Speech and Language Processing by Daniel Jurafsky & James H. Martin: + https://web.stanford.edu/~jurafsky/slp3/ +- Mel Frequency Cepstral Coefficient (MFCC) tutorial + http://practicalcryptography.com/miscellaneous/machine-learning + /guide-mel-frequency-cepstral-coefficients-mfccs/ + +Author: Amir Lavasani +""" + + +import logging + +import numpy as np +import scipy.fftpack as fft +from scipy.io import wavfile +from scipy.signal import get_window + +logging.basicConfig(level=logging.WARNING) + + +def mfcc( + audio: np.ndarray, + sample_rate: int, + ftt_size: int = 1024, + hop_length: int = 20, + mel_filter_num: int = 10, + dct_filter_num: int = 40, +) -> np.ndarray: + logging.info(f"Sample rate: {sample_rate}Hz") + logging.info(f"Audio duration: {len(audio) / sample_rate}s") + logging.info(f"Audio min: {np.min(audio)}") + logging.info(f"Audio max: {np.max(audio)}") + + # normalize audio + audio_normalized = normalize(audio) + + logging.info(f"Normalized audio min: {np.min(audio_normalized)}") + logging.info(f"Normalized audio max: {np.max(audio_normalized)}") + + # frame audio into + audio_framed = frame( + audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length + ) + + logging.info(f"Framed audio shape: {audio_framed.shape}") + logging.info(f"First frame: {audio_framed[0]}") + + # convert to frequency domain + # For simplicity we will choose the Hanning window. + window = get_window("hann", ftt_size, fftbins=True) + audio_windowed = audio_framed * window + + logging.info(f"Windowed audio shape: {audio_windowed.shape}") + logging.info(f"First frame: {audio_windowed[0]}") + + audio_fft = calculate_fft(audio_windowed, ftt_size) + logging.info(f"fft audio shape: {audio_fft.shape}") + logging.info(f"First frame: {audio_fft[0]}") + + audio_power = calculate_signal_power(audio_fft) + logging.info(f"power audio shape: {audio_power.shape}") + logging.info(f"First frame: {audio_power[0]}") + + filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size) + logging.info(f"filters shape: {filters.shape}") + + audio_filtered = np.dot(filters, np.transpose(audio_power)) + audio_log = 10.0 * np.log10(audio_filtered) + logging.info(f"audio_log shape: {audio_log.shape}") + + dct_filters = dct(dct_filter_num, mel_filter_num) + cepstral_coefficents = np.dot(dct_filters, audio_log) + + logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}") + return cepstral_coefficents + + +def normalize(audio: np.ndarray) -> np.ndarray: + """ + Normalize an audio signal by scaling it to have values between -1 and 1. + + Args: + audio (np.ndarray): The input audio signal. + + Returns: + np.ndarray: The normalized audio signal. + """ + # Find the maximum absolute value in the audio signal + max_abs_value = np.max(np.abs(audio)) + + # Divide the entire audio signal by the maximum absolute value + normalized_audio = audio / max_abs_value + + return normalized_audio + + +def frame( + audio: np.ndarray, + sample_rate: int, + hop_length: int = 20, + ftt_size: int = 1024, +) -> np.ndarray: + """ + Split an audio signal into overlapping frames. + + Args: + audio (np.ndarray): The input audio signal. + sample_rate (int): The sample rate of the audio signal. + hop_length (Optional[int]): The length of the hopping (default is 20ms). + ftt_size (Optional[int]): The size of the FFT window (default is 1024). + + Returns: + np.ndarray: An array of overlapping frames. + """ + + hop_size = np.round(sample_rate * hop_length / 1000).astype(int) + + # Pad the audio signal to handle edge cases + audio = np.pad(audio, int(ftt_size / 2), mode="reflect") + + # Calculate the number of frames + frame_num = int((len(audio) - ftt_size) / hop_size) + 1 + + # Initialize an array to store the frames + frames = np.zeros((frame_num, ftt_size)) + + # Split the audio signal into frames + for n in range(frame_num): + frames[n] = audio[n * hop_size : n * hop_size + ftt_size] + + return frames + + +def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray: + """ + Calculate the Fast Fourier Transform (FFT) of windowed audio data. + + Args: + audio_windowed (np.ndarray): The windowed audio signal. + ftt_size (Optional[int]): The size of the FFT (default is 1024). + + Returns: + np.ndarray: The FFT of the audio data. + """ + # Transpose the audio data to have time in rows and channels in columns + audio_transposed = np.transpose(audio_windowed) + + # Initialize an array to store the FFT results + audio_fft = np.empty( + (int(1 + ftt_size // 2), audio_transposed.shape[1]), + dtype=np.complex64, + order="F", + ) + + # Compute FFT for each channel + for n in range(audio_fft.shape[1]): + audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] + + # Transpose the FFT results back to the original shape + audio_fft = np.transpose(audio_fft) + + return audio_fft + + +def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: + """ + Calculate the power of the audio signal from its FFT. + + Args: + audio_fft (np.ndarray): The FFT of the audio signal. + + Returns: + np.ndarray: The power of the audio signal. + """ + # Calculate the power by squaring the absolute values of the FFT coefficients + audio_power = np.square(np.abs(audio_fft)) + + return audio_power + + +def freq_to_mel(freq): + """ + Convert a frequency in Hertz to the mel scale. + + Args: + freq (float): The frequency in Hertz. + + Returns: + float: The frequency in mel scale. + """ + # Use the formula to convert frequency to the mel scale + return 2595.0 * np.log10(1.0 + freq / 700.0) + + +def mel_to_freq(mels): + """ + Convert a frequency in the mel scale to Hertz. + + Args: + mels (float): The frequency in mel scale. + + Returns: + float: The frequency in Hertz. + """ + # Use the formula to convert mel scale to frequency + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) + + +def mel_spaced_filterbank( + sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024 +) -> np.ndarray: + """ + Create a Mel-spaced filter bank for audio processing. + + Args: + sample_rate (int): The sample rate of the audio. + mel_filter_num (Optional[int]): The number of mel filters (default is 10). + ftt_size (Optional[int]): The size of the FFT (default is 1024). + + Returns: + np.ndarray: Mel-spaced filter bank. + """ + freq_min = 0 + freq_high = sample_rate // 2 + + logging.info(f"Minimum frequency: {freq_min}") + logging.info(f"Maximum frequency: {freq_high}") + + # Calculate filter points and mel frequencies + filter_points, mel_freqs = get_filter_points( + sample_rate, + freq_min, + freq_high, + mel_filter_num, + ftt_size, + ) + + filters = get_filters(filter_points, ftt_size) + + # normalize filters + # taken from the librosa library + enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) + filters *= enorm[:, np.newaxis] + + return filters + + +def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: + """ + Generate filters for audio processing. + + Args: + filter_points (list): A list of filter points. + ftt_size (int): The size of the FFT. + + Returns: + np.ndarray: A matrix of filters. + """ + num_filters = len(filter_points) - 2 + filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) + + for n in range(num_filters): + start = filter_points[n] + mid = filter_points[n + 1] + end = filter_points[n + 2] + + # Linearly increase values from 0 to 1 + filters[n, start:mid] = np.linspace(0, 1, mid - start) + + # Linearly decrease values from 1 to 0 + filters[n, mid:end] = np.linspace(1, 0, end - mid) + + return filters + + +def get_filter_points( + sample_rate: int, + freq_min: int, + freq_high: int, + mel_filter_num: int = 10, + ftt_size: int = 1024, +): + """ + Calculate the filter points and frequencies for mel frequency filters. + + Args: + sample_rate (int): The sample rate of the audio. + freq_min (int): The minimum frequency in Hertz. + freq_high (int): The maximum frequency in Hertz. + mel_filter_num (Optional[int]): The number of mel filters (default is 10). + ftt_size (Optional[int]): The size of the FFT (default is 1024). + + Returns: + Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. + """ + + # Convert minimum and maximum frequencies to mel scale + fmin_mel = freq_to_mel(freq_min) + fmax_mel = freq_to_mel(freq_high) + + logging.info(f"MEL min: {fmin_mel}") + logging.info(f"MEL max: {fmax_mel}") + + # Generate equally spaced mel frequencies + mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2) + + # Convert mel frequencies back to Hertz + freqs = mel_to_freq(mels) + + # Calculate filter points as integer values + filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int) + + return filter_points, freqs + + +def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: + """ + Compute the Discrete Cosine Transform (DCT) basis matrix. + + Args: + dct_filter_num (int): The number of DCT filters to generate. + filter_num (int): The number of the fbank filters. + + Returns: + np.ndarray: The DCT basis matrix. + """ + basis = np.empty((dct_filter_num, filter_num)) + basis[0, :] = 1.0 / np.sqrt(filter_num) + + samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num) + + for i in range(1, dct_filter_num): + basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num) + + return basis + + +if __name__ == "__main__": + TRAIN_PATH = "./signal_processing/" + sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav") + + print(mfcc(audio, sample_rate)) + + import doctest + + doctest.testmod() From 088eaee190683c370a5307676bba26bed8074338 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Tue, 12 Sep 2023 11:46:00 +0330 Subject: [PATCH 08/18] Add standalone usage in comments --- machine_learning/mfcc.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 9144846bbb5d..9fea66ee4cca 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -62,7 +62,6 @@ import numpy as np import scipy.fftpack as fft -from scipy.io import wavfile from scipy.signal import get_window logging.basicConfig(level=logging.WARNING) @@ -386,10 +385,10 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: if __name__ == "__main__": - TRAIN_PATH = "./signal_processing/" - sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav") - - print(mfcc(audio, sample_rate)) + # from scipy.io import wavfile + # wav_file_path = "./path-to-file/sample.wav" + # sample_rate, audio = wavfile.read(wav_file_path) + # mfccs = mfcc(audio, sample_rate) import doctest From 554d52b35b9cf7892428c206836e808727d66446 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 11:01:17 +0330 Subject: [PATCH 09/18] Apply suggestions from code review Co-authored-by: Christian Clauss --- machine_learning/mfcc.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 9fea66ee4cca..ee7af7c8a2bd 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -1,5 +1,5 @@ """ -MFCC (Mel Frequency Cepstral Coefficients) Calculation +Mel Frequency Cepstral Coefficients (MFCC) Calculation MFCC is a feature widely used in audio and speech processing to represent the short-term power spectrum of a sound signal in a more compact and @@ -138,12 +138,10 @@ def normalize(audio: np.ndarray) -> np.ndarray: max_abs_value = np.max(np.abs(audio)) # Divide the entire audio signal by the maximum absolute value - normalized_audio = audio / max_abs_value + return audio / max_abs_value - return normalized_audio - -def frame( +def audio_frames( audio: np.ndarray, sample_rate: int, hop_length: int = 20, @@ -168,7 +166,7 @@ def frame( audio = np.pad(audio, int(ftt_size / 2), mode="reflect") # Calculate the number of frames - frame_num = int((len(audio) - ftt_size) / hop_size) + 1 + frame_count = int((len(audio) - ftt_size) / hop_size) + 1 # Initialize an array to store the frames frames = np.zeros((frame_num, ftt_size)) @@ -206,9 +204,7 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] # Transpose the FFT results back to the original shape - audio_fft = np.transpose(audio_fft) - - return audio_fft + return np.transpose(audio_fft) def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: @@ -289,9 +285,7 @@ def mel_spaced_filterbank( # normalize filters # taken from the librosa library enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) - filters *= enorm[:, np.newaxis] - - return filters + return filters * enorm[:, np.newaxis] def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: From f78fd1b34557d36c63ed8e1666f216b168ed373f Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 11:37:33 +0330 Subject: [PATCH 10/18] [main] Fix typo due to auto review change --- machine_learning/mfcc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index ee7af7c8a2bd..5f792fe242be 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -87,7 +87,7 @@ def mfcc( logging.info(f"Normalized audio max: {np.max(audio_normalized)}") # frame audio into - audio_framed = frame( + audio_framed = audio_frames( audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length ) @@ -169,10 +169,10 @@ def audio_frames( frame_count = int((len(audio) - ftt_size) / hop_size) + 1 # Initialize an array to store the frames - frames = np.zeros((frame_num, ftt_size)) + frames = np.zeros((frame_count, ftt_size)) # Split the audio signal into frames - for n in range(frame_num): + for n in range(frame_count): frames[n] = audio[n * hop_size : n * hop_size + ftt_size] return frames From cb9d9df1e3e4bef78f844ecb18680a7b2ce34d7f Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 12:37:00 +0330 Subject: [PATCH 11/18] Add doctests for all functions --- machine_learning/mfcc.py | 84 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 5f792fe242be..fc4efaf3a7d4 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -133,6 +133,14 @@ def normalize(audio: np.ndarray) -> np.ndarray: Returns: np.ndarray: The normalized audio signal. + + Examples: + >>> audio = np.array([1, 2, 3, 4, 5]) + >>> normalized_audio = normalize(audio) + >>> np.max(normalized_audio) + 1.0 + >>> np.min(normalized_audio) + 0.2 """ # Find the maximum absolute value in the audio signal max_abs_value = np.max(np.abs(audio)) @@ -158,6 +166,14 @@ def audio_frames( Returns: np.ndarray: An array of overlapping frames. + + Examples: + >>> import numpy as np + >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) + >>> sample_rate = 8000 + >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) + >>> frames.shape + (126, 512) """ hop_size = np.round(sample_rate * hop_length / 1000).astype(int) @@ -188,6 +204,13 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra Returns: np.ndarray: The FFT of the audio data. + + Examples: + >>> import numpy as np + >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) + >>> np.allclose(audio_fft, np.array([[6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j], [15.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]])) + True """ # Transpose the audio data to have time in rows and channels in columns audio_transposed = np.transpose(audio_windowed) @@ -216,6 +239,13 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: Returns: np.ndarray: The power of the audio signal. + + Examples: + >>> import numpy as np + >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) + >>> power = calculate_signal_power(audio_fft) + >>> np.allclose(power, np.array([5, 13, 25, 41])) + True """ # Calculate the power by squaring the absolute values of the FFT coefficients audio_power = np.square(np.abs(audio_fft)) @@ -232,6 +262,10 @@ def freq_to_mel(freq): Returns: float: The frequency in mel scale. + + Examples: + >>> round(freq_to_mel(1000), 2) + 999.99 """ # Use the formula to convert frequency to the mel scale return 2595.0 * np.log10(1.0 + freq / 700.0) @@ -246,6 +280,10 @@ def mel_to_freq(mels): Returns: float: The frequency in Hertz. + + Examples: + >>> round(mel_to_freq(999.99), 2) + 1000.01 """ # Use the formula to convert mel scale to frequency return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) @@ -264,6 +302,10 @@ def mel_spaced_filterbank( Returns: np.ndarray: Mel-spaced filter bank. + + Examples: + >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10) + 0.0004603981 """ freq_min = 0 freq_high = sample_rate // 2 @@ -298,6 +340,10 @@ def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: Returns: np.ndarray: A matrix of filters. + + Examples: + >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape + (4, 257) """ num_filters = len(filter_points) - 2 filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) @@ -335,8 +381,11 @@ def get_filter_points( Returns: Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. - """ +Examples: + >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] + array([ 0, 20, 51, 95, 161, 256]) + """ # Convert minimum and maximum frequencies to mel scale fmin_mel = freq_to_mel(freq_min) fmax_mel = freq_to_mel(freq_high) @@ -366,6 +415,9 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: Returns: np.ndarray: The DCT basis matrix. + Examples: + >>> round(dct(3, 5)[0][0], 5) + 0.44721 """ basis = np.empty((dct_filter_num, filter_num)) basis[0, :] = 1.0 / np.sqrt(filter_num) @@ -378,12 +430,30 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: return basis -if __name__ == "__main__": - # from scipy.io import wavfile - # wav_file_path = "./path-to-file/sample.wav" - # sample_rate, audio = wavfile.read(wav_file_path) - # mfccs = mfcc(audio, sample_rate) +def example(wav_file_path="./path-to-file/sample.wav"): + """ + Example function to calculate MFCCs (Mel Frequency Cepstral Coefficients) from an audio file. - import doctest + Args: + wav_file_path (str): The path to the WAV audio file (default is "./path-to-file/sample.wav"). + + Returns: + np.ndarray: The computed MFCCs for the audio. + """ + from scipy.io import wavfile + + try: + # Load the audio from the WAV file + sample_rate, audio = wavfile.read(wav_file_path) + # Calculate MFCCs + mfccs = mfcc(audio, sample_rate) + + return mfccs + + except Exception as e: + logging.error(f"Error processing audio: {str(e)}") + return None +if __name__ == "__main__": + import doctest doctest.testmod() From d6bdf63e35f7043439785d7879d78fea40ab6e1d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 17 Sep 2023 09:08:06 +0000 Subject: [PATCH 12/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/mfcc.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index fc4efaf3a7d4..da402e8da6f0 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -166,7 +166,7 @@ def audio_frames( Returns: np.ndarray: An array of overlapping frames. - + Examples: >>> import numpy as np >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) @@ -370,21 +370,21 @@ def get_filter_points( ftt_size: int = 1024, ): """ - Calculate the filter points and frequencies for mel frequency filters. + Calculate the filter points and frequencies for mel frequency filters. - Args: - sample_rate (int): The sample rate of the audio. - freq_min (int): The minimum frequency in Hertz. - freq_high (int): The maximum frequency in Hertz. - mel_filter_num (Optional[int]): The number of mel filters (default is 10). - ftt_size (Optional[int]): The size of the FFT (default is 1024). + Args: + sample_rate (int): The sample rate of the audio. + freq_min (int): The minimum frequency in Hertz. + freq_high (int): The maximum frequency in Hertz. + mel_filter_num (Optional[int]): The number of mel filters (default is 10). + ftt_size (Optional[int]): The size of the FFT (default is 1024). - Returns: - Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. + Returns: + Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. -Examples: - >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] - array([ 0, 20, 51, 95, 161, 256]) + Examples: + >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] + array([ 0, 20, 51, 95, 161, 256]) """ # Convert minimum and maximum frequencies to mel scale fmin_mel = freq_to_mel(freq_min) @@ -449,11 +449,13 @@ def example(wav_file_path="./path-to-file/sample.wav"): mfccs = mfcc(audio, sample_rate) return mfccs - + except Exception as e: logging.error(f"Error processing audio: {str(e)}") return None + if __name__ == "__main__": import doctest + doctest.testmod() From bda7ede7e5ca9515cbbf720b0e36209699a28fee Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 12:59:54 +0330 Subject: [PATCH 13/18] Fix some pre-commit issues --- machine_learning/mfcc.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 39d72584438e..0079363196f8 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -209,7 +209,9 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra >>> import numpy as np >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) - >>> np.allclose(audio_fft, np.array([[6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j], [15.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]])) + >>> np.allclose(\ + audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])\ + ) True """ # Transpose the audio data to have time in rows and channels in columns @@ -433,26 +435,22 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: def example(wav_file_path="./path-to-file/sample.wav"): """ - Example function to calculate MFCCs (Mel Frequency Cepstral Coefficients) from an audio file. + Example function to calculate MFCCs from an audio file. Args: - wav_file_path (str): The path to the WAV audio file (default is "./path-to-file/sample.wav"). + wav_file_path (str): The path to the WAV audio file. Returns: np.ndarray: The computed MFCCs for the audio. """ from scipy.io import wavfile - try: - # Load the audio from the WAV file - sample_rate, audio = wavfile.read(wav_file_path) - # Calculate MFCCs - mfccs = mfcc(audio, sample_rate) + # Load the audio from the WAV file + sample_rate, audio = wavfile.read(wav_file_path) + # Calculate MFCCs + mfccs = mfcc(audio, sample_rate) - return mfccs - except Exception as e: - logging.error(f"Error processing audio: {str(e)}") - return None + return mfccs if __name__ == "__main__": From 2fde552cb2561114acf792dd1a8cbb979f4396a2 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Sun, 17 Sep 2023 14:19:35 +0330 Subject: [PATCH 14/18] Update review issues * Remove types from docstring * Rename dct * Add mfcc docstring * Add typing to several functions --- machine_learning/mfcc.py | 181 ++++++++++++++++++++++----------------- 1 file changed, 103 insertions(+), 78 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 0079363196f8..c3d85b526112 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -75,6 +75,33 @@ def mfcc( mel_filter_num: int = 10, dct_filter_num: int = 40, ) -> np.ndarray: + """ + Calculate Mel Frequency Cepstral Coefficients (MFCCs) from an audio signal. + + Args: + audio: The input audio signal. + sample_rate: The sample rate of the audio signal (in Hz). + ftt_size: The size of the FFT window (default is 1024). + hop_length: The hop length for frame creation (default is 20ms). + mel_filter_num: The number of Mel filters (default is 10). + dct_filter_num: The number of DCT filters (default is 40). + + Returns: + A matrix of MFCCs for the input audio. + + Raises: + ValueError: If the input audio is empty. + + Example: + >>> import numpy as np + >>> sample_rate = 44100 # Sample rate of 44.1 kHz + >>> duration = 2.0 # Duration of 1 second + >>> t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) + >>> audio = 0.5 * np.sin(2 * np.pi * 440.0 * t) # Generate a 440 Hz sine wave + >>> mfccs = mfcc(audio, sample_rate) + >>> mfccs.shape + (40, 101) + """ logging.info(f"Sample rate: {sample_rate}Hz") logging.info(f"Audio duration: {len(audio) / sample_rate}s") logging.info(f"Audio min: {np.min(audio)}") @@ -117,7 +144,7 @@ def mfcc( audio_log = 10.0 * np.log10(audio_filtered) logging.info(f"audio_log shape: {audio_log.shape}") - dct_filters = dct(dct_filter_num, mel_filter_num) + dct_filters = discrete_cosine_transform(dct_filter_num, mel_filter_num) cepstral_coefficents = np.dot(dct_filters, audio_log) logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}") @@ -129,18 +156,18 @@ def normalize(audio: np.ndarray) -> np.ndarray: Normalize an audio signal by scaling it to have values between -1 and 1. Args: - audio (np.ndarray): The input audio signal. + audio: The input audio signal. Returns: - np.ndarray: The normalized audio signal. + The normalized audio signal. Examples: - >>> audio = np.array([1, 2, 3, 4, 5]) - >>> normalized_audio = normalize(audio) - >>> np.max(normalized_audio) - 1.0 - >>> np.min(normalized_audio) - 0.2 + >>> audio = np.array([1, 2, 3, 4, 5]) + >>> normalized_audio = normalize(audio) + >>> np.max(normalized_audio) + 1.0 + >>> np.min(normalized_audio) + 0.2 """ # Find the maximum absolute value in the audio signal max_abs_value = np.max(np.abs(audio)) @@ -159,21 +186,21 @@ def audio_frames( Split an audio signal into overlapping frames. Args: - audio (np.ndarray): The input audio signal. - sample_rate (int): The sample rate of the audio signal. - hop_length (Optional[int]): The length of the hopping (default is 20ms). - ftt_size (Optional[int]): The size of the FFT window (default is 1024). + audio: The input audio signal. + sample_rate: The sample rate of the audio signal. + hop_length: The length of the hopping (default is 20ms). + ftt_size: The size of the FFT window (default is 1024). Returns: - np.ndarray: An array of overlapping frames. + An array of overlapping frames. Examples: - >>> import numpy as np - >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) - >>> sample_rate = 8000 - >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) - >>> frames.shape - (126, 512) + >>> import numpy as np + >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) + >>> sample_rate = 8000 + >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) + >>> frames.shape + (126, 512) """ hop_size = np.round(sample_rate * hop_length / 1000).astype(int) @@ -199,20 +226,18 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra Calculate the Fast Fourier Transform (FFT) of windowed audio data. Args: - audio_windowed (np.ndarray): The windowed audio signal. - ftt_size (Optional[int]): The size of the FFT (default is 1024). + audio_windowed: The windowed audio signal. + ftt_size: The size of the FFT (default is 1024). Returns: - np.ndarray: The FFT of the audio data. + The FFT of the audio data. Examples: - >>> import numpy as np - >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) - >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) - >>> np.allclose(\ - audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])\ - ) - True + >>> import numpy as np + >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) + >>> np.allclose(audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])) + True """ # Transpose the audio data to have time in rows and channels in columns audio_transposed = np.transpose(audio_windowed) @@ -237,17 +262,17 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: Calculate the power of the audio signal from its FFT. Args: - audio_fft (np.ndarray): The FFT of the audio signal. + audio_fft: The FFT of the audio signal. Returns: - np.ndarray: The power of the audio signal. + The power of the audio signal. Examples: - >>> import numpy as np - >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) - >>> power = calculate_signal_power(audio_fft) - >>> np.allclose(power, np.array([5, 13, 25, 41])) - True + >>> import numpy as np + >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) + >>> power = calculate_signal_power(audio_fft) + >>> np.allclose(power, np.array([5, 13, 25, 41])) + True """ # Calculate the power by squaring the absolute values of the FFT coefficients audio_power = np.square(np.abs(audio_fft)) @@ -255,37 +280,37 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: return audio_power -def freq_to_mel(freq): +def freq_to_mel(freq: float) -> float: """ Convert a frequency in Hertz to the mel scale. Args: - freq (float): The frequency in Hertz. + freq: The frequency in Hertz. Returns: - float: The frequency in mel scale. + The frequency in mel scale. Examples: - >>> round(freq_to_mel(1000), 2) - 999.99 + >>> round(freq_to_mel(1000), 2) + 999.99 """ # Use the formula to convert frequency to the mel scale return 2595.0 * np.log10(1.0 + freq / 700.0) -def mel_to_freq(mels): +def mel_to_freq(mels: float) -> float: """ Convert a frequency in the mel scale to Hertz. Args: - mels (float): The frequency in mel scale. + mels: The frequency in mel scale. Returns: - float: The frequency in Hertz. + The frequency in Hertz. Examples: - >>> round(mel_to_freq(999.99), 2) - 1000.01 + >>> round(mel_to_freq(999.99), 2) + 1000.01 """ # Use the formula to convert mel scale to frequency return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) @@ -298,16 +323,16 @@ def mel_spaced_filterbank( Create a Mel-spaced filter bank for audio processing. Args: - sample_rate (int): The sample rate of the audio. - mel_filter_num (Optional[int]): The number of mel filters (default is 10). - ftt_size (Optional[int]): The size of the FFT (default is 1024). + sample_rate: The sample rate of the audio. + mel_filter_num: The number of mel filters (default is 10). + ftt_size: The size of the FFT (default is 1024). Returns: - np.ndarray: Mel-spaced filter bank. + Mel-spaced filter bank. Examples: - >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10) - 0.0004603981 + >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10) + 0.0004603981 """ freq_min = 0 freq_high = sample_rate // 2 @@ -337,15 +362,15 @@ def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: Generate filters for audio processing. Args: - filter_points (list): A list of filter points. - ftt_size (int): The size of the FFT. + filter_points: A list of filter points. + ftt_size: The size of the FFT. Returns: - np.ndarray: A matrix of filters. + A matrix of filters. Examples: - >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape - (4, 257) + >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape + (4, 257) """ num_filters = len(filter_points) - 2 filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) @@ -375,18 +400,18 @@ def get_filter_points( Calculate the filter points and frequencies for mel frequency filters. Args: - sample_rate (int): The sample rate of the audio. - freq_min (int): The minimum frequency in Hertz. - freq_high (int): The maximum frequency in Hertz. - mel_filter_num (Optional[int]): The number of mel filters (default is 10). - ftt_size (Optional[int]): The size of the FFT (default is 1024). + sample_rate: The sample rate of the audio. + freq_min: The minimum frequency in Hertz. + freq_high: The maximum frequency in Hertz. + mel_filter_num: The number of mel filters (default is 10). + ftt_size: The size of the FFT (default is 1024). Returns: - Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies. + Filter points and corresponding frequencies. Examples: - >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] - array([ 0, 20, 51, 95, 161, 256]) + >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] + array([ 0, 20, 51, 95, 161, 256]) """ # Convert minimum and maximum frequencies to mel scale fmin_mel = freq_to_mel(freq_min) @@ -407,20 +432,20 @@ def get_filter_points( return filter_points, freqs -def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: +def discrete_cosine_transform(dct_filter_num: int, filter_num: int) -> np.ndarray: """ Compute the Discrete Cosine Transform (DCT) basis matrix. Args: - dct_filter_num (int): The number of DCT filters to generate. - filter_num (int): The number of the fbank filters. + dct_filter_num: The number of DCT filters to generate. + filter_num: The number of the fbank filters. Returns: - np.ndarray: The DCT basis matrix. + The DCT basis matrix. Examples: - >>> round(dct(3, 5)[0][0], 5) - 0.44721 + >>> round(discrete_cosine_transform(3, 5)[0][0], 5) + 0.44721 """ basis = np.empty((dct_filter_num, filter_num)) basis[0, :] = 1.0 / np.sqrt(filter_num) @@ -433,12 +458,13 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray: return basis -def example(wav_file_path="./path-to-file/sample.wav"): +def example(wav_file_path: str = "./path-to-file/sample.wav") -> np.ndarray: """ - Example function to calculate MFCCs from an audio file. + Example function to calculate Mel Frequency Cepstral Coefficients + (MFCCs) from an audio file. Args: - wav_file_path (str): The path to the WAV audio file. + wav_file_path: The path to the WAV audio file. Returns: np.ndarray: The computed MFCCs for the audio. @@ -447,10 +473,9 @@ def example(wav_file_path="./path-to-file/sample.wav"): # Load the audio from the WAV file sample_rate, audio = wavfile.read(wav_file_path) - # Calculate MFCCs - mfccs = mfcc(audio, sample_rate) - return mfccs + # Calculate MFCCs + return mfcc(audio, sample_rate) if __name__ == "__main__": From ad0eeefd6df76d26ce537e5393b383ea2b550ac4 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Sep 2023 15:27:28 +0200 Subject: [PATCH 15/18] Apply suggestions from code review --- machine_learning/mfcc.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index c3d85b526112..3edddca77774 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -93,7 +93,6 @@ def mfcc( ValueError: If the input audio is empty. Example: - >>> import numpy as np >>> sample_rate = 44100 # Sample rate of 44.1 kHz >>> duration = 2.0 # Duration of 1 second >>> t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) @@ -195,7 +194,6 @@ def audio_frames( An array of overlapping frames. Examples: - >>> import numpy as np >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) >>> sample_rate = 8000 >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) @@ -233,7 +231,6 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra The FFT of the audio data. Examples: - >>> import numpy as np >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) >>> np.allclose(audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])) @@ -268,7 +265,6 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: The power of the audio signal. Examples: - >>> import numpy as np >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) >>> power = calculate_signal_power(audio_fft) >>> np.allclose(power, np.array([5, 13, 25, 41])) From 7e38bc3d8189855ad8ef3fa0913f47de235a7aec Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Sep 2023 22:58:29 +0200 Subject: [PATCH 16/18] Update mfcc.py --- machine_learning/mfcc.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 3edddca77774..13ed1b96d8c8 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -1,12 +1,12 @@ """ Mel Frequency Cepstral Coefficients (MFCC) Calculation -MFCC is a feature widely used in audio and speech processing to represent the +MFCC is an algorythm widely used in audio and speech processing to represent the short-term power spectrum of a sound signal in a more compact and discriminative way. It is particularly popular in speech and audio processing tasks such as speech recognition and speaker identification. -How MFCC is Calculated: +How Mel Frequency Cepstral Coefficients are Calculated: 1. Preprocessing: - Load an audio signal and normalize it to ensure that the values fall within a specific range (e.g., between -1 and 1). @@ -64,7 +64,7 @@ import scipy.fftpack as fft from scipy.signal import get_window -logging.basicConfig(level=logging.WARNING) +logging.basicConfig(filename=f"{__file__}.log", level=logging.INFO) def mfcc( @@ -168,11 +168,8 @@ def normalize(audio: np.ndarray) -> np.ndarray: >>> np.min(normalized_audio) 0.2 """ - # Find the maximum absolute value in the audio signal - max_abs_value = np.max(np.abs(audio)) - # Divide the entire audio signal by the maximum absolute value - return audio / max_abs_value + return audio / np.max(np.abs(audio)) def audio_frames( @@ -271,9 +268,7 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: True """ # Calculate the power by squaring the absolute values of the FFT coefficients - audio_power = np.square(np.abs(audio_fft)) - - return audio_power + return np.square(np.abs(audio_fft)) def freq_to_mel(freq: float) -> float: @@ -406,8 +401,12 @@ def get_filter_points( Filter points and corresponding frequencies. Examples: - >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0] + >>> filter_points = get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512) + >>> filter_points[0] array([ 0, 20, 51, 95, 161, 256]) + >>> filter_points[1] + array([ 0. , 324.46707094, 799.33254207, 1494.30973963, + 2511.42581671, 4000. ]) """ # Convert minimum and maximum frequencies to mel scale fmin_mel = freq_to_mel(freq_min) From f410bf239f06acd45df478f5df9866b7977965cc Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Sep 2023 23:03:31 +0200 Subject: [PATCH 17/18] get_filter_points() -> tuple[np.ndarray, np.ndarray]: --- machine_learning/mfcc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 13ed1b96d8c8..3c34d9e8149e 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -386,7 +386,7 @@ def get_filter_points( freq_high: int, mel_filter_num: int = 10, ftt_size: int = 1024, -): +) -> tuple[np.ndarray, np.ndarray]: """ Calculate the filter points and frequencies for mel frequency filters. From e143e5b12f88742e3739fd76f5c4e1bd06ec79dd Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Sep 2023 23:04:57 +0200 Subject: [PATCH 18/18] algorithm --- machine_learning/mfcc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py index 3c34d9e8149e..7ce8ceb50ff2 100644 --- a/machine_learning/mfcc.py +++ b/machine_learning/mfcc.py @@ -1,7 +1,7 @@ """ Mel Frequency Cepstral Coefficients (MFCC) Calculation -MFCC is an algorythm widely used in audio and speech processing to represent the +MFCC is an algorithm widely used in audio and speech processing to represent the short-term power spectrum of a sound signal in a more compact and discriminative way. It is particularly popular in speech and audio processing tasks such as speech recognition and speaker identification.