From 264bc13fee42202f2298d59707edc7403ed486d3 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Tue, 12 Sep 2023 11:39:01 +0330
Subject: [PATCH 01/18] Add MFCC feature extraction to machine learning

---
 machine_learning/mfcc.py | 396 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 machine_learning/mfcc.py

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
new file mode 100644
index 000000000000..9144846bbb5d
--- /dev/null
+++ b/machine_learning/mfcc.py
@@ -0,0 +1,396 @@
+"""
+MFCC (Mel Frequency Cepstral Coefficients) Calculation
+
+MFCC is a feature widely used in audio and speech processing to represent the
+short-term power spectrum of a sound signal in a more compact and
+discriminative way. It is particularly popular in speech and audio processing
+tasks such as speech recognition and speaker identification.
+
+How MFCC is Calculated:
+1. Preprocessing:
+   - Load an audio signal and normalize it to ensure that the values fall
+     within a specific range (e.g., between -1 and 1).
+   - Frame the audio signal into overlapping, fixed-length segments, typically
+     using a technique like windowing to reduce spectral leakage.
+
+2. Fourier Transform:
+   - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it
+     from the time domain to the frequency domain. This results in a
+     representation of the audio frame as a sequence of frequency components.
+
+3. Power Spectrum:
+   - Calculate the power spectrum by taking the squared magnitude of each
+     frequency component obtained from the FFT. This step measures the energy
+     distribution across different frequency bands.
+
+4. Mel Filterbank:
+   - Apply a set of triangular filterbanks spaced in the Mel frequency scale
+     to the power spectrum. These filters mimic the human auditory system's
+     frequency response. Each filterbank sums the power spectrum values within
+     its band.
+
+5. Logarithmic Compression:
+   - Take the logarithm (typically base 10) of the filterbank values to
+     compress the dynamic range. This step mimics the logarithmic response of
+     the human ear to sound intensity.
+
+6. Discrete Cosine Transform (DCT):
+   - Apply the Discrete Cosine Transform to the log filterbank energies to
+     obtain the MFCC coefficients. This transformation helps decorrelate the
+     filterbank energies and captures the most important features of the audio
+     signal.
+
+7. Feature Extraction:
+   - Select a subset of the DCT coefficients to form the feature vector.
+     Often, the first few coefficients (e.g., 12-13) are used for most
+     applications.
+
+References:
+- Mel-Frequency Cepstral Coefficients (MFCCs):
+  https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+- Speech and Language Processing by Daniel Jurafsky & James H. Martin:
+  https://web.stanford.edu/~jurafsky/slp3/
+- Mel Frequency Cepstral Coefficient (MFCC) tutorial
+  http://practicalcryptography.com/miscellaneous/machine-learning
+  /guide-mel-frequency-cepstral-coefficients-mfccs/
+
+Author: Amir Lavasani
+"""
+
+
+import logging
+
+import numpy as np
+import scipy.fftpack as fft
+from scipy.io import wavfile
+from scipy.signal import get_window
+
+logging.basicConfig(level=logging.WARNING)
+
+
+def mfcc(
+    audio: np.ndarray,
+    sample_rate: int,
+    ftt_size: int = 1024,
+    hop_length: int = 20,
+    mel_filter_num: int = 10,
+    dct_filter_num: int = 40,
+) -> np.ndarray:
+    logging.info(f"Sample rate: {sample_rate}Hz")
+    logging.info(f"Audio duration: {len(audio) / sample_rate}s")
+    logging.info(f"Audio min: {np.min(audio)}")
+    logging.info(f"Audio max: {np.max(audio)}")
+
+    # normalize audio
+    audio_normalized = normalize(audio)
+
+    logging.info(f"Normalized audio min: {np.min(audio_normalized)}")
+    logging.info(f"Normalized audio max: {np.max(audio_normalized)}")
+
+    # frame audio into
+    audio_framed = frame(
+        audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length
+    )
+
+    logging.info(f"Framed audio shape: {audio_framed.shape}")
+    logging.info(f"First frame: {audio_framed[0]}")
+
+    # convert to frequency domain
+    # For simplicity we will choose the Hanning window.
+    window = get_window("hann", ftt_size, fftbins=True)
+    audio_windowed = audio_framed * window
+
+    logging.info(f"Windowed audio shape: {audio_windowed.shape}")
+    logging.info(f"First frame: {audio_windowed[0]}")
+
+    audio_fft = calculate_fft(audio_windowed, ftt_size)
+    logging.info(f"fft audio shape: {audio_fft.shape}")
+    logging.info(f"First frame: {audio_fft[0]}")
+
+    audio_power = calculate_signal_power(audio_fft)
+    logging.info(f"power audio shape: {audio_power.shape}")
+    logging.info(f"First frame: {audio_power[0]}")
+
+    filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size)
+    logging.info(f"filters shape: {filters.shape}")
+
+    audio_filtered = np.dot(filters, np.transpose(audio_power))
+    audio_log = 10.0 * np.log10(audio_filtered)
+    logging.info(f"audio_log shape: {audio_log.shape}")
+
+    dct_filters = dct(dct_filter_num, mel_filter_num)
+    cepstral_coefficents = np.dot(dct_filters, audio_log)
+
+    logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}")
+    return cepstral_coefficents
+
+
+def normalize(audio: np.ndarray) -> np.ndarray:
+    """
+    Normalize an audio signal by scaling it to have values between -1 and 1.
+
+    Args:
+        audio (np.ndarray): The input audio signal.
+
+    Returns:
+        np.ndarray: The normalized audio signal.
+    """
+    # Find the maximum absolute value in the audio signal
+    max_abs_value = np.max(np.abs(audio))
+
+    # Divide the entire audio signal by the maximum absolute value
+    normalized_audio = audio / max_abs_value
+
+    return normalized_audio
+
+
+def frame(
+    audio: np.ndarray,
+    sample_rate: int,
+    hop_length: int = 20,
+    ftt_size: int = 1024,
+) -> np.ndarray:
+    """
+    Split an audio signal into overlapping frames.
+
+    Args:
+        audio (np.ndarray): The input audio signal.
+        sample_rate (int): The sample rate of the audio signal.
+        hop_length (Optional[int]): The length of the hopping (default is 20ms).
+        ftt_size (Optional[int]): The size of the FFT window (default is 1024).
+
+    Returns:
+        np.ndarray: An array of overlapping frames.
+    """
+
+    hop_size = np.round(sample_rate * hop_length / 1000).astype(int)
+
+    # Pad the audio signal to handle edge cases
+    audio = np.pad(audio, int(ftt_size / 2), mode="reflect")
+
+    # Calculate the number of frames
+    frame_num = int((len(audio) - ftt_size) / hop_size) + 1
+
+    # Initialize an array to store the frames
+    frames = np.zeros((frame_num, ftt_size))
+
+    # Split the audio signal into frames
+    for n in range(frame_num):
+        frames[n] = audio[n * hop_size : n * hop_size + ftt_size]
+
+    return frames
+
+
+def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray:
+    """
+    Calculate the Fast Fourier Transform (FFT) of windowed audio data.
+
+    Args:
+        audio_windowed (np.ndarray): The windowed audio signal.
+        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+
+    Returns:
+        np.ndarray: The FFT of the audio data.
+    """
+    # Transpose the audio data to have time in rows and channels in columns
+    audio_transposed = np.transpose(audio_windowed)
+
+    # Initialize an array to store the FFT results
+    audio_fft = np.empty(
+        (int(1 + ftt_size // 2), audio_transposed.shape[1]),
+        dtype=np.complex64,
+        order="F",
+    )
+
+    # Compute FFT for each channel
+    for n in range(audio_fft.shape[1]):
+        audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]]
+
+    # Transpose the FFT results back to the original shape
+    audio_fft = np.transpose(audio_fft)
+
+    return audio_fft
+
+
+def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
+    """
+    Calculate the power of the audio signal from its FFT.
+
+    Args:
+        audio_fft (np.ndarray): The FFT of the audio signal.
+
+    Returns:
+        np.ndarray: The power of the audio signal.
+    """
+    # Calculate the power by squaring the absolute values of the FFT coefficients
+    audio_power = np.square(np.abs(audio_fft))
+
+    return audio_power
+
+
+def freq_to_mel(freq):
+    """
+    Convert a frequency in Hertz to the mel scale.
+
+    Args:
+        freq (float): The frequency in Hertz.
+
+    Returns:
+        float: The frequency in mel scale.
+    """
+    # Use the formula to convert frequency to the mel scale
+    return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+
+def mel_to_freq(mels):
+    """
+    Convert a frequency in the mel scale to Hertz.
+
+    Args:
+        mels (float): The frequency in mel scale.
+
+    Returns:
+        float: The frequency in Hertz.
+    """
+    # Use the formula to convert mel scale to frequency
+    return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+
+def mel_spaced_filterbank(
+    sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024
+) -> np.ndarray:
+    """
+    Create a Mel-spaced filter bank for audio processing.
+
+    Args:
+        sample_rate (int): The sample rate of the audio.
+        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
+        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+
+    Returns:
+        np.ndarray: Mel-spaced filter bank.
+    """
+    freq_min = 0
+    freq_high = sample_rate // 2
+
+    logging.info(f"Minimum frequency: {freq_min}")
+    logging.info(f"Maximum frequency: {freq_high}")
+
+    # Calculate filter points and mel frequencies
+    filter_points, mel_freqs = get_filter_points(
+        sample_rate,
+        freq_min,
+        freq_high,
+        mel_filter_num,
+        ftt_size,
+    )
+
+    filters = get_filters(filter_points, ftt_size)
+
+    # normalize filters
+    # taken from the librosa library
+    enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num])
+    filters *= enorm[:, np.newaxis]
+
+    return filters
+
+
+def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
+    """
+    Generate filters for audio processing.
+
+    Args:
+        filter_points (list): A list of filter points.
+        ftt_size (int): The size of the FFT.
+
+    Returns:
+        np.ndarray: A matrix of filters.
+    """
+    num_filters = len(filter_points) - 2
+    filters = np.zeros((num_filters, int(ftt_size / 2) + 1))
+
+    for n in range(num_filters):
+        start = filter_points[n]
+        mid = filter_points[n + 1]
+        end = filter_points[n + 2]
+
+        # Linearly increase values from 0 to 1
+        filters[n, start:mid] = np.linspace(0, 1, mid - start)
+
+        # Linearly decrease values from 1 to 0
+        filters[n, mid:end] = np.linspace(1, 0, end - mid)
+
+    return filters
+
+
+def get_filter_points(
+    sample_rate: int,
+    freq_min: int,
+    freq_high: int,
+    mel_filter_num: int = 10,
+    ftt_size: int = 1024,
+):
+    """
+    Calculate the filter points and frequencies for mel frequency filters.
+
+    Args:
+        sample_rate (int): The sample rate of the audio.
+        freq_min (int): The minimum frequency in Hertz.
+        freq_high (int): The maximum frequency in Hertz.
+        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
+        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
+    """
+
+    # Convert minimum and maximum frequencies to mel scale
+    fmin_mel = freq_to_mel(freq_min)
+    fmax_mel = freq_to_mel(freq_high)
+
+    logging.info(f"MEL min: {fmin_mel}")
+    logging.info(f"MEL max: {fmax_mel}")
+
+    # Generate equally spaced mel frequencies
+    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2)
+
+    # Convert mel frequencies back to Hertz
+    freqs = mel_to_freq(mels)
+
+    # Calculate filter points as integer values
+    filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int)
+
+    return filter_points, freqs
+
+
+def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
+    """
+    Compute the Discrete Cosine Transform (DCT) basis matrix.
+
+    Args:
+        dct_filter_num (int): The number of DCT filters to generate.
+        filter_num (int): The number of the fbank filters.
+
+    Returns:
+        np.ndarray: The DCT basis matrix.
+    """
+    basis = np.empty((dct_filter_num, filter_num))
+    basis[0, :] = 1.0 / np.sqrt(filter_num)
+
+    samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num)
+
+    for i in range(1, dct_filter_num):
+        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num)
+
+    return basis
+
+
+if __name__ == "__main__":
+    TRAIN_PATH = "./signal_processing/"
+    sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav")
+
+    print(mfcc(audio, sample_rate))
+
+    import doctest
+
+    doctest.testmod()

From a1cb36c5556f3ea1eeeea64a3141e42c0b0bc230 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Tue, 12 Sep 2023 11:46:00 +0330
Subject: [PATCH 02/18] Add standalone usage in comments

---
 machine_learning/mfcc.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 9144846bbb5d..9fea66ee4cca 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -62,7 +62,6 @@
 
 import numpy as np
 import scipy.fftpack as fft
-from scipy.io import wavfile
 from scipy.signal import get_window
 
 logging.basicConfig(level=logging.WARNING)
@@ -386,10 +385,10 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
 
 
 if __name__ == "__main__":
-    TRAIN_PATH = "./signal_processing/"
-    sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav")
-
-    print(mfcc(audio, sample_rate))
+    # from scipy.io import wavfile
+    # wav_file_path = "./path-to-file/sample.wav"
+    # sample_rate, audio = wavfile.read(wav_file_path)
+    # mfccs = mfcc(audio, sample_rate)
 
     import doctest
 

From 68d2e11e8b566447ba4ccd499192007703a36bff Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 11:01:17 +0330
Subject: [PATCH 03/18] Apply suggestions from code review

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/mfcc.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 9fea66ee4cca..ee7af7c8a2bd 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -1,5 +1,5 @@
 """
-MFCC (Mel Frequency Cepstral Coefficients) Calculation
+Mel Frequency Cepstral Coefficients (MFCC) Calculation
 
 MFCC is a feature widely used in audio and speech processing to represent the
 short-term power spectrum of a sound signal in a more compact and
@@ -138,12 +138,10 @@ def normalize(audio: np.ndarray) -> np.ndarray:
     max_abs_value = np.max(np.abs(audio))
 
     # Divide the entire audio signal by the maximum absolute value
-    normalized_audio = audio / max_abs_value
+    return audio / max_abs_value
 
-    return normalized_audio
 
-
-def frame(
+def audio_frames(
     audio: np.ndarray,
     sample_rate: int,
     hop_length: int = 20,
@@ -168,7 +166,7 @@ def frame(
     audio = np.pad(audio, int(ftt_size / 2), mode="reflect")
 
     # Calculate the number of frames
-    frame_num = int((len(audio) - ftt_size) / hop_size) + 1
+    frame_count = int((len(audio) - ftt_size) / hop_size) + 1
 
     # Initialize an array to store the frames
     frames = np.zeros((frame_num, ftt_size))
@@ -206,9 +204,7 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
         audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]]
 
     # Transpose the FFT results back to the original shape
-    audio_fft = np.transpose(audio_fft)
-
-    return audio_fft
+    return np.transpose(audio_fft)
 
 
 def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
@@ -289,9 +285,7 @@ def mel_spaced_filterbank(
     # normalize filters
     # taken from the librosa library
     enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num])
-    filters *= enorm[:, np.newaxis]
-
-    return filters
+    return filters * enorm[:, np.newaxis]
 
 
 def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:

From 92d974814dc1b6a65e9cc26f4931ba8c3abeb96a Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Sat, 16 Sep 2023 18:12:31 -0400
Subject: [PATCH 04/18] Delete empty junk file (#9062)

* updating DIRECTORY.md

* updating DIRECTORY.md

* Delete empty junk file

* updating DIRECTORY.md

* Fix ruff errors

* Fix more ruff errors

---------

Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
---
 DIRECTORY.md                            | 1 -
 arithmetic_analysis/junk.py             | 0
 computer_vision/haralick_descriptors.py | 8 +++++---
 conversions/convert_number_to_words.py  | 6 +++---
 graphs/tarjans_scc.py                   | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)
 delete mode 100644 arithmetic_analysis/junk.py

diff --git a/DIRECTORY.md b/DIRECTORY.md
index 1b802564f939..d81e4ec1ee83 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -5,7 +5,6 @@
   * [In Static Equilibrium](arithmetic_analysis/in_static_equilibrium.py)
   * [Intersection](arithmetic_analysis/intersection.py)
   * [Jacobi Iteration Method](arithmetic_analysis/jacobi_iteration_method.py)
-  * [Junk](arithmetic_analysis/junk.py)
   * [Lu Decomposition](arithmetic_analysis/lu_decomposition.py)
   * [Newton Forward Interpolation](arithmetic_analysis/newton_forward_interpolation.py)
   * [Newton Method](arithmetic_analysis/newton_method.py)
diff --git a/arithmetic_analysis/junk.py b/arithmetic_analysis/junk.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/computer_vision/haralick_descriptors.py b/computer_vision/haralick_descriptors.py
index 1a86d84ea14b..413cea304f6c 100644
--- a/computer_vision/haralick_descriptors.py
+++ b/computer_vision/haralick_descriptors.py
@@ -100,7 +100,9 @@ def binarize(image: np.ndarray, threshold: float = 127.0) -> np.ndarray:
     return np.where(image > threshold, 1, 0)
 
 
-def transform(image: np.ndarray, kind: str, kernel: np.ndarray = None) -> np.ndarray:
+def transform(
+    image: np.ndarray, kind: str, kernel: np.ndarray | None = None
+) -> np.ndarray:
     """
     Simple image transformation using one of two available filter functions:
     Erosion and Dilation.
@@ -154,7 +156,7 @@ def transform(image: np.ndarray, kind: str, kernel: np.ndarray = None) -> np.nda
     return transformed
 
 
-def opening_filter(image: np.ndarray, kernel: np.ndarray = None) -> np.ndarray:
+def opening_filter(image: np.ndarray, kernel: np.ndarray | None = None) -> np.ndarray:
     """
     Opening filter, defined as the sequence of
     erosion and then a dilation filter on the same image.
@@ -172,7 +174,7 @@ def opening_filter(image: np.ndarray, kernel: np.ndarray = None) -> np.ndarray:
     return transform(transform(image, "dilation", kernel), "erosion", kernel)
 
 
-def closing_filter(image: np.ndarray, kernel: np.ndarray = None) -> np.ndarray:
+def closing_filter(image: np.ndarray, kernel: np.ndarray | None = None) -> np.ndarray:
     """
     Opening filter, defined as the sequence of
     dilation and then erosion filter on the same image.
diff --git a/conversions/convert_number_to_words.py b/conversions/convert_number_to_words.py
index 0e4405319f1f..0c428928b31d 100644
--- a/conversions/convert_number_to_words.py
+++ b/conversions/convert_number_to_words.py
@@ -54,7 +54,7 @@ def max_value(cls, system: str) -> int:
 
 
 class NumberWords(Enum):
-    ONES: ClassVar = {
+    ONES: ClassVar[dict[int, str]] = {
         0: "",
         1: "one",
         2: "two",
@@ -67,7 +67,7 @@ class NumberWords(Enum):
         9: "nine",
     }
 
-    TEENS: ClassVar = {
+    TEENS: ClassVar[dict[int, str]] = {
         0: "ten",
         1: "eleven",
         2: "twelve",
@@ -80,7 +80,7 @@ class NumberWords(Enum):
         9: "nineteen",
     }
 
-    TENS: ClassVar = {
+    TENS: ClassVar[dict[int, str]] = {
         2: "twenty",
         3: "thirty",
         4: "forty",
diff --git a/graphs/tarjans_scc.py b/graphs/tarjans_scc.py
index 30f8ca8a204f..dfd2e52704d5 100644
--- a/graphs/tarjans_scc.py
+++ b/graphs/tarjans_scc.py
@@ -77,7 +77,7 @@ def create_graph(n, edges):
     n_vertices = 7
     source = [0, 0, 1, 2, 3, 3, 4, 4, 6]
     target = [1, 3, 2, 0, 1, 4, 5, 6, 5]
-    edges = [(u, v) for u, v in zip(source, target)]
+    edges = list(zip(source, target))
     g = create_graph(n_vertices, edges)
 
     assert [[5], [6], [4], [3, 2, 1, 0]] == tarjan(g)

From 6c4f90bcad94da12d0a033d79a4c11c4219d2327 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 11:37:33 +0330
Subject: [PATCH 05/18] [main] Fix typo due to auto review change

---
 machine_learning/mfcc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index ee7af7c8a2bd..5f792fe242be 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -87,7 +87,7 @@ def mfcc(
     logging.info(f"Normalized audio max: {np.max(audio_normalized)}")
 
     # frame audio into
-    audio_framed = frame(
+    audio_framed = audio_frames(
         audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length
     )
 
@@ -169,10 +169,10 @@ def audio_frames(
     frame_count = int((len(audio) - ftt_size) / hop_size) + 1
 
     # Initialize an array to store the frames
-    frames = np.zeros((frame_num, ftt_size))
+    frames = np.zeros((frame_count, ftt_size))
 
     # Split the audio signal into frames
-    for n in range(frame_num):
+    for n in range(frame_count):
         frames[n] = audio[n * hop_size : n * hop_size + ftt_size]
 
     return frames

From 1d843b5bdae0d62e2ecbf3434539a3cc392062a2 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 12:37:00 +0330
Subject: [PATCH 06/18] Add doctests for all functions

---
 machine_learning/mfcc.py | 84 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 7 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 5f792fe242be..fc4efaf3a7d4 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -133,6 +133,14 @@ def normalize(audio: np.ndarray) -> np.ndarray:
 
     Returns:
         np.ndarray: The normalized audio signal.
+
+    Examples:
+        >>> audio = np.array([1, 2, 3, 4, 5])
+        >>> normalized_audio = normalize(audio)
+        >>> np.max(normalized_audio)
+        1.0
+        >>> np.min(normalized_audio)
+        0.2
     """
     # Find the maximum absolute value in the audio signal
     max_abs_value = np.max(np.abs(audio))
@@ -158,6 +166,14 @@ def audio_frames(
 
     Returns:
         np.ndarray: An array of overlapping frames.
+    
+    Examples:
+        >>> import numpy as np
+        >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000)
+        >>> sample_rate = 8000
+        >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512)
+        >>> frames.shape
+        (126, 512)
     """
 
     hop_size = np.round(sample_rate * hop_length / 1000).astype(int)
@@ -188,6 +204,13 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
 
     Returns:
         np.ndarray: The FFT of the audio data.
+
+    Examples:
+        >>> import numpy as np
+        >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+        >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4)
+        >>> np.allclose(audio_fft, np.array([[6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j], [15.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]]))
+        True
     """
     # Transpose the audio data to have time in rows and channels in columns
     audio_transposed = np.transpose(audio_windowed)
@@ -216,6 +239,13 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
 
     Returns:
         np.ndarray: The power of the audio signal.
+
+    Examples:
+        >>> import numpy as np
+        >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j])
+        >>> power = calculate_signal_power(audio_fft)
+        >>> np.allclose(power, np.array([5, 13, 25, 41]))
+        True
     """
     # Calculate the power by squaring the absolute values of the FFT coefficients
     audio_power = np.square(np.abs(audio_fft))
@@ -232,6 +262,10 @@ def freq_to_mel(freq):
 
     Returns:
         float: The frequency in mel scale.
+
+    Examples:
+        >>> round(freq_to_mel(1000), 2)
+        999.99
     """
     # Use the formula to convert frequency to the mel scale
     return 2595.0 * np.log10(1.0 + freq / 700.0)
@@ -246,6 +280,10 @@ def mel_to_freq(mels):
 
     Returns:
         float: The frequency in Hertz.
+
+    Examples:
+        >>> round(mel_to_freq(999.99), 2)
+        1000.01
     """
     # Use the formula to convert mel scale to frequency
     return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
@@ -264,6 +302,10 @@ def mel_spaced_filterbank(
 
     Returns:
         np.ndarray: Mel-spaced filter bank.
+
+    Examples:
+        >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10)
+        0.0004603981
     """
     freq_min = 0
     freq_high = sample_rate // 2
@@ -298,6 +340,10 @@ def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
 
     Returns:
         np.ndarray: A matrix of filters.
+
+    Examples:
+        >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape
+        (4, 257)
     """
     num_filters = len(filter_points) - 2
     filters = np.zeros((num_filters, int(ftt_size / 2) + 1))
@@ -335,8 +381,11 @@ def get_filter_points(
 
     Returns:
         Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
-    """
 
+Examples:
+        >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
+        array([  0,  20,  51,  95, 161, 256])
+    """
     # Convert minimum and maximum frequencies to mel scale
     fmin_mel = freq_to_mel(freq_min)
     fmax_mel = freq_to_mel(freq_high)
@@ -366,6 +415,9 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
 
     Returns:
         np.ndarray: The DCT basis matrix.
+    Examples:
+        >>> round(dct(3, 5)[0][0], 5)
+        0.44721
     """
     basis = np.empty((dct_filter_num, filter_num))
     basis[0, :] = 1.0 / np.sqrt(filter_num)
@@ -378,12 +430,30 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
     return basis
 
 
-if __name__ == "__main__":
-    # from scipy.io import wavfile
-    # wav_file_path = "./path-to-file/sample.wav"
-    # sample_rate, audio = wavfile.read(wav_file_path)
-    # mfccs = mfcc(audio, sample_rate)
+def example(wav_file_path="./path-to-file/sample.wav"):
+    """
+    Example function to calculate MFCCs (Mel Frequency Cepstral Coefficients) from an audio file.
 
-    import doctest
+    Args:
+        wav_file_path (str): The path to the WAV audio file (default is "./path-to-file/sample.wav").
+
+    Returns:
+        np.ndarray: The computed MFCCs for the audio.
+    """
+    from scipy.io import wavfile
+
+    try:
+        # Load the audio from the WAV file
+        sample_rate, audio = wavfile.read(wav_file_path)
+        # Calculate MFCCs
+        mfccs = mfcc(audio, sample_rate)
+
+        return mfccs
+    
+    except Exception as e:
+        logging.error(f"Error processing audio: {str(e)}")
+        return None
 
+if __name__ == "__main__":
+    import doctest
     doctest.testmod()

From d14cc0949d8265d4501dff58c9c9a1999935a481 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Tue, 12 Sep 2023 11:39:01 +0330
Subject: [PATCH 07/18] Add MFCC feature extraction to machine learning

---
 machine_learning/mfcc.py | 396 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 machine_learning/mfcc.py

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
new file mode 100644
index 000000000000..9144846bbb5d
--- /dev/null
+++ b/machine_learning/mfcc.py
@@ -0,0 +1,396 @@
+"""
+MFCC (Mel Frequency Cepstral Coefficients) Calculation
+
+MFCC is a feature widely used in audio and speech processing to represent the
+short-term power spectrum of a sound signal in a more compact and
+discriminative way. It is particularly popular in speech and audio processing
+tasks such as speech recognition and speaker identification.
+
+How MFCC is Calculated:
+1. Preprocessing:
+   - Load an audio signal and normalize it to ensure that the values fall
+     within a specific range (e.g., between -1 and 1).
+   - Frame the audio signal into overlapping, fixed-length segments, typically
+     using a technique like windowing to reduce spectral leakage.
+
+2. Fourier Transform:
+   - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it
+     from the time domain to the frequency domain. This results in a
+     representation of the audio frame as a sequence of frequency components.
+
+3. Power Spectrum:
+   - Calculate the power spectrum by taking the squared magnitude of each
+     frequency component obtained from the FFT. This step measures the energy
+     distribution across different frequency bands.
+
+4. Mel Filterbank:
+   - Apply a set of triangular filterbanks spaced in the Mel frequency scale
+     to the power spectrum. These filters mimic the human auditory system's
+     frequency response. Each filterbank sums the power spectrum values within
+     its band.
+
+5. Logarithmic Compression:
+   - Take the logarithm (typically base 10) of the filterbank values to
+     compress the dynamic range. This step mimics the logarithmic response of
+     the human ear to sound intensity.
+
+6. Discrete Cosine Transform (DCT):
+   - Apply the Discrete Cosine Transform to the log filterbank energies to
+     obtain the MFCC coefficients. This transformation helps decorrelate the
+     filterbank energies and captures the most important features of the audio
+     signal.
+
+7. Feature Extraction:
+   - Select a subset of the DCT coefficients to form the feature vector.
+     Often, the first few coefficients (e.g., 12-13) are used for most
+     applications.
+
+References:
+- Mel-Frequency Cepstral Coefficients (MFCCs):
+  https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+- Speech and Language Processing by Daniel Jurafsky & James H. Martin:
+  https://web.stanford.edu/~jurafsky/slp3/
+- Mel Frequency Cepstral Coefficient (MFCC) tutorial
+  http://practicalcryptography.com/miscellaneous/machine-learning
+  /guide-mel-frequency-cepstral-coefficients-mfccs/
+
+Author: Amir Lavasani
+"""
+
+
+import logging
+
+import numpy as np
+import scipy.fftpack as fft
+from scipy.io import wavfile
+from scipy.signal import get_window
+
+logging.basicConfig(level=logging.WARNING)
+
+
+def mfcc(
+    audio: np.ndarray,
+    sample_rate: int,
+    ftt_size: int = 1024,
+    hop_length: int = 20,
+    mel_filter_num: int = 10,
+    dct_filter_num: int = 40,
+) -> np.ndarray:
+    logging.info(f"Sample rate: {sample_rate}Hz")
+    logging.info(f"Audio duration: {len(audio) / sample_rate}s")
+    logging.info(f"Audio min: {np.min(audio)}")
+    logging.info(f"Audio max: {np.max(audio)}")
+
+    # normalize audio
+    audio_normalized = normalize(audio)
+
+    logging.info(f"Normalized audio min: {np.min(audio_normalized)}")
+    logging.info(f"Normalized audio max: {np.max(audio_normalized)}")
+
+    # frame audio into
+    audio_framed = frame(
+        audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length
+    )
+
+    logging.info(f"Framed audio shape: {audio_framed.shape}")
+    logging.info(f"First frame: {audio_framed[0]}")
+
+    # convert to frequency domain
+    # For simplicity we will choose the Hanning window.
+    window = get_window("hann", ftt_size, fftbins=True)
+    audio_windowed = audio_framed * window
+
+    logging.info(f"Windowed audio shape: {audio_windowed.shape}")
+    logging.info(f"First frame: {audio_windowed[0]}")
+
+    audio_fft = calculate_fft(audio_windowed, ftt_size)
+    logging.info(f"fft audio shape: {audio_fft.shape}")
+    logging.info(f"First frame: {audio_fft[0]}")
+
+    audio_power = calculate_signal_power(audio_fft)
+    logging.info(f"power audio shape: {audio_power.shape}")
+    logging.info(f"First frame: {audio_power[0]}")
+
+    filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size)
+    logging.info(f"filters shape: {filters.shape}")
+
+    audio_filtered = np.dot(filters, np.transpose(audio_power))
+    audio_log = 10.0 * np.log10(audio_filtered)
+    logging.info(f"audio_log shape: {audio_log.shape}")
+
+    dct_filters = dct(dct_filter_num, mel_filter_num)
+    cepstral_coefficents = np.dot(dct_filters, audio_log)
+
+    logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}")
+    return cepstral_coefficents
+
+
+def normalize(audio: np.ndarray) -> np.ndarray:
+    """
+    Normalize an audio signal by scaling it to have values between -1 and 1.
+
+    Args:
+        audio (np.ndarray): The input audio signal.
+
+    Returns:
+        np.ndarray: The normalized audio signal.
+    """
+    # Find the maximum absolute value in the audio signal
+    max_abs_value = np.max(np.abs(audio))
+
+    # Divide the entire audio signal by the maximum absolute value
+    normalized_audio = audio / max_abs_value
+
+    return normalized_audio
+
+
+def frame(
+    audio: np.ndarray,
+    sample_rate: int,
+    hop_length: int = 20,
+    ftt_size: int = 1024,
+) -> np.ndarray:
+    """
+    Split an audio signal into overlapping frames.
+
+    Args:
+        audio (np.ndarray): The input audio signal.
+        sample_rate (int): The sample rate of the audio signal.
+        hop_length (Optional[int]): The length of the hopping (default is 20ms).
+        ftt_size (Optional[int]): The size of the FFT window (default is 1024).
+
+    Returns:
+        np.ndarray: An array of overlapping frames.
+    """
+
+    hop_size = np.round(sample_rate * hop_length / 1000).astype(int)
+
+    # Pad the audio signal to handle edge cases
+    audio = np.pad(audio, int(ftt_size / 2), mode="reflect")
+
+    # Calculate the number of frames
+    frame_num = int((len(audio) - ftt_size) / hop_size) + 1
+
+    # Initialize an array to store the frames
+    frames = np.zeros((frame_num, ftt_size))
+
+    # Split the audio signal into frames
+    for n in range(frame_num):
+        frames[n] = audio[n * hop_size : n * hop_size + ftt_size]
+
+    return frames
+
+
+def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray:
+    """
+    Calculate the Fast Fourier Transform (FFT) of windowed audio data.
+
+    Args:
+        audio_windowed (np.ndarray): The windowed audio signal.
+        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+
+    Returns:
+        np.ndarray: The FFT of the audio data.
+    """
+    # Transpose the audio data to have time in rows and channels in columns
+    audio_transposed = np.transpose(audio_windowed)
+
+    # Initialize an array to store the FFT results
+    audio_fft = np.empty(
+        (int(1 + ftt_size // 2), audio_transposed.shape[1]),
+        dtype=np.complex64,
+        order="F",
+    )
+
+    # Compute FFT for each channel
+    for n in range(audio_fft.shape[1]):
+        audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]]
+
+    # Transpose the FFT results back to the original shape
+    audio_fft = np.transpose(audio_fft)
+
+    return audio_fft
+
+
+def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
+    """
+    Calculate the power of the audio signal from its FFT.
+
+    Args:
+        audio_fft (np.ndarray): The FFT of the audio signal.
+
+    Returns:
+        np.ndarray: The power of the audio signal.
+    """
+    # Calculate the power by squaring the absolute values of the FFT coefficients
+    audio_power = np.square(np.abs(audio_fft))
+
+    return audio_power
+
+
+def freq_to_mel(freq):
+    """
+    Convert a frequency in Hertz to the mel scale.
+
+    Args:
+        freq (float): The frequency in Hertz.
+
+    Returns:
+        float: The frequency in mel scale.
+    """
+    # Use the formula to convert frequency to the mel scale
+    return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+
+def mel_to_freq(mels):
+    """
+    Convert a frequency in the mel scale to Hertz.
+
+    Args:
+        mels (float): The frequency in mel scale.
+
+    Returns:
+        float: The frequency in Hertz.
+    """
+    # Use the formula to convert mel scale to frequency
+    return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+
+def mel_spaced_filterbank(
+    sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024
+) -> np.ndarray:
+    """
+    Create a Mel-spaced filter bank for audio processing.
+
+    Args:
+        sample_rate (int): The sample rate of the audio.
+        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
+        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+
+    Returns:
+        np.ndarray: Mel-spaced filter bank.
+    """
+    freq_min = 0
+    freq_high = sample_rate // 2
+
+    logging.info(f"Minimum frequency: {freq_min}")
+    logging.info(f"Maximum frequency: {freq_high}")
+
+    # Calculate filter points and mel frequencies
+    filter_points, mel_freqs = get_filter_points(
+        sample_rate,
+        freq_min,
+        freq_high,
+        mel_filter_num,
+        ftt_size,
+    )
+
+    filters = get_filters(filter_points, ftt_size)
+
+    # normalize filters
+    # taken from the librosa library
+    enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num])
+    filters *= enorm[:, np.newaxis]
+
+    return filters
+
+
+def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
+    """
+    Generate filters for audio processing.
+
+    Args:
+        filter_points (list): A list of filter points.
+        ftt_size (int): The size of the FFT.
+
+    Returns:
+        np.ndarray: A matrix of filters.
+    """
+    num_filters = len(filter_points) - 2
+    filters = np.zeros((num_filters, int(ftt_size / 2) + 1))
+
+    for n in range(num_filters):
+        start = filter_points[n]
+        mid = filter_points[n + 1]
+        end = filter_points[n + 2]
+
+        # Linearly increase values from 0 to 1
+        filters[n, start:mid] = np.linspace(0, 1, mid - start)
+
+        # Linearly decrease values from 1 to 0
+        filters[n, mid:end] = np.linspace(1, 0, end - mid)
+
+    return filters
+
+
+def get_filter_points(
+    sample_rate: int,
+    freq_min: int,
+    freq_high: int,
+    mel_filter_num: int = 10,
+    ftt_size: int = 1024,
+):
+    """
+    Calculate the filter points and frequencies for mel frequency filters.
+
+    Args:
+        sample_rate (int): The sample rate of the audio.
+        freq_min (int): The minimum frequency in Hertz.
+        freq_high (int): The maximum frequency in Hertz.
+        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
+        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
+    """
+
+    # Convert minimum and maximum frequencies to mel scale
+    fmin_mel = freq_to_mel(freq_min)
+    fmax_mel = freq_to_mel(freq_high)
+
+    logging.info(f"MEL min: {fmin_mel}")
+    logging.info(f"MEL max: {fmax_mel}")
+
+    # Generate equally spaced mel frequencies
+    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2)
+
+    # Convert mel frequencies back to Hertz
+    freqs = mel_to_freq(mels)
+
+    # Calculate filter points as integer values
+    filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int)
+
+    return filter_points, freqs
+
+
+def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
+    """
+    Compute the Discrete Cosine Transform (DCT) basis matrix.
+
+    Args:
+        dct_filter_num (int): The number of DCT filters to generate.
+        filter_num (int): The number of the fbank filters.
+
+    Returns:
+        np.ndarray: The DCT basis matrix.
+    """
+    basis = np.empty((dct_filter_num, filter_num))
+    basis[0, :] = 1.0 / np.sqrt(filter_num)
+
+    samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num)
+
+    for i in range(1, dct_filter_num):
+        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num)
+
+    return basis
+
+
+if __name__ == "__main__":
+    TRAIN_PATH = "./signal_processing/"
+    sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav")
+
+    print(mfcc(audio, sample_rate))
+
+    import doctest
+
+    doctest.testmod()

From 088eaee190683c370a5307676bba26bed8074338 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Tue, 12 Sep 2023 11:46:00 +0330
Subject: [PATCH 08/18] Add standalone usage in comments

---
 machine_learning/mfcc.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 9144846bbb5d..9fea66ee4cca 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -62,7 +62,6 @@
 
 import numpy as np
 import scipy.fftpack as fft
-from scipy.io import wavfile
 from scipy.signal import get_window
 
 logging.basicConfig(level=logging.WARNING)
@@ -386,10 +385,10 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
 
 
 if __name__ == "__main__":
-    TRAIN_PATH = "./signal_processing/"
-    sample_rate, audio = wavfile.read(TRAIN_PATH + "sample-speech.wav")
-
-    print(mfcc(audio, sample_rate))
+    # from scipy.io import wavfile
+    # wav_file_path = "./path-to-file/sample.wav"
+    # sample_rate, audio = wavfile.read(wav_file_path)
+    # mfccs = mfcc(audio, sample_rate)
 
     import doctest
 

From 554d52b35b9cf7892428c206836e808727d66446 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 11:01:17 +0330
Subject: [PATCH 09/18] Apply suggestions from code review

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/mfcc.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 9fea66ee4cca..ee7af7c8a2bd 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -1,5 +1,5 @@
 """
-MFCC (Mel Frequency Cepstral Coefficients) Calculation
+Mel Frequency Cepstral Coefficients (MFCC) Calculation
 
 MFCC is a feature widely used in audio and speech processing to represent the
 short-term power spectrum of a sound signal in a more compact and
@@ -138,12 +138,10 @@ def normalize(audio: np.ndarray) -> np.ndarray:
     max_abs_value = np.max(np.abs(audio))
 
     # Divide the entire audio signal by the maximum absolute value
-    normalized_audio = audio / max_abs_value
+    return audio / max_abs_value
 
-    return normalized_audio
 
-
-def frame(
+def audio_frames(
     audio: np.ndarray,
     sample_rate: int,
     hop_length: int = 20,
@@ -168,7 +166,7 @@ def frame(
     audio = np.pad(audio, int(ftt_size / 2), mode="reflect")
 
     # Calculate the number of frames
-    frame_num = int((len(audio) - ftt_size) / hop_size) + 1
+    frame_count = int((len(audio) - ftt_size) / hop_size) + 1
 
     # Initialize an array to store the frames
     frames = np.zeros((frame_num, ftt_size))
@@ -206,9 +204,7 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
         audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]]
 
     # Transpose the FFT results back to the original shape
-    audio_fft = np.transpose(audio_fft)
-
-    return audio_fft
+    return np.transpose(audio_fft)
 
 
 def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
@@ -289,9 +285,7 @@ def mel_spaced_filterbank(
     # normalize filters
     # taken from the librosa library
     enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num])
-    filters *= enorm[:, np.newaxis]
-
-    return filters
+    return filters * enorm[:, np.newaxis]
 
 
 def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:

From f78fd1b34557d36c63ed8e1666f216b168ed373f Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 11:37:33 +0330
Subject: [PATCH 10/18] [main] Fix typo due to auto review change

---
 machine_learning/mfcc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index ee7af7c8a2bd..5f792fe242be 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -87,7 +87,7 @@ def mfcc(
     logging.info(f"Normalized audio max: {np.max(audio_normalized)}")
 
     # frame audio into
-    audio_framed = frame(
+    audio_framed = audio_frames(
         audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length
     )
 
@@ -169,10 +169,10 @@ def audio_frames(
     frame_count = int((len(audio) - ftt_size) / hop_size) + 1
 
     # Initialize an array to store the frames
-    frames = np.zeros((frame_num, ftt_size))
+    frames = np.zeros((frame_count, ftt_size))
 
     # Split the audio signal into frames
-    for n in range(frame_num):
+    for n in range(frame_count):
         frames[n] = audio[n * hop_size : n * hop_size + ftt_size]
 
     return frames

From cb9d9df1e3e4bef78f844ecb18680a7b2ce34d7f Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 12:37:00 +0330
Subject: [PATCH 11/18] Add doctests for all functions

---
 machine_learning/mfcc.py | 84 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 7 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 5f792fe242be..fc4efaf3a7d4 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -133,6 +133,14 @@ def normalize(audio: np.ndarray) -> np.ndarray:
 
     Returns:
         np.ndarray: The normalized audio signal.
+
+    Examples:
+        >>> audio = np.array([1, 2, 3, 4, 5])
+        >>> normalized_audio = normalize(audio)
+        >>> np.max(normalized_audio)
+        1.0
+        >>> np.min(normalized_audio)
+        0.2
     """
     # Find the maximum absolute value in the audio signal
     max_abs_value = np.max(np.abs(audio))
@@ -158,6 +166,14 @@ def audio_frames(
 
     Returns:
         np.ndarray: An array of overlapping frames.
+    
+    Examples:
+        >>> import numpy as np
+        >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000)
+        >>> sample_rate = 8000
+        >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512)
+        >>> frames.shape
+        (126, 512)
     """
 
     hop_size = np.round(sample_rate * hop_length / 1000).astype(int)
@@ -188,6 +204,13 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
 
     Returns:
         np.ndarray: The FFT of the audio data.
+
+    Examples:
+        >>> import numpy as np
+        >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+        >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4)
+        >>> np.allclose(audio_fft, np.array([[6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j], [15.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]]))
+        True
     """
     # Transpose the audio data to have time in rows and channels in columns
     audio_transposed = np.transpose(audio_windowed)
@@ -216,6 +239,13 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
 
     Returns:
         np.ndarray: The power of the audio signal.
+
+    Examples:
+        >>> import numpy as np
+        >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j])
+        >>> power = calculate_signal_power(audio_fft)
+        >>> np.allclose(power, np.array([5, 13, 25, 41]))
+        True
     """
     # Calculate the power by squaring the absolute values of the FFT coefficients
     audio_power = np.square(np.abs(audio_fft))
@@ -232,6 +262,10 @@ def freq_to_mel(freq):
 
     Returns:
         float: The frequency in mel scale.
+
+    Examples:
+        >>> round(freq_to_mel(1000), 2)
+        999.99
     """
     # Use the formula to convert frequency to the mel scale
     return 2595.0 * np.log10(1.0 + freq / 700.0)
@@ -246,6 +280,10 @@ def mel_to_freq(mels):
 
     Returns:
         float: The frequency in Hertz.
+
+    Examples:
+        >>> round(mel_to_freq(999.99), 2)
+        1000.01
     """
     # Use the formula to convert mel scale to frequency
     return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
@@ -264,6 +302,10 @@ def mel_spaced_filterbank(
 
     Returns:
         np.ndarray: Mel-spaced filter bank.
+
+    Examples:
+        >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10)
+        0.0004603981
     """
     freq_min = 0
     freq_high = sample_rate // 2
@@ -298,6 +340,10 @@ def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
 
     Returns:
         np.ndarray: A matrix of filters.
+
+    Examples:
+        >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape
+        (4, 257)
     """
     num_filters = len(filter_points) - 2
     filters = np.zeros((num_filters, int(ftt_size / 2) + 1))
@@ -335,8 +381,11 @@ def get_filter_points(
 
     Returns:
         Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
-    """
 
+Examples:
+        >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
+        array([  0,  20,  51,  95, 161, 256])
+    """
     # Convert minimum and maximum frequencies to mel scale
     fmin_mel = freq_to_mel(freq_min)
     fmax_mel = freq_to_mel(freq_high)
@@ -366,6 +415,9 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
 
     Returns:
         np.ndarray: The DCT basis matrix.
+    Examples:
+        >>> round(dct(3, 5)[0][0], 5)
+        0.44721
     """
     basis = np.empty((dct_filter_num, filter_num))
     basis[0, :] = 1.0 / np.sqrt(filter_num)
@@ -378,12 +430,30 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
     return basis
 
 
-if __name__ == "__main__":
-    # from scipy.io import wavfile
-    # wav_file_path = "./path-to-file/sample.wav"
-    # sample_rate, audio = wavfile.read(wav_file_path)
-    # mfccs = mfcc(audio, sample_rate)
+def example(wav_file_path="./path-to-file/sample.wav"):
+    """
+    Example function to calculate MFCCs (Mel Frequency Cepstral Coefficients) from an audio file.
 
-    import doctest
+    Args:
+        wav_file_path (str): The path to the WAV audio file (default is "./path-to-file/sample.wav").
+
+    Returns:
+        np.ndarray: The computed MFCCs for the audio.
+    """
+    from scipy.io import wavfile
+
+    try:
+        # Load the audio from the WAV file
+        sample_rate, audio = wavfile.read(wav_file_path)
+        # Calculate MFCCs
+        mfccs = mfcc(audio, sample_rate)
+
+        return mfccs
+    
+    except Exception as e:
+        logging.error(f"Error processing audio: {str(e)}")
+        return None
 
+if __name__ == "__main__":
+    import doctest
     doctest.testmod()

From d6bdf63e35f7043439785d7879d78fea40ab6e1d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 17 Sep 2023 09:08:06 +0000
Subject: [PATCH 12/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/mfcc.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index fc4efaf3a7d4..da402e8da6f0 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -166,7 +166,7 @@ def audio_frames(
 
     Returns:
         np.ndarray: An array of overlapping frames.
-    
+
     Examples:
         >>> import numpy as np
         >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000)
@@ -370,21 +370,21 @@ def get_filter_points(
     ftt_size: int = 1024,
 ):
     """
-    Calculate the filter points and frequencies for mel frequency filters.
+        Calculate the filter points and frequencies for mel frequency filters.
 
-    Args:
-        sample_rate (int): The sample rate of the audio.
-        freq_min (int): The minimum frequency in Hertz.
-        freq_high (int): The maximum frequency in Hertz.
-        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
-        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+        Args:
+            sample_rate (int): The sample rate of the audio.
+            freq_min (int): The minimum frequency in Hertz.
+            freq_high (int): The maximum frequency in Hertz.
+            mel_filter_num (Optional[int]): The number of mel filters (default is 10).
+            ftt_size (Optional[int]): The size of the FFT (default is 1024).
 
-    Returns:
-        Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
 
-Examples:
-        >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
-        array([  0,  20,  51,  95, 161, 256])
+    Examples:
+            >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
+            array([  0,  20,  51,  95, 161, 256])
     """
     # Convert minimum and maximum frequencies to mel scale
     fmin_mel = freq_to_mel(freq_min)
@@ -449,11 +449,13 @@ def example(wav_file_path="./path-to-file/sample.wav"):
         mfccs = mfcc(audio, sample_rate)
 
         return mfccs
-    
+
     except Exception as e:
         logging.error(f"Error processing audio: {str(e)}")
         return None
 
+
 if __name__ == "__main__":
     import doctest
+
     doctest.testmod()

From bda7ede7e5ca9515cbbf720b0e36209699a28fee Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 12:59:54 +0330
Subject: [PATCH 13/18] Fix some pre-commit issues

---
 machine_learning/mfcc.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 39d72584438e..0079363196f8 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -209,7 +209,9 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
         >>> import numpy as np
         >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
         >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4)
-        >>> np.allclose(audio_fft, np.array([[6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j], [15.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]]))
+        >>> np.allclose(\
+            audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])\
+            )
         True
     """
     # Transpose the audio data to have time in rows and channels in columns
@@ -433,26 +435,22 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
 
 def example(wav_file_path="./path-to-file/sample.wav"):
     """
-    Example function to calculate MFCCs (Mel Frequency Cepstral Coefficients) from an audio file.
+    Example function to calculate MFCCs from an audio file.
 
     Args:
-        wav_file_path (str): The path to the WAV audio file (default is "./path-to-file/sample.wav").
+        wav_file_path (str): The path to the WAV audio file.
 
     Returns:
         np.ndarray: The computed MFCCs for the audio.
     """
     from scipy.io import wavfile
 
-    try:
-        # Load the audio from the WAV file
-        sample_rate, audio = wavfile.read(wav_file_path)
-        # Calculate MFCCs
-        mfccs = mfcc(audio, sample_rate)
+    # Load the audio from the WAV file
+    sample_rate, audio = wavfile.read(wav_file_path)
+    # Calculate MFCCs
+    mfccs = mfcc(audio, sample_rate)
 
-        return mfccs
-    except Exception as e:
-        logging.error(f"Error processing audio: {str(e)}")
-        return None
+    return mfccs
 
 
 if __name__ == "__main__":

From 2fde552cb2561114acf792dd1a8cbb979f4396a2 Mon Sep 17 00:00:00 2001
From: Amir Lavasani <amirm.lavasani@gmail.com>
Date: Sun, 17 Sep 2023 14:19:35 +0330
Subject: [PATCH 14/18] Update review issues * Remove types from docstring *
 Rename dct * Add mfcc docstring * Add typing to several functions

---
 machine_learning/mfcc.py | 181 ++++++++++++++++++++++-----------------
 1 file changed, 103 insertions(+), 78 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 0079363196f8..c3d85b526112 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -75,6 +75,33 @@ def mfcc(
     mel_filter_num: int = 10,
     dct_filter_num: int = 40,
 ) -> np.ndarray:
+    """
+    Calculate Mel Frequency Cepstral Coefficients (MFCCs) from an audio signal.
+
+    Args:
+        audio: The input audio signal.
+        sample_rate: The sample rate of the audio signal (in Hz).
+        ftt_size: The size of the FFT window (default is 1024).
+        hop_length: The hop length for frame creation (default is 20ms).
+        mel_filter_num: The number of Mel filters (default is 10).
+        dct_filter_num: The number of DCT filters (default is 40).
+
+    Returns:
+        A matrix of MFCCs for the input audio.
+
+    Raises:
+        ValueError: If the input audio is empty.
+
+    Example:
+    >>> import numpy as np
+    >>> sample_rate = 44100  # Sample rate of 44.1 kHz
+    >>> duration = 2.0  # Duration of 1 second
+    >>> t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
+    >>> audio = 0.5 * np.sin(2 * np.pi * 440.0 * t)  # Generate a 440 Hz sine wave
+    >>> mfccs = mfcc(audio, sample_rate)
+    >>> mfccs.shape
+    (40, 101)
+    """
     logging.info(f"Sample rate: {sample_rate}Hz")
     logging.info(f"Audio duration: {len(audio) / sample_rate}s")
     logging.info(f"Audio min: {np.min(audio)}")
@@ -117,7 +144,7 @@ def mfcc(
     audio_log = 10.0 * np.log10(audio_filtered)
     logging.info(f"audio_log shape: {audio_log.shape}")
 
-    dct_filters = dct(dct_filter_num, mel_filter_num)
+    dct_filters = discrete_cosine_transform(dct_filter_num, mel_filter_num)
     cepstral_coefficents = np.dot(dct_filters, audio_log)
 
     logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}")
@@ -129,18 +156,18 @@ def normalize(audio: np.ndarray) -> np.ndarray:
     Normalize an audio signal by scaling it to have values between -1 and 1.
 
     Args:
-        audio (np.ndarray): The input audio signal.
+        audio: The input audio signal.
 
     Returns:
-        np.ndarray: The normalized audio signal.
+        The normalized audio signal.
 
     Examples:
-        >>> audio = np.array([1, 2, 3, 4, 5])
-        >>> normalized_audio = normalize(audio)
-        >>> np.max(normalized_audio)
-        1.0
-        >>> np.min(normalized_audio)
-        0.2
+    >>> audio = np.array([1, 2, 3, 4, 5])
+    >>> normalized_audio = normalize(audio)
+    >>> np.max(normalized_audio)
+    1.0
+    >>> np.min(normalized_audio)
+    0.2
     """
     # Find the maximum absolute value in the audio signal
     max_abs_value = np.max(np.abs(audio))
@@ -159,21 +186,21 @@ def audio_frames(
     Split an audio signal into overlapping frames.
 
     Args:
-        audio (np.ndarray): The input audio signal.
-        sample_rate (int): The sample rate of the audio signal.
-        hop_length (Optional[int]): The length of the hopping (default is 20ms).
-        ftt_size (Optional[int]): The size of the FFT window (default is 1024).
+        audio: The input audio signal.
+        sample_rate: The sample rate of the audio signal.
+        hop_length: The length of the hopping (default is 20ms).
+        ftt_size: The size of the FFT window (default is 1024).
 
     Returns:
-        np.ndarray: An array of overlapping frames.
+        An array of overlapping frames.
 
     Examples:
-        >>> import numpy as np
-        >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000)
-        >>> sample_rate = 8000
-        >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512)
-        >>> frames.shape
-        (126, 512)
+    >>> import numpy as np
+    >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000)
+    >>> sample_rate = 8000
+    >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512)
+    >>> frames.shape
+    (126, 512)
     """
 
     hop_size = np.round(sample_rate * hop_length / 1000).astype(int)
@@ -199,20 +226,18 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
     Calculate the Fast Fourier Transform (FFT) of windowed audio data.
 
     Args:
-        audio_windowed (np.ndarray): The windowed audio signal.
-        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+        audio_windowed: The windowed audio signal.
+        ftt_size: The size of the FFT (default is 1024).
 
     Returns:
-        np.ndarray: The FFT of the audio data.
+        The FFT of the audio data.
 
     Examples:
-        >>> import numpy as np
-        >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-        >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4)
-        >>> np.allclose(\
-            audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])\
-            )
-        True
+    >>> import numpy as np
+    >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4)
+    >>> np.allclose(audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]))
+    True
     """
     # Transpose the audio data to have time in rows and channels in columns
     audio_transposed = np.transpose(audio_windowed)
@@ -237,17 +262,17 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
     Calculate the power of the audio signal from its FFT.
 
     Args:
-        audio_fft (np.ndarray): The FFT of the audio signal.
+        audio_fft: The FFT of the audio signal.
 
     Returns:
-        np.ndarray: The power of the audio signal.
+        The power of the audio signal.
 
     Examples:
-        >>> import numpy as np
-        >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j])
-        >>> power = calculate_signal_power(audio_fft)
-        >>> np.allclose(power, np.array([5, 13, 25, 41]))
-        True
+    >>> import numpy as np
+    >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j])
+    >>> power = calculate_signal_power(audio_fft)
+    >>> np.allclose(power, np.array([5, 13, 25, 41]))
+    True
     """
     # Calculate the power by squaring the absolute values of the FFT coefficients
     audio_power = np.square(np.abs(audio_fft))
@@ -255,37 +280,37 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
     return audio_power
 
 
-def freq_to_mel(freq):
+def freq_to_mel(freq: float) -> float:
     """
     Convert a frequency in Hertz to the mel scale.
 
     Args:
-        freq (float): The frequency in Hertz.
+        freq: The frequency in Hertz.
 
     Returns:
-        float: The frequency in mel scale.
+        The frequency in mel scale.
 
     Examples:
-        >>> round(freq_to_mel(1000), 2)
-        999.99
+    >>> round(freq_to_mel(1000), 2)
+    999.99
     """
     # Use the formula to convert frequency to the mel scale
     return 2595.0 * np.log10(1.0 + freq / 700.0)
 
 
-def mel_to_freq(mels):
+def mel_to_freq(mels: float) -> float:
     """
     Convert a frequency in the mel scale to Hertz.
 
     Args:
-        mels (float): The frequency in mel scale.
+        mels: The frequency in mel scale.
 
     Returns:
-        float: The frequency in Hertz.
+        The frequency in Hertz.
 
     Examples:
-        >>> round(mel_to_freq(999.99), 2)
-        1000.01
+    >>> round(mel_to_freq(999.99), 2)
+    1000.01
     """
     # Use the formula to convert mel scale to frequency
     return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
@@ -298,16 +323,16 @@ def mel_spaced_filterbank(
     Create a Mel-spaced filter bank for audio processing.
 
     Args:
-        sample_rate (int): The sample rate of the audio.
-        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
-        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+        sample_rate: The sample rate of the audio.
+        mel_filter_num: The number of mel filters (default is 10).
+        ftt_size: The size of the FFT (default is 1024).
 
     Returns:
-        np.ndarray: Mel-spaced filter bank.
+        Mel-spaced filter bank.
 
     Examples:
-        >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10)
-        0.0004603981
+    >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10)
+    0.0004603981
     """
     freq_min = 0
     freq_high = sample_rate // 2
@@ -337,15 +362,15 @@ def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray:
     Generate filters for audio processing.
 
     Args:
-        filter_points (list): A list of filter points.
-        ftt_size (int): The size of the FFT.
+        filter_points: A list of filter points.
+        ftt_size: The size of the FFT.
 
     Returns:
-        np.ndarray: A matrix of filters.
+        A matrix of filters.
 
     Examples:
-        >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape
-        (4, 257)
+    >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape
+    (4, 257)
     """
     num_filters = len(filter_points) - 2
     filters = np.zeros((num_filters, int(ftt_size / 2) + 1))
@@ -375,18 +400,18 @@ def get_filter_points(
     Calculate the filter points and frequencies for mel frequency filters.
 
     Args:
-        sample_rate (int): The sample rate of the audio.
-        freq_min (int): The minimum frequency in Hertz.
-        freq_high (int): The maximum frequency in Hertz.
-        mel_filter_num (Optional[int]): The number of mel filters (default is 10).
-        ftt_size (Optional[int]): The size of the FFT (default is 1024).
+        sample_rate: The sample rate of the audio.
+        freq_min: The minimum frequency in Hertz.
+        freq_high: The maximum frequency in Hertz.
+        mel_filter_num: The number of mel filters (default is 10).
+        ftt_size: The size of the FFT (default is 1024).
 
     Returns:
-        Tuple[np.ndarray, np.ndarray]: Filter points and corresponding frequencies.
+        Filter points and corresponding frequencies.
 
     Examples:
-            >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
-            array([  0,  20,  51,  95, 161, 256])
+    >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
+    array([  0,  20,  51,  95, 161, 256])
     """
     # Convert minimum and maximum frequencies to mel scale
     fmin_mel = freq_to_mel(freq_min)
@@ -407,20 +432,20 @@ def get_filter_points(
     return filter_points, freqs
 
 
-def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
+def discrete_cosine_transform(dct_filter_num: int, filter_num: int) -> np.ndarray:
     """
     Compute the Discrete Cosine Transform (DCT) basis matrix.
 
     Args:
-        dct_filter_num (int): The number of DCT filters to generate.
-        filter_num (int): The number of the fbank filters.
+        dct_filter_num: The number of DCT filters to generate.
+        filter_num: The number of the fbank filters.
 
     Returns:
-        np.ndarray: The DCT basis matrix.
+        The DCT basis matrix.
 
     Examples:
-        >>> round(dct(3, 5)[0][0], 5)
-        0.44721
+    >>> round(discrete_cosine_transform(3, 5)[0][0], 5)
+    0.44721
     """
     basis = np.empty((dct_filter_num, filter_num))
     basis[0, :] = 1.0 / np.sqrt(filter_num)
@@ -433,12 +458,13 @@ def dct(dct_filter_num: int, filter_num: int) -> np.ndarray:
     return basis
 
 
-def example(wav_file_path="./path-to-file/sample.wav"):
+def example(wav_file_path: str = "./path-to-file/sample.wav") -> np.ndarray:
     """
-    Example function to calculate MFCCs from an audio file.
+    Example function to calculate Mel Frequency Cepstral Coefficients
+    (MFCCs) from an audio file.
 
     Args:
-        wav_file_path (str): The path to the WAV audio file.
+        wav_file_path: The path to the WAV audio file.
 
     Returns:
         np.ndarray: The computed MFCCs for the audio.
@@ -447,10 +473,9 @@ def example(wav_file_path="./path-to-file/sample.wav"):
 
     # Load the audio from the WAV file
     sample_rate, audio = wavfile.read(wav_file_path)
-    # Calculate MFCCs
-    mfccs = mfcc(audio, sample_rate)
 
-    return mfccs
+    # Calculate MFCCs
+    return mfcc(audio, sample_rate)
 
 
 if __name__ == "__main__":

From ad0eeefd6df76d26ce537e5393b383ea2b550ac4 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sun, 24 Sep 2023 15:27:28 +0200
Subject: [PATCH 15/18] Apply suggestions from code review

---
 machine_learning/mfcc.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index c3d85b526112..3edddca77774 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -93,7 +93,6 @@ def mfcc(
         ValueError: If the input audio is empty.
 
     Example:
-    >>> import numpy as np
     >>> sample_rate = 44100  # Sample rate of 44.1 kHz
     >>> duration = 2.0  # Duration of 1 second
     >>> t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
@@ -195,7 +194,6 @@ def audio_frames(
         An array of overlapping frames.
 
     Examples:
-    >>> import numpy as np
     >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000)
     >>> sample_rate = 8000
     >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512)
@@ -233,7 +231,6 @@ def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarra
         The FFT of the audio data.
 
     Examples:
-    >>> import numpy as np
     >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4)
     >>> np.allclose(audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j]))
@@ -268,7 +265,6 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
         The power of the audio signal.
 
     Examples:
-    >>> import numpy as np
     >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j])
     >>> power = calculate_signal_power(audio_fft)
     >>> np.allclose(power, np.array([5, 13, 25, 41]))

From 7e38bc3d8189855ad8ef3fa0913f47de235a7aec Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sun, 24 Sep 2023 22:58:29 +0200
Subject: [PATCH 16/18] Update mfcc.py

---
 machine_learning/mfcc.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 3edddca77774..13ed1b96d8c8 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -1,12 +1,12 @@
 """
 Mel Frequency Cepstral Coefficients (MFCC) Calculation
 
-MFCC is a feature widely used in audio and speech processing to represent the
+MFCC is an algorythm widely used in audio and speech processing to represent the
 short-term power spectrum of a sound signal in a more compact and
 discriminative way. It is particularly popular in speech and audio processing
 tasks such as speech recognition and speaker identification.
 
-How MFCC is Calculated:
+How Mel Frequency Cepstral Coefficients are Calculated:
 1. Preprocessing:
    - Load an audio signal and normalize it to ensure that the values fall
      within a specific range (e.g., between -1 and 1).
@@ -64,7 +64,7 @@
 import scipy.fftpack as fft
 from scipy.signal import get_window
 
-logging.basicConfig(level=logging.WARNING)
+logging.basicConfig(filename=f"{__file__}.log", level=logging.INFO)
 
 
 def mfcc(
@@ -168,11 +168,8 @@ def normalize(audio: np.ndarray) -> np.ndarray:
     >>> np.min(normalized_audio)
     0.2
     """
-    # Find the maximum absolute value in the audio signal
-    max_abs_value = np.max(np.abs(audio))
-
     # Divide the entire audio signal by the maximum absolute value
-    return audio / max_abs_value
+    return audio / np.max(np.abs(audio))
 
 
 def audio_frames(
@@ -271,9 +268,7 @@ def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray:
     True
     """
     # Calculate the power by squaring the absolute values of the FFT coefficients
-    audio_power = np.square(np.abs(audio_fft))
-
-    return audio_power
+    return np.square(np.abs(audio_fft))
 
 
 def freq_to_mel(freq: float) -> float:
@@ -406,8 +401,12 @@ def get_filter_points(
         Filter points and corresponding frequencies.
 
     Examples:
-    >>> get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)[0]
+    >>> filter_points = get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512)
+    >>> filter_points[0]
     array([  0,  20,  51,  95, 161, 256])
+    >>> filter_points[1]
+    array([   0.        ,  324.46707094,  799.33254207, 1494.30973963,
+           2511.42581671, 4000.        ])
     """
     # Convert minimum and maximum frequencies to mel scale
     fmin_mel = freq_to_mel(freq_min)

From f410bf239f06acd45df478f5df9866b7977965cc Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sun, 24 Sep 2023 23:03:31 +0200
Subject: [PATCH 17/18] get_filter_points() -> tuple[np.ndarray, np.ndarray]:

---
 machine_learning/mfcc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 13ed1b96d8c8..3c34d9e8149e 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -386,7 +386,7 @@ def get_filter_points(
     freq_high: int,
     mel_filter_num: int = 10,
     ftt_size: int = 1024,
-):
+) -> tuple[np.ndarray, np.ndarray]:
     """
     Calculate the filter points and frequencies for mel frequency filters.
 

From e143e5b12f88742e3739fd76f5c4e1bd06ec79dd Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sun, 24 Sep 2023 23:04:57 +0200
Subject: [PATCH 18/18] algorithm

---
 machine_learning/mfcc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py
index 3c34d9e8149e..7ce8ceb50ff2 100644
--- a/machine_learning/mfcc.py
+++ b/machine_learning/mfcc.py
@@ -1,7 +1,7 @@
 """
 Mel Frequency Cepstral Coefficients (MFCC) Calculation
 
-MFCC is an algorythm widely used in audio and speech processing to represent the
+MFCC is an algorithm widely used in audio and speech processing to represent the
 short-term power spectrum of a sound signal in a more compact and
 discriminative way. It is particularly popular in speech and audio processing
 tasks such as speech recognition and speaker identification.