From 211247ef82fd54540e4cb832fbbb612ca5845700 Mon Sep 17 00:00:00 2001 From: Amir Lavasani Date: Mon, 25 Sep 2023 00:38:51 +0330 Subject: [PATCH 1/9] Add MFCC Feature Extraction Algorithm (#9057) * Add MFCC feature extraction to machine learning * Add standalone usage in comments * Apply suggestions from code review Co-authored-by: Christian Clauss * Delete empty junk file (#9062) * updating DIRECTORY.md * updating DIRECTORY.md * Delete empty junk file * updating DIRECTORY.md * Fix ruff errors * Fix more ruff errors --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> * [main] Fix typo due to auto review change * Add doctests for all functions * Add MFCC feature extraction to machine learning * Add standalone usage in comments * Apply suggestions from code review Co-authored-by: Christian Clauss * [main] Fix typo due to auto review change * Add doctests for all functions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix some pre-commit issues * Update review issues * Remove types from docstring * Rename dct * Add mfcc docstring * Add typing to several functions * Apply suggestions from code review * Update mfcc.py * get_filter_points() -> tuple[np.ndarray, np.ndarray]: * algorithm --------- Co-authored-by: Christian Clauss Co-authored-by: Tianyi Zheng Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- machine_learning/mfcc.py | 479 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 machine_learning/mfcc.py diff --git a/machine_learning/mfcc.py b/machine_learning/mfcc.py new file mode 100644 index 000000000000..7ce8ceb50ff2 --- /dev/null +++ b/machine_learning/mfcc.py @@ -0,0 +1,479 @@ +""" +Mel Frequency Cepstral Coefficients (MFCC) Calculation + +MFCC is an algorithm widely used in audio and speech processing to represent the +short-term power spectrum of a sound signal in a more compact and +discriminative way. It is particularly popular in speech and audio processing +tasks such as speech recognition and speaker identification. + +How Mel Frequency Cepstral Coefficients are Calculated: +1. Preprocessing: + - Load an audio signal and normalize it to ensure that the values fall + within a specific range (e.g., between -1 and 1). + - Frame the audio signal into overlapping, fixed-length segments, typically + using a technique like windowing to reduce spectral leakage. + +2. Fourier Transform: + - Apply a Fast Fourier Transform (FFT) to each audio frame to convert it + from the time domain to the frequency domain. This results in a + representation of the audio frame as a sequence of frequency components. + +3. Power Spectrum: + - Calculate the power spectrum by taking the squared magnitude of each + frequency component obtained from the FFT. This step measures the energy + distribution across different frequency bands. + +4. Mel Filterbank: + - Apply a set of triangular filterbanks spaced in the Mel frequency scale + to the power spectrum. These filters mimic the human auditory system's + frequency response. Each filterbank sums the power spectrum values within + its band. + +5. Logarithmic Compression: + - Take the logarithm (typically base 10) of the filterbank values to + compress the dynamic range. This step mimics the logarithmic response of + the human ear to sound intensity. + +6. Discrete Cosine Transform (DCT): + - Apply the Discrete Cosine Transform to the log filterbank energies to + obtain the MFCC coefficients. This transformation helps decorrelate the + filterbank energies and captures the most important features of the audio + signal. + +7. Feature Extraction: + - Select a subset of the DCT coefficients to form the feature vector. + Often, the first few coefficients (e.g., 12-13) are used for most + applications. + +References: +- Mel-Frequency Cepstral Coefficients (MFCCs): + https://en.wikipedia.org/wiki/Mel-frequency_cepstrum +- Speech and Language Processing by Daniel Jurafsky & James H. Martin: + https://web.stanford.edu/~jurafsky/slp3/ +- Mel Frequency Cepstral Coefficient (MFCC) tutorial + http://practicalcryptography.com/miscellaneous/machine-learning + /guide-mel-frequency-cepstral-coefficients-mfccs/ + +Author: Amir Lavasani +""" + + +import logging + +import numpy as np +import scipy.fftpack as fft +from scipy.signal import get_window + +logging.basicConfig(filename=f"{__file__}.log", level=logging.INFO) + + +def mfcc( + audio: np.ndarray, + sample_rate: int, + ftt_size: int = 1024, + hop_length: int = 20, + mel_filter_num: int = 10, + dct_filter_num: int = 40, +) -> np.ndarray: + """ + Calculate Mel Frequency Cepstral Coefficients (MFCCs) from an audio signal. + + Args: + audio: The input audio signal. + sample_rate: The sample rate of the audio signal (in Hz). + ftt_size: The size of the FFT window (default is 1024). + hop_length: The hop length for frame creation (default is 20ms). + mel_filter_num: The number of Mel filters (default is 10). + dct_filter_num: The number of DCT filters (default is 40). + + Returns: + A matrix of MFCCs for the input audio. + + Raises: + ValueError: If the input audio is empty. + + Example: + >>> sample_rate = 44100 # Sample rate of 44.1 kHz + >>> duration = 2.0 # Duration of 1 second + >>> t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) + >>> audio = 0.5 * np.sin(2 * np.pi * 440.0 * t) # Generate a 440 Hz sine wave + >>> mfccs = mfcc(audio, sample_rate) + >>> mfccs.shape + (40, 101) + """ + logging.info(f"Sample rate: {sample_rate}Hz") + logging.info(f"Audio duration: {len(audio) / sample_rate}s") + logging.info(f"Audio min: {np.min(audio)}") + logging.info(f"Audio max: {np.max(audio)}") + + # normalize audio + audio_normalized = normalize(audio) + + logging.info(f"Normalized audio min: {np.min(audio_normalized)}") + logging.info(f"Normalized audio max: {np.max(audio_normalized)}") + + # frame audio into + audio_framed = audio_frames( + audio_normalized, sample_rate, ftt_size=ftt_size, hop_length=hop_length + ) + + logging.info(f"Framed audio shape: {audio_framed.shape}") + logging.info(f"First frame: {audio_framed[0]}") + + # convert to frequency domain + # For simplicity we will choose the Hanning window. + window = get_window("hann", ftt_size, fftbins=True) + audio_windowed = audio_framed * window + + logging.info(f"Windowed audio shape: {audio_windowed.shape}") + logging.info(f"First frame: {audio_windowed[0]}") + + audio_fft = calculate_fft(audio_windowed, ftt_size) + logging.info(f"fft audio shape: {audio_fft.shape}") + logging.info(f"First frame: {audio_fft[0]}") + + audio_power = calculate_signal_power(audio_fft) + logging.info(f"power audio shape: {audio_power.shape}") + logging.info(f"First frame: {audio_power[0]}") + + filters = mel_spaced_filterbank(sample_rate, mel_filter_num, ftt_size) + logging.info(f"filters shape: {filters.shape}") + + audio_filtered = np.dot(filters, np.transpose(audio_power)) + audio_log = 10.0 * np.log10(audio_filtered) + logging.info(f"audio_log shape: {audio_log.shape}") + + dct_filters = discrete_cosine_transform(dct_filter_num, mel_filter_num) + cepstral_coefficents = np.dot(dct_filters, audio_log) + + logging.info(f"cepstral_coefficents shape: {cepstral_coefficents.shape}") + return cepstral_coefficents + + +def normalize(audio: np.ndarray) -> np.ndarray: + """ + Normalize an audio signal by scaling it to have values between -1 and 1. + + Args: + audio: The input audio signal. + + Returns: + The normalized audio signal. + + Examples: + >>> audio = np.array([1, 2, 3, 4, 5]) + >>> normalized_audio = normalize(audio) + >>> np.max(normalized_audio) + 1.0 + >>> np.min(normalized_audio) + 0.2 + """ + # Divide the entire audio signal by the maximum absolute value + return audio / np.max(np.abs(audio)) + + +def audio_frames( + audio: np.ndarray, + sample_rate: int, + hop_length: int = 20, + ftt_size: int = 1024, +) -> np.ndarray: + """ + Split an audio signal into overlapping frames. + + Args: + audio: The input audio signal. + sample_rate: The sample rate of the audio signal. + hop_length: The length of the hopping (default is 20ms). + ftt_size: The size of the FFT window (default is 1024). + + Returns: + An array of overlapping frames. + + Examples: + >>> audio = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]*1000) + >>> sample_rate = 8000 + >>> frames = audio_frames(audio, sample_rate, hop_length=10, ftt_size=512) + >>> frames.shape + (126, 512) + """ + + hop_size = np.round(sample_rate * hop_length / 1000).astype(int) + + # Pad the audio signal to handle edge cases + audio = np.pad(audio, int(ftt_size / 2), mode="reflect") + + # Calculate the number of frames + frame_count = int((len(audio) - ftt_size) / hop_size) + 1 + + # Initialize an array to store the frames + frames = np.zeros((frame_count, ftt_size)) + + # Split the audio signal into frames + for n in range(frame_count): + frames[n] = audio[n * hop_size : n * hop_size + ftt_size] + + return frames + + +def calculate_fft(audio_windowed: np.ndarray, ftt_size: int = 1024) -> np.ndarray: + """ + Calculate the Fast Fourier Transform (FFT) of windowed audio data. + + Args: + audio_windowed: The windowed audio signal. + ftt_size: The size of the FFT (default is 1024). + + Returns: + The FFT of the audio data. + + Examples: + >>> audio_windowed = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + >>> audio_fft = calculate_fft(audio_windowed, ftt_size=4) + >>> np.allclose(audio_fft[0], np.array([6.0+0.j, -1.5+0.8660254j, -1.5-0.8660254j])) + True + """ + # Transpose the audio data to have time in rows and channels in columns + audio_transposed = np.transpose(audio_windowed) + + # Initialize an array to store the FFT results + audio_fft = np.empty( + (int(1 + ftt_size // 2), audio_transposed.shape[1]), + dtype=np.complex64, + order="F", + ) + + # Compute FFT for each channel + for n in range(audio_fft.shape[1]): + audio_fft[:, n] = fft.fft(audio_transposed[:, n], axis=0)[: audio_fft.shape[0]] + + # Transpose the FFT results back to the original shape + return np.transpose(audio_fft) + + +def calculate_signal_power(audio_fft: np.ndarray) -> np.ndarray: + """ + Calculate the power of the audio signal from its FFT. + + Args: + audio_fft: The FFT of the audio signal. + + Returns: + The power of the audio signal. + + Examples: + >>> audio_fft = np.array([1+2j, 2+3j, 3+4j, 4+5j]) + >>> power = calculate_signal_power(audio_fft) + >>> np.allclose(power, np.array([5, 13, 25, 41])) + True + """ + # Calculate the power by squaring the absolute values of the FFT coefficients + return np.square(np.abs(audio_fft)) + + +def freq_to_mel(freq: float) -> float: + """ + Convert a frequency in Hertz to the mel scale. + + Args: + freq: The frequency in Hertz. + + Returns: + The frequency in mel scale. + + Examples: + >>> round(freq_to_mel(1000), 2) + 999.99 + """ + # Use the formula to convert frequency to the mel scale + return 2595.0 * np.log10(1.0 + freq / 700.0) + + +def mel_to_freq(mels: float) -> float: + """ + Convert a frequency in the mel scale to Hertz. + + Args: + mels: The frequency in mel scale. + + Returns: + The frequency in Hertz. + + Examples: + >>> round(mel_to_freq(999.99), 2) + 1000.01 + """ + # Use the formula to convert mel scale to frequency + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) + + +def mel_spaced_filterbank( + sample_rate: int, mel_filter_num: int = 10, ftt_size: int = 1024 +) -> np.ndarray: + """ + Create a Mel-spaced filter bank for audio processing. + + Args: + sample_rate: The sample rate of the audio. + mel_filter_num: The number of mel filters (default is 10). + ftt_size: The size of the FFT (default is 1024). + + Returns: + Mel-spaced filter bank. + + Examples: + >>> round(mel_spaced_filterbank(8000, 10, 1024)[0][1], 10) + 0.0004603981 + """ + freq_min = 0 + freq_high = sample_rate // 2 + + logging.info(f"Minimum frequency: {freq_min}") + logging.info(f"Maximum frequency: {freq_high}") + + # Calculate filter points and mel frequencies + filter_points, mel_freqs = get_filter_points( + sample_rate, + freq_min, + freq_high, + mel_filter_num, + ftt_size, + ) + + filters = get_filters(filter_points, ftt_size) + + # normalize filters + # taken from the librosa library + enorm = 2.0 / (mel_freqs[2 : mel_filter_num + 2] - mel_freqs[:mel_filter_num]) + return filters * enorm[:, np.newaxis] + + +def get_filters(filter_points: np.ndarray, ftt_size: int) -> np.ndarray: + """ + Generate filters for audio processing. + + Args: + filter_points: A list of filter points. + ftt_size: The size of the FFT. + + Returns: + A matrix of filters. + + Examples: + >>> get_filters(np.array([0, 20, 51, 95, 161, 256], dtype=int), 512).shape + (4, 257) + """ + num_filters = len(filter_points) - 2 + filters = np.zeros((num_filters, int(ftt_size / 2) + 1)) + + for n in range(num_filters): + start = filter_points[n] + mid = filter_points[n + 1] + end = filter_points[n + 2] + + # Linearly increase values from 0 to 1 + filters[n, start:mid] = np.linspace(0, 1, mid - start) + + # Linearly decrease values from 1 to 0 + filters[n, mid:end] = np.linspace(1, 0, end - mid) + + return filters + + +def get_filter_points( + sample_rate: int, + freq_min: int, + freq_high: int, + mel_filter_num: int = 10, + ftt_size: int = 1024, +) -> tuple[np.ndarray, np.ndarray]: + """ + Calculate the filter points and frequencies for mel frequency filters. + + Args: + sample_rate: The sample rate of the audio. + freq_min: The minimum frequency in Hertz. + freq_high: The maximum frequency in Hertz. + mel_filter_num: The number of mel filters (default is 10). + ftt_size: The size of the FFT (default is 1024). + + Returns: + Filter points and corresponding frequencies. + + Examples: + >>> filter_points = get_filter_points(8000, 0, 4000, mel_filter_num=4, ftt_size=512) + >>> filter_points[0] + array([ 0, 20, 51, 95, 161, 256]) + >>> filter_points[1] + array([ 0. , 324.46707094, 799.33254207, 1494.30973963, + 2511.42581671, 4000. ]) + """ + # Convert minimum and maximum frequencies to mel scale + fmin_mel = freq_to_mel(freq_min) + fmax_mel = freq_to_mel(freq_high) + + logging.info(f"MEL min: {fmin_mel}") + logging.info(f"MEL max: {fmax_mel}") + + # Generate equally spaced mel frequencies + mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2) + + # Convert mel frequencies back to Hertz + freqs = mel_to_freq(mels) + + # Calculate filter points as integer values + filter_points = np.floor((ftt_size + 1) / sample_rate * freqs).astype(int) + + return filter_points, freqs + + +def discrete_cosine_transform(dct_filter_num: int, filter_num: int) -> np.ndarray: + """ + Compute the Discrete Cosine Transform (DCT) basis matrix. + + Args: + dct_filter_num: The number of DCT filters to generate. + filter_num: The number of the fbank filters. + + Returns: + The DCT basis matrix. + + Examples: + >>> round(discrete_cosine_transform(3, 5)[0][0], 5) + 0.44721 + """ + basis = np.empty((dct_filter_num, filter_num)) + basis[0, :] = 1.0 / np.sqrt(filter_num) + + samples = np.arange(1, 2 * filter_num, 2) * np.pi / (2.0 * filter_num) + + for i in range(1, dct_filter_num): + basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_num) + + return basis + + +def example(wav_file_path: str = "./path-to-file/sample.wav") -> np.ndarray: + """ + Example function to calculate Mel Frequency Cepstral Coefficients + (MFCCs) from an audio file. + + Args: + wav_file_path: The path to the WAV audio file. + + Returns: + np.ndarray: The computed MFCCs for the audio. + """ + from scipy.io import wavfile + + # Load the audio from the WAV file + sample_rate, audio = wavfile.read(wav_file_path) + + # Calculate MFCCs + return mfcc(audio, sample_rate) + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From eace4cea32b831a1683b4c431379f0cd7b9061db Mon Sep 17 00:00:00 2001 From: gudlu1925 <120262240+gudlu1925@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:14:06 +0530 Subject: [PATCH 2/9] Added Coulomb_Law (#8714) * Create coulomb_law.py * Update coulomb_law.py * Update coulomb_law.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update and rename coulomb_law.py to coulombs_law.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update coulombs_law.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update coulombs_law.py * Update coulombs_law.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update coulombs_law.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update coulombs_law.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tianyi Zheng --- physics/coulombs_law.py | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 physics/coulombs_law.py diff --git a/physics/coulombs_law.py b/physics/coulombs_law.py new file mode 100644 index 000000000000..252e8ec0f74e --- /dev/null +++ b/physics/coulombs_law.py @@ -0,0 +1,42 @@ +""" +Coulomb's law states that the magnitude of the electrostatic force of attraction +or repulsion between two point charges is directly proportional to the product +of the magnitudes of charges and inversely proportional to the square of the +distance between them. + +F = k * q1 * q2 / r^2 + +k is Coulomb's constant and equals 1/(4π*ε0) +q1 is charge of first body (C) +q2 is charge of second body (C) +r is distance between two charged bodies (m) + +Reference: https://en.wikipedia.org/wiki/Coulomb%27s_law +""" + + +def coulombs_law(q1: float, q2: float, radius: float) -> float: + """ + Calculate the electrostatic force of attraction or repulsion + between two point charges + + >>> coulombs_law(15.5, 20, 15) + 12382849136.06 + >>> coulombs_law(1, 15, 5) + 5392531075.38 + >>> coulombs_law(20, -50, 15) + -39944674632.44 + >>> coulombs_law(-5, -8, 10) + 3595020716.92 + >>> coulombs_law(50, 100, 50) + 17975103584.6 + """ + if radius <= 0: + raise ValueError("The radius is always a positive non zero integer") + return round(((8.9875517923 * 10**9) * q1 * q2) / (radius**2), 2) + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From b2e186f4b769ae98d04f7f2408d3ac86da44c06f Mon Sep 17 00:00:00 2001 From: Okza Pradhana Date: Wed, 27 Sep 2023 13:06:19 +0700 Subject: [PATCH 3/9] feat(maths): add function to perform calculation (#6602) * feat(maths): add function to perform calculation - Add single function to calculate sum of two positive numbers using bitwise operator * docs: add wikipedia url as explanation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Caeden Perelli-Harris * Update sum_of_two_positive_numbers_bitwise.py * Update sum_of_two_positive_numbers_bitwise.py * Update sum_of_two_positive_numbers_bitwise.py --------- Co-authored-by: Okza Pradhana Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tianyi Zheng Co-authored-by: Caeden Perelli-Harris --- maths/sum_of_two_positive_numbers_bitwise.py | 55 ++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 maths/sum_of_two_positive_numbers_bitwise.py diff --git a/maths/sum_of_two_positive_numbers_bitwise.py b/maths/sum_of_two_positive_numbers_bitwise.py new file mode 100644 index 000000000000..70eaf6887b64 --- /dev/null +++ b/maths/sum_of_two_positive_numbers_bitwise.py @@ -0,0 +1,55 @@ +""" +Calculates the sum of two non-negative integers using bitwise operators +Wikipedia explanation: https://en.wikipedia.org/wiki/Binary_number +""" + + +def bitwise_addition_recursive(number: int, other_number: int) -> int: + """ + >>> bitwise_addition_recursive(4, 5) + 9 + >>> bitwise_addition_recursive(8, 9) + 17 + >>> bitwise_addition_recursive(0, 4) + 4 + >>> bitwise_addition_recursive(4.5, 9) + Traceback (most recent call last): + ... + TypeError: Both arguments MUST be integers! + >>> bitwise_addition_recursive('4', 9) + Traceback (most recent call last): + ... + TypeError: Both arguments MUST be integers! + >>> bitwise_addition_recursive('4.5', 9) + Traceback (most recent call last): + ... + TypeError: Both arguments MUST be integers! + >>> bitwise_addition_recursive(-1, 9) + Traceback (most recent call last): + ... + ValueError: Both arguments MUST be non-negative! + >>> bitwise_addition_recursive(1, -9) + Traceback (most recent call last): + ... + ValueError: Both arguments MUST be non-negative! + """ + + if not isinstance(number, int) or not isinstance(other_number, int): + raise TypeError("Both arguments MUST be integers!") + + if number < 0 or other_number < 0: + raise ValueError("Both arguments MUST be non-negative!") + + bitwise_sum = number ^ other_number + carry = number & other_number + + if carry == 0: + return bitwise_sum + + return bitwise_addition_recursive(bitwise_sum, carry << 1) + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From 84ec9414e45380a5e946d4f73b921b274ecd4be7 Mon Sep 17 00:00:00 2001 From: thor-harsh <105957576+thor-harsh@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:01:42 +0530 Subject: [PATCH 4/9] Update k_means_clust.py (#8996) * Update k_means_clust.py * Apply suggestions from code review --------- Co-authored-by: Tianyi Zheng --- machine_learning/k_means_clust.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 7c8142aab878..d93c5addf2ee 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -11,10 +11,10 @@ - initial_centroids , initial centroid values generated by utility function(mentioned in usage). - maxiter , maximum number of iterations to process. - - heterogeneity , empty list that will be filled with hetrogeneity values if passed + - heterogeneity , empty list that will be filled with heterogeneity values if passed to kmeans func. Usage: - 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list + 1. define 'k' value, 'X' features array and 'heterogeneity' empty list 2. create initial_centroids, initial_centroids = get_initial_centroids( X, @@ -31,8 +31,8 @@ record_heterogeneity=heterogeneity, verbose=True # whether to print logs in console or not.(default=False) ) - 4. Plot the loss function, hetrogeneity values for every iteration saved in - hetrogeneity list. + 4. Plot the loss function and heterogeneity values for every iteration saved in + heterogeneity list. plot_heterogeneity( heterogeneity, k @@ -198,13 +198,10 @@ def report_generator( df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None ) -> pd.DataFrame: """ - Function generates easy-erading clustering report. It takes 2 arguments as an input: - DataFrame - dataframe with predicted cluester column; - FillMissingReport - dictionary of rules how we are going to fill missing - values of for final report generate (not included in modeling); - in order to run the function following libraries must be imported: - import pandas as pd - import numpy as np + Generates a clustering report. This function takes 2 arguments as input: + df - dataframe with predicted cluster column + fill_missing_report - dictionary of rules on how we are going to fill in missing + values for final generated report (not included in modelling); >>> data = pd.DataFrame() >>> data['numbers'] = [1, 2, 3] >>> data['col1'] = [0.5, 2.5, 4.5] @@ -306,10 +303,10 @@ def report_generator( a.columns = report.columns # rename columns to match report report = report.drop( report[report.Type == "count"].index - ) # drop count values except cluster size + ) # drop count values except for cluster size report = pd.concat( [report, a, clustersize, clusterproportion], axis=0 - ) # concat report with clustert size and nan values + ) # concat report with cluster size and nan values report["Mark"] = report["Features"].isin(clustering_variables) cols = report.columns.tolist() cols = cols[0:2] + cols[-1:] + cols[2:-1] From 5830b29e7ecf5437ce46bcdefda88eedea693043 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Wed, 27 Sep 2023 08:00:34 -0400 Subject: [PATCH 5/9] Fix `mypy` errors in `erosion_operation.py` (#8603) * updating DIRECTORY.md * Fix mypy errors in erosion_operation.py * Rename functions to use snake case * updating DIRECTORY.md * updating DIRECTORY.md * Replace raw file string with pathlib Path * Fix function name in erosion_operation.py doctest --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- .../erosion_operation.py | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/digital_image_processing/morphological_operations/erosion_operation.py b/digital_image_processing/morphological_operations/erosion_operation.py index c0e1ef847237..53001da83468 100644 --- a/digital_image_processing/morphological_operations/erosion_operation.py +++ b/digital_image_processing/morphological_operations/erosion_operation.py @@ -1,34 +1,37 @@ +from pathlib import Path + import numpy as np from PIL import Image -def rgb2gray(rgb: np.array) -> np.array: +def rgb_to_gray(rgb: np.ndarray) -> np.ndarray: """ Return gray image from rgb image - >>> rgb2gray(np.array([[[127, 255, 0]]])) + + >>> rgb_to_gray(np.array([[[127, 255, 0]]])) array([[187.6453]]) - >>> rgb2gray(np.array([[[0, 0, 0]]])) + >>> rgb_to_gray(np.array([[[0, 0, 0]]])) array([[0.]]) - >>> rgb2gray(np.array([[[2, 4, 1]]])) + >>> rgb_to_gray(np.array([[[2, 4, 1]]])) array([[3.0598]]) - >>> rgb2gray(np.array([[[26, 255, 14], [5, 147, 20], [1, 200, 0]]])) + >>> rgb_to_gray(np.array([[[26, 255, 14], [5, 147, 20], [1, 200, 0]]])) array([[159.0524, 90.0635, 117.6989]]) """ r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2] return 0.2989 * r + 0.5870 * g + 0.1140 * b -def gray2binary(gray: np.array) -> np.array: +def gray_to_binary(gray: np.ndarray) -> np.ndarray: """ Return binary image from gray image - >>> gray2binary(np.array([[127, 255, 0]])) + >>> gray_to_binary(np.array([[127, 255, 0]])) array([[False, True, False]]) - >>> gray2binary(np.array([[0]])) + >>> gray_to_binary(np.array([[0]])) array([[False]]) - >>> gray2binary(np.array([[26.2409, 4.9315, 1.4729]])) + >>> gray_to_binary(np.array([[26.2409, 4.9315, 1.4729]])) array([[False, False, False]]) - >>> gray2binary(np.array([[26, 255, 14], [5, 147, 20], [1, 200, 0]])) + >>> gray_to_binary(np.array([[26, 255, 14], [5, 147, 20], [1, 200, 0]])) array([[False, True, False], [False, True, False], [False, True, False]]) @@ -36,9 +39,10 @@ def gray2binary(gray: np.array) -> np.array: return (gray > 127) & (gray <= 255) -def erosion(image: np.array, kernel: np.array) -> np.array: +def erosion(image: np.ndarray, kernel: np.ndarray) -> np.ndarray: """ Return eroded image + >>> erosion(np.array([[True, True, False]]), np.array([[0, 1, 0]])) array([[False, False, False]]) >>> erosion(np.array([[True, False, False]]), np.array([[1, 1, 0]])) @@ -62,14 +66,17 @@ def erosion(image: np.array, kernel: np.array) -> np.array: return output -# kernel to be applied -structuring_element = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]) - if __name__ == "__main__": # read original image - image = np.array(Image.open(r"..\image_data\lena.jpg")) + lena_path = Path(__file__).resolve().parent / "image_data" / "lena.jpg" + lena = np.array(Image.open(lena_path)) + + # kernel to be applied + structuring_element = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]) + # Apply erosion operation to a binary image - output = erosion(gray2binary(rgb2gray(image)), structuring_element) + output = erosion(gray_to_binary(rgb_to_gray(lena)), structuring_element) + # Save the output image pil_img = Image.fromarray(output).convert("RGB") pil_img.save("result_erosion.png") From 76767d2f09d15aeff0a54cfc44652207eda2314e Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Wed, 27 Sep 2023 08:01:18 -0400 Subject: [PATCH 6/9] Consolidate the two existing kNN implementations (#8903) * Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- DIRECTORY.md | 1 - machine_learning/k_nearest_neighbours.py | 128 ++++++++++++++--------- machine_learning/knn_sklearn.py | 31 ------ 3 files changed, 79 insertions(+), 81 deletions(-) delete mode 100644 machine_learning/knn_sklearn.py diff --git a/DIRECTORY.md b/DIRECTORY.md index d81e4ec1ee83..902999460fe5 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -507,7 +507,6 @@ * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) - * [Knn Sklearn](machine_learning/knn_sklearn.py) * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py) * [Linear Regression](machine_learning/linear_regression.py) * Local Weighted Learning diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 2a90cfe5987a..a43757c5c20e 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -1,58 +1,88 @@ +""" +k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning +algorithm used for classification. Given some labelled training data, a given +point is classified using its k nearest neighbours according to some distance +metric. The most commonly occurring label among the neighbours becomes the label +of the given point. In effect, the label of the given point is decided by a +majority vote. + +This implementation uses the commonly used Euclidean distance metric, but other +distance metrics can also be used. + +Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm +""" + from collections import Counter +from heapq import nsmallest import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split -data = datasets.load_iris() - -X = np.array(data["data"]) -y = np.array(data["target"]) -classes = data["target_names"] - -X_train, X_test, y_train, y_test = train_test_split(X, y) - - -def euclidean_distance(a, b): - """ - Gives the euclidean distance between two points - >>> euclidean_distance([0, 0], [3, 4]) - 5.0 - >>> euclidean_distance([1, 2, 3], [1, 8, 11]) - 10.0 - """ - return np.linalg.norm(np.array(a) - np.array(b)) - - -def classifier(train_data, train_target, classes, point, k=5): - """ - Classifies the point using the KNN algorithm - k closest points are found (ranked in ascending order of euclidean distance) - Params: - :train_data: Set of points that are classified into two or more classes - :train_target: List of classes in the order of train_data points - :classes: Labels of the classes - :point: The data point that needs to be classified - - >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] - >>> y_train = [0, 0, 0, 0, 1, 1, 1] - >>> classes = ['A','B']; point = [1.2,1.2] - >>> classifier(X_train, y_train, classes,point) - 'A' - """ - data = zip(train_data, train_target) - # List of distances of all points from the point to be classified - distances = [] - for data_point in data: - distance = euclidean_distance(data_point[0], point) - distances.append((distance, data_point[1])) - # Choosing 'k' points with the least distances. - votes = [i[1] for i in sorted(distances)[:k]] - # Most commonly occurring class among them - # is the class into which the point is classified - result = Counter(votes).most_common(1)[0][0] - return classes[result] + +class KNN: + def __init__( + self, + train_data: np.ndarray[float], + train_target: np.ndarray[int], + class_labels: list[str], + ) -> None: + """ + Create a kNN classifier using the given training data and class labels + """ + self.data = zip(train_data, train_target) + self.labels = class_labels + + @staticmethod + def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: + """ + Calculate the Euclidean distance between two points + >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4])) + 5.0 + >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) + 10.0 + """ + return np.linalg.norm(a - b) + + def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: + """ + Classify a given point using the kNN algorithm + >>> train_X = np.array( + ... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] + ... ) + >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1]) + >>> classes = ['A', 'B'] + >>> knn = KNN(train_X, train_y, classes) + >>> point = np.array([1.2, 1.2]) + >>> knn.classify(point) + 'A' + """ + # Distances of all points from the point to be classified + distances = ( + (self._euclidean_distance(data_point[0], pred_point), data_point[1]) + for data_point in self.data + ) + + # Choosing k points with the shortest distances + votes = (i[1] for i in nsmallest(k, distances)) + + # Most commonly occurring class is the one into which the point is classified + result = Counter(votes).most_common(1)[0][0] + return self.labels[result] if __name__ == "__main__": - print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) + import doctest + + doctest.testmod() + + iris = datasets.load_iris() + + X = np.array(iris["data"]) + y = np.array(iris["target"]) + iris_classes = iris["target_names"] + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + iris_point = np.array([4.4, 3.1, 1.3, 1.4]) + classifier = KNN(X_train, y_train, iris_classes) + print(classifier.classify(iris_point, k=3)) diff --git a/machine_learning/knn_sklearn.py b/machine_learning/knn_sklearn.py deleted file mode 100644 index 4a621a4244b6..000000000000 --- a/machine_learning/knn_sklearn.py +++ /dev/null @@ -1,31 +0,0 @@ -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier - -# Load iris file -iris = load_iris() -iris.keys() - - -print(f"Target names: \n {iris.target_names} ") -print(f"\n Features: \n {iris.feature_names}") - -# Train set e Test set -X_train, X_test, y_train, y_test = train_test_split( - iris["data"], iris["target"], random_state=4 -) - -# KNN - -knn = KNeighborsClassifier(n_neighbors=1) -knn.fit(X_train, y_train) - -# new array to test -X_new = [[1, 2, 1, 4], [2, 3, 4, 5]] - -prediction = knn.predict(X_new) - -print( - f"\nNew array: \n {X_new}\n\nTarget Names Prediction: \n" - f" {iris['target_names'][prediction]}" -) From f9b8759ba82cd7ca4e4a99b9bc9b661ace5a93cc Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Wed, 27 Sep 2023 09:54:40 -0400 Subject: [PATCH 7/9] Move bitwise add (#9097) * updating DIRECTORY.md * updating DIRECTORY.md * updating DIRECTORY.md * Move and rename maths/sum_of_two_positive_numbers_bitwise.py * updating DIRECTORY.md --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- DIRECTORY.md | 3 +++ .../bitwise_addition_recursive.py | 0 2 files changed, 3 insertions(+) rename maths/sum_of_two_positive_numbers_bitwise.py => bit_manipulation/bitwise_addition_recursive.py (100%) diff --git a/DIRECTORY.md b/DIRECTORY.md index 902999460fe5..e596d96e5e83 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -43,6 +43,7 @@ * [Binary Shifts](bit_manipulation/binary_shifts.py) * [Binary Twos Complement](bit_manipulation/binary_twos_complement.py) * [Binary Xor Operator](bit_manipulation/binary_xor_operator.py) + * [Bitwise Addition Recursive](bit_manipulation/bitwise_addition_recursive.py) * [Count 1S Brian Kernighan Method](bit_manipulation/count_1s_brian_kernighan_method.py) * [Count Number Of One Bits](bit_manipulation/count_number_of_one_bits.py) * [Gray Code Sequence](bit_manipulation/gray_code_sequence.py) @@ -514,6 +515,7 @@ * [Logistic Regression](machine_learning/logistic_regression.py) * Lstm * [Lstm Prediction](machine_learning/lstm/lstm_prediction.py) + * [Mfcc](machine_learning/mfcc.py) * [Multilayer Perceptron Classifier](machine_learning/multilayer_perceptron_classifier.py) * [Polynomial Regression](machine_learning/polynomial_regression.py) * [Scoring Functions](machine_learning/scoring_functions.py) @@ -752,6 +754,7 @@ * [Basic Orbital Capture](physics/basic_orbital_capture.py) * [Casimir Effect](physics/casimir_effect.py) * [Centripetal Force](physics/centripetal_force.py) + * [Coulombs Law](physics/coulombs_law.py) * [Grahams Law](physics/grahams_law.py) * [Horizontal Projectile Motion](physics/horizontal_projectile_motion.py) * [Hubble Parameter](physics/hubble_parameter.py) diff --git a/maths/sum_of_two_positive_numbers_bitwise.py b/bit_manipulation/bitwise_addition_recursive.py similarity index 100% rename from maths/sum_of_two_positive_numbers_bitwise.py rename to bit_manipulation/bitwise_addition_recursive.py From 38c2b839819549d1ab8566675fab09db449875cc Mon Sep 17 00:00:00 2001 From: aryan1165 <111041731+aryan1165@users.noreply.github.com> Date: Wed, 27 Sep 2023 19:26:01 +0530 Subject: [PATCH 8/9] Deleted euclidean_gcd.py. Fixes#8063 (#9108) --- maths/euclidean_gcd.py | 47 ------------------------------------------ 1 file changed, 47 deletions(-) delete mode 100644 maths/euclidean_gcd.py diff --git a/maths/euclidean_gcd.py b/maths/euclidean_gcd.py deleted file mode 100644 index de4b250243db..000000000000 --- a/maths/euclidean_gcd.py +++ /dev/null @@ -1,47 +0,0 @@ -""" https://en.wikipedia.org/wiki/Euclidean_algorithm """ - - -def euclidean_gcd(a: int, b: int) -> int: - """ - Examples: - >>> euclidean_gcd(3, 5) - 1 - - >>> euclidean_gcd(6, 3) - 3 - """ - while b: - a, b = b, a % b - return a - - -def euclidean_gcd_recursive(a: int, b: int) -> int: - """ - Recursive method for euclicedan gcd algorithm - - Examples: - >>> euclidean_gcd_recursive(3, 5) - 1 - - >>> euclidean_gcd_recursive(6, 3) - 3 - """ - return a if b == 0 else euclidean_gcd_recursive(b, a % b) - - -def main(): - print(f"euclidean_gcd(3, 5) = {euclidean_gcd(3, 5)}") - print(f"euclidean_gcd(5, 3) = {euclidean_gcd(5, 3)}") - print(f"euclidean_gcd(1, 3) = {euclidean_gcd(1, 3)}") - print(f"euclidean_gcd(3, 6) = {euclidean_gcd(3, 6)}") - print(f"euclidean_gcd(6, 3) = {euclidean_gcd(6, 3)}") - - print(f"euclidean_gcd_recursive(3, 5) = {euclidean_gcd_recursive(3, 5)}") - print(f"euclidean_gcd_recursive(5, 3) = {euclidean_gcd_recursive(5, 3)}") - print(f"euclidean_gcd_recursive(1, 3) = {euclidean_gcd_recursive(1, 3)}") - print(f"euclidean_gcd_recursive(3, 6) = {euclidean_gcd_recursive(3, 6)}") - print(f"euclidean_gcd_recursive(6, 3) = {euclidean_gcd_recursive(6, 3)}") - - -if __name__ == "__main__": - main() From 35dd529c85fc433e0780cdaff586c684208aa1b7 Mon Sep 17 00:00:00 2001 From: Hetarth Jain Date: Thu, 28 Sep 2023 23:54:46 +0530 Subject: [PATCH 9/9] Returning Index instead of boolean in knuth_morris_pratt (kmp) function, making it compatible with str.find(). (#9083) * Update knuth_morris_pratt.py - changed Boolean to Index * Update knuth_morris_pratt.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update knuth_morris_pratt.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update knuth_morris_pratt.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update back_propagation_neural_network.py * Update back_propagation_neural_network.py * Update strings/knuth_morris_pratt.py * Update knuth_morris_pratt.py * Update knuth_morris_pratt.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss --- strings/knuth_morris_pratt.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/strings/knuth_morris_pratt.py b/strings/knuth_morris_pratt.py index a488c171a93b..8a04eb2532c0 100644 --- a/strings/knuth_morris_pratt.py +++ b/strings/knuth_morris_pratt.py @@ -1,7 +1,7 @@ from __future__ import annotations -def kmp(pattern: str, text: str) -> bool: +def knuth_morris_pratt(text: str, pattern: str) -> int: """ The Knuth-Morris-Pratt Algorithm for finding a pattern within a piece of text with complexity O(n + m) @@ -14,6 +14,12 @@ def kmp(pattern: str, text: str) -> bool: 2) Step through the text one character at a time and compare it to a character in the pattern updating our location within the pattern if necessary + >>> kmp = "knuth_morris_pratt" + >>> all( + ... knuth_morris_pratt(kmp, s) == kmp.find(s) + ... for s in ("kn", "h_m", "rr", "tt", "not there") + ... ) + True """ # 1) Construct the failure array @@ -24,7 +30,7 @@ def kmp(pattern: str, text: str) -> bool: while i < len(text): if pattern[j] == text[i]: if j == (len(pattern) - 1): - return True + return i - j j += 1 # if this is a prefix in our pattern @@ -33,7 +39,7 @@ def kmp(pattern: str, text: str) -> bool: j = failure[j - 1] continue i += 1 - return False + return -1 def get_failure_array(pattern: str) -> list[int]: @@ -57,27 +63,38 @@ def get_failure_array(pattern: str) -> list[int]: if __name__ == "__main__": + import doctest + + doctest.testmod() + # Test 1) pattern = "abc1abc12" text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc" text2 = "alskfjaldsk23adsfabcabc" - assert kmp(pattern, text1) and not kmp(pattern, text2) + assert knuth_morris_pratt(text1, pattern) and knuth_morris_pratt(text2, pattern) # Test 2) pattern = "ABABX" text = "ABABZABABYABABX" - assert kmp(pattern, text) + assert knuth_morris_pratt(text, pattern) # Test 3) pattern = "AAAB" text = "ABAAAAAB" - assert kmp(pattern, text) + assert knuth_morris_pratt(text, pattern) # Test 4) pattern = "abcdabcy" text = "abcxabcdabxabcdabcdabcy" - assert kmp(pattern, text) + assert knuth_morris_pratt(text, pattern) + + # Test 5) -> Doctests + kmp = "knuth_morris_pratt" + assert all( + knuth_morris_pratt(kmp, s) == kmp.find(s) + for s in ("kn", "h_m", "rr", "tt", "not there") + ) - # Test 5) + # Test 6) pattern = "aabaabaaa" assert get_failure_array(pattern) == [0, 1, 0, 1, 2, 3, 4, 5, 2]