diff --git a/DIRECTORY.md b/DIRECTORY.md index f0a34a553946..0dc79258d428 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -602,6 +602,7 @@ * [Mfcc](machine_learning/mfcc.py) * [Multilayer Perceptron Classifier](machine_learning/multilayer_perceptron_classifier.py) * [Polynomial Regression](machine_learning/polynomial_regression.py) + * [Principal Component Analysis](machine_learning/principal_component_analysis.py) * [Scoring Functions](machine_learning/scoring_functions.py) * [Self Organizing Map](machine_learning/self_organizing_map.py) * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) diff --git a/machine_learning/principal_component_analysis.py b/machine_learning/principal_component_analysis.py new file mode 100644 index 000000000000..bd5bbf2d872a --- /dev/null +++ b/machine_learning/principal_component_analysis.py @@ -0,0 +1,127 @@ +""" +Principal Component Analysis (PCA) is a linear dimensionality reduction technique used +commonly as a data preprocessing step in unsupervised and supervised machine learning pipelines. +The principle behind PCA is the reduction in the number of variables in the dataset, while +preserving as much as information as possible. + +Here, the principal components represent the directions of the data that explain a +maximal amount of variance. Here, the data is projected onto a new coordinate system +such that the direction cqpturing the largest variation in data can be easily identified. + +This implementation of PCA consists of the following steps: +1. Data Standardization (Z-score Normalization): This step involved the centering of the +data by subtracting the mean and dividing it by the standard deviation +so that it has unit variance. + +2. Covariance Matrix Calculation: This step involved the calculation of the covariance matrix +of the standardized data. The covariance matrix allows us to measure how different +features vary together, capturing the relationships between them. + +3. Singular Value Decomposition: In this step, we use Singular Value Decomposition or SVD to +Decomposes the covariance matrix into its singular eignvectors and singular eigenvalues, + which help identify the principal components. + +4. Selection of Principal Components: Here, we choose the top k principal components +that explain the most variance in the data. + +5. Projection of Data: Here, we transform the original standardized data into +the new lower-dimensional space defined by the selected principal components. + +REFERENCE: en.wikipedia.org/wiki/Principal_component_analysis +""" + +import numpy as np + + +def svd(matrix): + """ + Perform Singular Value Decomposition (SVD) on the given matrix. + Args: + matrix (np.ndarray): The input matrix. + Returns: + tuple: The U, S, and VT matrices from SVD. + >>> matrix = np.array([[1, 2], [3, 4]]) + >>> u, s, vt = svd(matrix) + >>> np.allclose(np.dot(u, np.dot(s, vt)), matrix) + True + """ + m, n = matrix.shape + u = np.zeros((m, m)) + s = np.zeros((m, n)) + vt = np.zeros((n, n)) + eigvals, eigvecs = np.linalg.eig(np.dot(matrix.T, matrix)) + vt = eigvecs.T + + singular_values = np.sqrt(eigvals) + s[:n, :n] = np.diag(singular_values) + + for i in range(n): + u[:, i] = np.dot(matrix, vt[i, :]) / singular_values[i] + + return u, s, vt + + +def main(data: list[int], k: int): + """ + Perform Principal Component Analysis (PCA) on the given data. + + Args: + data (list[int]): The input data. + k (int): The number of principal components to retain. + + Returns: + np.ndarray: The transformed data with reduced dimensionality. + + >>> data = np.array([[1, 2], [3, 4], [5, 6]]) + >>> main(data, 1) + array([[-2.82842712], + [ 0. ], + [ 2.82842712]]) + """ + z_score = data - data.mean(axis=0) / data.std(axis=0) + cov_matrix = np.cov(z_score, ddof=1, rowvar=False) + + u, s, vt = svd(cov_matrix) + principal_components = vt[:k] + transformed_data = np.dot(z_score, principal_components.T) + return transformed_data + + +if __name__ == "__main__": + import numpy as np + import matplotlib.pyplot as plt + from sklearn.datasets import make_blobs + + data, _ = make_blobs(n_samples=100, n_features=4, centers=5, random_state=42) + k = 2 + + transformed_data = main(data, k) + print("Transformed Data:") + print(transformed_data) + + assert transformed_data.shape == ( + data.shape[0], + k, + ), "The transformed data does not have the expected shape." + + # Visualize the original data and the transformed data + plt.figure(figsize=(12, 6)) + + plt.subplot(1, 2, 1) + plt.scatter(data[:, 0], data[:, 1], c="blue", edgecolor="k", s=50) + plt.title("Original Data") + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + + plt.subplot(1, 2, 2) + plt.scatter( + transformed_data, np.zeros_like(transformed_data), c="red", edgecolor="k", s=50 + ) + plt.title("Transformed Data") + plt.xlabel("Principal Component 1") + plt.yticks([]) + + plt.tight_layout() + plt.show() + + print("All tests passed.")