TheAlgorithms · ChrisO345 · Apr 16, 2023 · Mar 28, 2023 · Mar 28, 2023 · Mar 28, 2023
diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2023 Diego Gasco ([email protected]), Diegomangasco on GitHub
+
+import logging
+import numpy as np
+import scipy
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def column_reshape(input_array: np.ndarray) -> np.ndarray:
+    """Function to reshape a row Numpy array into a column Numpy array"""
+
+    return input_array.reshape((input_array.size, 1))
+
+
+def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
+    """Function to compute the covariance matrix inside each class"""
+
+    covariance_sum = np.nan
+    for i in range(classes):
+        data = features[:, labels == i]
+        data_mean = data.mean(1)
+        # Centralize the data of class i
+        centered_data = data - column_reshape(data_mean)
+        if i > 0:
+            # If covariance_sum is not None
+            covariance_sum += np.dot(centered_data, centered_data.T)
+        else:
+            # If covariance_sum is np.nan (i.e. first loop)
+            covariance_sum = np.dot(centered_data, centered_data.T)
+
+    return covariance_sum / features.shape[1]
+
+
+def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
+    """Function to compute the covariance matrix between multiple classes"""
+
+    general_data_mean = features.mean(1)
+    covariance_sum = np.nan
+    for i in range(classes):
+        data = features[:, labels == i]
+        device_data = data.shape[1]
+        data_mean = data.mean(1)
+        if i > 0:
+            # If covariance_sum is not None
+            covariance_sum += device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean),
+                                                   (column_reshape(data_mean) - column_reshape(general_data_mean)).T)
+        else:
+            # If covariance_sum is np.nan (i.e. first loop)
+            covariance_sum = device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean),
+                                                  (column_reshape(data_mean) - column_reshape(general_data_mean)).T)
+
+    return covariance_sum / features.shape[1]
+
+
+def PCA(features: np.ndarray, dimensions: int) -> np.ndarray:
+    """Principal Component Analysis. \n
+    For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n
+    Parameters: \n
+    * features: the features extracted from the dataset
+    * labels: the class labels of the features
+    * dimensions: to filter the projected data for the desired dimension"""
+
+    # Check if the features have been loaded
+    if features.any():
+        data_mean = features.mean(1)
+        # Center the dataset
+        centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
+        covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
+        _, eigenvectors = np.linalg.eigh(covariance_matrix)
+        # Take all the columns in the reverse order (-1), and then takes only the first columns
+        filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
+        # Project the database on the new space
+        projected_data = np.dot(filtered_eigenvectors.T, features)
+        logging.info("Principal Component Analysis computed")
+
+        return projected_data
+    else:
+        logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True)
+        logging.error("Dataset empty")
+        raise AssertionError
+
+
+def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray:
+    """Linear Discriminant Analysis. \n
+    For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n
+    Parameters: \n
+    * features: the features extracted from the dataset
+    * labels: the class labels of the features
+    * classes: the number of classes present in the dataset
+    * dimensions: to filter the projected data for the desired dimension"""
+
+    # Check if the dimension desired is less than the number of classes
+    assert classes > dimensions
+
+    # Check if features have been already loaded
+    if features.any:
+        _, eigenvectors = scipy.linalg.eigh(
+            covariance_between_classes(features, labels, classes),
+            covariance_within_classes(features, labels, classes))
+        filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
+        svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
+        filtered_svd_matrix = svd_matrix[:, 0:dimensions]
+        projected_data = np.dot(filtered_svd_matrix.T, features)
+        logging.info("Linear Discriminant Analysis computed")
+
+        return projected_data
+    else:
+        logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True)
+        logging.error("Dataset empty")
+        raise AssertionError