Dimensionality reduction (#8590)

Diegomangasco · web-flow · commit 54dedf844a30 · 2023-04-17T10:34:22.000+12:00
diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py
@@ -0,0 +1,198 @@
+#  Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub
+
+"""
+Requirements:
+  - numpy version 1.21
+  - scipy version 1.3.3
+Notes:
+  - Each column of the features matrix corresponds to a class item
+"""
+
+import logging
+
+import numpy as np
+import pytest
+from scipy.linalg import eigh
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+
+def column_reshape(input_array: np.ndarray) -> np.ndarray:
+    """Function to reshape a row Numpy array into a column Numpy array
+    >>> input_array = np.array([1, 2, 3])
+    >>> column_reshape(input_array)
+    array([[1],
+           [2],
+           [3]])
+    """
+
+    return input_array.reshape((input_array.size, 1))
+
+
+def covariance_within_classes(
+    features: np.ndarray, labels: np.ndarray, classes: int
+) -> np.ndarray:
+    """Function to compute the covariance matrix inside each class.
+    >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> labels = np.array([0, 1, 0])
+    >>> covariance_within_classes(features, labels, 2)
+    array([[0.66666667, 0.66666667, 0.66666667],
+           [0.66666667, 0.66666667, 0.66666667],
+           [0.66666667, 0.66666667, 0.66666667]])
+    """
+
+    covariance_sum = np.nan
+    for i in range(classes):
+        data = features[:, labels == i]
+        data_mean = data.mean(1)
+        # Centralize the data of class i
+        centered_data = data - column_reshape(data_mean)
+        if i > 0:
+            # If covariance_sum is not None
+            covariance_sum += np.dot(centered_data, centered_data.T)
+        else:
+            # If covariance_sum is np.nan (i.e. first loop)
+            covariance_sum = np.dot(centered_data, centered_data.T)
+
+    return covariance_sum / features.shape[1]
+
+
+def covariance_between_classes(
+    features: np.ndarray, labels: np.ndarray, classes: int
+) -> np.ndarray:
+    """Function to compute the covariance matrix between multiple classes
+    >>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]])
+    >>> labels = np.array([0, 1, 0])
+    >>> covariance_between_classes(features, labels, 2)
+    array([[ 3.55555556,  1.77777778, -2.66666667],
+           [ 1.77777778,  0.88888889, -1.33333333],
+           [-2.66666667, -1.33333333,  2.        ]])
+    """
+
+    general_data_mean = features.mean(1)
+    covariance_sum = np.nan
+    for i in range(classes):
+        data = features[:, labels == i]
+        device_data = data.shape[1]
+        data_mean = data.mean(1)
+        if i > 0:
+            # If covariance_sum is not None
+            covariance_sum += device_data * np.dot(
+                column_reshape(data_mean) - column_reshape(general_data_mean),
+                (column_reshape(data_mean) - column_reshape(general_data_mean)).T,
+            )
+        else:
+            # If covariance_sum is np.nan (i.e. first loop)
+            covariance_sum = device_data * np.dot(
+                column_reshape(data_mean) - column_reshape(general_data_mean),
+                (column_reshape(data_mean) - column_reshape(general_data_mean)).T,
+            )
+
+    return covariance_sum / features.shape[1]
+
+
+def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray:
+    """
+    Principal Component Analysis.
+
+    For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis.
+    Parameters:
+        * features: the features extracted from the dataset
+        * dimensions: to filter the projected data for the desired dimension
+
+    >>> test_principal_component_analysis()
+    """
+
+    # Check if the features have been loaded
+    if features.any():
+        data_mean = features.mean(1)
+        # Center the dataset
+        centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
+        covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
+        _, eigenvectors = np.linalg.eigh(covariance_matrix)
+        # Take all the columns in the reverse order (-1), and then takes only the first
+        filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
+        # Project the database on the new space
+        projected_data = np.dot(filtered_eigenvectors.T, features)
+        logging.info("Principal Component Analysis computed")
+
+        return projected_data
+    else:
+        logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
+        logging.error("Dataset empty")
+        raise AssertionError
+
+
+def linear_discriminant_analysis(
+    features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int
+) -> np.ndarray:
+    """
+    Linear Discriminant Analysis.
+
+    For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis.
+    Parameters:
+        * features: the features extracted from the dataset
+        * labels: the class labels of the features
+        * classes: the number of classes present in the dataset
+        * dimensions: to filter the projected data for the desired dimension
+
+    >>> test_linear_discriminant_analysis()
+    """
+
+    # Check if the dimension desired is less than the number of classes
+    assert classes > dimensions
+
+    # Check if features have been already loaded
+    if features.any:
+        _, eigenvectors = eigh(
+            covariance_between_classes(features, labels, classes),
+            covariance_within_classes(features, labels, classes),
+        )
+        filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
+        svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
+        filtered_svd_matrix = svd_matrix[:, 0:dimensions]
+        projected_data = np.dot(filtered_svd_matrix.T, features)
+        logging.info("Linear Discriminant Analysis computed")
+
+        return projected_data
+    else:
+        logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
+        logging.error("Dataset empty")
+        raise AssertionError
+
+
+def test_linear_discriminant_analysis() -> None:
+    # Create dummy dataset with 2 classes and 3 features
+    features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]])
+    labels = np.array([0, 0, 0, 1, 1])
+    classes = 2
+    dimensions = 2
+
+    # Assert that the function raises an AssertionError if dimensions > classes
+    with pytest.raises(AssertionError) as error_info:
+        projected_data = linear_discriminant_analysis(
+            features, labels, classes, dimensions
+        )
+        if isinstance(projected_data, np.ndarray):
+            raise AssertionError(
+                "Did not raise AssertionError for dimensions > classes"
+            )
+        assert error_info.type is AssertionError
+
+
+def test_principal_component_analysis() -> None:
+    features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    dimensions = 2
+    expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]])
+
+    with pytest.raises(AssertionError) as error_info:
+        output = principal_component_analysis(features, dimensions)
+        if not np.allclose(expected_output, output):
+            raise AssertionError
+        assert error_info.type is AssertionError
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()