From babd745f6f6499717afced1ec7877c79cae2bd42 Mon Sep 17 00:00:00 2001 From: Diego Date: Tue, 28 Mar 2023 19:28:06 +0200 Subject: [PATCH 01/21] First commit for dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 102 +++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 machine_learning/dimensionality_reduction.py diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py new file mode 100644 index 000000000000..aac5ddacd902 --- /dev/null +++ b/machine_learning/dimensionality_reduction.py @@ -0,0 +1,102 @@ +import logging +import numpy as np +import scipy + + +def column_reshape(input_array: np.ndarray) -> np.ndarray: + """Function to reshape a row Numpy array into a column Numpy array.""" + + return v.reshape((input_array.size, 1)) + + +def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: + """Function to compute the covariance matrix inside each class.""" + + covariance_sum = None + for i in range(classes): + data = features[:, labels == i] + data_mean = data.mean(1) + centered_data = data - column_reshape(data_mean) + if covariance_sum: + covariance_sum += + np.dot(centered_data, centered_data.T) + else: + covariance_sum = np.dot(centered_data, centered_data.T) + + return convariance_sum / features.shape[1] + + +def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: + """Function to compute the covariance matrix between multiple classes.""" + + general_data_mean = features.mean(1) + covariance_sum = None + for i in range(classes): + data = features[:, labels == i] + device_data = data.shape[1] + data_mean = data.mean(1) + if covariance_sum: + covariance_sum += device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + else: + covariance_sum = device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + + return covariance_sum / features.shape[1] + + +class DimensionalityReduction: + """Class to apply PCA and LDA techniques for the dataset dimensionality reduction.\n + The data structures used are: \n + * self._features: a list of lists that contains all the packets features, where each packet has a list of features. \n + * self._labels: a list that contains the category labels (i.e. device labels) of each list of packet features. \n + * self._devices_number: an int value that specifies how many device models there are in the dataset. \n + * self._features_PCA: a Numpy array with the features mapped with PCA. \n + * self._features_LDA: a Numpy array with the features mapped with LDA.""" + + def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int): + logging.basicConfig(level=logging.INFO, format='%(message)s') + self._features = features + self._class_labels = class_labels + self._classes = classes + + def PCA(self, dimensions: int) -> np.ndarray: + """Principal Component Analysis with default filter parameter equal to 10.""" + + try: + assert any(self._features) is True + data = np.array(self._features) + mu = data.mean(1) + centered_data = self._features - np.reshape(mu, (mu.size, 1)) + covariance_matrix = np.dot(centered_data, centered_data.T) / data.shape[1] + s, U = np.linalg.eigh(covariance_matrix) + # Take all the columns in the reverse order (-1), and then takes only the first columns + P = U[:, ::-1][:, 0:m] + projected_data = np.dot(P.T, self._features) + logging.info("Principal Component Analysis computed.") + except AssertionError: + logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.error("The features must be a not-empty list") + raise AssertionError + + return projected_data + + def LDA(self, dimensions: int) -> np.ndarray: + """Linear Discriminant Analysis with default filter parameter equal to 8.""" + + try: + assert self._features_PCA is not None + data = np.array(self._features_PCA) + labels = np.array(self._labels) + s, U = scipy.linalg.eigh(covariance_between_classes(data, labels, self._devices_number), + covariance_within_classes(data, labels, self._devices_number)) + W = U[:, ::-1][:, :m] + UW, _, _ = np.linalg.svd(W) + U = UW[:, 0:m] + projected_data = np.dot(U.T, self._features_PCA) + logging.info("Linear Discriminant Analysis computed.") + except AssertionError: + logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.error("The features must be passed in PCA algorithm before!") + raise AssertionError + + return projected_data \ No newline at end of file From 5476b7da71d520fbdbd2c13208c4db5d3b73e7e9 Mon Sep 17 00:00:00 2001 From: Diego Date: Tue, 28 Mar 2023 19:52:50 +0200 Subject: [PATCH 02/21] Some bug fixies --- machine_learning/dimensionality_reduction.py | 60 ++++++++++++-------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index aac5ddacd902..cc81748ad643 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,3 +1,4 @@ +import copy import logging import numpy as np import scipy @@ -6,7 +7,7 @@ def column_reshape(input_array: np.ndarray) -> np.ndarray: """Function to reshape a row Numpy array into a column Numpy array.""" - return v.reshape((input_array.size, 1)) + return input_array.reshape((input_array.size, 1)) def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: @@ -18,11 +19,11 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: data_mean = data.mean(1) centered_data = data - column_reshape(data_mean) if covariance_sum: - covariance_sum += + np.dot(centered_data, centered_data.T) + covariance_sum += np.dot(centered_data, centered_data.T) else: covariance_sum = np.dot(centered_data, centered_data.T) - return convariance_sum / features.shape[1] + return covariance_sum / features.shape[1] def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: @@ -36,10 +37,10 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes data_mean = data.mean(1) if covariance_sum: covariance_sum += device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) else: covariance_sum = device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) return covariance_sum / features.shape[1] @@ -58,45 +59,54 @@ def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int) self._features = features self._class_labels = class_labels self._classes = classes + self._features_after_PCA = None def PCA(self, dimensions: int) -> np.ndarray: """Principal Component Analysis with default filter parameter equal to 10.""" try: assert any(self._features) is True - data = np.array(self._features) - mu = data.mean(1) - centered_data = self._features - np.reshape(mu, (mu.size, 1)) - covariance_matrix = np.dot(centered_data, centered_data.T) / data.shape[1] + data_mean = self._features.mean(1) + centered_data = self._features - np.reshape(data_mean, (data_mean.size, 1)) + covariance_matrix = np.dot(centered_data, centered_data.T) / self._features.shape[1] s, U = np.linalg.eigh(covariance_matrix) # Take all the columns in the reverse order (-1), and then takes only the first columns - P = U[:, ::-1][:, 0:m] + P = U[:, ::-1][:, 0:dimensions] projected_data = np.dot(P.T, self._features) - logging.info("Principal Component Analysis computed.") + self._features_after_PCA = copy.deepcopy(projected_data) + logging.info("Principal Component Analysis computed") except AssertionError: logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) - logging.error("The features must be a not-empty list") + logging.error("Feature array is empty") raise AssertionError return projected_data - def LDA(self, dimensions: int) -> np.ndarray: + def LDA(self, dimensions: int, pca_features=False) -> np.ndarray: """Linear Discriminant Analysis with default filter parameter equal to 8.""" try: - assert self._features_PCA is not None - data = np.array(self._features_PCA) - labels = np.array(self._labels) - s, U = scipy.linalg.eigh(covariance_between_classes(data, labels, self._devices_number), - covariance_within_classes(data, labels, self._devices_number)) - W = U[:, ::-1][:, :m] - UW, _, _ = np.linalg.svd(W) - U = UW[:, 0:m] - projected_data = np.dot(U.T, self._features_PCA) - logging.info("Linear Discriminant Analysis computed.") + if not pca_features: + assert any(self._features) is True + s, U = scipy.linalg.eigh(covariance_between_classes(self._features, self._class_labels, self._classes), + covariance_within_classes(self._features, self._class_labels, self._classes)) + W = U[:, ::-1][:, :dimensions] + UW, _, _ = np.linalg.svd(W) + U = UW[:, 0:dimensions] + projected_data = np.dot(U.T, self._features) + logging.info("Linear Discriminant Analysis computed on original features") + else: + assert self._features_after_PCA is not None + s, U = scipy.linalg.eigh(covariance_between_classes(self._features_after_PCA, self._class_labels, self._classes), + covariance_within_classes(self._features_after_PCA, self._class_labels, self._classes)) + W = U[:, ::-1][:, :dimensions] + UW, _, _ = np.linalg.svd(W) + U = UW[:, 0:dimensions] + projected_data = np.dot(U.T, self._features) + logging.info("Linear Discriminant Analysis computed on features pre-processed with PCA") except AssertionError: logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) - logging.error("The features must be passed in PCA algorithm before!") + logging.error("Features array is empty!") raise AssertionError - return projected_data \ No newline at end of file + return projected_data From 3d8c1beb2cb98b1c389318e4faad2231ac452d3a Mon Sep 17 00:00:00 2001 From: Diego Date: Tue, 28 Mar 2023 22:13:00 +0200 Subject: [PATCH 03/21] Added a TODO list --- machine_learning/dimensionality_reduction.py | 32 +++++++++++--------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index cc81748ad643..696d59892bb5 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -3,15 +3,19 @@ import numpy as np import scipy +# TODO update documentation +# TODO add comments +# TODO update variable names + def column_reshape(input_array: np.ndarray) -> np.ndarray: - """Function to reshape a row Numpy array into a column Numpy array.""" + """Function to reshape a row Numpy array into a column Numpy array""" return input_array.reshape((input_array.size, 1)) def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: - """Function to compute the covariance matrix inside each class.""" + """Function to compute the covariance matrix inside each class""" covariance_sum = None for i in range(classes): @@ -27,7 +31,7 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: - """Function to compute the covariance matrix between multiple classes.""" + """Function to compute the covariance matrix between multiple classes""" general_data_mean = features.mean(1) covariance_sum = None @@ -37,10 +41,10 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes data_mean = data.mean(1) if covariance_sum: covariance_sum += device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) + (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) else: covariance_sum = device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) + (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) return covariance_sum / features.shape[1] @@ -48,11 +52,10 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes class DimensionalityReduction: """Class to apply PCA and LDA techniques for the dataset dimensionality reduction.\n The data structures used are: \n - * self._features: a list of lists that contains all the packets features, where each packet has a list of features. \n - * self._labels: a list that contains the category labels (i.e. device labels) of each list of packet features. \n - * self._devices_number: an int value that specifies how many device models there are in the dataset. \n - * self._features_PCA: a Numpy array with the features mapped with PCA. \n - * self._features_LDA: a Numpy array with the features mapped with LDA.""" + * self._features: \n + * self._class_labels: \n + * self._classes: \n + * self._features_after_PCA: """ def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int): logging.basicConfig(level=logging.INFO, format='%(message)s') @@ -62,7 +65,7 @@ def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int) self._features_after_PCA = None def PCA(self, dimensions: int) -> np.ndarray: - """Principal Component Analysis with default filter parameter equal to 10.""" + """Principal Component Analysis with default filter parameter equal to 10""" try: assert any(self._features) is True @@ -83,7 +86,7 @@ def PCA(self, dimensions: int) -> np.ndarray: return projected_data def LDA(self, dimensions: int, pca_features=False) -> np.ndarray: - """Linear Discriminant Analysis with default filter parameter equal to 8.""" + """Linear Discriminant Analysis with default filter parameter equal to 8""" try: if not pca_features: @@ -97,8 +100,9 @@ def LDA(self, dimensions: int, pca_features=False) -> np.ndarray: logging.info("Linear Discriminant Analysis computed on original features") else: assert self._features_after_PCA is not None - s, U = scipy.linalg.eigh(covariance_between_classes(self._features_after_PCA, self._class_labels, self._classes), - covariance_within_classes(self._features_after_PCA, self._class_labels, self._classes)) + s, U = scipy.linalg.eigh( + covariance_between_classes(self._features_after_PCA, self._class_labels, self._classes), + covariance_within_classes(self._features_after_PCA, self._class_labels, self._classes)) W = U[:, ::-1][:, :dimensions] UW, _, _ = np.linalg.svd(W) U = UW[:, 0:dimensions] From 24a68e935c9da302f29c1408dd6303a5bc9284df Mon Sep 17 00:00:00 2001 From: Diego Date: Wed, 29 Mar 2023 16:46:10 +0200 Subject: [PATCH 04/21] Finish code for dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 57 +++++++++++--------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 696d59892bb5..641b7a8effc2 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -3,11 +3,6 @@ import numpy as np import scipy -# TODO update documentation -# TODO add comments -# TODO update variable names - - def column_reshape(input_array: np.ndarray) -> np.ndarray: """Function to reshape a row Numpy array into a column Numpy array""" @@ -21,10 +16,13 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: for i in range(classes): data = features[:, labels == i] data_mean = data.mean(1) + # Centralize the data of class i centered_data = data - column_reshape(data_mean) if covariance_sum: + # If covariance_sum is not None covariance_sum += np.dot(centered_data, centered_data.T) else: + # If covariance_sum is None (i.e. first loop) covariance_sum = np.dot(centered_data, centered_data.T) return covariance_sum / features.shape[1] @@ -40,9 +38,11 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes device_data = data.shape[1] data_mean = data.mean(1) if covariance_sum: + # If covariance_sum is not None covariance_sum += device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) else: + # If covariance_sum is None (i.e. first loop) covariance_sum = device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) @@ -52,10 +52,10 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes class DimensionalityReduction: """Class to apply PCA and LDA techniques for the dataset dimensionality reduction.\n The data structures used are: \n - * self._features: \n - * self._class_labels: \n - * self._classes: \n - * self._features_after_PCA: """ + * self._features: contains the features for each object as a matrix \n + * self._class_labels: contains the labels associated with each object \n + * self._classes: the number of classes in the dataset \n + * self._features_after_PCA: will contain the features mapped in a new space after PCA""" def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int): logging.basicConfig(level=logging.INFO, format='%(message)s') @@ -65,17 +65,20 @@ def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int) self._features_after_PCA = None def PCA(self, dimensions: int) -> np.ndarray: - """Principal Component Analysis with default filter parameter equal to 10""" + """Principal Component Analysis with a certain filter parameter""" try: + # Check if the features have been loaded assert any(self._features) is True data_mean = self._features.mean(1) + # Center the dataset centered_data = self._features - np.reshape(data_mean, (data_mean.size, 1)) covariance_matrix = np.dot(centered_data, centered_data.T) / self._features.shape[1] - s, U = np.linalg.eigh(covariance_matrix) + _, eigenvectors = np.linalg.eigh(covariance_matrix) # Take all the columns in the reverse order (-1), and then takes only the first columns - P = U[:, ::-1][:, 0:dimensions] - projected_data = np.dot(P.T, self._features) + filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] + # Project the database on the new space + projected_data = np.dot(filtered_eigenvectors.T, self._features) self._features_after_PCA = copy.deepcopy(projected_data) logging.info("Principal Component Analysis computed") except AssertionError: @@ -86,27 +89,31 @@ def PCA(self, dimensions: int) -> np.ndarray: return projected_data def LDA(self, dimensions: int, pca_features=False) -> np.ndarray: - """Linear Discriminant Analysis with default filter parameter equal to 8""" + """Linear Discriminant Analysis with a certain filter parameter""" try: if not pca_features: + # Check if features have been already loaded assert any(self._features) is True - s, U = scipy.linalg.eigh(covariance_between_classes(self._features, self._class_labels, self._classes), - covariance_within_classes(self._features, self._class_labels, self._classes)) - W = U[:, ::-1][:, :dimensions] - UW, _, _ = np.linalg.svd(W) - U = UW[:, 0:dimensions] - projected_data = np.dot(U.T, self._features) + _, eigenvectors = scipy.linalg.eigh( + covariance_between_classes(self._features, self._class_labels, self._classes), + covariance_within_classes(self._features, self._class_labels, self._classes)) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + filtered_svd_matrix = svd_matrix[:, 0:dimensions] + projected_data = np.dot(filtered_svd_matrix.T, self._features) logging.info("Linear Discriminant Analysis computed on original features") else: + # Check if features mapped on PCA have been already loaded assert self._features_after_PCA is not None - s, U = scipy.linalg.eigh( + _, eigenvectors = scipy.linalg.eigh( covariance_between_classes(self._features_after_PCA, self._class_labels, self._classes), covariance_within_classes(self._features_after_PCA, self._class_labels, self._classes)) - W = U[:, ::-1][:, :dimensions] - UW, _, _ = np.linalg.svd(W) - U = UW[:, 0:dimensions] - projected_data = np.dot(U.T, self._features) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + svd_matrix_filtered = svd_matrix[:, 0:dimensions] + # Project the database on the new space + projected_data = np.dot(svd_matrix_filtered.T, self._features) logging.info("Linear Discriminant Analysis computed on features pre-processed with PCA") except AssertionError: logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) From eb50e28940a528ea233f382864cc30207ae35f0b Mon Sep 17 00:00:00 2001 From: Diego Date: Fri, 31 Mar 2023 12:16:37 +0200 Subject: [PATCH 05/21] PCA and LDA finished and tested --- machine_learning/dimensionality_reduction.py | 142 ++++++++----------- 1 file changed, 63 insertions(+), 79 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 641b7a8effc2..3533556280a8 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,8 +1,10 @@ -import copy import logging import numpy as np import scipy +logging.basicConfig(level=logging.INFO, format='%(message)s') + + def column_reshape(input_array: np.ndarray) -> np.ndarray: """Function to reshape a row Numpy array into a column Numpy array""" @@ -12,17 +14,17 @@ def column_reshape(input_array: np.ndarray) -> np.ndarray: def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: """Function to compute the covariance matrix inside each class""" - covariance_sum = None + covariance_sum = np.nan for i in range(classes): data = features[:, labels == i] data_mean = data.mean(1) # Centralize the data of class i centered_data = data - column_reshape(data_mean) - if covariance_sum: + if i > 0: # If covariance_sum is not None covariance_sum += np.dot(centered_data, centered_data.T) else: - # If covariance_sum is None (i.e. first loop) + # If covariance_sum is np.nan (i.e. first loop) covariance_sum = np.dot(centered_data, centered_data.T) return covariance_sum / features.shape[1] @@ -32,92 +34,74 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes """Function to compute the covariance matrix between multiple classes""" general_data_mean = features.mean(1) - covariance_sum = None + covariance_sum = np.nan for i in range(classes): data = features[:, labels == i] device_data = data.shape[1] data_mean = data.mean(1) - if covariance_sum: + if i > 0: # If covariance_sum is not None - covariance_sum += device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) + covariance_sum += device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T) else: - # If covariance_sum is None (i.e. first loop) - covariance_sum = device_data * np.dot((column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T)) + # If covariance_sum is np.nan (i.e. first loop) + covariance_sum = device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T) return covariance_sum / features.shape[1] -class DimensionalityReduction: - """Class to apply PCA and LDA techniques for the dataset dimensionality reduction.\n - The data structures used are: \n - * self._features: contains the features for each object as a matrix \n - * self._class_labels: contains the labels associated with each object \n - * self._classes: the number of classes in the dataset \n - * self._features_after_PCA: will contain the features mapped in a new space after PCA""" - - def __init__(self, features: np.ndarray, class_labels: np.ndarray, classes: int): - logging.basicConfig(level=logging.INFO, format='%(message)s') - self._features = features - self._class_labels = class_labels - self._classes = classes - self._features_after_PCA = None - - def PCA(self, dimensions: int) -> np.ndarray: - """Principal Component Analysis with a certain filter parameter""" - - try: - # Check if the features have been loaded - assert any(self._features) is True - data_mean = self._features.mean(1) - # Center the dataset - centered_data = self._features - np.reshape(data_mean, (data_mean.size, 1)) - covariance_matrix = np.dot(centered_data, centered_data.T) / self._features.shape[1] - _, eigenvectors = np.linalg.eigh(covariance_matrix) - # Take all the columns in the reverse order (-1), and then takes only the first columns - filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] - # Project the database on the new space - projected_data = np.dot(filtered_eigenvectors.T, self._features) - self._features_after_PCA = copy.deepcopy(projected_data) - logging.info("Principal Component Analysis computed") - except AssertionError: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) - logging.error("Feature array is empty") - raise AssertionError +def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: + """Principal Component Analysis \n + Parameters: \n + * features: the features extracted from the dataset + * labels: the class labels of the features + * dimensions: to filter the projected data for the desired dimension""" + + # Check if the features have been loaded + if features.any(): + data_mean = features.mean(1) + # Center the dataset + centered_data = features - np.reshape(data_mean, (data_mean.size, 1)) + covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1] + _, eigenvectors = np.linalg.eigh(covariance_matrix) + # Take all the columns in the reverse order (-1), and then takes only the first columns + filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] + # Project the database on the new space + projected_data = np.dot(filtered_eigenvectors.T, features) + logging.info("Principal Component Analysis computed") return projected_data - - def LDA(self, dimensions: int, pca_features=False) -> np.ndarray: - """Linear Discriminant Analysis with a certain filter parameter""" - - try: - if not pca_features: - # Check if features have been already loaded - assert any(self._features) is True - _, eigenvectors = scipy.linalg.eigh( - covariance_between_classes(self._features, self._class_labels, self._classes), - covariance_within_classes(self._features, self._class_labels, self._classes)) - filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] - svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) - filtered_svd_matrix = svd_matrix[:, 0:dimensions] - projected_data = np.dot(filtered_svd_matrix.T, self._features) - logging.info("Linear Discriminant Analysis computed on original features") - else: - # Check if features mapped on PCA have been already loaded - assert self._features_after_PCA is not None - _, eigenvectors = scipy.linalg.eigh( - covariance_between_classes(self._features_after_PCA, self._class_labels, self._classes), - covariance_within_classes(self._features_after_PCA, self._class_labels, self._classes)) - filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] - svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) - svd_matrix_filtered = svd_matrix[:, 0:dimensions] - # Project the database on the new space - projected_data = np.dot(svd_matrix_filtered.T, self._features) - logging.info("Linear Discriminant Analysis computed on features pre-processed with PCA") - except AssertionError: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) - logging.error("Features array is empty!") - raise AssertionError + else: + logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.error("Dataset empty") + raise AssertionError + + +def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray: + """Linear Discriminant Analysis \n + Parameters: \n + * features: the features extracted from the dataset + * labels: the class labels of the features + * classes: the number of classes present in the dataset + * dimensions: to filter the projected data for the desired dimension""" + + # Check if the dimension desired is less than the number of classes + assert classes > dimensions + + # Check if features have been already loaded + if features.any: + _, eigenvectors = scipy.linalg.eigh( + covariance_between_classes(features, labels, classes), + covariance_within_classes(features, labels, classes)) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + filtered_svd_matrix = svd_matrix[:, 0:dimensions] + projected_data = np.dot(filtered_svd_matrix.T, features) + logging.info("Linear Discriminant Analysis computed") return projected_data + else: + logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.error("Dataset empty") + raise AssertionError From 0eb4e10a2814e3665bf3ff85b2ac2fb6201d66f2 Mon Sep 17 00:00:00 2001 From: Diego Date: Fri, 31 Mar 2023 12:44:39 +0200 Subject: [PATCH 06/21] Add Copyright --- machine_learning/dimensionality_reduction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 3533556280a8..2e4c200b4eb9 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,3 +1,5 @@ +# Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub + import logging import numpy as np import scipy From 5509e7dc9a5bc977926286d8ee3fa31cbaacca08 Mon Sep 17 00:00:00 2001 From: Diego Date: Fri, 31 Mar 2023 12:51:11 +0200 Subject: [PATCH 07/21] Add links to Wikipedia --- machine_learning/dimensionality_reduction.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 2e4c200b4eb9..db606ed2ffc7 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -54,7 +54,8 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: - """Principal Component Analysis \n + """Principal Component Analysis. \n + For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n Parameters: \n * features: the features extracted from the dataset * labels: the class labels of the features @@ -81,8 +82,9 @@ def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray: - """Linear Discriminant Analysis \n - Parameters: \n + """Linear Discriminant Analysis. \n + For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n + Parameters: \n * features: the features extracted from the dataset * labels: the class labels of the features * classes: the number of classes present in the dataset From 041aa1dec54cd7eba43eb05c2d451aa5808e8a1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Mar 2023 10:53:19 +0000 Subject: [PATCH 08/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 33 +++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index db606ed2ffc7..92165b515815 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -4,7 +4,7 @@ import numpy as np import scipy -logging.basicConfig(level=logging.INFO, format='%(message)s') +logging.basicConfig(level=logging.INFO, format="%(message)s") def column_reshape(input_array: np.ndarray) -> np.ndarray: @@ -13,7 +13,9 @@ def column_reshape(input_array: np.ndarray) -> np.ndarray: return input_array.reshape((input_array.size, 1)) -def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: +def covariance_within_classes( + features: np.ndarray, labels: np.ndarray, classes: int +) -> np.ndarray: """Function to compute the covariance matrix inside each class""" covariance_sum = np.nan @@ -32,7 +34,9 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: return covariance_sum / features.shape[1] -def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: +def covariance_between_classes( + features: np.ndarray, labels: np.ndarray, classes: int +) -> np.ndarray: """Function to compute the covariance matrix between multiple classes""" general_data_mean = features.mean(1) @@ -43,12 +47,16 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes data_mean = data.mean(1) if i > 0: # If covariance_sum is not None - covariance_sum += device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + covariance_sum += device_data * np.dot( + column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T, + ) else: # If covariance_sum is np.nan (i.e. first loop) - covariance_sum = device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + covariance_sum = device_data * np.dot( + column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T, + ) return covariance_sum / features.shape[1] @@ -76,12 +84,14 @@ def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: return projected_data else: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError -def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray: +def LDA( + features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int +) -> np.ndarray: """Linear Discriminant Analysis. \n For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n Parameters: \n @@ -97,7 +107,8 @@ def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) if features.any: _, eigenvectors = scipy.linalg.eigh( covariance_between_classes(features, labels, classes), - covariance_within_classes(features, labels, classes)) + covariance_within_classes(features, labels, classes), + ) filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) filtered_svd_matrix = svd_matrix[:, 0:dimensions] @@ -106,6 +117,6 @@ def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) return projected_data else: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError From 7e1fc3560248de8c527ece9cebed4c8e039d1703 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 31 Mar 2023 19:53:50 +0200 Subject: [PATCH 09/21] Apply suggestions from code review --- machine_learning/dimensionality_reduction.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 92165b515815..5c78c522c0a8 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,6 +1,7 @@ # Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub import logging + import numpy as np import scipy @@ -61,9 +62,9 @@ def covariance_between_classes( return covariance_sum / features.shape[1] -def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: +def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: """Principal Component Analysis. \n - For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n + For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis Parameters: \n * features: the features extracted from the dataset * labels: the class labels of the features @@ -76,7 +77,8 @@ def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: centered_data = features - np.reshape(data_mean, (data_mean.size, 1)) covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1] _, eigenvectors = np.linalg.eigh(covariance_matrix) - # Take all the columns in the reverse order (-1), and then takes only the first columns + # Take all the columns in the reverse order (-1), and then takes only the first + # columns filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] # Project the database on the new space projected_data = np.dot(filtered_eigenvectors.T, features) @@ -89,11 +91,11 @@ def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: raise AssertionError -def LDA( +def linear_discriminant_analysis( features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int ) -> np.ndarray: """Linear Discriminant Analysis. \n - For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n + For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis Parameters: \n * features: the features extracted from the dataset * labels: the class labels of the features From 43e1f53d02d8673cb1f61849e64b6a698d447d47 Mon Sep 17 00:00:00 2001 From: Diego Date: Fri, 31 Mar 2023 21:14:30 +0200 Subject: [PATCH 10/21] Reformat file --- machine_learning/dimensionality_reduction.py | 57 +++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index db606ed2ffc7..3d86865603a0 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,19 +1,19 @@ # Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub -import logging +import logging # noqa: I001 import numpy as np import scipy logging.basicConfig(level=logging.INFO, format='%(message)s') -def column_reshape(input_array: np.ndarray) -> np.ndarray: +def _column_reshape(input_array: np.ndarray) -> np.ndarray: """Function to reshape a row Numpy array into a column Numpy array""" return input_array.reshape((input_array.size, 1)) -def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: +def _covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: """Function to compute the covariance matrix inside each class""" covariance_sum = np.nan @@ -21,7 +21,7 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: data = features[:, labels == i] data_mean = data.mean(1) # Centralize the data of class i - centered_data = data - column_reshape(data_mean) + centered_data = data - _column_reshape(data_mean) if i > 0: # If covariance_sum is not None covariance_sum += np.dot(centered_data, centered_data.T) @@ -32,7 +32,7 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: return covariance_sum / features.shape[1] -def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: +def _covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: """Function to compute the covariance matrix between multiple classes""" general_data_mean = features.mean(1) @@ -43,23 +43,25 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes data_mean = data.mean(1) if i > 0: # If covariance_sum is not None - covariance_sum += device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + covariance_sum += device_data * np.dot(_column_reshape(data_mean) - _column_reshape(general_data_mean), + (_column_reshape(data_mean) - _column_reshape(general_data_mean)).T) else: # If covariance_sum is np.nan (i.e. first loop) - covariance_sum = device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean), - (column_reshape(data_mean) - column_reshape(general_data_mean)).T) + covariance_sum = device_data * np.dot(_column_reshape(data_mean) - _column_reshape(general_data_mean), + (_column_reshape(data_mean) - _column_reshape(general_data_mean)).T) return covariance_sum / features.shape[1] -def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: - """Principal Component Analysis. \n - For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n - Parameters: \n - * features: the features extracted from the dataset - * labels: the class labels of the features - * dimensions: to filter the projected data for the desired dimension""" +def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: + """ + Principal Component Analysis. + + For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis. + Parameters: + * features: the features extracted from the dataset + * dimensions: to filter the projected data for the desired dimension + """ # Check if the features have been loaded if features.any(): @@ -81,14 +83,17 @@ def PCA(features: np.ndarray, dimensions: int) -> np.ndarray: raise AssertionError -def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray: - """Linear Discriminant Analysis. \n - For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n - Parameters: \n - * features: the features extracted from the dataset - * labels: the class labels of the features - * classes: the number of classes present in the dataset - * dimensions: to filter the projected data for the desired dimension""" +def linear_discriminant_analysis(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray: + """ + Linear Discriminant Analysis. + + For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis. + Parameters: + * features: the features extracted from the dataset + * labels: the class labels of the features + * classes: the number of classes present in the dataset + * dimensions: to filter the projected data for the desired dimension + """ # Check if the dimension desired is less than the number of classes assert classes > dimensions @@ -96,8 +101,8 @@ def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) # Check if features have been already loaded if features.any: _, eigenvectors = scipy.linalg.eigh( - covariance_between_classes(features, labels, classes), - covariance_within_classes(features, labels, classes)) + _covariance_between_classes(features, labels, classes), + _covariance_within_classes(features, labels, classes)) filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) filtered_svd_matrix = svd_matrix[:, 0:dimensions] From 19727cf6d686ce608848671c0ca8a71210a60e46 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Mar 2023 19:17:36 +0000 Subject: [PATCH 11/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 33 +++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 3d86865603a0..bfd4939aa6de 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -4,7 +4,7 @@ import numpy as np import scipy -logging.basicConfig(level=logging.INFO, format='%(message)s') +logging.basicConfig(level=logging.INFO, format="%(message)s") def _column_reshape(input_array: np.ndarray) -> np.ndarray: @@ -13,7 +13,9 @@ def _column_reshape(input_array: np.ndarray) -> np.ndarray: return input_array.reshape((input_array.size, 1)) -def _covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: +def _covariance_within_classes( + features: np.ndarray, labels: np.ndarray, classes: int +) -> np.ndarray: """Function to compute the covariance matrix inside each class""" covariance_sum = np.nan @@ -32,7 +34,9 @@ def _covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes return covariance_sum / features.shape[1] -def _covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray: +def _covariance_between_classes( + features: np.ndarray, labels: np.ndarray, classes: int +) -> np.ndarray: """Function to compute the covariance matrix between multiple classes""" general_data_mean = features.mean(1) @@ -43,12 +47,16 @@ def _covariance_between_classes(features: np.ndarray, labels: np.ndarray, classe data_mean = data.mean(1) if i > 0: # If covariance_sum is not None - covariance_sum += device_data * np.dot(_column_reshape(data_mean) - _column_reshape(general_data_mean), - (_column_reshape(data_mean) - _column_reshape(general_data_mean)).T) + covariance_sum += device_data * np.dot( + _column_reshape(data_mean) - _column_reshape(general_data_mean), + (_column_reshape(data_mean) - _column_reshape(general_data_mean)).T, + ) else: # If covariance_sum is np.nan (i.e. first loop) - covariance_sum = device_data * np.dot(_column_reshape(data_mean) - _column_reshape(general_data_mean), - (_column_reshape(data_mean) - _column_reshape(general_data_mean)).T) + covariance_sum = device_data * np.dot( + _column_reshape(data_mean) - _column_reshape(general_data_mean), + (_column_reshape(data_mean) - _column_reshape(general_data_mean)).T, + ) return covariance_sum / features.shape[1] @@ -78,12 +86,14 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd return projected_data else: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError -def linear_discriminant_analysis(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray: +def linear_discriminant_analysis( + features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int +) -> np.ndarray: """ Linear Discriminant Analysis. @@ -102,7 +112,8 @@ def linear_discriminant_analysis(features: np.ndarray, labels: np.ndarray, class if features.any: _, eigenvectors = scipy.linalg.eigh( _covariance_between_classes(features, labels, classes), - _covariance_within_classes(features, labels, classes)) + _covariance_within_classes(features, labels, classes), + ) filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) filtered_svd_matrix = svd_matrix[:, 0:dimensions] @@ -111,6 +122,6 @@ def linear_discriminant_analysis(features: np.ndarray, labels: np.ndarray, class return projected_data else: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError From d2e483397d5b353b255604d46cc9c773bed980e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Mar 2023 22:56:32 +0000 Subject: [PATCH 12/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 6e54f1054989..f5b9e3553d26 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -13,7 +13,7 @@ import numpy as np from scipy.linalg import eigh -logging.basicConfig(level=logging.INFO, format='%(message)s') +logging.basicConfig(level=logging.INFO, format="%(message)s") def column_reshape(input_array: np.ndarray) -> np.ndarray: @@ -121,7 +121,7 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd return projected_data else: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError @@ -164,12 +164,12 @@ def linear_discriminant_analysis( return projected_data else: - logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError -def test_linear_discriminant_analysis(): +def test_linear_discriminant_analysis(): # Create dummy dataset with 2 classes and 3 features features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) labels = np.array([0, 0, 0, 1, 1]) @@ -195,6 +195,7 @@ def test_linear_discriminant_analysis(): else: raise AssertionError("Did not raise AssertionError for dimensions > classes") + if __name__ == "__main__": import doctest From 56a21315e621b981796b9f3ad7ffc2c0f21e863c Mon Sep 17 00:00:00 2001 From: Diego Date: Sat, 1 Apr 2023 00:59:56 +0200 Subject: [PATCH 13/21] Added None return to test function --- machine_learning/dimensionality_reduction.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index f5b9e3553d26..0f1ac1c61810 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -169,7 +169,7 @@ def linear_discriminant_analysis( raise AssertionError -def test_linear_discriminant_analysis(): +def test_linear_discriminant_analysis() -> None: # Create dummy dataset with 2 classes and 3 features features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) labels = np.array([0, 0, 0, 1, 1]) @@ -195,7 +195,6 @@ def test_linear_discriminant_analysis(): else: raise AssertionError("Did not raise AssertionError for dimensions > classes") - if __name__ == "__main__": import doctest From 8f6323b1dae13a8e4ef9bb156788079e1ad7fd01 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Mar 2023 23:00:34 +0000 Subject: [PATCH 14/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 0f1ac1c61810..c4f57443e460 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -195,6 +195,7 @@ def test_linear_discriminant_analysis() -> None: else: raise AssertionError("Did not raise AssertionError for dimensions > classes") + if __name__ == "__main__": import doctest From 5328195b3113526efc44528452272de39a7f7f3c Mon Sep 17 00:00:00 2001 From: Diego Date: Sat, 1 Apr 2023 09:25:58 +0200 Subject: [PATCH 15/21] Remove the word "here" --- machine_learning/dimensionality_reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index c4f57443e460..a84616e81f80 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -94,7 +94,7 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd """ Principal Component Analysis. - For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis. + For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis. Parameters: * features: the features extracted from the dataset * dimensions: to filter the projected data for the desired dimension @@ -132,7 +132,7 @@ def linear_discriminant_analysis( """ Linear Discriminant Analysis. - For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis. + For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis. Parameters: * features: the features extracted from the dataset * labels: the class labels of the features From 38a40191d9764300482e7932f32d0c89dda0503d Mon Sep 17 00:00:00 2001 From: Diego Date: Sat, 1 Apr 2023 09:49:17 +0200 Subject: [PATCH 16/21] Remove doctest from linear_discriminant_analysis --- machine_learning/dimensionality_reduction.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index a84616e81f80..d8bcbd11ee64 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -138,13 +138,6 @@ def linear_discriminant_analysis( * labels: the class labels of the features * classes: the number of classes present in the dataset * dimensions: to filter the projected data for the desired dimension - >>> features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) - >>> labels = np.array([0, 2, 0, 1, 1]) - >>> classes = 3 - >>> dimensions = 2 - >>> linear_discriminant_analysis(features, labels, classes, dimensions) - array([[0.70710678, 0.70710678, 0.70710678, 0.70710678, 0.70710678], - [3.60806823, 5.10257902, 6.59708982, 8.09160061, 9.58611141]]) """ # Check if the dimension desired is less than the number of classes From f2d329346fc3202c14a2a9465d77904b9ae66c87 Mon Sep 17 00:00:00 2001 From: Diego Date: Sat, 1 Apr 2023 09:56:10 +0200 Subject: [PATCH 17/21] Fixed doctest for PCA --- machine_learning/dimensionality_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index d8bcbd11ee64..3489ff35caa2 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -102,7 +102,7 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd >>> dimensions = 2 >>> principal_component_analysis(features, dimensions) array([[ 6.92820323, 8.66025404, 10.39230485], - [ 3. , 3. , 3. ]]) + [-2.99984328, -2.99984328, -2.99984328]]) """ # Check if the features have been loaded From 7ed93e26ea021eaf0b41b57f2336351f32f55148 Mon Sep 17 00:00:00 2001 From: Diego Date: Sat, 1 Apr 2023 10:01:41 +0200 Subject: [PATCH 18/21] Fixed doctest for PCA pt.2 --- machine_learning/dimensionality_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 3489ff35caa2..d8bcbd11ee64 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -102,7 +102,7 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd >>> dimensions = 2 >>> principal_component_analysis(features, dimensions) array([[ 6.92820323, 8.66025404, 10.39230485], - [-2.99984328, -2.99984328, -2.99984328]]) + [ 3. , 3. , 3. ]]) """ # Check if the features have been loaded From 85f1730f1be73132379cfb63f4b0796340a4dc1c Mon Sep 17 00:00:00 2001 From: Diego Date: Sun, 2 Apr 2023 19:15:28 +0200 Subject: [PATCH 19/21] Add test for principal_component_analysis --- machine_learning/dimensionality_reduction.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index d8bcbd11ee64..e0adf502e13b 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -29,7 +29,7 @@ def column_reshape(input_array: np.ndarray) -> np.ndarray: def covariance_within_classes( - features: np.ndarray, labels: np.ndarray, classes: int + features: np.ndarray, labels: np.ndarray, classes: int ) -> np.ndarray: """Function to compute the covariance matrix inside each class. >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) @@ -57,7 +57,7 @@ def covariance_within_classes( def covariance_between_classes( - features: np.ndarray, labels: np.ndarray, classes: int + features: np.ndarray, labels: np.ndarray, classes: int ) -> np.ndarray: """Function to compute the covariance matrix between multiple classes >>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]]) @@ -98,11 +98,6 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd Parameters: * features: the features extracted from the dataset * dimensions: to filter the projected data for the desired dimension - >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - >>> dimensions = 2 - >>> principal_component_analysis(features, dimensions) - array([[ 6.92820323, 8.66025404, 10.39230485], - [ 3. , 3. , 3. ]]) """ # Check if the features have been loaded @@ -113,7 +108,6 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1] _, eigenvectors = np.linalg.eigh(covariance_matrix) # Take all the columns in the reverse order (-1), and then takes only the first - # columns filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] # Project the database on the new space projected_data = np.dot(filtered_eigenvectors.T, features) @@ -127,7 +121,7 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd def linear_discriminant_analysis( - features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int + features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int ) -> np.ndarray: """ Linear Discriminant Analysis. @@ -189,6 +183,14 @@ def test_linear_discriminant_analysis() -> None: raise AssertionError("Did not raise AssertionError for dimensions > classes") +def test_principal_component_analysis() -> None: + features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + dimensions = 2 + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3., 3., 3.]]) + output = principal_component_analysis(features, dimensions) + assert np.allclose(expected_output, output), f"Expected {expected_output}, but got {output}" + + if __name__ == "__main__": import doctest From 7f524e160ff964e562ecf8639161dfcebe378377 Mon Sep 17 00:00:00 2001 From: Diego Date: Sat, 15 Apr 2023 15:45:35 +0200 Subject: [PATCH 20/21] Updated tests --- machine_learning/dimensionality_reduction.py | 45 ++++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index e0adf502e13b..788bd738ac92 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -11,6 +11,7 @@ import logging import numpy as np +import pytest from scipy.linalg import eigh logging.basicConfig(level=logging.INFO, format="%(message)s") @@ -29,7 +30,7 @@ def column_reshape(input_array: np.ndarray) -> np.ndarray: def covariance_within_classes( - features: np.ndarray, labels: np.ndarray, classes: int + features: np.ndarray, labels: np.ndarray, classes: int ) -> np.ndarray: """Function to compute the covariance matrix inside each class. >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) @@ -57,7 +58,7 @@ def covariance_within_classes( def covariance_between_classes( - features: np.ndarray, labels: np.ndarray, classes: int + features: np.ndarray, labels: np.ndarray, classes: int ) -> np.ndarray: """Function to compute the covariance matrix between multiple classes >>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]]) @@ -98,6 +99,8 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd Parameters: * features: the features extracted from the dataset * dimensions: to filter the projected data for the desired dimension + + >>> test_principal_component_analysis() """ # Check if the features have been loaded @@ -121,7 +124,7 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd def linear_discriminant_analysis( - features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int + features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int ) -> np.ndarray: """ Linear Discriminant Analysis. @@ -132,6 +135,8 @@ def linear_discriminant_analysis( * labels: the class labels of the features * classes: the number of classes present in the dataset * dimensions: to filter the projected data for the desired dimension + + >>> test_linear_discriminant_analysis() """ # Check if the dimension desired is less than the number of classes @@ -163,32 +168,26 @@ def test_linear_discriminant_analysis() -> None: classes = 2 dimensions = 2 - projected_data = linear_discriminant_analysis(features, labels, classes, dimensions) - - # Assert that the shape of the projected data is correct - assert projected_data.shape == (dimensions, features.shape[1]) - - # Assert that the projected data is a numpy array - assert isinstance(projected_data, np.ndarray) - - # Assert that the projected data is not empty - assert projected_data.any() - # Assert that the function raises an AssertionError if dimensions > classes - try: - projected_data = linear_discriminant_analysis(features, labels, classes, 3) - except AssertionError: - pass - else: - raise AssertionError("Did not raise AssertionError for dimensions > classes") + with pytest.raises(AssertionError) as error_info: + projected_data = linear_discriminant_analysis(features, labels, classes, dimensions) + if isinstance(projected_data, np.ndarray): + raise AssertionError( + "Did not raise AssertionError for dimensions > classes" + ) + assert error_info.type is AssertionError def test_principal_component_analysis() -> None: features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3., 3., 3.]]) - output = principal_component_analysis(features, dimensions) - assert np.allclose(expected_output, output), f"Expected {expected_output}, but got {output}" + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + + with pytest.raises(AssertionError) as error_info: + output = principal_component_analysis(features, dimensions) + if not np.allclose(expected_output, output): + raise AssertionError + assert error_info.type is AssertionError if __name__ == "__main__": From 6521ef1d287d0acf74c995d6ee633eec6e6dc610 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 15 Apr 2023 13:46:52 +0000 Subject: [PATCH 21/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 788bd738ac92..d2046f81af04 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -170,7 +170,9 @@ def test_linear_discriminant_analysis() -> None: # Assert that the function raises an AssertionError if dimensions > classes with pytest.raises(AssertionError) as error_info: - projected_data = linear_discriminant_analysis(features, labels, classes, dimensions) + projected_data = linear_discriminant_analysis( + features, labels, classes, dimensions + ) if isinstance(projected_data, np.ndarray): raise AssertionError( "Did not raise AssertionError for dimensions > classes"