Skip to content

Dimensionality reduction #8590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Apr 16, 2023
Merged
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
babd745
First commit for dimensionality_reduction.py
Diegomangasco Mar 28, 2023
5476b7d
Some bug fixies
Diegomangasco Mar 28, 2023
3d8c1be
Added a TODO list
Diegomangasco Mar 28, 2023
24a68e9
Finish code for dimensionality_reduction.py
Diegomangasco Mar 29, 2023
eb50e28
PCA and LDA finished and tested
Diegomangasco Mar 31, 2023
0eb4e10
Add Copyright
Diegomangasco Mar 31, 2023
5509e7d
Add links to Wikipedia
Diegomangasco Mar 31, 2023
041aa1d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
7e1fc35
Apply suggestions from code review
cclauss Mar 31, 2023
43e1f53
Reformat file
Diegomangasco Mar 31, 2023
19727cf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
6000941
Added tests
Diegomangasco Mar 31, 2023
d2e4833
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
56a2131
Added None return to test function
Diegomangasco Mar 31, 2023
8f6323b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
5328195
Remove the word "here"
Diegomangasco Apr 1, 2023
38a4019
Remove doctest from linear_discriminant_analysis
Diegomangasco Apr 1, 2023
f2d3293
Fixed doctest for PCA
Diegomangasco Apr 1, 2023
7ed93e2
Fixed doctest for PCA pt.2
Diegomangasco Apr 1, 2023
85f1730
Add test for principal_component_analysis
Diegomangasco Apr 2, 2023
7f524e1
Updated tests
Diegomangasco Apr 15, 2023
6521ef1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 31 additions & 26 deletions machine_learning/dimensionality_reduction.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
# Copyright (c) 2023 Diego Gasco ([email protected]), Diegomangasco on GitHub

import logging
import logging # noqa: I001
import numpy as np
import scipy

logging.basicConfig(level=logging.INFO, format='%(message)s')


def column_reshape(input_array: np.ndarray) -> np.ndarray:
def _column_reshape(input_array: np.ndarray) -> np.ndarray:
"""Function to reshape a row Numpy array into a column Numpy array"""

return input_array.reshape((input_array.size, 1))


def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
def _covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
"""Function to compute the covariance matrix inside each class"""

covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
data_mean = data.mean(1)
# Centralize the data of class i
centered_data = data - column_reshape(data_mean)
centered_data = data - _column_reshape(data_mean)
if i > 0:
# If covariance_sum is not None
covariance_sum += np.dot(centered_data, centered_data.T)
Expand All @@ -32,7 +32,7 @@ def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes:
return covariance_sum / features.shape[1]


def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
def _covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
"""Function to compute the covariance matrix between multiple classes"""

general_data_mean = features.mean(1)
Expand All @@ -43,23 +43,25 @@ def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes
data_mean = data.mean(1)
if i > 0:
# If covariance_sum is not None
covariance_sum += device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T)
covariance_sum += device_data * np.dot(_column_reshape(data_mean) - _column_reshape(general_data_mean),
(_column_reshape(data_mean) - _column_reshape(general_data_mean)).T)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T)
covariance_sum = device_data * np.dot(_column_reshape(data_mean) - _column_reshape(general_data_mean),
(_column_reshape(data_mean) - _column_reshape(general_data_mean)).T)

return covariance_sum / features.shape[1]


def PCA(features: np.ndarray, dimensions: int) -> np.ndarray:
"""Principal Component Analysis. \n
For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n
Parameters: \n
* features: the features extracted from the dataset
* labels: the class labels of the features
* dimensions: to filter the projected data for the desired dimension"""
def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray:
"""
Principal Component Analysis.

For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis.
Parameters:
* features: the features extracted from the dataset
* dimensions: to filter the projected data for the desired dimension
"""

# Check if the features have been loaded
if features.any():
Expand All @@ -81,23 +83,26 @@ def PCA(features: np.ndarray, dimensions: int) -> np.ndarray:
raise AssertionError


def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray:
"""Linear Discriminant Analysis. \n
For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n
Parameters: \n
* features: the features extracted from the dataset
* labels: the class labels of the features
* classes: the number of classes present in the dataset
* dimensions: to filter the projected data for the desired dimension"""
def linear_discriminant_analysis(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray:
"""
Linear Discriminant Analysis.

For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis.
Parameters:
* features: the features extracted from the dataset
* labels: the class labels of the features
* classes: the number of classes present in the dataset
* dimensions: to filter the projected data for the desired dimension
"""

# Check if the dimension desired is less than the number of classes
assert classes > dimensions

# Check if features have been already loaded
if features.any:
_, eigenvectors = scipy.linalg.eigh(
covariance_between_classes(features, labels, classes),
covariance_within_classes(features, labels, classes))
_covariance_between_classes(features, labels, classes),
_covariance_within_classes(features, labels, classes))
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
filtered_svd_matrix = svd_matrix[:, 0:dimensions]
Expand Down