Skip to content

Dimensionality reduction #8590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Apr 16, 2023
Merged
Changes from 7 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
babd745
First commit for dimensionality_reduction.py
Diegomangasco Mar 28, 2023
5476b7d
Some bug fixies
Diegomangasco Mar 28, 2023
3d8c1be
Added a TODO list
Diegomangasco Mar 28, 2023
24a68e9
Finish code for dimensionality_reduction.py
Diegomangasco Mar 29, 2023
eb50e28
PCA and LDA finished and tested
Diegomangasco Mar 31, 2023
0eb4e10
Add Copyright
Diegomangasco Mar 31, 2023
5509e7d
Add links to Wikipedia
Diegomangasco Mar 31, 2023
041aa1d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
7e1fc35
Apply suggestions from code review
cclauss Mar 31, 2023
43e1f53
Reformat file
Diegomangasco Mar 31, 2023
19727cf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
6000941
Added tests
Diegomangasco Mar 31, 2023
d2e4833
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
56a2131
Added None return to test function
Diegomangasco Mar 31, 2023
8f6323b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2023
5328195
Remove the word "here"
Diegomangasco Apr 1, 2023
38a4019
Remove doctest from linear_discriminant_analysis
Diegomangasco Apr 1, 2023
f2d3293
Fixed doctest for PCA
Diegomangasco Apr 1, 2023
7ed93e2
Fixed doctest for PCA pt.2
Diegomangasco Apr 1, 2023
85f1730
Add test for principal_component_analysis
Diegomangasco Apr 2, 2023
7f524e1
Updated tests
Diegomangasco Apr 15, 2023
6521ef1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions machine_learning/dimensionality_reduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) 2023 Diego Gasco ([email protected]), Diegomangasco on GitHub

import logging
import numpy as np
import scipy

logging.basicConfig(level=logging.INFO, format='%(message)s')


def column_reshape(input_array: np.ndarray) -> np.ndarray:
"""Function to reshape a row Numpy array into a column Numpy array"""

return input_array.reshape((input_array.size, 1))


def covariance_within_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
"""Function to compute the covariance matrix inside each class"""

covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
data_mean = data.mean(1)
# Centralize the data of class i
centered_data = data - column_reshape(data_mean)
if i > 0:
# If covariance_sum is not None
covariance_sum += np.dot(centered_data, centered_data.T)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = np.dot(centered_data, centered_data.T)

return covariance_sum / features.shape[1]


def covariance_between_classes(features: np.ndarray, labels: np.ndarray, classes: int) -> np.ndarray:
"""Function to compute the covariance matrix between multiple classes"""

general_data_mean = features.mean(1)
covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
device_data = data.shape[1]
data_mean = data.mean(1)
if i > 0:
# If covariance_sum is not None
covariance_sum += device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = device_data * np.dot(column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T)

return covariance_sum / features.shape[1]


def PCA(features: np.ndarray, dimensions: int) -> np.ndarray:
"""Principal Component Analysis. \n
For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n
Parameters: \n
* features: the features extracted from the dataset
* labels: the class labels of the features
* dimensions: to filter the projected data for the desired dimension"""

# Check if the features have been loaded
if features.any():
data_mean = features.mean(1)
# Center the dataset
centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
_, eigenvectors = np.linalg.eigh(covariance_matrix)
# Take all the columns in the reverse order (-1), and then takes only the first columns
filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
# Project the database on the new space
projected_data = np.dot(filtered_eigenvectors.T, features)
logging.info("Principal Component Analysis computed")

return projected_data
else:
logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True)
logging.error("Dataset empty")
raise AssertionError


def LDA(features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int) -> np.ndarray:
"""Linear Discriminant Analysis. \n
For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n
Parameters: \n
* features: the features extracted from the dataset
* labels: the class labels of the features
* classes: the number of classes present in the dataset
* dimensions: to filter the projected data for the desired dimension"""

# Check if the dimension desired is less than the number of classes
assert classes > dimensions

# Check if features have been already loaded
if features.any:
_, eigenvectors = scipy.linalg.eigh(
covariance_between_classes(features, labels, classes),
covariance_within_classes(features, labels, classes))
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
filtered_svd_matrix = svd_matrix[:, 0:dimensions]
projected_data = np.dot(filtered_svd_matrix.T, features)
logging.info("Linear Discriminant Analysis computed")

return projected_data
else:
logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True)
logging.error("Dataset empty")
raise AssertionError