forked from TheAlgorithms/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdimensionality_reduction.py
122 lines (100 loc) · 4.7 KB
/
dimensionality_reduction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Copyright (c) 2023 Diego Gasco ([email protected]), Diegomangasco on GitHub
import logging
import numpy as np
import scipy
logging.basicConfig(level=logging.INFO, format="%(message)s")
def column_reshape(input_array: np.ndarray) -> np.ndarray:
"""Function to reshape a row Numpy array into a column Numpy array"""
return input_array.reshape((input_array.size, 1))
def covariance_within_classes(
features: np.ndarray, labels: np.ndarray, classes: int
) -> np.ndarray:
"""Function to compute the covariance matrix inside each class"""
covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
data_mean = data.mean(1)
# Centralize the data of class i
centered_data = data - column_reshape(data_mean)
if i > 0:
# If covariance_sum is not None
covariance_sum += np.dot(centered_data, centered_data.T)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = np.dot(centered_data, centered_data.T)
return covariance_sum / features.shape[1]
def covariance_between_classes(
features: np.ndarray, labels: np.ndarray, classes: int
) -> np.ndarray:
"""Function to compute the covariance matrix between multiple classes"""
general_data_mean = features.mean(1)
covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
device_data = data.shape[1]
data_mean = data.mean(1)
if i > 0:
# If covariance_sum is not None
covariance_sum += device_data * np.dot(
column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = device_data * np.dot(
column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
)
return covariance_sum / features.shape[1]
def PCA(features: np.ndarray, dimensions: int) -> np.ndarray:
"""Principal Component Analysis. \n
For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis \n
Parameters: \n
* features: the features extracted from the dataset
* labels: the class labels of the features
* dimensions: to filter the projected data for the desired dimension"""
# Check if the features have been loaded
if features.any():
data_mean = features.mean(1)
# Center the dataset
centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
_, eigenvectors = np.linalg.eigh(covariance_matrix)
# Take all the columns in the reverse order (-1), and then takes only the first columns
filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
# Project the database on the new space
projected_data = np.dot(filtered_eigenvectors.T, features)
logging.info("Principal Component Analysis computed")
return projected_data
else:
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
logging.error("Dataset empty")
raise AssertionError
def LDA(
features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int
) -> np.ndarray:
"""Linear Discriminant Analysis. \n
For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis \n
Parameters: \n
* features: the features extracted from the dataset
* labels: the class labels of the features
* classes: the number of classes present in the dataset
* dimensions: to filter the projected data for the desired dimension"""
# Check if the dimension desired is less than the number of classes
assert classes > dimensions
# Check if features have been already loaded
if features.any:
_, eigenvectors = scipy.linalg.eigh(
covariance_between_classes(features, labels, classes),
covariance_within_classes(features, labels, classes),
)
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
filtered_svd_matrix = svd_matrix[:, 0:dimensions]
projected_data = np.dot(filtered_svd_matrix.T, features)
logging.info("Linear Discriminant Analysis computed")
return projected_data
else:
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
logging.error("Dataset empty")
raise AssertionError