Skip to content

Commit 56cc2a8

Browse files
Diegomangascosedatguzelsemme
authored andcommitted
Dimensionality reduction (TheAlgorithms#8590)
1 parent 2b86d4f commit 56cc2a8

File tree

1 file changed

+198
-0
lines changed

1 file changed

+198
-0
lines changed

Diff for: machine_learning/dimensionality_reduction.py

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
# Copyright (c) 2023 Diego Gasco ([email protected]), Diegomangasco on GitHub
2+
3+
"""
4+
Requirements:
5+
- numpy version 1.21
6+
- scipy version 1.3.3
7+
Notes:
8+
- Each column of the features matrix corresponds to a class item
9+
"""
10+
11+
import logging
12+
13+
import numpy as np
14+
import pytest
15+
from scipy.linalg import eigh
16+
17+
logging.basicConfig(level=logging.INFO, format="%(message)s")
18+
19+
20+
def column_reshape(input_array: np.ndarray) -> np.ndarray:
21+
"""Function to reshape a row Numpy array into a column Numpy array
22+
>>> input_array = np.array([1, 2, 3])
23+
>>> column_reshape(input_array)
24+
array([[1],
25+
[2],
26+
[3]])
27+
"""
28+
29+
return input_array.reshape((input_array.size, 1))
30+
31+
32+
def covariance_within_classes(
33+
features: np.ndarray, labels: np.ndarray, classes: int
34+
) -> np.ndarray:
35+
"""Function to compute the covariance matrix inside each class.
36+
>>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
37+
>>> labels = np.array([0, 1, 0])
38+
>>> covariance_within_classes(features, labels, 2)
39+
array([[0.66666667, 0.66666667, 0.66666667],
40+
[0.66666667, 0.66666667, 0.66666667],
41+
[0.66666667, 0.66666667, 0.66666667]])
42+
"""
43+
44+
covariance_sum = np.nan
45+
for i in range(classes):
46+
data = features[:, labels == i]
47+
data_mean = data.mean(1)
48+
# Centralize the data of class i
49+
centered_data = data - column_reshape(data_mean)
50+
if i > 0:
51+
# If covariance_sum is not None
52+
covariance_sum += np.dot(centered_data, centered_data.T)
53+
else:
54+
# If covariance_sum is np.nan (i.e. first loop)
55+
covariance_sum = np.dot(centered_data, centered_data.T)
56+
57+
return covariance_sum / features.shape[1]
58+
59+
60+
def covariance_between_classes(
61+
features: np.ndarray, labels: np.ndarray, classes: int
62+
) -> np.ndarray:
63+
"""Function to compute the covariance matrix between multiple classes
64+
>>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]])
65+
>>> labels = np.array([0, 1, 0])
66+
>>> covariance_between_classes(features, labels, 2)
67+
array([[ 3.55555556, 1.77777778, -2.66666667],
68+
[ 1.77777778, 0.88888889, -1.33333333],
69+
[-2.66666667, -1.33333333, 2. ]])
70+
"""
71+
72+
general_data_mean = features.mean(1)
73+
covariance_sum = np.nan
74+
for i in range(classes):
75+
data = features[:, labels == i]
76+
device_data = data.shape[1]
77+
data_mean = data.mean(1)
78+
if i > 0:
79+
# If covariance_sum is not None
80+
covariance_sum += device_data * np.dot(
81+
column_reshape(data_mean) - column_reshape(general_data_mean),
82+
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
83+
)
84+
else:
85+
# If covariance_sum is np.nan (i.e. first loop)
86+
covariance_sum = device_data * np.dot(
87+
column_reshape(data_mean) - column_reshape(general_data_mean),
88+
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
89+
)
90+
91+
return covariance_sum / features.shape[1]
92+
93+
94+
def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray:
95+
"""
96+
Principal Component Analysis.
97+
98+
For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis.
99+
Parameters:
100+
* features: the features extracted from the dataset
101+
* dimensions: to filter the projected data for the desired dimension
102+
103+
>>> test_principal_component_analysis()
104+
"""
105+
106+
# Check if the features have been loaded
107+
if features.any():
108+
data_mean = features.mean(1)
109+
# Center the dataset
110+
centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
111+
covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
112+
_, eigenvectors = np.linalg.eigh(covariance_matrix)
113+
# Take all the columns in the reverse order (-1), and then takes only the first
114+
filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
115+
# Project the database on the new space
116+
projected_data = np.dot(filtered_eigenvectors.T, features)
117+
logging.info("Principal Component Analysis computed")
118+
119+
return projected_data
120+
else:
121+
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
122+
logging.error("Dataset empty")
123+
raise AssertionError
124+
125+
126+
def linear_discriminant_analysis(
127+
features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int
128+
) -> np.ndarray:
129+
"""
130+
Linear Discriminant Analysis.
131+
132+
For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis.
133+
Parameters:
134+
* features: the features extracted from the dataset
135+
* labels: the class labels of the features
136+
* classes: the number of classes present in the dataset
137+
* dimensions: to filter the projected data for the desired dimension
138+
139+
>>> test_linear_discriminant_analysis()
140+
"""
141+
142+
# Check if the dimension desired is less than the number of classes
143+
assert classes > dimensions
144+
145+
# Check if features have been already loaded
146+
if features.any:
147+
_, eigenvectors = eigh(
148+
covariance_between_classes(features, labels, classes),
149+
covariance_within_classes(features, labels, classes),
150+
)
151+
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
152+
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
153+
filtered_svd_matrix = svd_matrix[:, 0:dimensions]
154+
projected_data = np.dot(filtered_svd_matrix.T, features)
155+
logging.info("Linear Discriminant Analysis computed")
156+
157+
return projected_data
158+
else:
159+
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
160+
logging.error("Dataset empty")
161+
raise AssertionError
162+
163+
164+
def test_linear_discriminant_analysis() -> None:
165+
# Create dummy dataset with 2 classes and 3 features
166+
features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]])
167+
labels = np.array([0, 0, 0, 1, 1])
168+
classes = 2
169+
dimensions = 2
170+
171+
# Assert that the function raises an AssertionError if dimensions > classes
172+
with pytest.raises(AssertionError) as error_info:
173+
projected_data = linear_discriminant_analysis(
174+
features, labels, classes, dimensions
175+
)
176+
if isinstance(projected_data, np.ndarray):
177+
raise AssertionError(
178+
"Did not raise AssertionError for dimensions > classes"
179+
)
180+
assert error_info.type is AssertionError
181+
182+
183+
def test_principal_component_analysis() -> None:
184+
features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
185+
dimensions = 2
186+
expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]])
187+
188+
with pytest.raises(AssertionError) as error_info:
189+
output = principal_component_analysis(features, dimensions)
190+
if not np.allclose(expected_output, output):
191+
raise AssertionError
192+
assert error_info.type is AssertionError
193+
194+
195+
if __name__ == "__main__":
196+
import doctest
197+
198+
doctest.testmod()

0 commit comments

Comments
 (0)