Skip to content

Commit 36af827

Browse files
committed
feat: add pca implementation
1 parent 0abeeab commit 36af827

File tree

1 file changed

+120
-0
lines changed

1 file changed

+120
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
'''
2+
Principal Component Analysis (PCA) is a linear dimensionality reduction technique used
3+
commonly as a data preprocessing step in unsupervised and supervised machine learning pipelines.
4+
The principle behind PCA is the reduction in the number of variables in the dataset, while
5+
preserving as much as information as possible.
6+
7+
Here, the principal components represent the directions of the data that explain a
8+
maximal amount of variance. Here, the data is projected onto a new coordinate system
9+
such that the direction cqpturing the largest variation in data can be easily identified.
10+
11+
This implementation of PCA consists of the following steps:
12+
1. Data Standardization (Z-score Normalization): This step involved the centering of the
13+
data by subtracting the mean and dividing it by the standard deviation
14+
so that it has unit variance.
15+
16+
2. Covariance Matrix Calculation: This step involved the calculation of the covariance matrix
17+
of the standardized data. The covariance matrix allows us to measure how different
18+
features vary together, capturing the relationships between them.
19+
20+
3. Singular Value Decomposition: In this step, we use Singular Value Decomposition or SVD to
21+
Decomposes the covariance matrix into its singular eignvectors and singular eigenvalues,
22+
which help identify the principal components.
23+
24+
4. Selection of Principal Components: Here, we choose the top k principal components
25+
that explain the most variance in the data.
26+
27+
5. Projection of Data: Here, we transform the original standardized data into
28+
the new lower-dimensional space defined by the selected principal components.
29+
30+
REFERENCE: en.wikipedia.org/wiki/Principal_component_analysis
31+
'''
32+
33+
import numpy as np
34+
35+
36+
def svd(matrix):
37+
"""
38+
Perform Singular Value Decomposition (SVD) on the given matrix.
39+
Args:
40+
matrix (np.ndarray): The input matrix.
41+
Returns:
42+
tuple: The U, S, and VT matrices from SVD.
43+
>>> matrix = np.array([[1, 2], [3, 4]])
44+
>>> u, s, vt = svd(matrix)
45+
>>> np.allclose(np.dot(u, np.dot(s, vt)), matrix)
46+
True
47+
"""
48+
m, n = matrix.shape
49+
u = np.zeros((m, m))
50+
s = np.zeros((m, n))
51+
vt = np.zeros((n, n))
52+
eigvals, eigvecs = np.linalg.eig(np.dot(matrix.T, matrix))
53+
vt = eigvecs.T
54+
55+
singular_values = np.sqrt(eigvals)
56+
s[:n, :n] = np.diag(singular_values)
57+
58+
for i in range(n):
59+
u[:, i] = np.dot(matrix, vt[i, :]) / singular_values[i]
60+
61+
return u, s, vt
62+
63+
def main(data: list[int], k:int):
64+
"""
65+
Perform Principal Component Analysis (PCA) on the given data.
66+
67+
Args:
68+
data (list[int]): The input data.
69+
k (int): The number of principal components to retain.
70+
71+
Returns:
72+
np.ndarray: The transformed data with reduced dimensionality.
73+
74+
>>> data = np.array([[1, 2], [3, 4], [5, 6]])
75+
>>> main(data, 1)
76+
array([[-2.82842712],
77+
[ 0. ],
78+
[ 2.82842712]])
79+
"""
80+
z_score = (data - data.mean(axis=0) / data.std(axis=0))
81+
cov_matrix = np.cov(z_score, ddof=1, rowvar=False)
82+
83+
u, s, vt = svd(cov_matrix)
84+
principal_components = vt[:k]
85+
transformed_data = np.dot(z_score, principal_components.T)
86+
return transformed_data
87+
88+
if __name__ == '__main__':
89+
import numpy as np
90+
import matplotlib.pyplot as plt
91+
from sklearn.datasets import make_blobs
92+
93+
data, _ = make_blobs(n_samples=100, n_features=4, centers=5, random_state=42)
94+
k = 2
95+
96+
transformed_data = main(data, k)
97+
print("Transformed Data:")
98+
print(transformed_data)
99+
100+
assert transformed_data.shape == (data.shape[0], k), "The transformed data does not have the expected shape."
101+
102+
# Visualize the original data and the transformed data
103+
plt.figure(figsize=(12, 6))
104+
105+
plt.subplot(1, 2, 1)
106+
plt.scatter(data[:, 0], data[:, 1], c='blue', edgecolor='k', s=50)
107+
plt.title("Original Data")
108+
plt.xlabel("Feature 1")
109+
plt.ylabel("Feature 2")
110+
111+
plt.subplot(1, 2, 2)
112+
plt.scatter(transformed_data, np.zeros_like(transformed_data), c='red', edgecolor='k', s=50)
113+
plt.title("Transformed Data")
114+
plt.xlabel("Principal Component 1")
115+
plt.yticks([])
116+
117+
plt.tight_layout()
118+
plt.show()
119+
120+
print("All tests passed.")

0 commit comments

Comments
 (0)