1
+ '''
2
+ Principal Component Analysis (PCA) is a linear dimensionality reduction technique used
3
+ commonly as a data preprocessing step in unsupervised and supervised machine learning pipelines.
4
+ The principle behind PCA is the reduction in the number of variables in the dataset, while
5
+ preserving as much as information as possible.
6
+
7
+ Here, the principal components represent the directions of the data that explain a
8
+ maximal amount of variance. Here, the data is projected onto a new coordinate system
9
+ such that the direction cqpturing the largest variation in data can be easily identified.
10
+
11
+ This implementation of PCA consists of the following steps:
12
+ 1. Data Standardization (Z-score Normalization): This step involved the centering of the
13
+ data by subtracting the mean and dividing it by the standard deviation
14
+ so that it has unit variance.
15
+
16
+ 2. Covariance Matrix Calculation: This step involved the calculation of the covariance matrix
17
+ of the standardized data. The covariance matrix allows us to measure how different
18
+ features vary together, capturing the relationships between them.
19
+
20
+ 3. Singular Value Decomposition: In this step, we use Singular Value Decomposition or SVD to
21
+ Decomposes the covariance matrix into its singular eignvectors and singular eigenvalues,
22
+ which help identify the principal components.
23
+
24
+ 4. Selection of Principal Components: Here, we choose the top k principal components
25
+ that explain the most variance in the data.
26
+
27
+ 5. Projection of Data: Here, we transform the original standardized data into
28
+ the new lower-dimensional space defined by the selected principal components.
29
+
30
+ REFERENCE: en.wikipedia.org/wiki/Principal_component_analysis
31
+ '''
32
+
33
+ import numpy as np
34
+
35
+
36
+ def svd (matrix ):
37
+ """
38
+ Perform Singular Value Decomposition (SVD) on the given matrix.
39
+ Args:
40
+ matrix (np.ndarray): The input matrix.
41
+ Returns:
42
+ tuple: The U, S, and VT matrices from SVD.
43
+ >>> matrix = np.array([[1, 2], [3, 4]])
44
+ >>> u, s, vt = svd(matrix)
45
+ >>> np.allclose(np.dot(u, np.dot(s, vt)), matrix)
46
+ True
47
+ """
48
+ m , n = matrix .shape
49
+ u = np .zeros ((m , m ))
50
+ s = np .zeros ((m , n ))
51
+ vt = np .zeros ((n , n ))
52
+ eigvals , eigvecs = np .linalg .eig (np .dot (matrix .T , matrix ))
53
+ vt = eigvecs .T
54
+
55
+ singular_values = np .sqrt (eigvals )
56
+ s [:n , :n ] = np .diag (singular_values )
57
+
58
+ for i in range (n ):
59
+ u [:, i ] = np .dot (matrix , vt [i , :]) / singular_values [i ]
60
+
61
+ return u , s , vt
62
+
63
+ def main (data : list [int ], k :int ):
64
+ """
65
+ Perform Principal Component Analysis (PCA) on the given data.
66
+
67
+ Args:
68
+ data (list[int]): The input data.
69
+ k (int): The number of principal components to retain.
70
+
71
+ Returns:
72
+ np.ndarray: The transformed data with reduced dimensionality.
73
+
74
+ >>> data = np.array([[1, 2], [3, 4], [5, 6]])
75
+ >>> main(data, 1)
76
+ array([[-2.82842712],
77
+ [ 0. ],
78
+ [ 2.82842712]])
79
+ """
80
+ z_score = (data - data .mean (axis = 0 ) / data .std (axis = 0 ))
81
+ cov_matrix = np .cov (z_score , ddof = 1 , rowvar = False )
82
+
83
+ u , s , vt = svd (cov_matrix )
84
+ principal_components = vt [:k ]
85
+ transformed_data = np .dot (z_score , principal_components .T )
86
+ return transformed_data
87
+
88
+ if __name__ == '__main__' :
89
+ import numpy as np
90
+ import matplotlib .pyplot as plt
91
+ from sklearn .datasets import make_blobs
92
+
93
+ data , _ = make_blobs (n_samples = 100 , n_features = 4 , centers = 5 , random_state = 42 )
94
+ k = 2
95
+
96
+ transformed_data = main (data , k )
97
+ print ("Transformed Data:" )
98
+ print (transformed_data )
99
+
100
+ assert transformed_data .shape == (data .shape [0 ], k ), "The transformed data does not have the expected shape."
101
+
102
+ # Visualize the original data and the transformed data
103
+ plt .figure (figsize = (12 , 6 ))
104
+
105
+ plt .subplot (1 , 2 , 1 )
106
+ plt .scatter (data [:, 0 ], data [:, 1 ], c = 'blue' , edgecolor = 'k' , s = 50 )
107
+ plt .title ("Original Data" )
108
+ plt .xlabel ("Feature 1" )
109
+ plt .ylabel ("Feature 2" )
110
+
111
+ plt .subplot (1 , 2 , 2 )
112
+ plt .scatter (transformed_data , np .zeros_like (transformed_data ), c = 'red' , edgecolor = 'k' , s = 50 )
113
+ plt .title ("Transformed Data" )
114
+ plt .xlabel ("Principal Component 1" )
115
+ plt .yticks ([])
116
+
117
+ plt .tight_layout ()
118
+ plt .show ()
119
+
120
+ print ("All tests passed." )
0 commit comments