1
+ '''README, Author - Anurag Kumar(mailto:[email protected] )
2
+
3
+ Requirements:
4
+ - sklearn
5
+ - numpy
6
+ - matplotlib
7
+
8
+ Python:
9
+ - 3.5
10
+
11
+ Inputs:
12
+ - X , a 2D numpy array of features.
13
+ - k , number of clusters to create.
14
+ - initial_centroids , initial centroid values generated by utility function(mentioned in usage).
15
+ - maxiter , maximum number of iterations to process.
16
+ - heterogeneity , empty list that will be filled with hetrogeneity values if passed to kmeans func.
17
+
18
+ Usage:
19
+ 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
20
+
21
+ 2. create initial_centroids,
22
+ initial_centroids = get_initial_centroids(
23
+ X,
24
+ k,
25
+ seed=0 # seed value for initial centroid generation, None for randomness(default=None)
26
+ )
27
+
28
+ 3. find centroids and clusters using kmeans function.
29
+
30
+ centroids, cluster_assignment = kmeans(
31
+ X,
32
+ k,
33
+ initial_centroids,
34
+ maxiter=400,
35
+ record_heterogeneity=heterogeneity,
36
+ verbose=True # whether to print logs in console or not.(default=False)
37
+ )
38
+
39
+
40
+ 4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list.
41
+ plot_heterogeneity(
42
+ heterogeneity,
43
+ k
44
+ )
45
+
46
+ 5. Have fun..
47
+
48
+ '''
49
+ from sklearn .metrics import pairwise_distances
50
+ import numpy as np
51
+
52
+ TAG = 'K-MEANS-CLUST/ '
53
+
54
+ def get_initial_centroids (data , k , seed = None ):
55
+ '''Randomly choose k data points as initial centroids'''
56
+ if seed is not None : # useful for obtaining consistent results
57
+ np .random .seed (seed )
58
+ n = data .shape [0 ] # number of data points
59
+
60
+ # Pick K indices from range [0, N).
61
+ rand_indices = np .random .randint (0 , n , k )
62
+
63
+ # Keep centroids as dense format, as many entries will be nonzero due to averaging.
64
+ # As long as at least one document in a cluster contains a word,
65
+ # it will carry a nonzero weight in the TF-IDF vector of the centroid.
66
+ centroids = data [rand_indices ,:]
67
+
68
+ return centroids
69
+
70
+ def centroid_pairwise_dist (X ,centroids ):
71
+ return pairwise_distances (X ,centroids ,metric = 'euclidean' )
72
+
73
+ def assign_clusters (data , centroids ):
74
+
75
+ # Compute distances between each data point and the set of centroids:
76
+ # Fill in the blank (RHS only)
77
+ distances_from_centroids = centroid_pairwise_dist (data ,centroids )
78
+
79
+ # Compute cluster assignments for each data point:
80
+ # Fill in the blank (RHS only)
81
+ cluster_assignment = np .argmin (distances_from_centroids ,axis = 1 )
82
+
83
+ return cluster_assignment
84
+
85
+ def revise_centroids (data , k , cluster_assignment ):
86
+ new_centroids = []
87
+ for i in range (k ):
88
+ # Select all data points that belong to cluster i. Fill in the blank (RHS only)
89
+ member_data_points = data [cluster_assignment == i ]
90
+ # Compute the mean of the data points. Fill in the blank (RHS only)
91
+ centroid = member_data_points .mean (axis = 0 )
92
+ new_centroids .append (centroid )
93
+ new_centroids = np .array (new_centroids )
94
+
95
+ return new_centroids
96
+
97
+ def compute_heterogeneity (data , k , centroids , cluster_assignment ):
98
+
99
+ heterogeneity = 0.0
100
+ for i in range (k ):
101
+
102
+ # Select all data points that belong to cluster i. Fill in the blank (RHS only)
103
+ member_data_points = data [cluster_assignment == i , :]
104
+
105
+ if member_data_points .shape [0 ] > 0 : # check if i-th cluster is non-empty
106
+ # Compute distances from centroid to data points (RHS only)
107
+ distances = pairwise_distances (member_data_points , [centroids [i ]], metric = 'euclidean' )
108
+ squared_distances = distances ** 2
109
+ heterogeneity += np .sum (squared_distances )
110
+
111
+ return heterogeneity
112
+
113
+ from matplotlib import pyplot as plt
114
+ def plot_heterogeneity (heterogeneity , k ):
115
+ plt .figure (figsize = (7 ,4 ))
116
+ plt .plot (heterogeneity , linewidth = 4 )
117
+ plt .xlabel ('# Iterations' )
118
+ plt .ylabel ('Heterogeneity' )
119
+ plt .title ('Heterogeneity of clustering over time, K={0:d}' .format (k ))
120
+ plt .rcParams .update ({'font.size' : 16 })
121
+ plt .show ()
122
+
123
+ def kmeans (data , k , initial_centroids , maxiter = 500 , record_heterogeneity = None , verbose = False ):
124
+ '''This function runs k-means on given data and initial set of centroids.
125
+ maxiter: maximum number of iterations to run.(default=500)
126
+ record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
127
+ if None, do not store the history.
128
+ verbose: if True, print how many data points changed their cluster labels in each iteration'''
129
+ centroids = initial_centroids [:]
130
+ prev_cluster_assignment = None
131
+
132
+ for itr in range (maxiter ):
133
+ if verbose :
134
+ print (itr , end = '' )
135
+
136
+ # 1. Make cluster assignments using nearest centroids
137
+ cluster_assignment = assign_clusters (data ,centroids )
138
+
139
+ # 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
140
+ centroids = revise_centroids (data ,k , cluster_assignment )
141
+
142
+ # Check for convergence: if none of the assignments changed, stop
143
+ if prev_cluster_assignment is not None and \
144
+ (prev_cluster_assignment == cluster_assignment ).all ():
145
+ break
146
+
147
+ # Print number of new assignments
148
+ if prev_cluster_assignment is not None :
149
+ num_changed = np .sum (prev_cluster_assignment != cluster_assignment )
150
+ if verbose :
151
+ print (' {0:5d} elements changed their cluster assignment.' .format (num_changed ))
152
+
153
+ # Record heterogeneity convergence metric
154
+ if record_heterogeneity is not None :
155
+ # YOUR CODE HERE
156
+ score = compute_heterogeneity (data ,k ,centroids ,cluster_assignment )
157
+ record_heterogeneity .append (score )
158
+
159
+ prev_cluster_assignment = cluster_assignment [:]
160
+
161
+ return centroids , cluster_assignment
162
+
163
+ # Mock test below
164
+ if False : # change to true to run this test case.
165
+ import sklearn .datasets as ds
166
+ dataset = ds .load_iris ()
167
+ k = 3
168
+ heterogeneity = []
169
+ initial_centroids = get_initial_centroids (dataset ['data' ], k , seed = 0 )
170
+ centroids , cluster_assignment = kmeans (dataset ['data' ], k , initial_centroids , maxiter = 400 ,
171
+ record_heterogeneity = heterogeneity , verbose = True )
172
+ plot_heterogeneity (heterogeneity , k )
0 commit comments