Skip to content

Commit 6bc30c7

Browse files
authored
Merge pull request #139 from anuragkumarak95/master
added k means clustering algorithm, usage doc inside.
2 parents 6882321 + 07fb7d6 commit 6bc30c7

File tree

1 file changed

+172
-0
lines changed

1 file changed

+172
-0
lines changed

Diff for: machine_learning/k_means_clust.py

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
'''README, Author - Anurag Kumar(mailto:[email protected])
2+
3+
Requirements:
4+
- sklearn
5+
- numpy
6+
- matplotlib
7+
8+
Python:
9+
- 3.5
10+
11+
Inputs:
12+
- X , a 2D numpy array of features.
13+
- k , number of clusters to create.
14+
- initial_centroids , initial centroid values generated by utility function(mentioned in usage).
15+
- maxiter , maximum number of iterations to process.
16+
- heterogeneity , empty list that will be filled with hetrogeneity values if passed to kmeans func.
17+
18+
Usage:
19+
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
20+
21+
2. create initial_centroids,
22+
initial_centroids = get_initial_centroids(
23+
X,
24+
k,
25+
seed=0 # seed value for initial centroid generation, None for randomness(default=None)
26+
)
27+
28+
3. find centroids and clusters using kmeans function.
29+
30+
centroids, cluster_assignment = kmeans(
31+
X,
32+
k,
33+
initial_centroids,
34+
maxiter=400,
35+
record_heterogeneity=heterogeneity,
36+
verbose=True # whether to print logs in console or not.(default=False)
37+
)
38+
39+
40+
4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list.
41+
plot_heterogeneity(
42+
heterogeneity,
43+
k
44+
)
45+
46+
5. Have fun..
47+
48+
'''
49+
from sklearn.metrics import pairwise_distances
50+
import numpy as np
51+
52+
TAG = 'K-MEANS-CLUST/ '
53+
54+
def get_initial_centroids(data, k, seed=None):
55+
'''Randomly choose k data points as initial centroids'''
56+
if seed is not None: # useful for obtaining consistent results
57+
np.random.seed(seed)
58+
n = data.shape[0] # number of data points
59+
60+
# Pick K indices from range [0, N).
61+
rand_indices = np.random.randint(0, n, k)
62+
63+
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
64+
# As long as at least one document in a cluster contains a word,
65+
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
66+
centroids = data[rand_indices,:]
67+
68+
return centroids
69+
70+
def centroid_pairwise_dist(X,centroids):
71+
return pairwise_distances(X,centroids,metric='euclidean')
72+
73+
def assign_clusters(data, centroids):
74+
75+
# Compute distances between each data point and the set of centroids:
76+
# Fill in the blank (RHS only)
77+
distances_from_centroids = centroid_pairwise_dist(data,centroids)
78+
79+
# Compute cluster assignments for each data point:
80+
# Fill in the blank (RHS only)
81+
cluster_assignment = np.argmin(distances_from_centroids,axis=1)
82+
83+
return cluster_assignment
84+
85+
def revise_centroids(data, k, cluster_assignment):
86+
new_centroids = []
87+
for i in range(k):
88+
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
89+
member_data_points = data[cluster_assignment==i]
90+
# Compute the mean of the data points. Fill in the blank (RHS only)
91+
centroid = member_data_points.mean(axis=0)
92+
new_centroids.append(centroid)
93+
new_centroids = np.array(new_centroids)
94+
95+
return new_centroids
96+
97+
def compute_heterogeneity(data, k, centroids, cluster_assignment):
98+
99+
heterogeneity = 0.0
100+
for i in range(k):
101+
102+
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
103+
member_data_points = data[cluster_assignment==i, :]
104+
105+
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
106+
# Compute distances from centroid to data points (RHS only)
107+
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
108+
squared_distances = distances**2
109+
heterogeneity += np.sum(squared_distances)
110+
111+
return heterogeneity
112+
113+
from matplotlib import pyplot as plt
114+
def plot_heterogeneity(heterogeneity, k):
115+
plt.figure(figsize=(7,4))
116+
plt.plot(heterogeneity, linewidth=4)
117+
plt.xlabel('# Iterations')
118+
plt.ylabel('Heterogeneity')
119+
plt.title('Heterogeneity of clustering over time, K={0:d}'.format(k))
120+
plt.rcParams.update({'font.size': 16})
121+
plt.show()
122+
123+
def kmeans(data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False):
124+
'''This function runs k-means on given data and initial set of centroids.
125+
maxiter: maximum number of iterations to run.(default=500)
126+
record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
127+
if None, do not store the history.
128+
verbose: if True, print how many data points changed their cluster labels in each iteration'''
129+
centroids = initial_centroids[:]
130+
prev_cluster_assignment = None
131+
132+
for itr in range(maxiter):
133+
if verbose:
134+
print(itr, end='')
135+
136+
# 1. Make cluster assignments using nearest centroids
137+
cluster_assignment = assign_clusters(data,centroids)
138+
139+
# 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
140+
centroids = revise_centroids(data,k, cluster_assignment)
141+
142+
# Check for convergence: if none of the assignments changed, stop
143+
if prev_cluster_assignment is not None and \
144+
(prev_cluster_assignment==cluster_assignment).all():
145+
break
146+
147+
# Print number of new assignments
148+
if prev_cluster_assignment is not None:
149+
num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
150+
if verbose:
151+
print(' {0:5d} elements changed their cluster assignment.'.format(num_changed))
152+
153+
# Record heterogeneity convergence metric
154+
if record_heterogeneity is not None:
155+
# YOUR CODE HERE
156+
score = compute_heterogeneity(data,k,centroids,cluster_assignment)
157+
record_heterogeneity.append(score)
158+
159+
prev_cluster_assignment = cluster_assignment[:]
160+
161+
return centroids, cluster_assignment
162+
163+
# Mock test below
164+
if False: # change to true to run this test case.
165+
import sklearn.datasets as ds
166+
dataset = ds.load_iris()
167+
k = 3
168+
heterogeneity = []
169+
initial_centroids = get_initial_centroids(dataset['data'], k, seed=0)
170+
centroids, cluster_assignment = kmeans(dataset['data'], k, initial_centroids, maxiter=400,
171+
record_heterogeneity=heterogeneity, verbose=True)
172+
plot_heterogeneity(heterogeneity, k)

0 commit comments

Comments
 (0)