K Means
Prompt: Implement K Means. Assume X_train contains a set of N d-dimensional data points.
class KMeans:
def __init__(self, k):
self.k = k
def init_centroids(self, dim):
"""
Args:
dim: dimension of data
Returns:
np array of shape (self.k, d) of initialized centroids
"""
return np.random.normal(size=(self.k, dim))
def fit(self, X_train, num_iters):
"""
Args:
X_train: np array of shape (N, d)
num_iters: number of algorithm iterations
Returns:
centroids: np array of shape (k, d) the centroids after algorithm converges
clusters: dict[int, list] with mapping cluster_idx -> list of indices of X_train belonging to that cluster
"""
N, d = X_train.shape
centroids = self.init_centroids(dim=d)
a2 = np.square(X_train).sum(axis=1) # (N, )
for i in range(num_iters):
# initialize empty clusters
clusters = {i:[] for i in range(self.k)}
b2 = np.square(centroids).sum(axis=1) # (k, )
ab = X_train @ centroids.T # (N, k)
dists = a2[:, None] + b2[None, :] - 2*ab # (N, k)
top1 = np.argmin(dists, axis=1) # (N, )
for member_idx, cluster_idx in enumerate(top1):
clusters[cluster_idx].append(member_idx)
# use clusters to update centroids
for cluster_idx, members in clusters.items():
centroids[cluster_idx] = np.mean(X_train[members], axis=0)
return centroids, clusters