K Means

Prompt: Implement K Means. Assume X_train contains a set of N d-dimensional data points.


class KMeans:
    def __init__(self, k):
        self.k = k

    def init_centroids(self, dim):
        """
        Args:
            dim: dimension of data

        Returns:
            np array of shape (self.k, d) of initialized centroids
        """

        return np.random.normal(size=(self.k, dim))



    def fit(self, X_train, num_iters):
        """
        Args:
            X_train: np array of shape (N, d)
            num_iters: number of algorithm iterations

        Returns:
            centroids: np array of shape (k, d) the centroids after algorithm converges
            clusters: dict[int, list] with mapping cluster_idx -> list of indices of X_train belonging to that cluster
        """

        N, d = X_train.shape
        centroids = self.init_centroids(dim=d)


        a2 = np.square(X_train).sum(axis=1) # (N, )

        for i in range(num_iters):

            # initialize empty clusters
            clusters = {i:[] for i in range(self.k)}

            b2 = np.square(centroids).sum(axis=1) # (k, )
            ab = X_train @ centroids.T # (N, k)

            dists = a2[:, None] + b2[None, :] - 2*ab # (N, k)

            top1 = np.argmin(dists, axis=1) # (N, )

            for member_idx, cluster_idx in enumerate(top1):
                clusters[cluster_idx].append(member_idx)

            # use clusters to update centroids
            for cluster_idx, members in clusters.items():
                centroids[cluster_idx] = np.mean(X_train[members], axis=0)

        
        return centroids, clusters