K-Means from Scratch

Goal: Implement K-Means clustering from scratch — random init, assignment, update, convergence. Plus: elbow method and failure cases.

Prerequisites: K-Means Clustering, Dot Product, Vectors and Matrices

The Algorithm

Pick $k$ random points as initial centroids
Assign each point to its nearest centroid
Update each centroid to the mean of its assigned points
Repeat until centroids stop moving

import numpy as np
import matplotlib.pyplot as plt
 
def kmeans(X, k, max_iter=100, seed=42):
    """K-Means clustering. Returns centroids and labels."""
    rng = np.random.RandomState(seed)
    n = len(X)
 
    # Step 1: Random initialization
    idx = rng.choice(n, k, replace=False)
    centroids = X[idx].copy()
 
    for iteration in range(max_iter):
        # Step 2: Assign each point to nearest centroid
        # distances[i, j] = ||X[i] - centroids[j]||^2
        distances = np.sum((X[:, None] - centroids[None]) ** 2, axis=2)
        labels = np.argmin(distances, axis=1)
 
        # Step 3: Update centroids
        new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(k)])
 
        # Check convergence
        if np.allclose(centroids, new_centroids):
            print(f"Converged at iteration {iteration}")
            break
        centroids = new_centroids
 
    return centroids, labels

Test on Synthetic Data

from sklearn.datasets import make_blobs
 
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.8, random_state=42)
 
centroids, labels = kmeans(X, k=4)
 
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap="tab10", s=15, alpha=0.7)
plt.scatter(centroids[:, 0], centroids[:, 1], c="black", marker="X", s=200, edgecolors="white")
plt.title("K-Means clustering (k=4)")
plt.show()

Watch It Converge

Step-by-step visualization of the algorithm:

def kmeans_animated(X, k, max_iter=10, seed=42):
    rng = np.random.RandomState(seed)
    idx = rng.choice(len(X), k, replace=False)
    centroids = X[idx].copy()
 
    fig, axes = plt.subplots(2, 3, figsize=(14, 9))
    axes = axes.ravel()
 
    for i in range(min(max_iter, 6)):
        distances = np.sum((X[:, None] - centroids[None]) ** 2, axis=2)
        labels = np.argmin(distances, axis=1)
 
        ax = axes[i]
        ax.scatter(X[:, 0], X[:, 1], c=labels, cmap="tab10", s=15, alpha=0.7)
        ax.scatter(centroids[:, 0], centroids[:, 1], c="black", marker="X", s=200, edgecolors="white")
        ax.set_title(f"Iteration {i}")
 
        new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(k)])
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
 
    plt.tight_layout()
    plt.show()
 
kmeans_animated(X, k=4)

The Elbow Method

How to choose $k$ ? Plot inertia (sum of squared distances to nearest centroid) vs $k$ :

def inertia(X, centroids, labels):
    """Sum of squared distances to nearest centroid."""
    return sum(np.sum((X[labels == j] - centroids[j]) ** 2) for j in range(len(centroids)))
 
inertias = []
K_range = range(1, 10)
for k in K_range:
    c, l = kmeans(X, k)
    inertias.append(inertia(X, c, l))
 
plt.plot(K_range, inertias, "o-")
plt.xlabel("k"); plt.ylabel("Inertia")
plt.title("Elbow method — look for the bend")
plt.show()

The “elbow” is where adding more clusters gives diminishing returns. Here it’s at $k = 4$ .

K-Means++ Initialization

Random init can give bad results. K-Means++ picks initial centroids that are spread out:

def kmeans_pp_init(X, k, seed=42):
    """K-Means++ initialization: pick spread-out initial centroids."""
    rng = np.random.RandomState(seed)
    n = len(X)
    centroids = [X[rng.randint(n)]]
 
    for _ in range(1, k):
        # Distance from each point to nearest existing centroid
        dists = np.min([np.sum((X - c) ** 2, axis=1) for c in centroids], axis=0)
        # Pick next centroid with probability proportional to distance^2
        probs = dists / dists.sum()
        centroids.append(X[rng.choice(n, p=probs)])
 
    return np.array(centroids)
 
# Compare random vs K-Means++ init
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
 
for ax, init_method, title in [
    (axes[0], "random", "Random init"),
    (axes[1], "kmeans++", "K-Means++ init"),
]:
    if init_method == "random":
        c, l = kmeans(X, k=4, seed=0)  # bad seed
    else:
        init_c = kmeans_pp_init(X, k=4, seed=0)
        # Run K-Means with this init
        centroids = init_c.copy()
        for _ in range(100):
            distances = np.sum((X[:, None] - centroids[None]) ** 2, axis=2)
            labels = np.argmin(distances, axis=1)
            new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(4)])
            if np.allclose(centroids, new_centroids):
                break
            centroids = new_centroids
        c, l = centroids, labels
 
    ax.scatter(X[:, 0], X[:, 1], c=l, cmap="tab10", s=15, alpha=0.7)
    ax.scatter(c[:, 0], c[:, 1], c="black", marker="X", s=200, edgecolors="white")
    ax.set_title(title)
plt.tight_layout()
plt.show()

When K-Means Fails

K-Means assumes spherical, equally-sized clusters. It breaks on:

from sklearn.datasets import make_moons, make_circles
 
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
 
# Non-convex clusters
X_moons, _ = make_moons(n_samples=300, noise=0.05, random_state=42)
c, l = kmeans(X_moons, k=2)
axes[0].scatter(X_moons[:, 0], X_moons[:, 1], c=l, cmap="tab10", s=15)
axes[0].set_title("K-Means on moons (fails)")
 
# Concentric clusters
X_circles, _ = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)
c, l = kmeans(X_circles, k=2)
axes[1].scatter(X_circles[:, 0], X_circles[:, 1], c=l, cmap="tab10", s=15)
axes[1].set_title("K-Means on circles (fails)")
 
plt.tight_layout()
plt.show()

For these shapes, use DBSCAN or spectral clustering.

Exercises

Silhouette score: Implement the silhouette coefficient: $s (i) = \frac{b ( i ) - a ( i )}{m a x ( a ( i ) , b ( i ))}$ where $a (i)$ is mean intra-cluster distance and $b (i)$ is mean nearest-cluster distance. Plot silhouette vs $k$ .
Image compression: Load an image as a (H*W, 3) array of RGB pixels. Run K-Means with $k = 8, 16, 32$ to reduce colors. Display the compressed images.
Mini-batch K-Means: Instead of using all points, randomly sample a batch of 50 points each iteration. Compare convergence speed vs accuracy.
Soft assignment: Instead of hard labels, compute a probability for each cluster using $p (k ∣ x) \propto e^{- ∣∣ x - μ_{k} ∣ ∣^{2} / (2 σ^{2})}$ . This is the E-step of Gaussian Mixture Models.

Next: 05 - Neural Network from Scratch — combine linear layers and nonlinearities to learn any function.

AI/ML Notes

Explorer

04 - K-Means from Scratch