K-Means from Scratch
Goal: Implement K-Means clustering from scratch — random init, assignment, update, convergence. Plus: elbow method and failure cases.
Prerequisites: K-Means Clustering, Dot Product, Vectors and Matrices
The Algorithm
- Pick random points as initial centroids
- Assign each point to its nearest centroid
- Update each centroid to the mean of its assigned points
- Repeat until centroids stop moving
import numpy as np
import matplotlib.pyplot as plt
def kmeans(X, k, max_iter=100, seed=42):
"""K-Means clustering. Returns centroids and labels."""
rng = np.random.RandomState(seed)
n = len(X)
# Step 1: Random initialization
idx = rng.choice(n, k, replace=False)
centroids = X[idx].copy()
for iteration in range(max_iter):
# Step 2: Assign each point to nearest centroid
# distances[i, j] = ||X[i] - centroids[j]||^2
distances = np.sum((X[:, None] - centroids[None]) ** 2, axis=2)
labels = np.argmin(distances, axis=1)
# Step 3: Update centroids
new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(k)])
# Check convergence
if np.allclose(centroids, new_centroids):
print(f"Converged at iteration {iteration}")
break
centroids = new_centroids
return centroids, labelsTest on Synthetic Data
from sklearn.datasets import make_blobs
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.8, random_state=42)
centroids, labels = kmeans(X, k=4)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap="tab10", s=15, alpha=0.7)
plt.scatter(centroids[:, 0], centroids[:, 1], c="black", marker="X", s=200, edgecolors="white")
plt.title("K-Means clustering (k=4)")
plt.show()Watch It Converge
Step-by-step visualization of the algorithm:
def kmeans_animated(X, k, max_iter=10, seed=42):
rng = np.random.RandomState(seed)
idx = rng.choice(len(X), k, replace=False)
centroids = X[idx].copy()
fig, axes = plt.subplots(2, 3, figsize=(14, 9))
axes = axes.ravel()
for i in range(min(max_iter, 6)):
distances = np.sum((X[:, None] - centroids[None]) ** 2, axis=2)
labels = np.argmin(distances, axis=1)
ax = axes[i]
ax.scatter(X[:, 0], X[:, 1], c=labels, cmap="tab10", s=15, alpha=0.7)
ax.scatter(centroids[:, 0], centroids[:, 1], c="black", marker="X", s=200, edgecolors="white")
ax.set_title(f"Iteration {i}")
new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(k)])
if np.allclose(centroids, new_centroids):
break
centroids = new_centroids
plt.tight_layout()
plt.show()
kmeans_animated(X, k=4)The Elbow Method
How to choose ? Plot inertia (sum of squared distances to nearest centroid) vs :
def inertia(X, centroids, labels):
"""Sum of squared distances to nearest centroid."""
return sum(np.sum((X[labels == j] - centroids[j]) ** 2) for j in range(len(centroids)))
inertias = []
K_range = range(1, 10)
for k in K_range:
c, l = kmeans(X, k)
inertias.append(inertia(X, c, l))
plt.plot(K_range, inertias, "o-")
plt.xlabel("k"); plt.ylabel("Inertia")
plt.title("Elbow method — look for the bend")
plt.show()The “elbow” is where adding more clusters gives diminishing returns. Here it’s at .
K-Means++ Initialization
Random init can give bad results. K-Means++ picks initial centroids that are spread out:
def kmeans_pp_init(X, k, seed=42):
"""K-Means++ initialization: pick spread-out initial centroids."""
rng = np.random.RandomState(seed)
n = len(X)
centroids = [X[rng.randint(n)]]
for _ in range(1, k):
# Distance from each point to nearest existing centroid
dists = np.min([np.sum((X - c) ** 2, axis=1) for c in centroids], axis=0)
# Pick next centroid with probability proportional to distance^2
probs = dists / dists.sum()
centroids.append(X[rng.choice(n, p=probs)])
return np.array(centroids)
# Compare random vs K-Means++ init
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, init_method, title in [
(axes[0], "random", "Random init"),
(axes[1], "kmeans++", "K-Means++ init"),
]:
if init_method == "random":
c, l = kmeans(X, k=4, seed=0) # bad seed
else:
init_c = kmeans_pp_init(X, k=4, seed=0)
# Run K-Means with this init
centroids = init_c.copy()
for _ in range(100):
distances = np.sum((X[:, None] - centroids[None]) ** 2, axis=2)
labels = np.argmin(distances, axis=1)
new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(4)])
if np.allclose(centroids, new_centroids):
break
centroids = new_centroids
c, l = centroids, labels
ax.scatter(X[:, 0], X[:, 1], c=l, cmap="tab10", s=15, alpha=0.7)
ax.scatter(c[:, 0], c[:, 1], c="black", marker="X", s=200, edgecolors="white")
ax.set_title(title)
plt.tight_layout()
plt.show()When K-Means Fails
K-Means assumes spherical, equally-sized clusters. It breaks on:
from sklearn.datasets import make_moons, make_circles
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Non-convex clusters
X_moons, _ = make_moons(n_samples=300, noise=0.05, random_state=42)
c, l = kmeans(X_moons, k=2)
axes[0].scatter(X_moons[:, 0], X_moons[:, 1], c=l, cmap="tab10", s=15)
axes[0].set_title("K-Means on moons (fails)")
# Concentric clusters
X_circles, _ = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)
c, l = kmeans(X_circles, k=2)
axes[1].scatter(X_circles[:, 0], X_circles[:, 1], c=l, cmap="tab10", s=15)
axes[1].set_title("K-Means on circles (fails)")
plt.tight_layout()
plt.show()For these shapes, use DBSCAN or spectral clustering.
Exercises
-
Silhouette score: Implement the silhouette coefficient: where is mean intra-cluster distance and is mean nearest-cluster distance. Plot silhouette vs .
-
Image compression: Load an image as a (H*W, 3) array of RGB pixels. Run K-Means with to reduce colors. Display the compressed images.
-
Mini-batch K-Means: Instead of using all points, randomly sample a batch of 50 points each iteration. Compare convergence speed vs accuracy.
-
Soft assignment: Instead of hard labels, compute a probability for each cluster using . This is the E-step of Gaussian Mixture Models.
Next: 05 - Neural Network from Scratch — combine linear layers and nonlinearities to learn any function.