Build a CNN from Scratch

Goal: Implement 2D convolution, max pooling, and a full CNN forward+backward pass in NumPy. Then verify with PyTorch. Inspired by Karpathy’s CS231n.

Prerequisites: Convolutional Neural Networks, Backpropagation, 05 - Neural Network from Scratch


What a Convolution Actually Does

A filter (small matrix) slides across an image, computing a dot product at each position. Each filter detects one pattern — edges, corners, textures.

import numpy as np
import matplotlib.pyplot as plt
 
def conv2d(input, kernel, stride=1, padding=0):
    """2D convolution of a single channel."""
    if padding > 0:
        input = np.pad(input, padding, mode='constant')
 
    H, W = input.shape
    kH, kW = kernel.shape
    out_H = (H - kH) // stride + 1
    out_W = (W - kW) // stride + 1
 
    output = np.zeros((out_H, out_W))
    for i in range(out_H):
        for j in range(out_W):
            patch = input[i*stride:i*stride+kH, j*stride:j*stride+kW]
            output[i, j] = np.sum(patch * kernel)
    return output

See it in action

# Create a simple 8x8 image with a vertical edge
img = np.zeros((8, 8))
img[:, 4:] = 1.0
 
# Edge-detection kernels
vertical_edge = np.array([[-1, 0, 1],
                           [-1, 0, 1],
                           [-1, 0, 1]])
 
horizontal_edge = np.array([[-1, -1, -1],
                              [ 0,  0,  0],
                              [ 1,  1,  1]])
 
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].imshow(img, cmap='gray'); axes[0].set_title("Input")
axes[1].imshow(conv2d(img, vertical_edge), cmap='RdBu'); axes[1].set_title("Vertical edge filter")
axes[2].imshow(conv2d(img, horizontal_edge), cmap='RdBu'); axes[2].set_title("Horizontal edge filter")
plt.tight_layout()
plt.show()

Multi-Channel Convolution

Real images have 3 channels (RGB). Filters span all channels:

def conv2d_multi(input, kernels, bias=None, stride=1, padding=0):
    """
    input:   (C_in, H, W)
    kernels: (C_out, C_in, kH, kW)
    bias:    (C_out,) or None
    output:  (C_out, H_out, W_out)
    """
    C_out, C_in, kH, kW = kernels.shape
    C, H, W = input.shape
    assert C == C_in
 
    if padding > 0:
        input = np.pad(input, ((0, 0), (padding, padding), (padding, padding)), mode='constant')
        _, H, W = input.shape
 
    out_H = (H - kH) // stride + 1
    out_W = (W - kW) // stride + 1
    output = np.zeros((C_out, out_H, out_W))
 
    for f in range(C_out):
        for i in range(out_H):
            for j in range(out_W):
                patch = input[:, i*stride:i*stride+kH, j*stride:j*stride+kW]
                output[f, i, j] = np.sum(patch * kernels[f])
        if bias is not None:
            output[f] += bias[f]
 
    return output
 
# Test: 3-channel input, 2 filters
np.random.seed(42)
x = np.random.randn(3, 8, 8)         # (C_in=3, H=8, W=8)
w = np.random.randn(2, 3, 3, 3) * 0.5  # (C_out=2, C_in=3, kH=3, kW=3)
b = np.zeros(2)
 
out = conv2d_multi(x, w, b, padding=1)
print(f"Input:  {x.shape}")
print(f"Output: {out.shape}")  # (2, 8, 8) — same spatial size with padding=1

Max Pooling

Downsample by taking the maximum in each window:

def maxpool2d(input, pool_size=2, stride=2):
    """Max pooling over spatial dimensions."""
    C, H, W = input.shape
    out_H = (H - pool_size) // stride + 1
    out_W = (W - pool_size) // stride + 1
    output = np.zeros((C, out_H, out_W))
    max_indices = np.zeros((C, out_H, out_W, 2), dtype=int)  # for backward pass
 
    for c in range(C):
        for i in range(out_H):
            for j in range(out_W):
                patch = input[c, i*stride:i*stride+pool_size, j*stride:j*stride+pool_size]
                output[c, i, j] = patch.max()
                # Store where the max came from (needed for backprop)
                max_idx = np.unravel_index(patch.argmax(), patch.shape)
                max_indices[c, i, j] = [i*stride + max_idx[0], j*stride + max_idx[1]]
 
    return output, max_indices
 
out_pool, pool_idx = maxpool2d(out)
print(f"After pool: {out_pool.shape}")  # (2, 4, 4)

Full CNN: Forward Pass

class SimpleCNN:
    def __init__(self):
        np.random.seed(42)
        # Conv1: 1 input channel → 4 filters, 3×3
        self.W1 = np.random.randn(4, 1, 3, 3) * np.sqrt(2.0 / 9)
        self.b1 = np.zeros(4)
        # Conv2: 4 → 8 filters, 3×3
        self.W2 = np.random.randn(8, 4, 3, 3) * np.sqrt(2.0 / 36)
        self.b2 = np.zeros(8)
        # Fully connected: 8*7*7 → 10 (for MNIST 28×28)
        self.W3 = np.random.randn(8 * 7 * 7, 10) * np.sqrt(2.0 / (8*7*7))
        self.b3 = np.zeros(10)
 
    def forward(self, x):
        """x: (1, 28, 28) single grayscale image."""
        self.x = x
 
        # Conv1 + ReLU + Pool
        self.z1 = conv2d_multi(x, self.W1, self.b1, padding=1)     # (4, 28, 28)
        self.a1 = np.maximum(0, self.z1)                            # ReLU
        self.p1, self.p1_idx = maxpool2d(self.a1)                   # (4, 14, 14)
 
        # Conv2 + ReLU + Pool
        self.z2 = conv2d_multi(self.p1, self.W2, self.b2, padding=1)  # (8, 14, 14)
        self.a2 = np.maximum(0, self.z2)
        self.p2, self.p2_idx = maxpool2d(self.a2)                      # (8, 7, 7)
 
        # Flatten + FC
        self.flat = self.p2.reshape(-1)                              # (392,)
        self.logits = self.flat @ self.W3 + self.b3                  # (10,)
 
        # Softmax
        exp_logits = np.exp(self.logits - self.logits.max())
        self.probs = exp_logits / exp_logits.sum()
 
        return self.probs
 
    def loss(self, probs, label):
        return -np.log(probs[label] + 1e-15)
 
# Test forward pass
from sklearn.datasets import load_digits
digits = load_digits()
# Resize 8x8 to 28x28 for realistic CNN
from scipy.ndimage import zoom
img = zoom(digits.images[0], 3.5, order=1)  # 8→28
img = (img - img.mean()) / (img.std() + 1e-5)  # normalize
x = img[np.newaxis, :, :]  # (1, 28, 28)
 
cnn = SimpleCNN()
probs = cnn.forward(x)
print(f"Input shape: {x.shape}")
print(f"Predictions: {probs.round(3)}")
print(f"Predicted class: {probs.argmax()}, True: {digits.target[0]}")

Backward Pass: Convolution Gradient

The hardest part — the gradient through a convolution is itself a convolution:

def conv2d_backward(d_output, input, kernel, stride=1, padding=0):
    """Backward pass through conv2d_multi. Returns d_input, d_kernel, d_bias."""
    C_out, C_in, kH, kW = kernel.shape
    _, out_H, out_W = d_output.shape
 
    if padding > 0:
        input_padded = np.pad(input, ((0,0), (padding,padding), (padding,padding)), mode='constant')
    else:
        input_padded = input
 
    d_kernel = np.zeros_like(kernel)
    d_input_padded = np.zeros_like(input_padded)
    d_bias = d_output.sum(axis=(1, 2))
 
    for f in range(C_out):
        for i in range(out_H):
            for j in range(out_W):
                # d_kernel: gradient w.r.t. the filter weights
                patch = input_padded[:, i*stride:i*stride+kH, j*stride:j*stride+kW]
                d_kernel[f] += d_output[f, i, j] * patch
 
                # d_input: gradient w.r.t. the input
                d_input_padded[:, i*stride:i*stride+kH, j*stride:j*stride+kW] += (
                    d_output[f, i, j] * kernel[f]
                )
 
    if padding > 0:
        d_input = d_input_padded[:, padding:-padding, padding:-padding]
    else:
        d_input = d_input_padded
 
    return d_input, d_kernel, d_bias
 
def maxpool_backward(d_output, max_indices, input_shape, pool_size=2, stride=2):
    """Backward pass through maxpool — gradient goes only to max locations."""
    C, H, W = input_shape
    d_input = np.zeros((C, H, W))
    C_out, out_H, out_W = d_output.shape
 
    for c in range(C_out):
        for i in range(out_H):
            for j in range(out_W):
                mi, mj = max_indices[c, i, j]
                d_input[c, mi, mj] += d_output[c, i, j]
 
    return d_input

Verify with PyTorch

import torch
import torch.nn as nn
 
# Set up matching PyTorch model
x_t = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # (1, 1, 28, 28)
 
conv1 = nn.Conv2d(1, 4, 3, padding=1, bias=True)
conv1.weight.data = torch.tensor(cnn.W1, dtype=torch.float32)
conv1.bias.data = torch.tensor(cnn.b1, dtype=torch.float32)
 
# Forward
z1 = conv1(x_t)
a1 = torch.relu(z1)
p1 = nn.functional.max_pool2d(a1, 2)
 
# Compare
print(f"NumPy conv1 output mean: {cnn.z1.mean():.6f}")
print(f"PyTorch conv1 output mean: {z1.detach().numpy().mean():.6f}")
print(f"Match: {np.allclose(cnn.z1, z1.detach().numpy()[0], atol=1e-5)}")

What Each Layer Learns

# Visualize the 4 learned filters (here just random — would need training)
fig, axes = plt.subplots(1, 4, figsize=(12, 3))
for i in range(4):
    axes[i].imshow(cnn.W1[i, 0], cmap='RdBu')
    axes[i].set_title(f"Filter {i}")
    axes[i].axis('off')
plt.suptitle("Conv1 filters (3×3)")
plt.show()
 
# Visualize activations
fig, axes = plt.subplots(1, 5, figsize=(15, 3))
axes[0].imshow(x[0], cmap='gray'); axes[0].set_title("Input")
for i in range(4):
    axes[i+1].imshow(cnn.a1[i], cmap='viridis'); axes[i+1].set_title(f"Feature map {i}")
plt.suptitle("Conv1 activations after ReLU")
plt.tight_layout()
plt.show()

Exercises

  1. Train on MNIST: Implement a full training loop — forward, loss, backward, update. Train on sklearn’s 8×8 digits (simpler) or full MNIST. Target: >95% accuracy.

  2. Stride experiment: Implement stride=2 convolution instead of max pooling. Compare: stride halves spatial size like pooling but learns the downsampling. Modern architectures prefer strided convolutions.

  3. im2col trick: The nested loop convolution is slow. Implement im2col — unfold patches into columns so the convolution becomes a single matrix multiplication. This is how frameworks actually do it.

  4. Depthwise separable convolution: Instead of (C_out, C_in, kH, kW) filters, use (C_in, 1, kH, kW) depthwise + (C_out, C_in, 1, 1) pointwise. Count the parameter savings. This is MobileNet’s trick.


Next: 23 - RNN Text Generator — process sequences one step at a time.