Build a CNN from Scratch
Goal: Implement 2D convolution, max pooling, and a full CNN forward+backward pass in NumPy. Then verify with PyTorch. Inspired by Karpathy’s CS231n.
Prerequisites: Convolutional Neural Networks, Backpropagation, 05 - Neural Network from Scratch
What a Convolution Actually Does
A filter (small matrix) slides across an image, computing a dot product at each position. Each filter detects one pattern — edges, corners, textures.
import numpy as np
import matplotlib.pyplot as plt
def conv2d(input, kernel, stride=1, padding=0):
"""2D convolution of a single channel."""
if padding > 0:
input = np.pad(input, padding, mode='constant')
H, W = input.shape
kH, kW = kernel.shape
out_H = (H - kH) // stride + 1
out_W = (W - kW) // stride + 1
output = np.zeros((out_H, out_W))
for i in range(out_H):
for j in range(out_W):
patch = input[i*stride:i*stride+kH, j*stride:j*stride+kW]
output[i, j] = np.sum(patch * kernel)
return outputSee it in action
# Create a simple 8x8 image with a vertical edge
img = np.zeros((8, 8))
img[:, 4:] = 1.0
# Edge-detection kernels
vertical_edge = np.array([[-1, 0, 1],
[-1, 0, 1],
[-1, 0, 1]])
horizontal_edge = np.array([[-1, -1, -1],
[ 0, 0, 0],
[ 1, 1, 1]])
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].imshow(img, cmap='gray'); axes[0].set_title("Input")
axes[1].imshow(conv2d(img, vertical_edge), cmap='RdBu'); axes[1].set_title("Vertical edge filter")
axes[2].imshow(conv2d(img, horizontal_edge), cmap='RdBu'); axes[2].set_title("Horizontal edge filter")
plt.tight_layout()
plt.show()Multi-Channel Convolution
Real images have 3 channels (RGB). Filters span all channels:
def conv2d_multi(input, kernels, bias=None, stride=1, padding=0):
"""
input: (C_in, H, W)
kernels: (C_out, C_in, kH, kW)
bias: (C_out,) or None
output: (C_out, H_out, W_out)
"""
C_out, C_in, kH, kW = kernels.shape
C, H, W = input.shape
assert C == C_in
if padding > 0:
input = np.pad(input, ((0, 0), (padding, padding), (padding, padding)), mode='constant')
_, H, W = input.shape
out_H = (H - kH) // stride + 1
out_W = (W - kW) // stride + 1
output = np.zeros((C_out, out_H, out_W))
for f in range(C_out):
for i in range(out_H):
for j in range(out_W):
patch = input[:, i*stride:i*stride+kH, j*stride:j*stride+kW]
output[f, i, j] = np.sum(patch * kernels[f])
if bias is not None:
output[f] += bias[f]
return output
# Test: 3-channel input, 2 filters
np.random.seed(42)
x = np.random.randn(3, 8, 8) # (C_in=3, H=8, W=8)
w = np.random.randn(2, 3, 3, 3) * 0.5 # (C_out=2, C_in=3, kH=3, kW=3)
b = np.zeros(2)
out = conv2d_multi(x, w, b, padding=1)
print(f"Input: {x.shape}")
print(f"Output: {out.shape}") # (2, 8, 8) — same spatial size with padding=1Max Pooling
Downsample by taking the maximum in each window:
def maxpool2d(input, pool_size=2, stride=2):
"""Max pooling over spatial dimensions."""
C, H, W = input.shape
out_H = (H - pool_size) // stride + 1
out_W = (W - pool_size) // stride + 1
output = np.zeros((C, out_H, out_W))
max_indices = np.zeros((C, out_H, out_W, 2), dtype=int) # for backward pass
for c in range(C):
for i in range(out_H):
for j in range(out_W):
patch = input[c, i*stride:i*stride+pool_size, j*stride:j*stride+pool_size]
output[c, i, j] = patch.max()
# Store where the max came from (needed for backprop)
max_idx = np.unravel_index(patch.argmax(), patch.shape)
max_indices[c, i, j] = [i*stride + max_idx[0], j*stride + max_idx[1]]
return output, max_indices
out_pool, pool_idx = maxpool2d(out)
print(f"After pool: {out_pool.shape}") # (2, 4, 4)Full CNN: Forward Pass
class SimpleCNN:
def __init__(self):
np.random.seed(42)
# Conv1: 1 input channel → 4 filters, 3×3
self.W1 = np.random.randn(4, 1, 3, 3) * np.sqrt(2.0 / 9)
self.b1 = np.zeros(4)
# Conv2: 4 → 8 filters, 3×3
self.W2 = np.random.randn(8, 4, 3, 3) * np.sqrt(2.0 / 36)
self.b2 = np.zeros(8)
# Fully connected: 8*7*7 → 10 (for MNIST 28×28)
self.W3 = np.random.randn(8 * 7 * 7, 10) * np.sqrt(2.0 / (8*7*7))
self.b3 = np.zeros(10)
def forward(self, x):
"""x: (1, 28, 28) single grayscale image."""
self.x = x
# Conv1 + ReLU + Pool
self.z1 = conv2d_multi(x, self.W1, self.b1, padding=1) # (4, 28, 28)
self.a1 = np.maximum(0, self.z1) # ReLU
self.p1, self.p1_idx = maxpool2d(self.a1) # (4, 14, 14)
# Conv2 + ReLU + Pool
self.z2 = conv2d_multi(self.p1, self.W2, self.b2, padding=1) # (8, 14, 14)
self.a2 = np.maximum(0, self.z2)
self.p2, self.p2_idx = maxpool2d(self.a2) # (8, 7, 7)
# Flatten + FC
self.flat = self.p2.reshape(-1) # (392,)
self.logits = self.flat @ self.W3 + self.b3 # (10,)
# Softmax
exp_logits = np.exp(self.logits - self.logits.max())
self.probs = exp_logits / exp_logits.sum()
return self.probs
def loss(self, probs, label):
return -np.log(probs[label] + 1e-15)
# Test forward pass
from sklearn.datasets import load_digits
digits = load_digits()
# Resize 8x8 to 28x28 for realistic CNN
from scipy.ndimage import zoom
img = zoom(digits.images[0], 3.5, order=1) # 8→28
img = (img - img.mean()) / (img.std() + 1e-5) # normalize
x = img[np.newaxis, :, :] # (1, 28, 28)
cnn = SimpleCNN()
probs = cnn.forward(x)
print(f"Input shape: {x.shape}")
print(f"Predictions: {probs.round(3)}")
print(f"Predicted class: {probs.argmax()}, True: {digits.target[0]}")Backward Pass: Convolution Gradient
The hardest part — the gradient through a convolution is itself a convolution:
def conv2d_backward(d_output, input, kernel, stride=1, padding=0):
"""Backward pass through conv2d_multi. Returns d_input, d_kernel, d_bias."""
C_out, C_in, kH, kW = kernel.shape
_, out_H, out_W = d_output.shape
if padding > 0:
input_padded = np.pad(input, ((0,0), (padding,padding), (padding,padding)), mode='constant')
else:
input_padded = input
d_kernel = np.zeros_like(kernel)
d_input_padded = np.zeros_like(input_padded)
d_bias = d_output.sum(axis=(1, 2))
for f in range(C_out):
for i in range(out_H):
for j in range(out_W):
# d_kernel: gradient w.r.t. the filter weights
patch = input_padded[:, i*stride:i*stride+kH, j*stride:j*stride+kW]
d_kernel[f] += d_output[f, i, j] * patch
# d_input: gradient w.r.t. the input
d_input_padded[:, i*stride:i*stride+kH, j*stride:j*stride+kW] += (
d_output[f, i, j] * kernel[f]
)
if padding > 0:
d_input = d_input_padded[:, padding:-padding, padding:-padding]
else:
d_input = d_input_padded
return d_input, d_kernel, d_bias
def maxpool_backward(d_output, max_indices, input_shape, pool_size=2, stride=2):
"""Backward pass through maxpool — gradient goes only to max locations."""
C, H, W = input_shape
d_input = np.zeros((C, H, W))
C_out, out_H, out_W = d_output.shape
for c in range(C_out):
for i in range(out_H):
for j in range(out_W):
mi, mj = max_indices[c, i, j]
d_input[c, mi, mj] += d_output[c, i, j]
return d_inputVerify with PyTorch
import torch
import torch.nn as nn
# Set up matching PyTorch model
x_t = torch.tensor(x, dtype=torch.float32).unsqueeze(0) # (1, 1, 28, 28)
conv1 = nn.Conv2d(1, 4, 3, padding=1, bias=True)
conv1.weight.data = torch.tensor(cnn.W1, dtype=torch.float32)
conv1.bias.data = torch.tensor(cnn.b1, dtype=torch.float32)
# Forward
z1 = conv1(x_t)
a1 = torch.relu(z1)
p1 = nn.functional.max_pool2d(a1, 2)
# Compare
print(f"NumPy conv1 output mean: {cnn.z1.mean():.6f}")
print(f"PyTorch conv1 output mean: {z1.detach().numpy().mean():.6f}")
print(f"Match: {np.allclose(cnn.z1, z1.detach().numpy()[0], atol=1e-5)}")What Each Layer Learns
# Visualize the 4 learned filters (here just random — would need training)
fig, axes = plt.subplots(1, 4, figsize=(12, 3))
for i in range(4):
axes[i].imshow(cnn.W1[i, 0], cmap='RdBu')
axes[i].set_title(f"Filter {i}")
axes[i].axis('off')
plt.suptitle("Conv1 filters (3×3)")
plt.show()
# Visualize activations
fig, axes = plt.subplots(1, 5, figsize=(15, 3))
axes[0].imshow(x[0], cmap='gray'); axes[0].set_title("Input")
for i in range(4):
axes[i+1].imshow(cnn.a1[i], cmap='viridis'); axes[i+1].set_title(f"Feature map {i}")
plt.suptitle("Conv1 activations after ReLU")
plt.tight_layout()
plt.show()Exercises
-
Train on MNIST: Implement a full training loop — forward, loss, backward, update. Train on sklearn’s 8×8 digits (simpler) or full MNIST. Target: >95% accuracy.
-
Stride experiment: Implement stride=2 convolution instead of max pooling. Compare: stride halves spatial size like pooling but learns the downsampling. Modern architectures prefer strided convolutions.
-
im2col trick: The nested loop convolution is slow. Implement
im2col— unfold patches into columns so the convolution becomes a single matrix multiplication. This is how frameworks actually do it. -
Depthwise separable convolution: Instead of (C_out, C_in, kH, kW) filters, use (C_in, 1, kH, kW) depthwise + (C_out, C_in, 1, 1) pointwise. Count the parameter savings. This is MobileNet’s trick.
Next: 23 - RNN Text Generator — process sequences one step at a time.