Neural Network from Scratch
Goal: Build a 2-layer neural network from scratch using only NumPy. Forward pass, backpropagation, training loop. Solve XOR and classify MNIST digits.
Prerequisites: Backpropagation, Neurons and Activation Functions, Chain Rule, Gradient Descent
Architecture
Input (d) → Linear → ReLU → Linear → Softmax → Loss
Two weight matrices, two bias vectors. That’s it.
Building Blocks
import numpy as np
import matplotlib.pyplot as plt
def relu(z):
return np.maximum(0, z)
def relu_derivative(z):
return (z > 0).astype(float)
def softmax(z):
"""Numerically stable softmax."""
exp_z = np.exp(z - z.max(axis=1, keepdims=True))
return exp_z / exp_z.sum(axis=1, keepdims=True)
def cross_entropy_loss(y_onehot, probs):
"""Cross-entropy between one-hot labels and predicted probabilities."""
n = len(y_onehot)
return -np.sum(y_onehot * np.log(probs + 1e-15)) / nThe Network
class NeuralNetwork:
def __init__(self, input_dim, hidden_dim, output_dim, lr=0.01):
# Xavier initialization — keeps variance stable across layers
self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2.0 / input_dim)
self.b1 = np.zeros(hidden_dim)
self.W2 = np.random.randn(hidden_dim, output_dim) * np.sqrt(2.0 / hidden_dim)
self.b2 = np.zeros(output_dim)
self.lr = lr
def forward(self, X):
"""Forward pass. Save intermediates for backprop."""
self.z1 = X @ self.W1 + self.b1 # pre-activation, hidden
self.a1 = relu(self.z1) # post-activation, hidden
self.z2 = self.a1 @ self.W2 + self.b2 # pre-activation, output
self.a2 = softmax(self.z2) # probabilities
return self.a2
def backward(self, X, y_onehot):
"""Backpropagation. Compute gradients and update weights."""
n = len(X)
# Output layer gradient
# For softmax + cross-entropy, the gradient is simply: probs - labels
dz2 = self.a2 - y_onehot # (n, output_dim)
dW2 = (1 / n) * self.a1.T @ dz2 # (hidden_dim, output_dim)
db2 = (1 / n) * dz2.sum(axis=0) # (output_dim,)
# Hidden layer gradient — chain rule through W2 and ReLU
da1 = dz2 @ self.W2.T # (n, hidden_dim)
dz1 = da1 * relu_derivative(self.z1) # element-wise
dW1 = (1 / n) * X.T @ dz1 # (input_dim, hidden_dim)
db1 = (1 / n) * dz1.sum(axis=0) # (hidden_dim,)
# Update
self.W1 -= self.lr * dW1
self.b1 -= self.lr * db1
self.W2 -= self.lr * dW2
self.b2 -= self.lr * db2
def train(self, X, y, n_epochs=1000, verbose=True):
n_classes = len(np.unique(y))
y_onehot = np.eye(n_classes)[y]
losses = []
for epoch in range(n_epochs):
probs = self.forward(X)
loss = cross_entropy_loss(y_onehot, probs)
losses.append(loss)
self.backward(X, y_onehot)
if verbose and epoch % (n_epochs // 10) == 0:
acc = (probs.argmax(axis=1) == y).mean()
print(f"Epoch {epoch:4d} | Loss: {loss:.4f} | Acc: {acc:.4f}")
return losses
def predict(self, X):
return self.forward(X).argmax(axis=1)Test 1: XOR Problem
XOR is the classic problem a single-layer perceptron can’t solve:
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=float)
y_xor = np.array([0, 1, 1, 0])
net = NeuralNetwork(input_dim=2, hidden_dim=8, output_dim=2, lr=0.5)
losses = net.train(X_xor, y_xor, n_epochs=2000, verbose=False)
print("Predictions:", net.predict(X_xor)) # should be [0, 1, 1, 0]
print("True labels:", y_xor)
plt.plot(losses)
plt.xlabel("Epoch"); plt.ylabel("Loss")
plt.title("XOR training")
plt.show()The hidden layer creates a nonlinear feature space where XOR becomes linearly separable.
Test 2: MNIST Digits
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load 8x8 digit images (1797 samples, 64 features)
digits = load_digits()
X, y = digits.data, digits.target
# Normalize
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train
net = NeuralNetwork(input_dim=64, hidden_dim=128, output_dim=10, lr=0.1)
losses = net.train(X_train, y_train, n_epochs=500)
# Evaluate
train_acc = (net.predict(X_train) == y_train).mean()
test_acc = (net.predict(X_test) == y_test).mean()
print(f"\nTrain accuracy: {train_acc:.4f}")
print(f"Test accuracy: {test_acc:.4f}")You should get ~95%+ on test. With a single hidden layer of 128 neurons. No frameworks.
Visualize What the Network Learned
# Show some predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
preds = net.predict(X_test)
for i, ax in enumerate(axes.ravel()):
ax.imshow(X_test[i].reshape(8, 8), cmap="gray")
color = "green" if preds[i] == y_test[i] else "red"
ax.set_title(f"pred={preds[i]}, true={y_test[i]}", color=color, fontsize=10)
ax.axis("off")
plt.tight_layout()
plt.show()Mini-Batch Training
Full-batch is slow for large datasets. Use random subsets:
def train_minibatch(net, X, y, n_epochs=100, batch_size=32):
n_classes = len(np.unique(y))
y_onehot = np.eye(n_classes)[y]
n = len(X)
losses = []
for epoch in range(n_epochs):
# Shuffle
perm = np.random.permutation(n)
X_shuf, y_shuf = X[perm], y_onehot[perm]
epoch_loss = 0
for i in range(0, n, batch_size):
X_batch = X_shuf[i:i+batch_size]
y_batch = y_shuf[i:i+batch_size]
probs = net.forward(X_batch)
epoch_loss += cross_entropy_loss(y_batch, probs) * len(X_batch)
net.backward(X_batch, y_batch)
losses.append(epoch_loss / n)
return losses
net2 = NeuralNetwork(64, 128, 10, lr=0.01)
losses = train_minibatch(net2, X_train, y_train, n_epochs=200, batch_size=32)
print(f"Mini-batch test accuracy: {(net2.predict(X_test) == y_test).mean():.4f}")The Gradient Flow
Why this architecture works — trace the gradient through each layer:
Loss → dz2 = (probs - labels) # softmax+CE simplification
→ dW2 = a1.T @ dz2 # how W2 should change
→ da1 = dz2 @ W2.T # error signal to hidden layer
→ dz1 = da1 * relu'(z1) # ReLU gates the gradient
→ dW1 = X.T @ dz1 # how W1 should change
ReLU derivative is 0 or 1 — it either passes the gradient through or kills it. This is why “dying ReLU” (neuron always outputs 0) is a problem.
Exercises
-
Add a third layer: Make it Input → 128 → 64 → 10. Does it improve MNIST accuracy? Watch for vanishing gradients.
-
Implement dropout: During training, randomly zero out hidden activations with probability . Scale remaining by . Compare test accuracy.
-
Momentum: Add velocity terms: , then . Use . How does convergence change?
-
Visualize hidden representations: After training, run X through the first layer only. Use PCA or t-SNE to plot the 128-dim hidden vectors in 2D, colored by digit class.
-
Learning rate finder: Train for 1 epoch with lr exponentially increasing from 1e-5 to 10. Plot loss vs lr. The best lr is just before the loss starts rising.
Next: 06 - Gradient Descent Variants — SGD, momentum, Adam, and why the optimizer matters.