Bigram Language Model
Goal: Build a character-level bigram language model two ways: counting statistics, then a neural network. Generate names. Inspired by Karpathy’s makemore Part 1.
Prerequisites: Language Models, Probability Basics, Loss Functions, 02 - Logistic Regression from Scratch
The Idea
A bigram model predicts the next character using only the previous character. Simple — but it’s the foundation for everything from GPT onward.
Given “hello”: P(h|start), P(e|h), P(l|e), P(l|l), P(o|l), P(end|o).
Dataset
We’ll use a list of names. Any list of words works:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
# Download names or use a built-in list
words = open('names.txt', 'r').read().splitlines() if __import__('os').path.exists('names.txt') else [
"emma", "olivia", "ava", "sophia", "isabella", "mia", "charlotte", "amelia",
"harper", "evelyn", "abigail", "emily", "elizabeth", "sofia", "avery",
"ella", "scarlett", "grace", "chloe", "victoria", "riley", "aria", "lily",
"aurora", "zoey", "nora", "luna", "hannah", "penelope", "layla",
"eleanor", "stella", "violet", "hazel", "aurora", "savannah", "audrey",
"brooklyn", "bella", "claire", "skylar", "lucy", "paisley", "everly",
"anna", "caroline", "genesis", "emilia", "kennedy", "maya", "willow",
]
print(f"{len(words)} words")
print(f"Examples: {words[:5]}")
# Character vocabulary
chars = sorted(set(''.join(words)))
stoi = {c: i+1 for i, c in enumerate(chars)} # char → int
stoi['.'] = 0 # special start/end token
itos = {i: c for c, i in stoi.items()}
vocab_size = len(stoi)
print(f"Vocab: {vocab_size} chars: {''.join(chars)}")Method 1: Counting
Build a 27×27 matrix where entry (i,j) counts how often character j follows character i:
import numpy as np
N = np.zeros((vocab_size, vocab_size), dtype=int)
for word in words:
chs = ['.'] + list(word) + ['.']
for c1, c2 in zip(chs, chs[1:]):
N[stoi[c1], stoi[c2]] += 1
# Visualize
plt.figure(figsize=(12, 12))
plt.imshow(N, cmap='Blues')
for i in range(vocab_size):
for j in range(vocab_size):
plt.text(j, i, itos[i] + itos[j], ha='center', va='bottom', fontsize=6, color='gray')
plt.text(j, i, str(N[i, j]), ha='center', va='top', fontsize=6)
plt.title("Bigram counts")
plt.show()Convert to probabilities and sample
# Add smoothing to avoid zero probabilities
P = (N + 1).astype(float) # Laplace smoothing
P = P / P.sum(axis=1, keepdims=True) # normalize rows
def sample_counting(n=10):
"""Generate names by sampling from bigram probabilities."""
names = []
for _ in range(n):
name = []
idx = 0 # start token '.'
while True:
probs = P[idx]
idx = np.random.choice(vocab_size, p=probs)
if idx == 0: # end token
break
name.append(itos[idx])
names.append(''.join(name))
return names
print("Counting model samples:")
for name in sample_counting(10):
print(f" {name}")Evaluate with negative log-likelihood
log_likelihood = 0
count = 0
for word in words:
chs = ['.'] + list(word) + ['.']
for c1, c2 in zip(chs, chs[1:]):
prob = P[stoi[c1], stoi[c2]]
log_likelihood += np.log(prob)
count += 1
nll = -log_likelihood / count
print(f"Negative log-likelihood: {nll:.4f}")
# Lower is better. Random would be -log(1/27) ≈ 3.30Method 2: Neural Network
Same problem, but learn the probabilities with gradient descent instead of counting:
# Build training data: pairs of (input_char, target_char)
xs, ys = [], []
for word in words:
chs = ['.'] + list(word) + ['.']
for c1, c2 in zip(chs, chs[1:]):
xs.append(stoi[c1])
ys.append(stoi[c2])
xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(f"Training examples: {len(xs)}")
print(f"Example: '{itos[xs[0].item()]}' → '{itos[ys[0].item()]}'")One-hot encoding → linear layer → softmax
# The model is a single weight matrix: W[27, 27]
# Input: one-hot vector for current char
# Output: logits for next char → softmax → probabilities
W = torch.randn(vocab_size, vocab_size, requires_grad=True)
# Training loop
losses = []
for epoch in range(200):
# Forward pass
x_onehot = F.one_hot(xs, num_classes=vocab_size).float() # (N, 27)
logits = x_onehot @ W # (N, 27)
counts = logits.exp() # softmax numerator
probs = counts / counts.sum(dim=1, keepdim=True) # softmax
# Loss: negative log-likelihood
loss = -probs[torch.arange(len(ys)), ys].log().mean()
# Add L2 regularization to prevent extreme weights
loss = loss + 0.01 * (W ** 2).mean()
losses.append(loss.item())
# Backward pass
W.grad = None
loss.backward()
# Update
W.data -= 50 * W.grad
if epoch % 50 == 0:
print(f"Epoch {epoch:3d} | Loss: {loss.item():.4f}")
plt.plot(losses)
plt.xlabel("Epoch"); plt.ylabel("NLL")
plt.title("Neural bigram training")
plt.show()Sample from the neural model
def sample_neural(n=10):
names = []
for _ in range(n):
name = []
idx = 0
while True:
x_oh = F.one_hot(torch.tensor(idx), num_classes=vocab_size).float()
logits = x_oh @ W
probs = F.softmax(logits, dim=0)
idx = torch.multinomial(probs, 1).item()
if idx == 0:
break
name.append(itos[idx])
names.append(''.join(name))
return names
print("\nNeural model samples:")
for name in sample_neural(10):
print(f" {name}")Why This Matters
The neural bigram model converges to the same probabilities as counting (up to regularization). But the neural approach:
- Generalizes to more context (use 3 chars instead of 1 → trigram)
- Shares parameters through embeddings (similar chars can produce similar outputs)
- Scales to arbitrary architectures (MLPs, RNNs, transformers)
The counting approach is a dead end. The neural approach is the starting point for GPT.
# Verify: neural weights ≈ log of counting probabilities
W_effective = W.detach().numpy()
P_log = np.log(P + 1e-10)
# Compare row 0 (starting token '.')
plt.figure(figsize=(10, 4))
plt.bar(range(vocab_size), P_log[0], alpha=0.5, label="log(counting)")
plt.bar(range(vocab_size), W_effective[0], alpha=0.5, label="neural weights")
plt.xticks(range(vocab_size), [itos[i] for i in range(vocab_size)])
plt.legend()
plt.title("Neural weights converge to log-counts")
plt.show()Exercises
-
Trigram counting model: Extend to P(c3 | c1, c2). You’ll need a 27×27×27 tensor. How much does the loss improve?
-
Temperature sampling: Divide logits by temperature before softmax. Try (sharper), (normal), (flatter). How does generation quality change?
-
Different datasets: Try this on city names, Pokémon names, or last names. How does the generated “style” change?
-
Loss analysis: Print the highest-loss bigrams. These are the transitions the model finds most surprising. Do they make sense?
Next: 17 - MLP Language Model — give the model more context with embeddings and hidden layers.