Build a BPE Tokenizer
Goal: Build a Byte Pair Encoding tokenizer from scratch. Understand how LLMs turn text into tokens — the most underappreciated piece of the pipeline. Inspired by Karpathy’s minbpe.
Prerequisites: Text Preprocessing, Language Models, Embeddings
Why Tokenization Matters
Character-level: "hello" → ['h', 'e', 'l', 'l', 'o'] — 5 tokens, but vocabulary is only 256 (bytes). Sequences are long, model needs huge context.
Word-level: "hello" → ['hello'] — 1 token, but vocabulary is 100K+ words. Can’t handle typos, new words, code.
BPE: "hello" → ['hel', 'lo'] — middle ground. Start with bytes, iteratively merge the most common pairs. Vocabulary of 50K-100K covers everything.
The Algorithm
- Start with raw bytes (UTF-8)
- Count all adjacent pairs
- Merge the most frequent pair into a new token
- Repeat until desired vocab size
def get_stats(ids):
"""Count frequency of each adjacent pair."""
counts = {}
for pair in zip(ids, ids[1:]):
counts[pair] = counts.get(pair, 0) + 1
return counts
def merge(ids, pair, new_id):
"""Replace all occurrences of pair with new_id."""
new_ids = []
i = 0
while i < len(ids):
if i < len(ids) - 1 and (ids[i], ids[i+1]) == pair:
new_ids.append(new_id)
i += 2
else:
new_ids.append(ids[i])
i += 1
return new_idsTraining the Tokenizer
# Training text
text = """The quick brown fox jumps over the lazy dog. The dog barked at the fox.
The fox ran away quickly. The quick brown fox returned later that evening.
Natural language processing is a field of artificial intelligence.
Language models learn to predict the next token in a sequence.
Tokenization is the first step in any NLP pipeline."""
# Convert to bytes (UTF-8)
tokens = list(text.encode("utf-8"))
print(f"Original: {len(tokens)} bytes")
print(f"First 20 bytes: {tokens[:20]}")
# BPE training
vocab_size_target = 276 # 256 base bytes + 20 merges
num_merges = vocab_size_target - 256
merges = {} # (pair) → new_id
for i in range(num_merges):
stats = get_stats(tokens)
if not stats:
break
top_pair = max(stats, key=stats.get)
new_id = 256 + i
tokens = merge(tokens, top_pair, new_id)
merges[top_pair] = new_id
# Decode the pair for display
pair_bytes = bytes([top_pair[0]]) if top_pair[0] < 256 else b'?'
print(f"Merge {i:2d}: {top_pair} → {new_id} | "
f"count={stats[top_pair]:3d} | tokens remaining: {len(tokens)}")
print(f"\nCompression: {len(text.encode('utf-8'))} bytes → {len(tokens)} tokens "
f"({len(text.encode('utf-8'))/len(tokens):.1f}x)")Build the Vocabulary
vocab = {i: bytes([i]) for i in range(256)} # base: single bytes
for (p0, p1), new_id in merges.items():
vocab[new_id] = vocab[p0] + vocab[p1]
# Show the learned merges
print("Learned tokens:")
for new_id in range(256, 256 + num_merges):
token_bytes = vocab[new_id]
try:
token_str = token_bytes.decode("utf-8")
except UnicodeDecodeError:
token_str = str(token_bytes)
print(f" {new_id}: {repr(token_str)}")Common merges are typically: th, the, he, in, er, on — the most frequent subwords in English.
Encode and Decode
def encode_bpe(text, merges):
"""Encode text to BPE token IDs."""
tokens = list(text.encode("utf-8"))
while len(tokens) >= 2:
stats = get_stats(tokens)
# Find the pair with the lowest merge index (highest priority)
pair = min(stats, key=lambda p: merges.get(p, float('inf')))
if pair not in merges:
break # no more merges to apply
tokens = merge(tokens, pair, merges[pair])
return tokens
def decode_bpe(ids, vocab):
"""Decode token IDs back to text."""
tokens = b"".join(vocab[i] for i in ids)
return tokens.decode("utf-8", errors="replace")
# Test roundtrip
test = "The quick brown fox"
encoded = encode_bpe(test, merges)
decoded = decode_bpe(encoded, vocab)
print(f"Original: '{test}'")
print(f"Encoded: {encoded}")
print(f"Decoded: '{decoded}'")
print(f"Roundtrip: {test == decoded}")
# Show token boundaries
print(f"\nTokens:")
for tok_id in encoded:
tok_str = decode_bpe([tok_id], vocab)
print(f" {tok_id:3d} → '{tok_str}'")Visualize Compression
import matplotlib.pyplot as plt
# Track compression over merges
text_bytes = list(text.encode("utf-8"))
original_len = len(text_bytes)
lengths = [original_len]
tokens_track = text_bytes[:]
for (p0, p1), new_id in merges.items():
tokens_track = merge(tokens_track, (p0, p1), new_id)
lengths.append(len(tokens_track))
plt.figure(figsize=(10, 5))
plt.plot(lengths, 'o-')
plt.xlabel("Number of merges")
plt.ylabel("Sequence length (tokens)")
plt.title("BPE compression: each merge reduces sequence length")
plt.axhline(original_len, color='gray', linestyle='--', alpha=0.5, label='Original bytes')
plt.legend()
plt.show()Regex Pre-Tokenization (GPT-4 Style)
Real tokenizers don’t merge across word boundaries. They first split text with regex:
import re
# GPT-4 pattern (simplified)
GPT4_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
# Simpler version that works with re module
SIMPLE_PATTERN = r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+"""
text_sample = "Hello, world! It's a beautiful day. I've got 123 tokens."
chunks = re.findall(SIMPLE_PATTERN, text_sample)
print("Pre-tokenized chunks:")
for chunk in chunks:
print(f" '{chunk}'")
# BPE runs independently within each chunk — merges don't cross boundariesFull Tokenizer Class
class BasicBPE:
def __init__(self):
self.merges = {}
self.vocab = {}
def train(self, text, vocab_size):
assert vocab_size >= 256
tokens = list(text.encode("utf-8"))
num_merges = vocab_size - 256
self.merges = {}
for i in range(num_merges):
stats = get_stats(tokens)
if not stats:
break
top_pair = max(stats, key=stats.get)
new_id = 256 + i
tokens = merge(tokens, top_pair, new_id)
self.merges[top_pair] = new_id
# Build vocab
self.vocab = {i: bytes([i]) for i in range(256)}
for (p0, p1), new_id in self.merges.items():
self.vocab[new_id] = self.vocab[p0] + self.vocab[p1]
return self
def encode(self, text):
tokens = list(text.encode("utf-8"))
while len(tokens) >= 2:
stats = get_stats(tokens)
pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
if pair not in self.merges:
break
tokens = merge(tokens, pair, self.merges[pair])
return tokens
def decode(self, ids):
return b"".join(self.vocab[i] for i in ids).decode("utf-8", errors="replace")
# Train and test
tokenizer = BasicBPE().train(text, vocab_size=300)
test_text = "The fox quickly jumped over the lazy dog"
ids = tokenizer.encode(test_text)
print(f"'{test_text}'")
print(f"→ {len(ids)} tokens: {ids}")
print(f"→ '{tokenizer.decode(ids)}'")Compare with Real Tokenizers
# pip install tiktoken
try:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # GPT-4 tokenizer
test = "The quick brown fox jumps over the lazy dog"
our_tokens = tokenizer.encode(test)
gpt4_tokens = enc.encode(test)
print(f"Our BPE: {len(our_tokens)} tokens")
print(f"GPT-4: {len(gpt4_tokens)} tokens")
print(f"\nGPT-4 token breakdown:")
for t in gpt4_tokens:
print(f" {t:5d} → '{enc.decode([t])}'")
except ImportError:
print("Install tiktoken to compare: pip install tiktoken")Tokenization Gotchas
Things that break because of tokenization:
# 1. Arithmetic: "123 + 456" might tokenize as ["123", " +", " 456"]
# or ["12", "3", " +", " 4", "56"] — the model doesn't see individual digits
# 2. Trailing whitespace: " hello" and "hello" are different tokens
# This is why LLMs sometimes struggle with whitespace-sensitive code
# 3. Non-English: CJK characters often get split into multiple tokens
# "你好" might be 4-6 tokens, "hello" is 1 token — efficiency gap
# Show token counts for different inputs
test_cases = [
"hello",
"Hello",
"HELLO",
"123456",
" ", # spaces
]
for t in test_cases:
ids = tokenizer.encode(t)
print(f" '{t}' → {len(ids)} tokens")Exercises
-
Larger training corpus: Train on the full Shakespeare text from tutorial 20. How does vocab size affect compression ratio? Plot tokens/char vs vocab size for vocab_size = 256, 300, 500, 1000.
-
Special tokens: Add
<|endoftext|>and<|pad|>as special tokens with fixed IDs. Modify encode/decode to handle them. -
Regex pre-tokenization: Integrate the regex splitter into the BPE class so merges don’t cross word boundaries. Compare the learned merges with and without pre-tokenization.
-
Unicode handling: Test your tokenizer on multilingual text (English + Cyrillic + CJK). How many tokens does each language use per character? This is why multilingual tokenizers need larger vocabularies.
-
Plug into GPT: Replace the character-level tokenization in 20 - Build GPT from Scratch with your BPE tokenizer. Does the model learn faster with a better tokenizer?
Next: 22 - Build a CNN from Scratch — convolutions, pooling, and backprop through spatial operations.