Build a BPE Tokenizer

Goal: Build a Byte Pair Encoding tokenizer from scratch. Understand how LLMs turn text into tokens — the most underappreciated piece of the pipeline. Inspired by Karpathy’s minbpe.

Prerequisites: Text Preprocessing, Language Models, Embeddings

Why Tokenization Matters

Character-level: "hello" → ['h', 'e', 'l', 'l', 'o'] — 5 tokens, but vocabulary is only 256 (bytes). Sequences are long, model needs huge context.

Word-level: "hello" → ['hello'] — 1 token, but vocabulary is 100K+ words. Can’t handle typos, new words, code.

BPE: "hello" → ['hel', 'lo'] — middle ground. Start with bytes, iteratively merge the most common pairs. Vocabulary of 50K-100K covers everything.

The Algorithm

Start with raw bytes (UTF-8)
Count all adjacent pairs
Merge the most frequent pair into a new token
Repeat until desired vocab size

def get_stats(ids):
    """Count frequency of each adjacent pair."""
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts
 
def merge(ids, pair, new_id):
    """Replace all occurrences of pair with new_id."""
    new_ids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and (ids[i], ids[i+1]) == pair:
            new_ids.append(new_id)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    return new_ids

Training the Tokenizer

# Training text
text = """The quick brown fox jumps over the lazy dog. The dog barked at the fox.
The fox ran away quickly. The quick brown fox returned later that evening.
Natural language processing is a field of artificial intelligence.
Language models learn to predict the next token in a sequence.
Tokenization is the first step in any NLP pipeline."""
 
# Convert to bytes (UTF-8)
tokens = list(text.encode("utf-8"))
print(f"Original: {len(tokens)} bytes")
print(f"First 20 bytes: {tokens[:20]}")
 
# BPE training
vocab_size_target = 276  # 256 base bytes + 20 merges
num_merges = vocab_size_target - 256
 
merges = {}  # (pair) → new_id
for i in range(num_merges):
    stats = get_stats(tokens)
    if not stats:
        break
    top_pair = max(stats, key=stats.get)
    new_id = 256 + i
    tokens = merge(tokens, top_pair, new_id)
    merges[top_pair] = new_id
 
    # Decode the pair for display
    pair_bytes = bytes([top_pair[0]]) if top_pair[0] < 256 else b'?'
    print(f"Merge {i:2d}: {top_pair} → {new_id} | "
          f"count={stats[top_pair]:3d} | tokens remaining: {len(tokens)}")
 
print(f"\nCompression: {len(text.encode('utf-8'))} bytes → {len(tokens)} tokens "
      f"({len(text.encode('utf-8'))/len(tokens):.1f}x)")

Build the Vocabulary

vocab = {i: bytes([i]) for i in range(256)}  # base: single bytes
 
for (p0, p1), new_id in merges.items():
    vocab[new_id] = vocab[p0] + vocab[p1]
 
# Show the learned merges
print("Learned tokens:")
for new_id in range(256, 256 + num_merges):
    token_bytes = vocab[new_id]
    try:
        token_str = token_bytes.decode("utf-8")
    except UnicodeDecodeError:
        token_str = str(token_bytes)
    print(f"  {new_id}: {repr(token_str)}")

Common merges are typically: th, the, he, in, er, on — the most frequent subwords in English.

Encode and Decode

def encode_bpe(text, merges):
    """Encode text to BPE token IDs."""
    tokens = list(text.encode("utf-8"))
    while len(tokens) >= 2:
        stats = get_stats(tokens)
        # Find the pair with the lowest merge index (highest priority)
        pair = min(stats, key=lambda p: merges.get(p, float('inf')))
        if pair not in merges:
            break  # no more merges to apply
        tokens = merge(tokens, pair, merges[pair])
    return tokens
 
def decode_bpe(ids, vocab):
    """Decode token IDs back to text."""
    tokens = b"".join(vocab[i] for i in ids)
    return tokens.decode("utf-8", errors="replace")
 
# Test roundtrip
test = "The quick brown fox"
encoded = encode_bpe(test, merges)
decoded = decode_bpe(encoded, vocab)
 
print(f"Original:  '{test}'")
print(f"Encoded:   {encoded}")
print(f"Decoded:   '{decoded}'")
print(f"Roundtrip: {test == decoded}")
 
# Show token boundaries
print(f"\nTokens:")
for tok_id in encoded:
    tok_str = decode_bpe([tok_id], vocab)
    print(f"  {tok_id:3d} → '{tok_str}'")

Visualize Compression

import matplotlib.pyplot as plt
 
# Track compression over merges
text_bytes = list(text.encode("utf-8"))
original_len = len(text_bytes)
lengths = [original_len]
 
tokens_track = text_bytes[:]
for (p0, p1), new_id in merges.items():
    tokens_track = merge(tokens_track, (p0, p1), new_id)
    lengths.append(len(tokens_track))
 
plt.figure(figsize=(10, 5))
plt.plot(lengths, 'o-')
plt.xlabel("Number of merges")
plt.ylabel("Sequence length (tokens)")
plt.title("BPE compression: each merge reduces sequence length")
plt.axhline(original_len, color='gray', linestyle='--', alpha=0.5, label='Original bytes')
plt.legend()
plt.show()

Regex Pre-Tokenization (GPT-4 Style)

Real tokenizers don’t merge across word boundaries. They first split text with regex:

import re
 
# GPT-4 pattern (simplified)
GPT4_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
 
# Simpler version that works with re module
SIMPLE_PATTERN = r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w]+|\s+"""
 
text_sample = "Hello, world! It's a beautiful day. I've got 123 tokens."
chunks = re.findall(SIMPLE_PATTERN, text_sample)
print("Pre-tokenized chunks:")
for chunk in chunks:
    print(f"  '{chunk}'")
 
# BPE runs independently within each chunk — merges don't cross boundaries

Full Tokenizer Class

class BasicBPE:
    def __init__(self):
        self.merges = {}
        self.vocab = {}
 
    def train(self, text, vocab_size):
        assert vocab_size >= 256
        tokens = list(text.encode("utf-8"))
        num_merges = vocab_size - 256
 
        self.merges = {}
        for i in range(num_merges):
            stats = get_stats(tokens)
            if not stats:
                break
            top_pair = max(stats, key=stats.get)
            new_id = 256 + i
            tokens = merge(tokens, top_pair, new_id)
            self.merges[top_pair] = new_id
 
        # Build vocab
        self.vocab = {i: bytes([i]) for i in range(256)}
        for (p0, p1), new_id in self.merges.items():
            self.vocab[new_id] = self.vocab[p0] + self.vocab[p1]
 
        return self
 
    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = get_stats(tokens)
            pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
            if pair not in self.merges:
                break
            tokens = merge(tokens, pair, self.merges[pair])
        return tokens
 
    def decode(self, ids):
        return b"".join(self.vocab[i] for i in ids).decode("utf-8", errors="replace")
 
# Train and test
tokenizer = BasicBPE().train(text, vocab_size=300)
test_text = "The fox quickly jumped over the lazy dog"
ids = tokenizer.encode(test_text)
print(f"'{test_text}'")
print(f"→ {len(ids)} tokens: {ids}")
print(f"→ '{tokenizer.decode(ids)}'")

Compare with Real Tokenizers

# pip install tiktoken
try:
    import tiktoken
    enc = tiktoken.get_encoding("cl100k_base")  # GPT-4 tokenizer
    test = "The quick brown fox jumps over the lazy dog"
 
    our_tokens = tokenizer.encode(test)
    gpt4_tokens = enc.encode(test)
 
    print(f"Our BPE:  {len(our_tokens)} tokens")
    print(f"GPT-4:    {len(gpt4_tokens)} tokens")
    print(f"\nGPT-4 token breakdown:")
    for t in gpt4_tokens:
        print(f"  {t:5d} → '{enc.decode([t])}'")
except ImportError:
    print("Install tiktoken to compare: pip install tiktoken")

Tokenization Gotchas

Things that break because of tokenization:

# 1. Arithmetic: "123 + 456" might tokenize as ["123", " +", " 456"]
#    or ["12", "3", " +", " 4", "56"] — the model doesn't see individual digits
 
# 2. Trailing whitespace: " hello" and "hello" are different tokens
#    This is why LLMs sometimes struggle with whitespace-sensitive code
 
# 3. Non-English: CJK characters often get split into multiple tokens
#    "你好" might be 4-6 tokens, "hello" is 1 token — efficiency gap
 
# Show token counts for different inputs
test_cases = [
    "hello",
    "Hello",
    "HELLO",
    "123456",
    "    ",  # spaces
]
for t in test_cases:
    ids = tokenizer.encode(t)
    print(f"  '{t}' → {len(ids)} tokens")

Exercises

Larger training corpus: Train on the full Shakespeare text from tutorial 20. How does vocab size affect compression ratio? Plot tokens/char vs vocab size for vocab_size = 256, 300, 500, 1000.
Special tokens: Add <|endoftext|> and <|pad|> as special tokens with fixed IDs. Modify encode/decode to handle them.
Regex pre-tokenization: Integrate the regex splitter into the BPE class so merges don’t cross word boundaries. Compare the learned merges with and without pre-tokenization.
Unicode handling: Test your tokenizer on multilingual text (English + Cyrillic + CJK). How many tokens does each language use per character? This is why multilingual tokenizers need larger vocabularies.
Plug into GPT: Replace the character-level tokenization in 20 - Build GPT from Scratch with your BPE tokenizer. Does the model learn faster with a better tokenizer?

Next: 22 - Build a CNN from Scratch — convolutions, pooling, and backprop through spatial operations.

AI/ML Notes

Explorer

21 - Build a BPE Tokenizer