Build GPT from Scratch
Goal: Build a decoder-only transformer (GPT) from scratch in PyTorch. Train it on tiny Shakespeare. Generate text. Inspired by Karpathy’s nanoGPT and his “Let’s build GPT” lecture.
Prerequisites: Transformers, Attention Mechanism, Batch Normalization, 08 - Attention Mechanism from Scratch, 17 - MLP Language Model
The Architecture
Token Embeddings + Position Embeddings
↓
┌─── Transformer Block ───┐
│ LayerNorm → MultiHead │ × N blocks
│ + residual │
│ LayerNorm → FFN │
│ + residual │
└─────────────────────────┘
↓
LayerNorm → Linear → logits
Every piece you’ve already seen. This tutorial assembles them into a working language model.
Dataset: Tiny Shakespeare
import torch
import torch.nn as nn
import torch.nn.functional as F
# Download tiny shakespeare
import urllib.request, os
if not os.path.exists('input.txt'):
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
urllib.request.urlretrieve(url, 'input.txt')
text = open('input.txt', 'r').read()
print(f"Length: {len(text)} characters")
print(f"First 200 chars:\n{text[:200]}")
# Character-level tokenization
chars = sorted(set(text))
vocab_size = len(chars)
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)
print(f"Vocab size: {vocab_size}")
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]Hyperparameters
# Small model — trainable on CPU in minutes
batch_size = 32
block_size = 64 # context length
n_embed = 64 # embedding dimension
n_head = 4 # attention heads
n_layer = 4 # transformer blocks
dropout = 0.1
max_steps = 5000
eval_interval = 500
lr = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")Data Loading
def get_batch(split):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix]).to(device)
y = torch.stack([data[i+1:i+1+block_size] for i in ix]).to(device)
return x, y
@torch.no_grad()
def estimate_loss(model, eval_iters=100):
model.eval()
out = {}
for split in ['train', 'val']:
losses = []
for _ in range(eval_iters):
x, y = get_batch(split)
_, loss = model(x, y)
losses.append(loss.item())
out[split] = sum(losses) / len(losses)
model.train()
return outBuilding Blocks
Single Attention Head
class Head(nn.Module):
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embed, head_size, bias=False)
self.query = nn.Linear(n_embed, head_size, bias=False)
self.value = nn.Linear(n_embed, head_size, bias=False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, T, C = x.shape
k = self.key(x) # (B, T, head_size)
q = self.query(x) # (B, T, head_size)
v = self.value(x) # (B, T, head_size)
# Attention scores
wei = q @ k.transpose(-2, -1) * (C ** -0.5) # (B, T, T)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # causal mask
wei = F.softmax(wei, dim=-1)
wei = self.dropout(wei)
return wei @ v # (B, T, head_size)Multi-Head Attention
class MultiHeadAttention(nn.Module):
def __init__(self, n_heads, head_size):
super().__init__()
self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
self.proj = nn.Linear(n_embed, n_embed) # projection back to residual stream
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1)
return self.dropout(self.proj(out))Feed-Forward Network
class FeedForward(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embed, 4 * n_embed),
nn.GELU(),
nn.Linear(4 * n_embed, n_embed),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)Transformer Block
class Block(nn.Module):
def __init__(self):
super().__init__()
head_size = n_embed // n_head
self.sa = MultiHeadAttention(n_head, head_size)
self.ffwd = FeedForward()
self.ln1 = nn.LayerNorm(n_embed)
self.ln2 = nn.LayerNorm(n_embed)
def forward(self, x):
x = x + self.sa(self.ln1(x)) # residual + pre-norm
x = x + self.ffwd(self.ln2(x)) # residual + pre-norm
return xNote: pre-norm (LayerNorm before attention) not post-norm. This is the modern convention — more stable training.
The GPT Model
class GPT(nn.Module):
def __init__(self):
super().__init__()
self.token_emb = nn.Embedding(vocab_size, n_embed)
self.pos_emb = nn.Embedding(block_size, n_embed)
self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embed)
self.lm_head = nn.Linear(n_embed, vocab_size)
def forward(self, idx, targets=None):
B, T = idx.shape
tok_emb = self.token_emb(idx) # (B, T, n_embed)
pos_emb = self.pos_emb(torch.arange(T, device=device)) # (T, n_embed)
x = tok_emb + pos_emb # (B, T, n_embed)
x = self.blocks(x)
x = self.ln_f(x)
logits = self.lm_head(x) # (B, T, vocab_size)
loss = None
if targets is not None:
logits_flat = logits.view(-1, vocab_size)
targets_flat = targets.view(-1)
loss = F.cross_entropy(logits_flat, targets_flat)
return logits, loss
def generate(self, idx, max_new_tokens):
for _ in range(max_new_tokens):
# Crop context to block_size
idx_cond = idx[:, -block_size:]
logits, _ = self(idx_cond)
logits = logits[:, -1, :] # last token's predictions
probs = F.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat([idx, idx_next], dim=1)
return idx
model = GPT().to(device)
n_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {n_params:,}")Training
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
print("Training...")
for step in range(max_steps):
# Evaluate periodically
if step % eval_interval == 0:
losses = estimate_loss(model)
print(f"Step {step:5d} | train loss: {losses['train']:.4f} | val loss: {losses['val']:.4f}")
x, y = get_batch('train')
logits, loss = model(x, y)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
# Final evaluation
losses = estimate_loss(model)
print(f"\nFinal — train: {losses['train']:.4f}, val: {losses['val']:.4f}")Generate Text
context = torch.zeros((1, 1), dtype=torch.long, device=device) # start with newline
generated = model.generate(context, max_new_tokens=500)
print(decode(generated[0].tolist()))With the small config above, after 5000 steps you’ll get text that looks vaguely Shakespearean — real words mixed with nonsense, but correct formatting (speaker names, line structure).
What Each Part Does
| Component | Purpose |
|---|---|
| Token embedding | Map character → vector |
| Position embedding | Inject sequence order information |
| Multi-head attention | Each token gathers info from past tokens |
| Feed-forward | Process gathered information (the “thinking”) |
| Residual connections | Let gradients flow through deep networks |
| LayerNorm | Stabilize activations at each layer |
| Causal mask | Prevent looking at future tokens |
Scale It Up
To go from toy to real:
# Tiny (this tutorial): ~100K params, block_size=64, CPU OK
# Small: n_embed=128, n_head=4, n_layer=6, block_size=256 → ~2M params
# Medium: n_embed=384, n_head=6, n_layer=6, block_size=256 → ~10M params (needs GPU)
# GPT-2: n_embed=768, n_head=12, n_layer=12, block_size=1024 → 124M params
# GPT-3: n_embed=12288, n_head=96, n_layer=96 → 175B paramsThe architecture is identical. Only the numbers change.
Exercises
-
Temperature and top-k: Modify
generate()to accept temperature and top-k parameters. Generate at T=0.5, T=1.0, T=1.5 and compare quality. -
Learning rate schedule: Add linear warmup (100 steps) + cosine decay. Standard for transformer training.
-
Dropout experiment: Train with dropout=0, 0.1, 0.3. Plot train and val loss. Without dropout, the gap widens (overfitting).
-
Attention visualization: After training, extract attention weights from one head. Plot the 64×64 attention matrix for a sample sequence. What patterns emerge?
-
Word-level: Replace character tokenization with BPE (21 - Build a BPE Tokenizer). How does generation quality change?
Next: 21 - Build a BPE Tokenizer — how real LLMs turn text into tokens.