Build GPT from Scratch

Goal: Build a decoder-only transformer (GPT) from scratch in PyTorch. Train it on tiny Shakespeare. Generate text. Inspired by Karpathy’s nanoGPT and his “Let’s build GPT” lecture.

Prerequisites: Transformers, Attention Mechanism, Batch Normalization, 08 - Attention Mechanism from Scratch, 17 - MLP Language Model

The Architecture

Token Embeddings + Position Embeddings
            ↓
    ┌─── Transformer Block ───┐
    │  LayerNorm → MultiHead  │  × N blocks
    │  + residual             │
    │  LayerNorm → FFN        │
    │  + residual             │
    └─────────────────────────┘
            ↓
      LayerNorm → Linear → logits

Every piece you’ve already seen. This tutorial assembles them into a working language model.

Dataset: Tiny Shakespeare

import torch
import torch.nn as nn
import torch.nn.functional as F
 
# Download tiny shakespeare
import urllib.request, os
if not os.path.exists('input.txt'):
    url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    urllib.request.urlretrieve(url, 'input.txt')
 
text = open('input.txt', 'r').read()
print(f"Length: {len(text)} characters")
print(f"First 200 chars:\n{text[:200]}")
 
# Character-level tokenization
chars = sorted(set(text))
vocab_size = len(chars)
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)
 
print(f"Vocab size: {vocab_size}")
 
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

Hyperparameters

# Small model — trainable on CPU in minutes
batch_size = 32
block_size = 64       # context length
n_embed = 64          # embedding dimension
n_head = 4            # attention heads
n_layer = 4           # transformer blocks
dropout = 0.1
max_steps = 5000
eval_interval = 500
lr = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

Data Loading

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]).to(device)
    y = torch.stack([data[i+1:i+1+block_size] for i in ix]).to(device)
    return x, y
 
@torch.no_grad()
def estimate_loss(model, eval_iters=100):
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = []
        for _ in range(eval_iters):
            x, y = get_batch(split)
            _, loss = model(x, y)
            losses.append(loss.item())
        out[split] = sum(losses) / len(losses)
    model.train()
    return out

Building Blocks

Single Attention Head

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
 
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)
        v = self.value(x)  # (B, T, head_size)
 
        # Attention scores
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # causal mask
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
 
        return wei @ v  # (B, T, head_size)

Multi-Head Attention

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embed, n_embed)  # projection back to residual stream
        self.dropout = nn.Dropout(dropout)
 
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

Feed-Forward Network

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )
 
    def forward(self, x):
        return self.net(x)

Transformer Block

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)
 
    def forward(self, x):
        x = x + self.sa(self.ln1(x))    # residual + pre-norm
        x = x + self.ffwd(self.ln2(x))  # residual + pre-norm
        return x

Note: pre-norm (LayerNorm before attention) not post-norm. This is the modern convention — more stable training.

The GPT Model

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embed)
        self.pos_emb = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
 
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_emb(idx)                          # (B, T, n_embed)
        pos_emb = self.pos_emb(torch.arange(T, device=device)) # (T, n_embed)
        x = tok_emb + pos_emb                                  # (B, T, n_embed)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)                               # (B, T, vocab_size)
 
        loss = None
        if targets is not None:
            logits_flat = logits.view(-1, vocab_size)
            targets_flat = targets.view(-1)
            loss = F.cross_entropy(logits_flat, targets_flat)
 
        return logits, loss
 
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Crop context to block_size
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]  # last token's predictions
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)
        return idx
 
model = GPT().to(device)
n_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {n_params:,}")

Training

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
 
print("Training...")
for step in range(max_steps):
    # Evaluate periodically
    if step % eval_interval == 0:
        losses = estimate_loss(model)
        print(f"Step {step:5d} | train loss: {losses['train']:.4f} | val loss: {losses['val']:.4f}")
 
    x, y = get_batch('train')
    logits, loss = model(x, y)
 
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
 
# Final evaluation
losses = estimate_loss(model)
print(f"\nFinal — train: {losses['train']:.4f}, val: {losses['val']:.4f}")

Generate Text

context = torch.zeros((1, 1), dtype=torch.long, device=device)  # start with newline
generated = model.generate(context, max_new_tokens=500)
print(decode(generated[0].tolist()))

With the small config above, after 5000 steps you’ll get text that looks vaguely Shakespearean — real words mixed with nonsense, but correct formatting (speaker names, line structure).

What Each Part Does

Component	Purpose
Token embedding	Map character → vector
Position embedding	Inject sequence order information
Multi-head attention	Each token gathers info from past tokens
Feed-forward	Process gathered information (the “thinking”)
Residual connections	Let gradients flow through deep networks
LayerNorm	Stabilize activations at each layer
Causal mask	Prevent looking at future tokens

Scale It Up

To go from toy to real:

# Tiny (this tutorial):     ~100K params, block_size=64, CPU OK
# Small:   n_embed=128, n_head=4, n_layer=6,  block_size=256  → ~2M params
# Medium:  n_embed=384, n_head=6, n_layer=6,  block_size=256  → ~10M params (needs GPU)
# GPT-2:   n_embed=768, n_head=12, n_layer=12, block_size=1024 → 124M params
# GPT-3:   n_embed=12288, n_head=96, n_layer=96 → 175B params

The architecture is identical. Only the numbers change.

Exercises

Temperature and top-k: Modify generate() to accept temperature and top-k parameters. Generate at T=0.5, T=1.0, T=1.5 and compare quality.
Learning rate schedule: Add linear warmup (100 steps) + cosine decay. Standard for transformer training.
Dropout experiment: Train with dropout=0, 0.1, 0.3. Plot train and val loss. Without dropout, the gap widens (overfitting).
Attention visualization: After training, extract attention weights from one head. Plot the 64×64 attention matrix for a sample sequence. What patterns emerge?
Word-level: Replace character tokenization with BPE (21 - Build a BPE Tokenizer). How does generation quality change?

Next: 21 - Build a BPE Tokenizer — how real LLMs turn text into tokens.

AI/ML Notes

Explorer

20 - Build GPT from Scratch