RNN Text Generator

Goal: Build a character-level RNN from scratch, train it on text, generate new text. Understand hidden states, backpropagation through time, and why RNNs struggle with long sequences. Inspired by Karpathy’s “The Unreasonable Effectiveness of RNNs.”

Prerequisites: Recurrent Neural Networks, Vanishing and Exploding Gradients, Language Models, 16 - Bigram Language Model


The RNN Cell

An RNN maintains a hidden state that gets updated at each time step:

The hidden state is the memory — it carries information from all previous tokens.

import numpy as np
import matplotlib.pyplot as plt
 
class RNN:
    def __init__(self, vocab_size, hidden_size):
        self.hidden_size = hidden_size
        scale = 0.01
 
        # Weights
        self.Wxh = np.random.randn(hidden_size, vocab_size) * scale   # input → hidden
        self.Whh = np.random.randn(hidden_size, hidden_size) * scale  # hidden → hidden
        self.Why = np.random.randn(vocab_size, hidden_size) * scale   # hidden → output
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))
 
    def forward(self, inputs, h_prev):
        """
        Forward pass through a sequence.
        inputs: list of integer token IDs
        h_prev: (hidden_size, 1) initial hidden state
        Returns: loss, cache for backward pass
        """
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = h_prev.copy()
        loss = 0
 
        for t in range(len(inputs) - 1):
            # One-hot encode input
            xs[t] = np.zeros((self.Wxh.shape[1], 1))
            xs[t][inputs[t]] = 1
 
            # Hidden state update
            hs[t] = np.tanh(self.Wxh @ xs[t] + self.Whh @ hs[t-1] + self.bh)
 
            # Output (logits)
            ys[t] = self.Why @ hs[t] + self.by
 
            # Softmax
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
 
            # Cross-entropy loss for next-token prediction
            loss += -np.log(ps[t][inputs[t+1], 0])
 
        cache = (xs, hs, ys, ps, inputs)
        return loss, cache, hs[len(inputs) - 2]
 
    def backward(self, cache, seq_len):
        """Backpropagation through time (BPTT)."""
        xs, hs, ys, ps, inputs = cache
 
        # Gradient accumulators
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dWhy = np.zeros_like(self.Why)
        dbh = np.zeros_like(self.bh)
        dby = np.zeros_like(self.by)
        dh_next = np.zeros_like(hs[0])
 
        for t in reversed(range(seq_len - 1)):
            # Output gradient: softmax - one_hot
            dy = ps[t].copy()
            dy[inputs[t+1]] -= 1
 
            # Hidden → output weights
            dWhy += dy @ hs[t].T
            dby += dy
 
            # Backprop into hidden state
            dh = self.Why.T @ dy + dh_next
 
            # Through tanh: dtanh = (1 - tanh^2) * dh
            dh_raw = (1 - hs[t] ** 2) * dh
 
            # Hidden weights
            dWxh += dh_raw @ xs[t].T
            dWhh += dh_raw @ hs[t-1].T
            dbh += dh_raw
 
            # Gradient for next step backward
            dh_next = self.Whh.T @ dh_raw
 
        # Clip gradients to prevent exploding
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam)
 
        return dWxh, dWhh, dWhy, dbh, dby
 
    def sample(self, h, seed_ix, n):
        """Generate n characters starting from seed_ix."""
        x = np.zeros((self.Wxh.shape[1], 1))
        x[seed_ix] = 1
        result = [seed_ix]
 
        for _ in range(n):
            h = np.tanh(self.Wxh @ x + self.Whh @ h + self.bh)
            y = self.Why @ h + self.by
            p = np.exp(y) / np.sum(np.exp(y))
            ix = np.random.choice(range(len(p)), p=p.ravel())
 
            x = np.zeros_like(x)
            x[ix] = 1
            result.append(ix)
 
        return result

Train on Text

# Load data
import urllib.request, os
if not os.path.exists('input.txt'):
    url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    urllib.request.urlretrieve(url, 'input.txt')
 
data = open('input.txt', 'r').read()
chars = sorted(set(data))
vocab_size = len(chars)
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}
 
print(f"Data: {len(data)} chars, {vocab_size} unique")
 
# Encode
data_ids = [stoi[c] for c in data]

Training loop with Adagrad

hidden_size = 128
seq_len = 25  # BPTT sequence length
lr = 0.1
 
rnn = RNN(vocab_size, hidden_size)
 
# Adagrad memory (per-parameter adaptive learning rate)
mWxh = np.zeros_like(rnn.Wxh)
mWhh = np.zeros_like(rnn.Whh)
mWhy = np.zeros_like(rnn.Why)
mbh = np.zeros_like(rnn.bh)
mby = np.zeros_like(rnn.by)
 
smooth_loss = -np.log(1.0 / vocab_size) * seq_len  # initial loss estimate
losses = []
pointer = 0
h_prev = np.zeros((hidden_size, 1))
 
for step in range(30000):
    # Reset if end of data
    if pointer + seq_len + 1 >= len(data_ids):
        pointer = 0
        h_prev = np.zeros((hidden_size, 1))
 
    inputs = data_ids[pointer:pointer + seq_len + 1]
 
    # Sample every 5000 steps
    if step % 5000 == 0:
        sample_ids = rnn.sample(h_prev, inputs[0], 200)
        sample_text = ''.join(itos[i] for i in sample_ids)
        print(f"\n--- Step {step}, loss: {smooth_loss:.4f} ---")
        print(sample_text)
        print("---")
 
    # Forward + backward
    loss, cache, h_prev = rnn.forward(inputs, h_prev)
    dWxh, dWhh, dWhy, dbh, dby = rnn.backward(cache, len(inputs))
 
    smooth_loss = 0.999 * smooth_loss + 0.001 * loss
    losses.append(smooth_loss)
 
    # Adagrad update
    for param, dparam, mem in zip(
        [rnn.Wxh, rnn.Whh, rnn.Why, rnn.bh, rnn.by],
        [dWxh, dWhh, dWhy, dbh, dby],
        [mWxh, mWhh, mWhy, mbh, mby],
    ):
        mem += dparam ** 2
        param -= lr * dparam / (np.sqrt(mem) + 1e-8)
 
    pointer += seq_len
 
plt.plot(losses)
plt.xlabel("Step"); plt.ylabel("Smoothed loss")
plt.title("RNN training")
plt.show()

Watch It Learn

Early training: random garbage

After 1000 steps: some word-like patterns

After 10000 steps: recognizable English words, correct formatting

After 30000 steps: plausible Shakespeare with speaker names, line structure

The hidden state is learning to encode which character is likely next — essentially learning English spelling and basic grammar.


Visualize Hidden State

# Run forward on a sentence and track hidden state
sentence = "To be or not to be"
h = np.zeros((hidden_size, 1))
states = []
for c in sentence:
    x = np.zeros((vocab_size, 1))
    x[stoi[c]] = 1
    h = np.tanh(rnn.Wxh @ x + rnn.Whh @ h + rnn.bh)
    states.append(h.ravel().copy())
 
states = np.array(states)
 
plt.figure(figsize=(14, 5))
plt.imshow(states.T[:20], aspect='auto', cmap='RdBu')
plt.xticks(range(len(sentence)), list(sentence), fontsize=12)
plt.ylabel("Hidden unit")
plt.xlabel("Character")
plt.title("Hidden state activations over time (first 20 units)")
plt.colorbar()
plt.show()

Why RNNs Struggle: The Gradient Problem

# Track gradient magnitude through time
def gradient_magnitudes(rnn, inputs, h_prev):
    """Measure how gradient decays through time steps."""
    loss, cache, _ = rnn.forward(inputs, h_prev)
    xs, hs, ys, ps, inp = cache
 
    magnitudes = []
    dh_next = np.zeros_like(hs[0])
 
    for t in reversed(range(len(inputs) - 1)):
        dy = ps[t].copy()
        dy[inputs[t+1]] -= 1
        dh = rnn.Why.T @ dy + dh_next
        dh_raw = (1 - hs[t] ** 2) * dh
        dh_next = rnn.Whh.T @ dh_raw
        magnitudes.append(np.linalg.norm(dh_next))
 
    return list(reversed(magnitudes))
 
inputs = data_ids[:50]
h = np.zeros((hidden_size, 1))
mags = gradient_magnitudes(rnn, inputs, h)
 
plt.figure(figsize=(10, 4))
plt.plot(mags)
plt.xlabel("Time step (from past to present)")
plt.ylabel("Gradient magnitude")
plt.title("Gradient vanishes over time — RNN forgets the past")
plt.yscale("log")
plt.show()

The gradient decays exponentially. By 20 steps back, the signal is negligible. This is why RNNs can’t learn long-range dependencies.


LSTM: Fixing the Gradient

LSTM adds a cell state that carries information forward without multiplication by :

class LSTMCell:
    """Single LSTM cell — forward pass only, for understanding."""
    def __init__(self, input_size, hidden_size):
        n = input_size + hidden_size
        self.Wf = np.random.randn(hidden_size, n) * 0.01  # forget gate
        self.Wi = np.random.randn(hidden_size, n) * 0.01  # input gate
        self.Wc = np.random.randn(hidden_size, n) * 0.01  # cell candidate
        self.Wo = np.random.randn(hidden_size, n) * 0.01  # output gate
        self.bf = np.ones((hidden_size, 1))   # bias forget gate to 1 (remember by default!)
        self.bi = np.zeros((hidden_size, 1))
        self.bc = np.zeros((hidden_size, 1))
        self.bo = np.zeros((hidden_size, 1))
 
    def forward(self, x, h_prev, c_prev):
        combined = np.vstack([h_prev, x])
 
        f = 1 / (1 + np.exp(-(self.Wf @ combined + self.bf)))  # forget gate
        i = 1 / (1 + np.exp(-(self.Wi @ combined + self.bi)))  # input gate
        c_hat = np.tanh(self.Wc @ combined + self.bc)           # cell candidate
        o = 1 / (1 + np.exp(-(self.Wo @ combined + self.bo)))  # output gate
 
        c = f * c_prev + i * c_hat  # cell state: forget old + add new
        h = o * np.tanh(c)          # hidden state
 
        return h, c

The key: c = f * c_prev + i * c_hat. The cell state is an additive highway — gradients flow through without vanishing.


Exercises

  1. Temperature sampling: Modify sample() to accept a temperature parameter. Divide logits by T before softmax. Generate at T=0.3, 0.7, 1.0, 1.5.

  2. Multi-layer RNN: Stack 2 RNN layers — output of layer 1 is input to layer 2. Does it generate better text?

  3. LSTM from scratch: Implement the full LSTM (forward + backward) and train on the same text. Compare loss curves — LSTM should converge lower.

  4. Gradient flow comparison: Plot gradient magnitude through time for vanilla RNN vs LSTM (after both are trained). LSTM maintains gradients much further back.

  5. Compare with GPT: Generate 200 chars from both your RNN and the GPT from 20 - Build GPT from Scratch (same training data). Which produces better text? How many parameters does each use?


Next: 24 - Neural Network Training Recipes — the systematic approach to debugging and improving any model.