RNN Text Generator
Goal: Build a character-level RNN from scratch, train it on text, generate new text. Understand hidden states, backpropagation through time, and why RNNs struggle with long sequences. Inspired by Karpathy’s “The Unreasonable Effectiveness of RNNs.”
Prerequisites: Recurrent Neural Networks, Vanishing and Exploding Gradients, Language Models, 16 - Bigram Language Model
The RNN Cell
An RNN maintains a hidden state that gets updated at each time step:
The hidden state is the memory — it carries information from all previous tokens.
import numpy as np
import matplotlib.pyplot as plt
class RNN:
def __init__(self, vocab_size, hidden_size):
self.hidden_size = hidden_size
scale = 0.01
# Weights
self.Wxh = np.random.randn(hidden_size, vocab_size) * scale # input → hidden
self.Whh = np.random.randn(hidden_size, hidden_size) * scale # hidden → hidden
self.Why = np.random.randn(vocab_size, hidden_size) * scale # hidden → output
self.bh = np.zeros((hidden_size, 1))
self.by = np.zeros((vocab_size, 1))
def forward(self, inputs, h_prev):
"""
Forward pass through a sequence.
inputs: list of integer token IDs
h_prev: (hidden_size, 1) initial hidden state
Returns: loss, cache for backward pass
"""
xs, hs, ys, ps = {}, {}, {}, {}
hs[-1] = h_prev.copy()
loss = 0
for t in range(len(inputs) - 1):
# One-hot encode input
xs[t] = np.zeros((self.Wxh.shape[1], 1))
xs[t][inputs[t]] = 1
# Hidden state update
hs[t] = np.tanh(self.Wxh @ xs[t] + self.Whh @ hs[t-1] + self.bh)
# Output (logits)
ys[t] = self.Why @ hs[t] + self.by
# Softmax
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
# Cross-entropy loss for next-token prediction
loss += -np.log(ps[t][inputs[t+1], 0])
cache = (xs, hs, ys, ps, inputs)
return loss, cache, hs[len(inputs) - 2]
def backward(self, cache, seq_len):
"""Backpropagation through time (BPTT)."""
xs, hs, ys, ps, inputs = cache
# Gradient accumulators
dWxh = np.zeros_like(self.Wxh)
dWhh = np.zeros_like(self.Whh)
dWhy = np.zeros_like(self.Why)
dbh = np.zeros_like(self.bh)
dby = np.zeros_like(self.by)
dh_next = np.zeros_like(hs[0])
for t in reversed(range(seq_len - 1)):
# Output gradient: softmax - one_hot
dy = ps[t].copy()
dy[inputs[t+1]] -= 1
# Hidden → output weights
dWhy += dy @ hs[t].T
dby += dy
# Backprop into hidden state
dh = self.Why.T @ dy + dh_next
# Through tanh: dtanh = (1 - tanh^2) * dh
dh_raw = (1 - hs[t] ** 2) * dh
# Hidden weights
dWxh += dh_raw @ xs[t].T
dWhh += dh_raw @ hs[t-1].T
dbh += dh_raw
# Gradient for next step backward
dh_next = self.Whh.T @ dh_raw
# Clip gradients to prevent exploding
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -5, 5, out=dparam)
return dWxh, dWhh, dWhy, dbh, dby
def sample(self, h, seed_ix, n):
"""Generate n characters starting from seed_ix."""
x = np.zeros((self.Wxh.shape[1], 1))
x[seed_ix] = 1
result = [seed_ix]
for _ in range(n):
h = np.tanh(self.Wxh @ x + self.Whh @ h + self.bh)
y = self.Why @ h + self.by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(len(p)), p=p.ravel())
x = np.zeros_like(x)
x[ix] = 1
result.append(ix)
return resultTrain on Text
# Load data
import urllib.request, os
if not os.path.exists('input.txt'):
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
urllib.request.urlretrieve(url, 'input.txt')
data = open('input.txt', 'r').read()
chars = sorted(set(data))
vocab_size = len(chars)
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}
print(f"Data: {len(data)} chars, {vocab_size} unique")
# Encode
data_ids = [stoi[c] for c in data]Training loop with Adagrad
hidden_size = 128
seq_len = 25 # BPTT sequence length
lr = 0.1
rnn = RNN(vocab_size, hidden_size)
# Adagrad memory (per-parameter adaptive learning rate)
mWxh = np.zeros_like(rnn.Wxh)
mWhh = np.zeros_like(rnn.Whh)
mWhy = np.zeros_like(rnn.Why)
mbh = np.zeros_like(rnn.bh)
mby = np.zeros_like(rnn.by)
smooth_loss = -np.log(1.0 / vocab_size) * seq_len # initial loss estimate
losses = []
pointer = 0
h_prev = np.zeros((hidden_size, 1))
for step in range(30000):
# Reset if end of data
if pointer + seq_len + 1 >= len(data_ids):
pointer = 0
h_prev = np.zeros((hidden_size, 1))
inputs = data_ids[pointer:pointer + seq_len + 1]
# Sample every 5000 steps
if step % 5000 == 0:
sample_ids = rnn.sample(h_prev, inputs[0], 200)
sample_text = ''.join(itos[i] for i in sample_ids)
print(f"\n--- Step {step}, loss: {smooth_loss:.4f} ---")
print(sample_text)
print("---")
# Forward + backward
loss, cache, h_prev = rnn.forward(inputs, h_prev)
dWxh, dWhh, dWhy, dbh, dby = rnn.backward(cache, len(inputs))
smooth_loss = 0.999 * smooth_loss + 0.001 * loss
losses.append(smooth_loss)
# Adagrad update
for param, dparam, mem in zip(
[rnn.Wxh, rnn.Whh, rnn.Why, rnn.bh, rnn.by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby],
):
mem += dparam ** 2
param -= lr * dparam / (np.sqrt(mem) + 1e-8)
pointer += seq_len
plt.plot(losses)
plt.xlabel("Step"); plt.ylabel("Smoothed loss")
plt.title("RNN training")
plt.show()Watch It Learn
Early training: random garbage
After 1000 steps: some word-like patterns
After 10000 steps: recognizable English words, correct formatting
After 30000 steps: plausible Shakespeare with speaker names, line structure
The hidden state is learning to encode which character is likely next — essentially learning English spelling and basic grammar.
Visualize Hidden State
# Run forward on a sentence and track hidden state
sentence = "To be or not to be"
h = np.zeros((hidden_size, 1))
states = []
for c in sentence:
x = np.zeros((vocab_size, 1))
x[stoi[c]] = 1
h = np.tanh(rnn.Wxh @ x + rnn.Whh @ h + rnn.bh)
states.append(h.ravel().copy())
states = np.array(states)
plt.figure(figsize=(14, 5))
plt.imshow(states.T[:20], aspect='auto', cmap='RdBu')
plt.xticks(range(len(sentence)), list(sentence), fontsize=12)
plt.ylabel("Hidden unit")
plt.xlabel("Character")
plt.title("Hidden state activations over time (first 20 units)")
plt.colorbar()
plt.show()Why RNNs Struggle: The Gradient Problem
# Track gradient magnitude through time
def gradient_magnitudes(rnn, inputs, h_prev):
"""Measure how gradient decays through time steps."""
loss, cache, _ = rnn.forward(inputs, h_prev)
xs, hs, ys, ps, inp = cache
magnitudes = []
dh_next = np.zeros_like(hs[0])
for t in reversed(range(len(inputs) - 1)):
dy = ps[t].copy()
dy[inputs[t+1]] -= 1
dh = rnn.Why.T @ dy + dh_next
dh_raw = (1 - hs[t] ** 2) * dh
dh_next = rnn.Whh.T @ dh_raw
magnitudes.append(np.linalg.norm(dh_next))
return list(reversed(magnitudes))
inputs = data_ids[:50]
h = np.zeros((hidden_size, 1))
mags = gradient_magnitudes(rnn, inputs, h)
plt.figure(figsize=(10, 4))
plt.plot(mags)
plt.xlabel("Time step (from past to present)")
plt.ylabel("Gradient magnitude")
plt.title("Gradient vanishes over time — RNN forgets the past")
plt.yscale("log")
plt.show()The gradient decays exponentially. By 20 steps back, the signal is negligible. This is why RNNs can’t learn long-range dependencies.
LSTM: Fixing the Gradient
LSTM adds a cell state that carries information forward without multiplication by :
class LSTMCell:
"""Single LSTM cell — forward pass only, for understanding."""
def __init__(self, input_size, hidden_size):
n = input_size + hidden_size
self.Wf = np.random.randn(hidden_size, n) * 0.01 # forget gate
self.Wi = np.random.randn(hidden_size, n) * 0.01 # input gate
self.Wc = np.random.randn(hidden_size, n) * 0.01 # cell candidate
self.Wo = np.random.randn(hidden_size, n) * 0.01 # output gate
self.bf = np.ones((hidden_size, 1)) # bias forget gate to 1 (remember by default!)
self.bi = np.zeros((hidden_size, 1))
self.bc = np.zeros((hidden_size, 1))
self.bo = np.zeros((hidden_size, 1))
def forward(self, x, h_prev, c_prev):
combined = np.vstack([h_prev, x])
f = 1 / (1 + np.exp(-(self.Wf @ combined + self.bf))) # forget gate
i = 1 / (1 + np.exp(-(self.Wi @ combined + self.bi))) # input gate
c_hat = np.tanh(self.Wc @ combined + self.bc) # cell candidate
o = 1 / (1 + np.exp(-(self.Wo @ combined + self.bo))) # output gate
c = f * c_prev + i * c_hat # cell state: forget old + add new
h = o * np.tanh(c) # hidden state
return h, cThe key: c = f * c_prev + i * c_hat. The cell state is an additive highway — gradients flow through without vanishing.
Exercises
-
Temperature sampling: Modify
sample()to accept a temperature parameter. Divide logits by T before softmax. Generate at T=0.3, 0.7, 1.0, 1.5. -
Multi-layer RNN: Stack 2 RNN layers — output of layer 1 is input to layer 2. Does it generate better text?
-
LSTM from scratch: Implement the full LSTM (forward + backward) and train on the same text. Compare loss curves — LSTM should converge lower.
-
Gradient flow comparison: Plot gradient magnitude through time for vanilla RNN vs LSTM (after both are trained). LSTM maintains gradients much further back.
-
Compare with GPT: Generate 200 chars from both your RNN and the GPT from 20 - Build GPT from Scratch (same training data). Which produces better text? How many parameters does each use?
Next: 24 - Neural Network Training Recipes — the systematic approach to debugging and improving any model.