Tutorial - Object Tracking Pipeline
Goal
After this tutorial, you can track objects across video frames using detection + association. You will build a tracker from scratch, then compare it with production-grade trackers.
Prerequisites: Object Detection, basic Python and NumPy.
Time: 60-90 minutes.
Step 1: Set up and load video
We need a video with multiple moving objects. You can use any video — a dashcam clip, drone footage, or your webcam.
import cv2
import numpy as np
from ultralytics import YOLO
# Download a sample traffic video (or use your own)
# https://motchallenge.net/data/MOT17/ has standard benchmarks
VIDEO_PATH = "traffic.mp4" # replace with your video
cap = cv2.VideoCapture(VIDEO_PATH)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Video: {width}x{height}, {fps:.1f} FPS, {total_frames} frames")
cap.release()Step 2: Run YOLO detection per frame
First, detect objects in every frame independently. No tracking yet — just raw detections.
model = YOLO("yolo11n.pt") # nano model for speed
def detect_all_frames(video_path, model, conf=0.3, max_frames=None):
"""Run detection on every frame.
Returns: list of frame detections.
Each element: list of [x1, y1, x2, y2, conf, class_id].
"""
cap = cv2.VideoCapture(video_path)
all_detections = []
frame_idx = 0
while True:
ret, frame = cap.read()
if not ret:
break
if max_frames and frame_idx >= max_frames:
break
results = model(frame, conf=conf, verbose=False)[0]
frame_dets = []
for box in results.boxes:
x1, y1, x2, y2 = box.xyxy[0].tolist()
c = float(box.conf[0])
cls = int(box.cls[0])
frame_dets.append([x1, y1, x2, y2, c, cls])
all_detections.append(frame_dets)
frame_idx += 1
cap.release()
return all_detections
detections = detect_all_frames(VIDEO_PATH, model, max_frames=300)
print(f"Processed {len(detections)} frames")
print(f"Frame 0: {len(detections[0])} detections")What just happened: We ran YOLO independently on each frame. Each detection is a bounding box with confidence and class. But there are no IDs — the same car might be detection #3 in frame 1 and detection #7 in frame 2. We have no way to know they’re the same object.
Step 3: Implement IoU tracker from scratch
The simplest possible tracker: match detections between frames by how much their bounding boxes overlap.
from scipy.optimize import linear_sum_assignment
def iou(box_a, box_b):
"""IoU between two [x1, y1, x2, y2] boxes."""
x1 = max(box_a[0], box_b[0])
y1 = max(box_a[1], box_b[1])
x2 = min(box_a[2], box_b[2])
y2 = min(box_a[3], box_b[3])
inter = max(0, x2 - x1) * max(0, y2 - y1)
area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
return inter / (area_a + area_b - inter + 1e-6)
class IoUTracker:
"""Track objects by matching detections with IoU + Hungarian algorithm."""
def __init__(self, iou_threshold=0.25, max_age=5):
self.tracks = {} # track_id -> track dict
self.next_id = 0
self.iou_threshold = iou_threshold
self.max_age = max_age # frames before deleting lost track
def update(self, detections):
"""Update tracker with new frame detections.
detections: list of [x1, y1, x2, y2, conf, class_id].
Returns: dict of active tracks {track_id: [x1, y1, x2, y2]}.
"""
det_boxes = [d[:4] for d in detections]
if not self.tracks:
# First frame: create new tracks for all detections
for det in detections:
self._new_track(det[:4])
return {tid: t["box"] for tid, t in self.tracks.items()}
# Build IoU cost matrix
track_ids = list(self.tracks.keys())
track_boxes = [self.tracks[tid]["box"] for tid in track_ids]
n_tracks = len(track_boxes)
n_dets = len(det_boxes)
cost = np.zeros((n_tracks, n_dets))
for i in range(n_tracks):
for j in range(n_dets):
cost[i, j] = iou(track_boxes[i], det_boxes[j])
# Hungarian matching (maximize IoU = minimize negative IoU)
matched_t, matched_d = [], []
if n_tracks > 0 and n_dets > 0:
row_idx, col_idx = linear_sum_assignment(-cost)
for r, c in zip(row_idx, col_idx):
if cost[r, c] >= self.iou_threshold:
matched_t.append(r)
matched_d.append(c)
unmatched_t = set(range(n_tracks)) - set(matched_t)
unmatched_d = set(range(n_dets)) - set(matched_d)
# Update matched tracks
for t_idx, d_idx in zip(matched_t, matched_d):
tid = track_ids[t_idx]
self.tracks[tid]["box"] = det_boxes[d_idx]
self.tracks[tid]["age"] = 0
# Age unmatched tracks
for t_idx in unmatched_t:
tid = track_ids[t_idx]
self.tracks[tid]["age"] += 1
# Create new tracks for unmatched detections
for d_idx in unmatched_d:
self._new_track(det_boxes[d_idx])
# Remove old tracks
to_remove = [tid for tid, t in self.tracks.items()
if t["age"] > self.max_age]
for tid in to_remove:
del self.tracks[tid]
return {tid: t["box"] for tid, t in self.tracks.items()}
def _new_track(self, box):
self.tracks[self.next_id] = {"box": list(box), "age": 0}
self.next_id += 1
# Run tracker on all detections
tracker = IoUTracker(iou_threshold=0.25, max_age=5)
tracking_results = [] # list of dicts: {track_id: box} per frame
for frame_dets in detections:
result = tracker.update(frame_dets)
tracking_results.append(result)
print(f"Tracked {tracker.next_id} unique objects across {len(detections)} frames")What just happened: For each frame, we computed IoU between every existing track and every new detection, used the Hungarian algorithm to find the optimal 1-to-1 matching, updated matched tracks, created new ones, and deleted stale ones. This is the core of SORT without the Kalman filter.
Step 4: Add Kalman filter for motion prediction
The IoU tracker breaks when an object is temporarily missed (no detection for a frame). With a Kalman filter, we can predict where the object should be even when there’s no detection.
class KalmanBoxTracker:
"""Track a single object with constant-velocity Kalman filter.
State: [x_center, y_center, area, aspect_ratio, vx, vy, va, vr]
"""
def __init__(self, bbox):
# State: [cx, cy, area, ratio, vcx, vcy, varea, vratio]
self.kf = self._init_kalman()
cx = (bbox[0] + bbox[2]) / 2
cy = (bbox[1] + bbox[3]) / 2
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
self.kf[:4] = [cx, cy, w * h, w / (h + 1e-6)]
def _init_kalman(self):
# Simple: state is 8-dim, measurement is 4-dim
# For production, use filterpy or scipy Kalman filter
return np.zeros(8)
def predict(self):
"""Predict next position using constant velocity model."""
# x_{t+1} = x_t + v_t
self.kf[:4] += self.kf[4:]
return self._state_to_bbox()
def update(self, bbox):
"""Update state with new measurement."""
cx = (bbox[0] + bbox[2]) / 2
cy = (bbox[1] + bbox[3]) / 2
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
measurement = np.array([cx, cy, w * h, w / (h + 1e-6)])
# Simple exponential smoothing update (alpha blend)
alpha = 0.5
old_state = self.kf[:4].copy()
self.kf[:4] = alpha * measurement + (1 - alpha) * self.kf[:4]
self.kf[4:] = self.kf[:4] - old_state # update velocity
def _state_to_bbox(self):
cx, cy, area, ratio = self.kf[:4]
w = np.sqrt(max(area * ratio, 1))
h = area / (w + 1e-6)
return [cx - w/2, cy - h/2, cx + w/2, cy + h/2]What just happened: The Kalman filter maintains a velocity estimate. Even when a detection is missed, predict() moves the track forward based on estimated velocity. This bridges short gaps in detection (1-3 frames of occlusion).
Step 5: Track lifecycle management
Real trackers distinguish between tentative and confirmed tracks to avoid noise.
New detection (no match) → TENTATIVE track
├─ Matched for 3+ consecutive frames → CONFIRMED track
└─ Not matched within 3 frames → DELETED
CONFIRMED track not matched → LOST (keep predicting)
├─ Re-matched → back to CONFIRMED
└─ Lost for max_age frames → DELETED
This prevents spurious detections (false positives) from creating permanent tracks.
Step 6: Visualize results
import cv2
import numpy as np
def visualize_tracks(video_path, tracking_results, output_path="tracked.mp4"):
"""Draw bounding boxes with track IDs and trajectory trails."""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))
# Generate consistent colors per track ID
np.random.seed(42)
colors = {}
# Store trajectory history
trajectories = {} # track_id -> list of center points
for frame_idx in range(len(tracking_results)):
ret, frame = cap.read()
if not ret:
break
tracks = tracking_results[frame_idx]
for tid, box in tracks.items():
if tid not in colors:
colors[tid] = tuple(int(x) for x in np.random.randint(50, 255, 3))
x1, y1, x2, y2 = [int(v) for v in box]
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
# Store trajectory
if tid not in trajectories:
trajectories[tid] = []
trajectories[tid].append((cx, cy))
# Draw bounding box
color = colors[tid]
cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
cv2.putText(frame, f"ID:{tid}", (x1, y1 - 8),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
# Draw trajectory trail (last 30 points)
trail = trajectories[tid][-30:]
for i in range(1, len(trail)):
cv2.line(frame, trail[i-1], trail[i], color, 2)
out.write(frame)
cap.release()
out.release()
print(f"Saved tracked video to {output_path}")
visualize_tracks(VIDEO_PATH, tracking_results, "tracked_iou.mp4")Step 7: Compare with YOLO built-in tracker
from ultralytics import YOLO
model = YOLO("yolo11n.pt")
# ByteTrack -- the production tracker
results = model.track(
source=VIDEO_PATH,
tracker="bytetrack.yaml",
conf=0.3,
save=True, # saves output video
stream=True,
verbose=False,
)
yolo_tracks = []
for r in results:
frame_tracks = {}
if r.boxes.id is not None:
for box, tid in zip(r.boxes.xyxy, r.boxes.id):
frame_tracks[int(tid)] = box.tolist()
yolo_tracks.append(frame_tracks)
print(f"YOLO ByteTrack: tracked objects across {len(yolo_tracks)} frames")What just happened: Ultralytics includes ByteTrack and BoT-SORT as built-in trackers. One line of code. Compare the track IDs and trajectories with your custom IoU tracker — you’ll see ByteTrack handles occlusions and ID switches better.
Step 8: Evaluate with MOT metrics
# pip install motmetrics
import motmetrics as mm
def evaluate_tracker(ground_truth, hypotheses, n_frames):
"""Compare tracking output to ground truth.
ground_truth: list of dicts per frame {gt_id: [x1, y1, x2, y2]}
hypotheses: list of dicts per frame {hyp_id: [x1, y1, x2, y2]}
"""
acc = mm.MOTAccumulator(auto_id=True)
for frame_idx in range(n_frames):
gt = ground_truth[frame_idx]
hyp = hypotheses[frame_idx]
gt_ids = list(gt.keys())
hyp_ids = list(hyp.keys())
# Compute distance matrix (IoU-based)
distances = np.full((len(gt_ids), len(hyp_ids)), np.nan)
for i, gid in enumerate(gt_ids):
for j, hid in enumerate(hyp_ids):
iou_val = iou(gt[gid], hyp[hid])
if iou_val >= 0.5:
distances[i, j] = 1 - iou_val # distance = 1 - IoU
acc.update(gt_ids, hyp_ids, distances)
mh = mm.metrics.create()
summary = mh.compute(acc, metrics=["mota", "motp", "idf1", "num_switches"],
name="tracker")
print(summary)
return summary
# To evaluate, you need ground truth annotations (MOT17 dataset provides these)
# evaluate_tracker(gt_tracks, tracking_results, len(detections))What you built
You now have a working understanding of the full MOT pipeline:
- Detection → per-frame bounding boxes (YOLO)
- Cost matrix → IoU between tracks and detections
- Assignment → Hungarian algorithm for optimal matching
- Prediction → Kalman filter bridges detection gaps
- Lifecycle → tentative → confirmed → lost → deleted
- Evaluation → MOTA, MOTP, IDF1
Try this next
- Add Re-ID features: Extract appearance embeddings from detected crops (use a pretrained ResNet). Add cosine distance on embeddings as secondary matching when IoU is ambiguous.
- Handle camera motion: For drone footage, estimate the homography between frames and compensate track positions before IoU matching.
- Counting line: Draw a virtual line across a road. Count objects crossing it by checking when track trajectories cross the line. Report counts per class (cars, trucks, pedestrians).
Links
- Multi-Object Tracking — theory and algorithms
- Object Detection — the detection stage
- Optical Flow — alternative motion estimation
- Case Study - CV Pipeline Design — design decisions for tracking systems