Tutorial - Object Tracking Pipeline

Goal

After this tutorial, you can track objects across video frames using detection + association. You will build a tracker from scratch, then compare it with production-grade trackers.

Prerequisites: Object Detection, basic Python and NumPy.

Time: 60-90 minutes.

Step 1: Set up and load video

We need a video with multiple moving objects. You can use any video — a dashcam clip, drone footage, or your webcam.

import cv2
import numpy as np
from ultralytics import YOLO
 
# Download a sample traffic video (or use your own)
# https://motchallenge.net/data/MOT17/ has standard benchmarks
VIDEO_PATH = "traffic.mp4"  # replace with your video
 
cap = cv2.VideoCapture(VIDEO_PATH)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Video: {width}x{height}, {fps:.1f} FPS, {total_frames} frames")
cap.release()

Step 2: Run YOLO detection per frame

First, detect objects in every frame independently. No tracking yet — just raw detections.

model = YOLO("yolo11n.pt")  # nano model for speed
 
def detect_all_frames(video_path, model, conf=0.3, max_frames=None):
    """Run detection on every frame.
    Returns: list of frame detections.
    Each element: list of [x1, y1, x2, y2, conf, class_id].
    """
    cap = cv2.VideoCapture(video_path)
    all_detections = []
    frame_idx = 0
 
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if max_frames and frame_idx >= max_frames:
            break
 
        results = model(frame, conf=conf, verbose=False)[0]
        frame_dets = []
        for box in results.boxes:
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            c = float(box.conf[0])
            cls = int(box.cls[0])
            frame_dets.append([x1, y1, x2, y2, c, cls])
 
        all_detections.append(frame_dets)
        frame_idx += 1
 
    cap.release()
    return all_detections
 
detections = detect_all_frames(VIDEO_PATH, model, max_frames=300)
print(f"Processed {len(detections)} frames")
print(f"Frame 0: {len(detections[0])} detections")

What just happened: We ran YOLO independently on each frame. Each detection is a bounding box with confidence and class. But there are no IDs — the same car might be detection #3 in frame 1 and detection #7 in frame 2. We have no way to know they’re the same object.

Step 3: Implement IoU tracker from scratch

The simplest possible tracker: match detections between frames by how much their bounding boxes overlap.

from scipy.optimize import linear_sum_assignment
 
def iou(box_a, box_b):
    """IoU between two [x1, y1, x2, y2] boxes."""
    x1 = max(box_a[0], box_b[0])
    y1 = max(box_a[1], box_b[1])
    x2 = min(box_a[2], box_b[2])
    y2 = min(box_a[3], box_b[3])
    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
    area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
    return inter / (area_a + area_b - inter + 1e-6)
 
class IoUTracker:
    """Track objects by matching detections with IoU + Hungarian algorithm."""
 
    def __init__(self, iou_threshold=0.25, max_age=5):
        self.tracks = {}       # track_id -> track dict
        self.next_id = 0
        self.iou_threshold = iou_threshold
        self.max_age = max_age  # frames before deleting lost track
 
    def update(self, detections):
        """Update tracker with new frame detections.
        detections: list of [x1, y1, x2, y2, conf, class_id].
        Returns: dict of active tracks {track_id: [x1, y1, x2, y2]}.
        """
        det_boxes = [d[:4] for d in detections]
 
        if not self.tracks:
            # First frame: create new tracks for all detections
            for det in detections:
                self._new_track(det[:4])
            return {tid: t["box"] for tid, t in self.tracks.items()}
 
        # Build IoU cost matrix
        track_ids = list(self.tracks.keys())
        track_boxes = [self.tracks[tid]["box"] for tid in track_ids]
 
        n_tracks = len(track_boxes)
        n_dets = len(det_boxes)
        cost = np.zeros((n_tracks, n_dets))
        for i in range(n_tracks):
            for j in range(n_dets):
                cost[i, j] = iou(track_boxes[i], det_boxes[j])
 
        # Hungarian matching (maximize IoU = minimize negative IoU)
        matched_t, matched_d = [], []
        if n_tracks > 0 and n_dets > 0:
            row_idx, col_idx = linear_sum_assignment(-cost)
            for r, c in zip(row_idx, col_idx):
                if cost[r, c] >= self.iou_threshold:
                    matched_t.append(r)
                    matched_d.append(c)
 
        unmatched_t = set(range(n_tracks)) - set(matched_t)
        unmatched_d = set(range(n_dets)) - set(matched_d)
 
        # Update matched tracks
        for t_idx, d_idx in zip(matched_t, matched_d):
            tid = track_ids[t_idx]
            self.tracks[tid]["box"] = det_boxes[d_idx]
            self.tracks[tid]["age"] = 0
 
        # Age unmatched tracks
        for t_idx in unmatched_t:
            tid = track_ids[t_idx]
            self.tracks[tid]["age"] += 1
 
        # Create new tracks for unmatched detections
        for d_idx in unmatched_d:
            self._new_track(det_boxes[d_idx])
 
        # Remove old tracks
        to_remove = [tid for tid, t in self.tracks.items()
                     if t["age"] > self.max_age]
        for tid in to_remove:
            del self.tracks[tid]
 
        return {tid: t["box"] for tid, t in self.tracks.items()}
 
    def _new_track(self, box):
        self.tracks[self.next_id] = {"box": list(box), "age": 0}
        self.next_id += 1
 
# Run tracker on all detections
tracker = IoUTracker(iou_threshold=0.25, max_age=5)
tracking_results = []  # list of dicts: {track_id: box} per frame
 
for frame_dets in detections:
    result = tracker.update(frame_dets)
    tracking_results.append(result)
 
print(f"Tracked {tracker.next_id} unique objects across {len(detections)} frames")

What just happened: For each frame, we computed IoU between every existing track and every new detection, used the Hungarian algorithm to find the optimal 1-to-1 matching, updated matched tracks, created new ones, and deleted stale ones. This is the core of SORT without the Kalman filter.

Step 4: Add Kalman filter for motion prediction

The IoU tracker breaks when an object is temporarily missed (no detection for a frame). With a Kalman filter, we can predict where the object should be even when there’s no detection.

class KalmanBoxTracker:
    """Track a single object with constant-velocity Kalman filter.
    State: [x_center, y_center, area, aspect_ratio, vx, vy, va, vr]
    """
    def __init__(self, bbox):
        # State: [cx, cy, area, ratio, vcx, vcy, varea, vratio]
        self.kf = self._init_kalman()
        cx = (bbox[0] + bbox[2]) / 2
        cy = (bbox[1] + bbox[3]) / 2
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]
        self.kf[:4] = [cx, cy, w * h, w / (h + 1e-6)]
 
    def _init_kalman(self):
        # Simple: state is 8-dim, measurement is 4-dim
        # For production, use filterpy or scipy Kalman filter
        return np.zeros(8)
 
    def predict(self):
        """Predict next position using constant velocity model."""
        # x_{t+1} = x_t + v_t
        self.kf[:4] += self.kf[4:]
        return self._state_to_bbox()
 
    def update(self, bbox):
        """Update state with new measurement."""
        cx = (bbox[0] + bbox[2]) / 2
        cy = (bbox[1] + bbox[3]) / 2
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]
        measurement = np.array([cx, cy, w * h, w / (h + 1e-6)])
 
        # Simple exponential smoothing update (alpha blend)
        alpha = 0.5
        old_state = self.kf[:4].copy()
        self.kf[:4] = alpha * measurement + (1 - alpha) * self.kf[:4]
        self.kf[4:] = self.kf[:4] - old_state  # update velocity
 
    def _state_to_bbox(self):
        cx, cy, area, ratio = self.kf[:4]
        w = np.sqrt(max(area * ratio, 1))
        h = area / (w + 1e-6)
        return [cx - w/2, cy - h/2, cx + w/2, cy + h/2]

What just happened: The Kalman filter maintains a velocity estimate. Even when a detection is missed, predict() moves the track forward based on estimated velocity. This bridges short gaps in detection (1-3 frames of occlusion).

Step 5: Track lifecycle management

Real trackers distinguish between tentative and confirmed tracks to avoid noise.

New detection (no match) → TENTATIVE track
  ├─ Matched for 3+ consecutive frames → CONFIRMED track
  └─ Not matched within 3 frames → DELETED

CONFIRMED track not matched → LOST (keep predicting)
  ├─ Re-matched → back to CONFIRMED
  └─ Lost for max_age frames → DELETED

This prevents spurious detections (false positives) from creating permanent tracks.

Step 6: Visualize results

import cv2
import numpy as np
 
def visualize_tracks(video_path, tracking_results, output_path="tracked.mp4"):
    """Draw bounding boxes with track IDs and trajectory trails."""
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))
 
    # Generate consistent colors per track ID
    np.random.seed(42)
    colors = {}
 
    # Store trajectory history
    trajectories = {}  # track_id -> list of center points
 
    for frame_idx in range(len(tracking_results)):
        ret, frame = cap.read()
        if not ret:
            break
 
        tracks = tracking_results[frame_idx]
 
        for tid, box in tracks.items():
            if tid not in colors:
                colors[tid] = tuple(int(x) for x in np.random.randint(50, 255, 3))
 
            x1, y1, x2, y2 = [int(v) for v in box]
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
 
            # Store trajectory
            if tid not in trajectories:
                trajectories[tid] = []
            trajectories[tid].append((cx, cy))
 
            # Draw bounding box
            color = colors[tid]
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame, f"ID:{tid}", (x1, y1 - 8),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
 
            # Draw trajectory trail (last 30 points)
            trail = trajectories[tid][-30:]
            for i in range(1, len(trail)):
                cv2.line(frame, trail[i-1], trail[i], color, 2)
 
        out.write(frame)
 
    cap.release()
    out.release()
    print(f"Saved tracked video to {output_path}")
 
visualize_tracks(VIDEO_PATH, tracking_results, "tracked_iou.mp4")

Step 7: Compare with YOLO built-in tracker

from ultralytics import YOLO
 
model = YOLO("yolo11n.pt")
 
# ByteTrack -- the production tracker
results = model.track(
    source=VIDEO_PATH,
    tracker="bytetrack.yaml",
    conf=0.3,
    save=True,        # saves output video
    stream=True,
    verbose=False,
)
 
yolo_tracks = []
for r in results:
    frame_tracks = {}
    if r.boxes.id is not None:
        for box, tid in zip(r.boxes.xyxy, r.boxes.id):
            frame_tracks[int(tid)] = box.tolist()
    yolo_tracks.append(frame_tracks)
 
print(f"YOLO ByteTrack: tracked objects across {len(yolo_tracks)} frames")

What just happened: Ultralytics includes ByteTrack and BoT-SORT as built-in trackers. One line of code. Compare the track IDs and trajectories with your custom IoU tracker — you’ll see ByteTrack handles occlusions and ID switches better.

Step 8: Evaluate with MOT metrics

# pip install motmetrics
import motmetrics as mm
 
def evaluate_tracker(ground_truth, hypotheses, n_frames):
    """Compare tracking output to ground truth.
    ground_truth: list of dicts per frame {gt_id: [x1, y1, x2, y2]}
    hypotheses: list of dicts per frame {hyp_id: [x1, y1, x2, y2]}
    """
    acc = mm.MOTAccumulator(auto_id=True)
 
    for frame_idx in range(n_frames):
        gt = ground_truth[frame_idx]
        hyp = hypotheses[frame_idx]
 
        gt_ids = list(gt.keys())
        hyp_ids = list(hyp.keys())
 
        # Compute distance matrix (IoU-based)
        distances = np.full((len(gt_ids), len(hyp_ids)), np.nan)
        for i, gid in enumerate(gt_ids):
            for j, hid in enumerate(hyp_ids):
                iou_val = iou(gt[gid], hyp[hid])
                if iou_val >= 0.5:
                    distances[i, j] = 1 - iou_val  # distance = 1 - IoU
 
        acc.update(gt_ids, hyp_ids, distances)
 
    mh = mm.metrics.create()
    summary = mh.compute(acc, metrics=["mota", "motp", "idf1", "num_switches"],
                         name="tracker")
    print(summary)
    return summary
 
# To evaluate, you need ground truth annotations (MOT17 dataset provides these)
# evaluate_tracker(gt_tracks, tracking_results, len(detections))

What you built

You now have a working understanding of the full MOT pipeline:

  1. Detection → per-frame bounding boxes (YOLO)
  2. Cost matrix → IoU between tracks and detections
  3. Assignment → Hungarian algorithm for optimal matching
  4. Prediction → Kalman filter bridges detection gaps
  5. Lifecycle → tentative → confirmed → lost → deleted
  6. Evaluation → MOTA, MOTP, IDF1

Try this next

  1. Add Re-ID features: Extract appearance embeddings from detected crops (use a pretrained ResNet). Add cosine distance on embeddings as secondary matching when IoU is ambiguous.
  2. Handle camera motion: For drone footage, estimate the homography between frames and compensate track positions before IoU matching.
  3. Counting line: Draw a virtual line across a road. Count objects crossing it by checking when track trajectories cross the line. Report counts per class (cars, trucks, pedestrians).