Merge pull request #97 from gcordova10/feat/eval-comfort-offroad-baselines

riita10069 · web-flow · commit 637ee16fbfd5 · 2026-06-30T12:31:22.000+09:00
feat(evaluation): comfort + off-road metrics, baselines, val splits (#66)
diff --git a/Model/evaluation/__init__.py b/Model/evaluation/__init__.py
@@ -1,3 +1,27 @@
-from .metrics import compute_open_loop_metrics, gate_check, integrate_trajectory
+from .metrics import (
+    COMFORT_THRESHOLDS,
+    compute_comfort_metrics,
+    compute_open_loop_metrics,
+    gate_check,
+    integrate_trajectory,
+    offroad_rate,
+)
+from .baselines import constant_velocity_baseline, hold_last_action_baseline
+from .splits import episode_range_split, geographic_holdout_split
 
-__all__ = ["compute_open_loop_metrics", "gate_check", "integrate_trajectory"]
+__all__ = [
+    # existing (open-loop displacement metrics + gate)
+    "compute_open_loop_metrics",
+    "gate_check",
+    "integrate_trajectory",
+    # complementary: comfort + off-road (#66 §2-3)
+    "compute_comfort_metrics",
+    "COMFORT_THRESHOLDS",
+    "offroad_rate",
+    # training-free baselines (#66 §5)
+    "constant_velocity_baseline",
+    "hold_last_action_baseline",
+    # validation splits (#66 §4)
+    "episode_range_split",
+    "geographic_holdout_split",
+]
diff --git a/Model/evaluation/baselines.py b/Model/evaluation/baselines.py
@@ -0,0 +1,45 @@
+"""Open-loop evaluation baselines (#66 §5).
+
+Simple, training-free baselines in the model's action space ``(accel, curv)``,
+so they can be scored with the same metrics as AutoE2E. Their purpose is to
+reveal whether the perception pipeline contributes **beyond ego-status
+extrapolation** — if the model barely beats constant-velocity, perception isn't
+helping yet (the ego-status critique of nuScenes planning, Zhai et al. 2023).
+
+The trained "ego-status MLP" baseline from the proposal needs training and is a
+follow-up; these are the zero-training references.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def constant_velocity_baseline(batch_size: int,
+                               num_timesteps: int = 64) -> tuple[np.ndarray, np.ndarray]:
+    """Maintain current speed, drive straight: ``accel = 0``, ``curv = 0``.
+
+    Returns ``(accel, curv)`` each ``(batch_size, num_timesteps)``.
+    """
+    zeros = np.zeros((batch_size, num_timesteps), dtype=np.float64)
+    return zeros, zeros.copy()
+
+
+def hold_last_action_baseline(last_accel: np.ndarray, last_curv: np.ndarray,
+                              num_timesteps: int = 64) -> tuple[np.ndarray, np.ndarray]:
+    """Extrapolate the last observed action forward (ego-status, no perception).
+
+    Holds the most recent ``(accel, curv)`` constant over the horizon — a
+    stronger ego-only baseline than constant-velocity when the ego is mid-
+    manoeuvre.
+
+    Args:
+        last_accel, last_curv: ``(B,)`` last observed action from egomotion.
+    Returns:
+        ``(accel, curv)`` each ``(B, num_timesteps)``.
+    """
+    accel = np.repeat(np.asarray(last_accel, dtype=np.float64)[:, None],
+                      num_timesteps, axis=1)
+    curv = np.repeat(np.asarray(last_curv, dtype=np.float64)[:, None],
+                     num_timesteps, axis=1)
+    return accel, curv
diff --git a/Model/evaluation/metrics.py b/Model/evaluation/metrics.py
@@ -119,3 +119,187 @@ def gate_check(
         if metrics.get(key, float("inf")) > max_val:
             return False
     return True
+
+
+# ---------------------------------------------------------------------------
+# Complementary metrics (#66 §2-3) — comfort and an off-road proxy.
+# These extend the displacement metrics already provided by
+# ``compute_open_loop_metrics`` above; they need no ground-truth trajectory
+# (comfort) or no other-agent labels (off-road), which L2D lacks.
+# ---------------------------------------------------------------------------
+
+# nuPlan comfort bounds (the full set from nuplan-devkit `ego_is_comfortable`).
+COMFORT_THRESHOLDS = {
+    "lon_accel_max": 2.40,    # m/s^2   upper bound on longitudinal accel
+    "lon_accel_min": -4.05,   # m/s^2   lower bound (braking)
+    "lat_accel": 4.89,        # m/s^2   |lateral accel|
+    "yaw_rate": 0.95,         # rad/s   |yaw rate|
+    "yaw_accel": 1.93,        # rad/s^2 |yaw acceleration|
+    "lon_jerk": 4.13,         # m/s^3   |longitudinal jerk|
+    "mag_jerk": 8.37,         # m/s^3   |jerk magnitude| = sqrt(lon_jerk^2 + lat_jerk^2)
+}
+
+
+def compute_comfort_metrics(
+    pred_accel: np.ndarray,
+    pred_curv: np.ndarray,
+    initial_speed: np.ndarray,
+    dt: float = 0.1,
+    thresholds: dict[str, float] = COMFORT_THRESHOLDS,
+) -> dict[str, float]:
+    """Comfort metrics from the ``(a, κ)`` outputs vs the nuPlan bounds (#66 §3).
+
+    Mirrors nuplan-devkit's ``ego_is_comfortable`` set — no ground truth needed.
+    With the per-step speed ``v[t] = v0 + Σ a·dt`` (clamped ≥ 0):
+      * longitudinal acceleration ``a``         — two-sided bound ``[min, max]``
+      * lateral acceleration      ``v² κ``       — ``|·|`` bound
+      * yaw rate                  ``v κ``        — ``|·|`` bound
+      * yaw acceleration          ``Δ(v κ)/dt``  — ``|·|`` bound
+      * longitudinal jerk         ``Δa/dt``      — ``|·|`` bound
+      * jerk magnitude            ``√(lon_jerk² + lat_jerk²)`` — bound (this is the
+        8.37 m/s³ threshold; *not* lateral jerk)
+
+    Reports the batch-mean of each per-sample peak, a per-metric violation rate,
+    and the overall ``comfort_violation_rate`` (fraction of samples exceeding ANY
+    bound).
+
+    Args:
+        pred_accel, pred_curv: ``(B, T)`` predicted action signals.
+        initial_speed: ``(B,)`` speed at the prediction start.
+    """
+    accel = np.asarray(pred_accel, dtype=np.float64)                 # (B, T)
+    curv = np.asarray(pred_curv, dtype=np.float64)                   # (B, T)
+    v0 = np.asarray(initial_speed, dtype=np.float64)[:, None]
+
+    v = np.clip(v0 + np.cumsum(accel, axis=1) * dt, 0.0, None)       # (B, T)
+    lat_accel = v ** 2 * curv                                        # (B, T)
+    yaw_rate = v * curv                                              # (B, T)
+    lon_jerk = np.diff(accel, axis=1) / dt                           # (B, T-1)
+    lat_jerk = np.diff(lat_accel, axis=1) / dt                       # (B, T-1)
+    yaw_accel = np.diff(yaw_rate, axis=1) / dt                       # (B, T-1)
+    mag_jerk = np.hypot(lon_jerk, lat_jerk)                          # (B, T-1)
+
+    out: dict[str, float] = {}
+    violated = np.zeros(accel.shape[0], dtype=bool)
+
+    # Longitudinal acceleration: asymmetric two-sided bound.
+    lon_max, lon_min = accel.max(axis=1), accel.min(axis=1)
+    out["max_lon_accel"] = float(lon_max.mean())
+    out["min_lon_accel"] = float(lon_min.mean())
+    lon_exceed = (lon_max > thresholds["lon_accel_max"]) | (lon_min < thresholds["lon_accel_min"])
+    out["lon_accel_violation_rate"] = float(lon_exceed.mean())
+    violated |= lon_exceed
+
+    # Magnitude-bounded quantities.
+    abs_peaks = {
+        "lat_accel": np.abs(lat_accel).max(axis=1),
+        "yaw_rate": np.abs(yaw_rate).max(axis=1),
+        "yaw_accel": np.abs(yaw_accel).max(axis=1),
+        "lon_jerk": np.abs(lon_jerk).max(axis=1),
+        "mag_jerk": mag_jerk.max(axis=1),
+    }
+    for name, peak in abs_peaks.items():
+        out[f"max_{name}"] = float(peak.mean())
+        exceed = peak > thresholds[name]
+        out[f"{name}_violation_rate"] = float(exceed.mean())
+        violated |= exceed
+
+    out["comfort_violation_rate"] = float(violated.mean())
+    return out
+
+
+def _erode_drivable(mask: np.ndarray, iterations: int) -> np.ndarray:
+    """Shrink the drivable area by ``iterations`` pixels (4-neighbour erosion).
+
+    A cell stays drivable only if it and its 4 neighbours are drivable (cells
+    outside the grid count as non-drivable), so after ``k`` iterations any cell
+    within Manhattan distance ``k`` of the boundary is removed. Pure-numpy, no
+    scipy. Used to require a safety margin from the road edge.
+    """
+    eroded = np.asarray(mask, dtype=bool)
+    for _ in range(max(0, int(iterations))):
+        nb = eroded.copy()
+        nb[1:, :] &= eroded[:-1, :]      # up neighbour
+        nb[:-1, :] &= eroded[1:, :]      # down neighbour
+        nb[:, 1:] &= eroded[:, :-1]      # left neighbour
+        nb[:, :-1] &= eroded[:, 1:]      # right neighbour
+        nb[0, :] = nb[-1, :] = nb[:, 0] = nb[:, -1] = False   # border = off-road
+        eroded = nb
+    return eroded
+
+
+def offroad_rate(
+    positions: np.ndarray,
+    drivable_mask: np.ndarray,
+    meters_per_pixel: float,
+    center_px: tuple[int, int] | None = None,
+    headings: np.ndarray | None = None,
+    ego_size: tuple[float, float] | None = None,
+    dilation_px: int = 0,
+) -> float:
+    """Off-road proxy for collision rate when agents are unlabelled (#66 §2).
+
+    L2D has no other-agent annotations, so we use the BEV drivable mask: a
+    trajectory is off-road if it leaves the drivable area. By default this checks
+    the trajectory **centre** point (lightweight). For drivable-area *compliance*
+    the ego footprint matters — a corner can leave the road while the centre stays
+    inside — so pass ``ego_size`` to check the four footprint corners, and/or
+    ``dilation_px`` to require a safety margin from the boundary.
+
+    Args:
+        positions: ``(B, T, 2)`` integrated ``(x_forward, y_left)`` in metres.
+        drivable_mask: ``(H, W)`` boolean BEV; True = drivable.
+        meters_per_pixel: BEV resolution.
+        center_px: ego pixel ``(row, col)``; defaults to the grid centre.
+            Convention (matches the repo's BEV rendering): forward +x → up
+            (decreasing row), left +y → left (decreasing col).
+        headings: optional ``(B, T)`` heading per pose (rad) to orient the
+            footprint. If ``None`` and ``ego_size`` is given, heading is taken
+            from the finite-difference travel direction.
+        ego_size: optional ``(length, width)`` in metres. When given, the four
+            footprint corners are checked instead of only the centre.
+        dilation_px: erode the drivable mask by this many pixels first (require a
+            margin from the boundary). 0 = off.
+
+    Returns:
+        Fraction of trajectories that leave the drivable area.
+    """
+    mask = _erode_drivable(drivable_mask, dilation_px) if dilation_px > 0 \
+        else np.asarray(drivable_mask, dtype=bool)
+    H, W = mask.shape
+    cr, cc = center_px if center_px is not None else (H // 2, W // 2)
+    pos = np.asarray(positions, dtype=np.float64)
+    B, T, _ = pos.shape
+
+    if ego_size is None:
+        query = pos[:, :, None, :]                                   # (B, T, 1, 2)
+    else:
+        length, width = ego_size
+        corners = np.array([                                         # ego frame
+            [length / 2, width / 2], [length / 2, -width / 2],
+            [-length / 2, width / 2], [-length / 2, -width / 2],
+        ])                                                           # (4, 2)
+        if headings is not None:
+            theta = np.asarray(headings, dtype=np.float64)
+        elif T >= 2:
+            d = np.diff(pos, axis=1)
+            d = np.concatenate([d[:, :1, :], d], axis=1)             # (B, T, 2)
+            theta = np.arctan2(d[..., 1], d[..., 0])                 # (B, T)
+        else:
+            theta = np.zeros((B, T))
+        cos, sin = np.cos(theta), np.sin(theta)                      # (B, T)
+        cx, cy = corners[:, 0], corners[:, 1]                        # (4,)
+        qx = pos[..., 0:1] + cos[..., None] * cx - sin[..., None] * cy   # (B, T, 4)
+        qy = pos[..., 1:2] + sin[..., None] * cx + cos[..., None] * cy   # (B, T, 4)
+        query = np.stack([qx, qy], axis=-1)                          # (B, T, 4, 2)
+
+    offroad = 0
+    for i in range(B):
+        rows = np.round(cr - query[i, ..., 0] / meters_per_pixel).astype(int)
+        cols = np.round(cc - query[i, ..., 1] / meters_per_pixel).astype(int)
+        inside = (rows >= 0) & (rows < H) & (cols >= 0) & (cols < W)
+        on_road = inside.copy()                                      # OOB = off-road
+        on_road[inside] = mask[rows[inside], cols[inside]]
+        if not on_road.all():
+            offroad += 1
+    return offroad / max(B, 1)
diff --git a/Model/evaluation/splits.py b/Model/evaluation/splits.py
@@ -0,0 +1,72 @@
+"""Validation-split helpers for open-loop evaluation (#66 §4).
+
+L2D ships all episodes in a single "train" partition, so we define our own
+val split. The recommended design is a **geographic holdout** (reserve whole
+cities) to avoid the geographic/temporal leakage that inflates nuScenes
+planning numbers (Lilja et al., CVPR 2024); a simple episode-range split is an
+acceptable early-experiment fallback (per the proposal).
+
+Both helpers operate on plain indices / labels supplied by the caller — they do
+**not** read L2D metadata themselves. Pulling per-episode city labels from the
+dataset (for :func:`geographic_holdout_split`) is a separate metadata step and a
+follow-up; these functions just turn that information into train/val indices.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+
+def episode_range_split(num_episodes: int,
+                        val_fraction: float = 0.1) -> tuple[list[int], list[int]]:
+    """Reserve the last ``val_fraction`` of episode indices for validation.
+
+    Simple, leakage-prone (adjacent frames/locations) — for early experiments
+    only; prefer :func:`geographic_holdout_split`. Guarantees a non-empty train
+    and val split.
+
+    Raises:
+        ValueError: if ``num_episodes < 2`` (can't form two non-empty splits) or
+            ``val_fraction`` is not in ``(0, 1)``.
+    """
+    if num_episodes < 2:
+        raise ValueError(
+            f"need >= 2 episodes to form non-empty train/val splits, got {num_episodes}")
+    if not 0.0 < val_fraction < 1.0:
+        raise ValueError(f"val_fraction must be in (0,1), got {val_fraction}")
+    # at least 1 val, and at least 1 train (cap val at num_episodes - 1)
+    n_val = min(max(1, int(round(num_episodes * val_fraction))), num_episodes - 1)
+    cut = num_episodes - n_val
+    return list(range(cut)), list(range(cut, num_episodes))
+
+
+def geographic_holdout_split(
+    episode_cities: Sequence[str],
+    holdout_cities: Sequence[str],
+) -> tuple[list[int], list[int]]:
+    """Hold out whole cities for validation (recommended, leakage-safe).
+
+    This is a **helper that requires episode-level city labels** supplied by the
+    caller (`episode_cities`, index-aligned with the dataset). Extracting those
+    labels from L2D metadata is a separate step (follow-up), not done here.
+
+    Args:
+        episode_cities: city label per episode (index-aligned).
+        holdout_cities: cities to reserve for validation.
+    Returns:
+        ``(train_indices, val_indices)``.
+    Raises:
+        ValueError: if the holdout leaves the train or val split empty (e.g. none
+            of ``holdout_cities`` appear, or they cover every episode).
+    """
+    holdout = set(holdout_cities)
+    train: list[int] = []
+    val: list[int] = []
+    for i, city in enumerate(episode_cities):
+        (val if city in holdout else train).append(i)
+    if not val:
+        raise ValueError(
+            f"holdout_cities {sorted(holdout)} match no episodes — val split is empty")
+    if not train:
+        raise ValueError("holdout_cities cover every episode — train split is empty")
+    return train, val
diff --git a/Model/tests/test_evaluation_complementary.py b/Model/tests/test_evaluation_complementary.py