alibaba · tiankongdeguiji · Jun 11, 2026 · Jun 5, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/tzrec/main.py b/tzrec/main.py
@@ -515,6 +515,12 @@ def run_eval(step: int, epoch: int) -> None:
             if lr.by_epoch:
                 lr.step()
 
+    # One-shot end-of-loop hook (default no-op; e.g. SidRqkmeans fits its FAISS
+    # codebook here). SID models run with periodic checkpointing disabled
+    # (save_checkpoints_steps/epochs = 0), so the tail final=True save below is
+    # the only checkpoint and persists whatever on_train_end produced.
+    _model.on_train_end()
+
     _log_train(
         i_step,
         losses,

diff --git a/tzrec/metrics/relative_l1.py b/tzrec/metrics/relative_l1.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2026, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torchmetrics import Metric
+
+
+class RelativeL1(Metric):
+    """Mean symmetric relative-L1 error ``|t - p| / (max(|t|, |p|) + eps)``.
+
+    A bounded reconstruction-error metric (0 = exact, → 1 = unrelated). It is a
+    verbatim port of OpenOneRec's residual-K-Means ``calc_loss`` and is
+    deliberately **not** ``torchmetrics.MeanAbsolutePercentageError``, which uses
+    the asymmetric ``|t - p| / |t|`` denominator. Aggregation is element-wise
+    (count-weighted), so the reported value is the mean over all elements seen.
+    """
+
+    higher_is_better = False
+    is_differentiable = True
+
+    def __init__(self, epsilon: float = 1e-4, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.epsilon = epsilon
+        # float64 sum / long count: float32 loses integer precision past 2**24
+        # (~32K rows of a 512-dim embedding) under element-wise aggregation.
+        self.add_state(
+            "sum_rel",
+            default=torch.tensor(0.0, dtype=torch.float64),
+            dist_reduce_fx="sum",
+        )
+        self.add_state(
+            "count", default=torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum"
+        )
+
+    def update(self, preds: torch.Tensor, target: torch.Tensor) -> None:
+        """Accumulate the relative-L1 error between ``preds`` and ``target``.
+
+        Args:
+            preds (Tensor): reconstruction, shape (B, D).
+            target (Tensor): ground-truth embedding, shape (B, D).
+        """
+        rel = torch.abs(target - preds) / (
+            torch.maximum(torch.abs(target), torch.abs(preds)) + self.epsilon
+        )
+        self.sum_rel += rel.sum().double()
+        self.count += rel.numel()
+
+    def compute(self) -> torch.Tensor:
+        """Mean relative-L1 over all elements (NaN before any update)."""
+        return self.sum_rel / self.count
diff --git a/tzrec/metrics/relative_l1_test.py b/tzrec/metrics/relative_l1_test.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2026, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from tzrec.metrics.relative_l1 import RelativeL1
+
+
+class RelativeL1Test(unittest.TestCase):
+    def test_zero_on_identity(self) -> None:
+        metric = RelativeL1()
+        x = torch.randn(8, 4)
+        metric.update(x, x.clone())
+        self.assertAlmostEqual(metric.compute().item(), 0.0, places=6)
+
+    def test_matches_formula(self) -> None:
+        metric = RelativeL1(epsilon=1e-4)
+        p = torch.tensor([[1.0, 0.0]])
+        t = torch.tensor([[0.0, 2.0]])
+        # |t-p|/(max(|t|,|p|)+eps): [1/(1+eps), 2/(2+eps)], mean of the two.
+        expected = (1.0 / (1.0 + 1e-4) + 2.0 / (2.0 + 1e-4)) / 2
+        metric.update(p, t)
+        self.assertAlmostEqual(metric.compute().item(), expected, places=5)
+
+    def test_count_weighted_across_updates(self) -> None:
+        """Aggregation is element-wise, not a mean of per-batch means."""
+        metric = RelativeL1()
+        metric.update(torch.zeros(1, 4), torch.ones(1, 4))  # 4 elems, rel ~1
+        metric.update(torch.ones(3, 4), torch.ones(3, 4))  # 12 elems, rel 0
+        # Element-weighted: 4 nonzero over 16 elems -> ~0.25, NOT (1+0)/2 = 0.5.
+        per = 1.0 / (1.0 + 1e-4)  # rel of a 0-vs-1 element (with epsilon)
+        self.assertAlmostEqual(metric.compute().item(), 4 * per / 16, places=6)
+
+    def test_nan_before_update(self) -> None:
+        self.assertTrue(torch.isnan(RelativeL1().compute()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tzrec/models/model.py b/tzrec/models/model.py
@@ -150,6 +150,15 @@ def compute_train_metric(self) -> Dict[str, torch.Tensor]:
             metric_results[metric_name] = metric.compute()
         return metric_results
 
+    def on_train_end(self) -> None:
+        """Hook fired once after the train_eval loop exits.
+
+        Default no-op; override for one-shot end-of-loop work (e.g.
+        :class:`SidRqkmeans` fits its FAISS codebook here). The tail
+        ``final=True`` checkpoint persists whatever it produced.
+        """
+        return
+
     def sparse_parameters(
         self,
     ) -> Tuple[Iterable[nn.Parameter], Iterable[nn.Parameter]]:

diff --git a/tzrec/models/sid_model.py b/tzrec/models/sid_model.py
@@ -18,6 +18,7 @@
 
 from tzrec.datasets.utils import BASE_DATA_GROUP, Batch
 from tzrec.features.feature import BaseFeature
+from tzrec.metrics.relative_l1 import RelativeL1
 from tzrec.metrics.unique_ratio import UniqueRatio
 from tzrec.models.model import BaseModel
 from tzrec.protos.model_pb2 import ModelConfig
@@ -40,9 +41,9 @@ class BaseSidModel(BaseModel):
 
     Subclasses build their quantizer in ``__init__`` (after calling
     ``super().__init__``) and implement :meth:`predict` and :meth:`loss`.
-    They extend :meth:`init_metric` (via ``super()``) and implement
-    :meth:`update_metric` to populate the registered metrics
-    (:meth:`update_train_metric` defaults to a no-op).
+    :meth:`predict` exposes the reconstruction under ``predictions["x_hat"]``
+    (only when meaningful) so the shared :meth:`update_metric` can score it.
+    (:meth:`update_train_metric` defaults to a no-op.)
 
     Args:
         model_config (ModelConfig): an instance of ModelConfig.
@@ -69,8 +70,17 @@ def __init__(
         self._input_dim = cfg.input_dim
         self._normalize_residuals = cfg.normalize_residuals
 
-        assert cfg.codebook, "codebook must be set, e.g. [256, 256, 256]"
+        if not cfg.codebook:
+            raise ValueError("codebook must be set, e.g. [256, 256, 256]")
         self._n_embed_list = list(cfg.codebook)
+        # Fail fast: a zero codebook entry / input_dim==0 only errors opaquely
+        # deep inside faiss, after the whole training pass.
+        if any(k < 1 for k in self._n_embed_list):
+            raise ValueError(
+                f"every codebook entry must be >= 1, got {self._n_embed_list}"
+            )
+        if self._input_dim < 1:
+            raise ValueError(f"input_dim must be >= 1, got {self._input_dim}")
         self._n_layers = len(self._n_embed_list)
 
     def _extract_feature(
@@ -99,14 +109,48 @@ def init_loss(self) -> None:
     def init_metric(self) -> None:
         """Initialize the eval metrics shared by all SID models.
 
-        ``mse``: reconstruction error (input vs. quantized / decoded).
-        ``unique_sid_ratio``: mean per-batch unique-SID ratio (distinct rows /
-        batch size; a batch-size-sensitive diversity proxy, not global
-        coverage). Subclasses call ``super().init_metric()`` then add extras.
+        - ``mse``: reconstruction error (input vs. quantized / decoded).
+        - ``rel_loss``: symmetric relative-L1 reconstruction error
+          (:class:`~tzrec.metrics.relative_l1.RelativeL1`); meaningful only with
+          ``normalize_residuals=False`` (else the reconstruction and the input
+          live on different scales).
+        - ``unique_sid_ratio``: mean per-batch unique-SID ratio (distinct rows /
+          batch size; a batch-size-sensitive diversity proxy, not global
+          coverage).
+
+        Subclasses that add extras call ``super().init_metric()`` first.
         """
         self._metric_modules["mse"] = torchmetrics.MeanSquaredError()
+        self._metric_modules["rel_loss"] = RelativeL1()
         self._metric_modules["unique_sid_ratio"] = UniqueRatio()
 
+    def update_metric(
+        self,
+        predictions: Dict[str, torch.Tensor],
+        batch: Batch,
+        losses: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> None:
+        """Update eval metrics from the reconstruction + the re-extracted input.
+
+        ``predictions["x_hat"]`` is the model's reconstruction of the input
+        embedding (the centroid sum for RQ-KMeans, the decoder output for
+        RQ-VAE). Subclasses expose it only when it is meaningful, so a
+        not-yet-fitted model omits it and this logs nothing. The target
+        embedding is re-extracted from ``batch`` (it is an input, not an output).
+
+        Args:
+            predictions (dict): a dict of predicted result.
+            batch (Batch): input batch data.
+            losses (dict, optional): a dict of loss.
+        """
+        if "x_hat" not in predictions:
+            return
+        recon = predictions["x_hat"]
+        embedding = self._extract_feature(batch)
+        self._metric_modules["mse"].update(recon, embedding)
+        self._metric_modules["rel_loss"].update(recon, embedding)
+        self._metric_modules["unique_sid_ratio"].update(predictions["codes"])
+
     def update_train_metric(
         self,
         predictions: Dict[str, torch.Tensor],