feat: Add CharacterErrorRate (CER) metric to ignite.metrics.nlp (#3785)

Shubh3005 · aaishwarymishra · web-flow · commit a9c793e2fe89 · 2026-06-29T05:34:10.000Z
## Summary Implements `CharacterErrorRate` (CER) as a follow-up to #3638 (WER), resolving #3634. CER measures the edit distance at the character level — used in ASR and OCR evaluation where a single character error (e.g. misreading a financial figure) is a severe failure. ## Changes - `ignite/metrics/nlp/character_error_rate.py` — CER metric inheriting from `_BaseErrorRate`, using character-level Levenshtein distance - `tests/ignite/metrics/nlp/test_character_error_rate.py` — 15 test cases covering: identical sequences, single deletion/insertion/substitution, empty inputs, batch accumulation, multi-update accumulation, reset, single string input, whitespace as character, unicode - `ignite/metrics/nlp/__init__.py` — exports `CharacterErrorRate` ## Design Follows the same structure as `word_error_rate.py` and `bleu.py`: - Separate file per metric (per maintainer feedback in #3634) - Inherits `_BaseErrorRate` from `word_error_rate.py` - Only difference from WER: `_tokenize` returns `list(text)` instead of `text.split()` Closes #3634 --------- Co-authored-by: Aaishwarya Mishra <aaishwarymishra@gmail.com>
diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
@@ -354,6 +354,7 @@ Complete list of metrics
     Rouge
     RougeL
     RougeN
+    CharacterErrorRate
     InceptionScore
     FID
     CosineSimilarity
diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py
@@ -32,6 +32,7 @@
 from ignite.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix
 from ignite.metrics.mutual_information import MutualInformation
 from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.character_error_rate import CharacterErrorRate
 from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.precision import Precision
 from ignite.metrics.precision_recall_curve import PrecisionRecallCurve
@@ -90,6 +91,7 @@
     "Frequency",
     "SSIM",
     "Bleu",
+    "CharacterErrorRate",
     "Rouge",
     "RougeN",
     "RougeL",
diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py
@@ -1,8 +1,10 @@
 from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.character_error_rate import CharacterErrorRate
 from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
 
 __all__ = [
     "Bleu",
+    "CharacterErrorRate",
     "Rouge",
     "RougeN",
     "RougeL",
diff --git a/ignite/metrics/nlp/character_error_rate.py b/ignite/metrics/nlp/character_error_rate.py
@@ -0,0 +1,119 @@
+from typing import Callable, Sequence
+
+import torch
+from torch.types import Number
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
+
+__all__ = ["CharacterErrorRate"]
+
+
+def _edit_distance(ref: str, pred: str) -> int:
+    """Computes the Levenshtein distance between two strings."""
+    n, m = len(ref), len(pred)
+    if n == 0:
+        return m
+    if m == 0:
+        return n
+    dp = list(range(m + 1))
+    for i in range(1, n + 1):
+        prev_diag = dp[0]
+        dp[0] = i
+        for j in range(1, m + 1):
+            temp = dp[j]
+            dp[j] = prev_diag if ref[i - 1] == pred[j - 1] else min(dp[j - 1], dp[j], prev_diag) + 1
+            prev_diag = temp
+    return dp[m]
+
+
+class CharacterErrorRate(Metric):
+    r"""Calculates the Character Error Rate (CER).
+
+    CER is defined as the total number of errors (substitutions, deletions, and insertions)
+    at the character level divided by the total number of characters in the reference sequence.
+
+    .. math::
+        \text{CER} = \frac{S + D + I}{N} = \frac{S + D + I}{S + D + C}
+
+    where :math:`S` is the number of substitutions, :math:`D` is the number of deletions,
+    :math:`I` is the number of insertions, :math:`C` is the number of correct characters,
+    and :math:`N` is the total number of characters in the reference (:math:`N = S + D + C`).
+
+    - ``update`` must receive input of the form ``(y_pred, y)``.
+    - `y_pred` and `y` both must be either ``str`` or list of ``str``.
+    - When both inputs are plain ``str``, they are treated as a single-element batch.
+
+    Args:
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric.
+        device: specifies which device updates are accumulated on. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method.
+
+    Examples:
+        For more information on how metric works with :class:`~ignite.engine.engine.Engine`, visit :ref:`attach-engine`.
+
+        .. testcode::
+
+            from ignite.metrics.nlp import CharacterErrorRate
+
+            cer = CharacterErrorRate()
+
+            y_pred = ["the cat sat on the mat", "hello world"]
+            y = ["the cat sat on mat", "hello world"]
+
+            cer.update((y_pred, y))
+            print(round(cer.compute(), 4))
+
+        .. testoutput::
+
+            0.1379
+
+    .. versionadded:: 0.5.2
+    """
+
+    def __init__(
+        self,
+        output_transform: Callable = lambda x: x,
+        device: str | torch.device = torch.device("cpu"),
+        skip_unrolling: bool = False,
+    ):
+        super().__init__(output_transform=output_transform, device=device, skip_unrolling=skip_unrolling)
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._num_errors = torch.tensor(0.0, device=self._device)
+        self._num_refs = torch.tensor(0.0, device=self._device)
+        self._num_examples = torch.tensor(0.0, device=self._device)
+
+    @reinit__is_reduced
+    def update(self, output: Sequence[str]) -> None:
+        y_pred, y = output[0], output[1]
+        if not isinstance(y_pred, (str, list)) or not isinstance(y, (str, list)):
+            raise TypeError(f"y_pred and y must be str or list[str], got y_pred: {type(y_pred)} and y: {type(y)}")
+        if isinstance(y_pred, str) and isinstance(y, str):
+            y_pred = [y_pred]
+            y = [y]
+        if not all(isinstance(p, str) for p in y_pred) or not all(isinstance(r, str) for r in y):
+            raise TypeError("All elements of y_pred and y must be strings.")
+        if len(y_pred) != len(y):
+            raise ValueError(
+                f"y_pred and y must have the same length. Got y_pred of length {len(y_pred)} and y of length {len(y)}."
+            )
+        errors = 0.0
+        refs = 0.0
+        for p, r in zip(y_pred, y):
+            errors += _edit_distance(r, p)
+            refs += len(r)
+        self._num_errors += errors
+        self._num_refs += refs
+        self._num_examples += 1
+
+    @sync_all_reduce("_num_errors", "_num_refs")
+    def compute(self) -> Number:
+        if self._num_examples == 0:
+            raise NotComputableError("CharacterErrorRate must have at least one example before it can be computed.")
+        if self._num_refs == 0:
+            return 0.0 if self._num_errors == 0 else 1.0
+        return (self._num_errors / self._num_refs).item()
diff --git a/tests/ignite/metrics/nlp/test_character_error_rate.py b/tests/ignite/metrics/nlp/test_character_error_rate.py
@@ -0,0 +1,111 @@
+import pytest
+from ignite.exceptions import NotComputableError
+from ignite.metrics.nlp import CharacterErrorRate
+
+
+def test_zero_cer_identical():
+    cer = CharacterErrorRate()
+    cer.update((["hello world"], ["hello world"]))
+    assert cer.compute() == pytest.approx(0.0)
+
+
+def test_cer_single_deletion():
+    cer = CharacterErrorRate()
+    cer.update((["helo"], ["hello"]))
+    assert cer.compute() == pytest.approx(1 / 5)
+
+
+def test_cer_single_insertion():
+    cer = CharacterErrorRate()
+    cer.update((["hello"], ["helo"]))
+    assert cer.compute() == pytest.approx(1 / 4)
+
+
+def test_cer_single_substitution():
+    cer = CharacterErrorRate()
+    cer.update((["bat"], ["cat"]))
+    assert cer.compute() == pytest.approx(1 / 3)
+
+
+def test_cer_completely_wrong():
+    cer = CharacterErrorRate()
+    cer.update((["xyz"], ["abc"]))
+    assert cer.compute() == pytest.approx(1.0)
+
+
+def test_cer_empty_prediction():
+    cer = CharacterErrorRate()
+    cer.update(([""], ["hello"]))
+    assert cer.compute() == pytest.approx(1.0)
+
+
+def test_cer_empty_reference():
+    # mixed batch: empty ref pair contributes errors but not refs
+    cer = CharacterErrorRate()
+    cer.update((["hello world", "hello"], ["hello world", ""]))
+    assert cer.compute() == pytest.approx(5 / 11)
+
+
+def test_cer_empty_ref_nonempty_pred_only():
+    # case 1: errors > 0, refs == 0 -> return 1.0
+    cer = CharacterErrorRate()
+    cer.update((["hello"], [""]))
+    assert cer.compute() == pytest.approx(1.0)
+
+
+def test_cer_both_empty_strings():
+    # case 3: both empty -> return 0.0
+    cer = CharacterErrorRate()
+    cer.update(([""], [""]))
+    assert cer.compute() == pytest.approx(0.0)
+
+
+def test_cer_batch():
+    cer = CharacterErrorRate()
+    cer.update((["hello", "cat"], ["hello", "bat"]))
+    assert cer.compute() == pytest.approx(1 / 8)
+
+
+def test_cer_accumulates_across_updates():
+    cer = CharacterErrorRate()
+    cer.update((["hello"], ["hello"]))
+    cer.update((["cat"], ["bat"]))
+    assert cer.compute() == pytest.approx(1 / 8)
+
+
+def test_cer_reset_clears_state():
+    cer = CharacterErrorRate()
+    cer.update((["cat"], ["bat"]))
+    cer.reset()
+    cer.update((["hello"], ["hello"]))
+    assert cer.compute() == pytest.approx(0.0)
+
+
+def test_cer_single_string_input():
+    cer = CharacterErrorRate()
+    cer.update(("helo", "hello"))
+    assert cer.compute() == pytest.approx(1 / 5)
+
+
+def test_cer_whitespace_counts_as_character():
+    cer = CharacterErrorRate()
+    cer.update((["ab"], ["a b"]))
+    assert cer.compute() == pytest.approx(1 / 3)
+
+
+def test_cer_not_computable_before_update():
+    cer = CharacterErrorRate()
+    with pytest.raises(NotComputableError):
+        cer.compute()
+
+
+def test_cer_multiline():
+    cer = CharacterErrorRate()
+    cer.update((["hello\nworld"], ["hello\nworld"]))
+    assert cer.compute() == pytest.approx(0.0)
+
+
+def test_cer_unicode():
+    cer = CharacterErrorRate()
+    cer.update((["cafe"], ["café"]))
+    assert cer.compute() == pytest.approx(1 / 4)