Preserve = and (= tokens in grid build and cell text cleanup

= signs are used as definition markers in textbooks ("film = 1. Film"). They were incorrectly removed by two filters: 1. grid_build_core.py Step 5j-pre: _PURE_JUNK_RE matched "=" as artifact noise. Now exempts =, (=, ;, :, - and similar meaningful punctuation tokens. 2. cv_ocr_engines.py _is_noise_tail_token: "pure non-alpha" check removed trailing = tokens. Now exempts meaningful punctuation. Fixes: "film = 1. Film; 2. filmen" losing the = sign, "(= I won and he lost.)" losing the (=. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 23:04:27 +02:00
parent 50bfd6e902
commit ba0f659d1e
2 changed files with 9 additions and 1 deletions
@@ -1881,6 +1881,11 @@ def _is_noise_tail_token(token: str) -> bool:
    if t.endswith(']'):
        return False

+    # Keep meaningful punctuation tokens used in textbooks
+    # = (definition marker), (= (definition opener), ; (separator)
+    if t in ('=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', '&'):
+        return False
+
    # Pure non-alpha → noise ("3", ")", "|")
    alpha_chars = _RE_ALPHA.findall(t)
    if not alpha_chars:
@@ -1575,7 +1575,10 @@ async def _build_grid_core(
            if not core:
                is_artifact = True
            elif _PURE_JUNK_RE.match(core):
-                is_artifact = True
+                # Keep meaningful punctuation tokens used in textbooks
+                # = (definition), (= (definition), ; : - are valid content
+                if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
+                    is_artifact = True
            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
                # Short non-alphabetic text like "a=", not word beginnings like "Zw"
                is_artifact = True