From ba0f659d1ea2daa38c3c9c61f6bf91ef03557342 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 15 Apr 2026 23:04:27 +0200 Subject: [PATCH] Preserve = and (= tokens in grid build and cell text cleanup = signs are used as definition markers in textbooks ("film = 1. Film"). They were incorrectly removed by two filters: 1. grid_build_core.py Step 5j-pre: _PURE_JUNK_RE matched "=" as artifact noise. Now exempts =, (=, ;, :, - and similar meaningful punctuation tokens. 2. cv_ocr_engines.py _is_noise_tail_token: "pure non-alpha" check removed trailing = tokens. Now exempts meaningful punctuation. Fixes: "film = 1. Film; 2. filmen" losing the = sign, "(= I won and he lost.)" losing the (=. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/cv_ocr_engines.py | 5 +++++ klausur-service/backend/grid_build_core.py | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index ba2d1e1..4bed09b 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1881,6 +1881,11 @@ def _is_noise_tail_token(token: str) -> bool: if t.endswith(']'): return False + # Keep meaningful punctuation tokens used in textbooks + # = (definition marker), (= (definition opener), ; (separator) + if t in ('=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', '&'): + return False + # Pure non-alpha → noise ("3", ")", "|") alpha_chars = _RE_ALPHA.findall(t) if not alpha_chars: diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py index 037c578..954aef8 100644 --- a/klausur-service/backend/grid_build_core.py +++ b/klausur-service/backend/grid_build_core.py @@ -1575,7 +1575,10 @@ async def _build_grid_core( if not core: is_artifact = True elif _PURE_JUNK_RE.match(core): - is_artifact = True + # Keep meaningful punctuation tokens used in textbooks + # = (definition), (= (definition), ; : - are valid content + if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'): + is_artifact = True elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): # Short non-alphabetic text like "a=", not word beginnings like "Zw" is_artifact = True