From 596864431be31e5e986326bf9f5f2dacfb93df55 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 15 Apr 2026 23:34:21 +0200 Subject: [PATCH] Rule (a2): switch from allow-list to block-list for symbol removal Instead of keeping only specific symbols (_KEEP_SYMBOLS), now only removes explicitly decorative symbols (_REMOVE_SYMBOLS: > < ~ \ ^ etc). All other punctuation (= ( ) ; : - etc.) is preserved by default. This is more robust: any new symbol used in textbooks will be kept unless it's in the small block-list of known decorative artifacts. Fixes: (= token still being removed on page 5 despite being in the allow-list (possibly due to Unicode variants or whitespace). Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_build_core.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py index e268244..8440d07 100644 --- a/klausur-service/backend/grid_build_core.py +++ b/klausur-service/backend/grid_build_core.py @@ -1407,13 +1407,14 @@ async def _build_grid_core( # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) # Small images/icons next to words get OCR'd as ">", "<", "~", etc. # Remove word boxes that contain NO letters or digits. - # Exception: meaningful punctuation used in textbooks (=, ;, :, -, etc.) - _KEEP_SYMBOLS = {'=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', - '•', '·', '(', ')', '&', '*', '→', '←', '↔'} + # Exception: keep standard punctuation and symbols commonly used + # in textbooks (=, ;, :, -, (, ), etc.). Only remove truly + # decorative symbols like >, <, ~, \, ^, `, #. + _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'} for i, wb in enumerate(wbs): t = (wb.get("text") or "").strip() if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: - if t not in _KEEP_SYMBOLS: + if t in _REMOVE_SYMBOLS: to_remove.add(i) # Rule (b) + (c): overlap and duplicate detection