From c8027eb7f94b1ec47628c94e07d9ca7c2ffe2a4e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 15 Apr 2026 23:18:35 +0200 Subject: [PATCH] Fix: preserve = ; : - and other meaningful symbols in word_boxes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rule (a2) in Step 5i removed word_boxes with no letters/digits as "graphic OCR artifacts". This incorrectly removed = signs used as definition markers in textbooks ("film = 1. Film; 2. filmen"). Added exception list _KEEP_SYMBOLS for meaningful punctuation: = (= =) ; : - – — / + • · ( ) & * → ← ↔ The root cause: PaddleOCR returns "film = 1. Film; 2. filmen" as one block, which gets split into word_boxes ["film", "=", "1.", ...]. The "=" word_box had no alphanumeric chars and was removed as artifact. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_build_core.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py index 954aef8..e268244 100644 --- a/klausur-service/backend/grid_build_core.py +++ b/klausur-service/backend/grid_build_core.py @@ -1407,10 +1407,14 @@ async def _build_grid_core( # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) # Small images/icons next to words get OCR'd as ">", "<", "~", etc. # Remove word boxes that contain NO letters or digits. + # Exception: meaningful punctuation used in textbooks (=, ;, :, -, etc.) + _KEEP_SYMBOLS = {'=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', + '•', '·', '(', ')', '&', '*', '→', '←', '↔'} for i, wb in enumerate(wbs): t = (wb.get("text") or "").strip() if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: - to_remove.add(i) + if t not in _KEEP_SYMBOLS: + to_remove.add(i) # Rule (b) + (c): overlap and duplicate detection # Sort by x for pairwise comparison