diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py index 954aef8..e268244 100644 --- a/klausur-service/backend/grid_build_core.py +++ b/klausur-service/backend/grid_build_core.py @@ -1407,10 +1407,14 @@ async def _build_grid_core( # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) # Small images/icons next to words get OCR'd as ">", "<", "~", etc. # Remove word boxes that contain NO letters or digits. + # Exception: meaningful punctuation used in textbooks (=, ;, :, -, etc.) + _KEEP_SYMBOLS = {'=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', + '•', '·', '(', ')', '&', '*', '→', '←', '↔'} for i, wb in enumerate(wbs): t = (wb.get("text") or "").strip() if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: - to_remove.add(i) + if t not in _KEEP_SYMBOLS: + to_remove.add(i) # Rule (b) + (c): overlap and duplicate detection # Sort by x for pairwise comparison