diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py index e268244..8440d07 100644 --- a/klausur-service/backend/grid_build_core.py +++ b/klausur-service/backend/grid_build_core.py @@ -1407,13 +1407,14 @@ async def _build_grid_core( # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) # Small images/icons next to words get OCR'd as ">", "<", "~", etc. # Remove word boxes that contain NO letters or digits. - # Exception: meaningful punctuation used in textbooks (=, ;, :, -, etc.) - _KEEP_SYMBOLS = {'=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', - '•', '·', '(', ')', '&', '*', '→', '←', '↔'} + # Exception: keep standard punctuation and symbols commonly used + # in textbooks (=, ;, :, -, (, ), etc.). Only remove truly + # decorative symbols like >, <, ~, \, ^, `, #. + _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'} for i, wb in enumerate(wbs): t = (wb.get("text") or "").strip() if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: - if t not in _KEEP_SYMBOLS: + if t in _REMOVE_SYMBOLS: to_remove.add(i) # Rule (b) + (c): overlap and duplicate detection