diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index f2bd0ac..82b6fab 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1678,6 +1678,38 @@ async def _build_grid_core( if "|" in t: cell["text"] = t.replace("|", "") + # --- Split merged words (OCR sometimes glues adjacent words) --- + # Uses dictionary lookup to split e.g. "atmyschool" → "at my school" + try: + from cv_review import _try_split_merged_word, _SPELL_AVAILABLE + if _SPELL_AVAILABLE: + split_count = 0 + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if not text: + continue + parts = [] + changed = False + for token in text.split(): + # Only try splitting pure-alpha tokens > 7 chars + clean = token.rstrip(".,!?;:'\")") + suffix = token[len(clean):] + if len(clean) > 7 and clean.isalpha(): + split = _try_split_merged_word(clean) + if split: + parts.append(split + suffix) + changed = True + continue + parts.append(token) + if changed: + cell["text"] = " ".join(parts) + split_count += 1 + if split_count: + logger.info("build-grid session %s: split %d merged words", session_id, split_count) + except ImportError: + pass + # Clean up internal flags before returning for z in zones_data: for cell in z.get("cells", []):