Apply merged-word splitting to grid-editor cells

The spell review only runs on vocab entries, but the OCR pipeline's grid-editor cells also contain merged words (e.g. "atmyschool"). Now splits merged words directly in the grid-build finalization step, right before returning the result. Uses the same _try_split_merged_word() dictionary-based DP algorithm. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 14:52:00 +02:00
parent 53b0d77853
commit 6e494a43ab
1 changed files with 32 additions and 0 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1678,6 +1678,38 @@ async def _build_grid_core(
                if "|" in t:
                    cell["text"] = t.replace("|", "")

+    # --- Split merged words (OCR sometimes glues adjacent words) ---
+    # Uses dictionary lookup to split e.g. "atmyschool" → "at my school"
+    try:
+        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
+        if _SPELL_AVAILABLE:
+            split_count = 0
+            for z in zones_data:
+                for cell in z.get("cells", []):
+                    text = cell.get("text", "")
+                    if not text:
+                        continue
+                    parts = []
+                    changed = False
+                    for token in text.split():
+                        # Only try splitting pure-alpha tokens > 7 chars
+                        clean = token.rstrip(".,!?;:'\")")
+                        suffix = token[len(clean):]
+                        if len(clean) > 7 and clean.isalpha():
+                            split = _try_split_merged_word(clean)
+                            if split:
+                                parts.append(split + suffix)
+                                changed = True
+                                continue
+                        parts.append(token)
+                    if changed:
+                        cell["text"] = " ".join(parts)
+                        split_count += 1
+            if split_count:
+                logger.info("build-grid session %s: split %d merged words", session_id, split_count)
+    except ImportError:
+        pass
+
    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):