From 92a7b85c2d179daf82e8bdfb7cfa6d67c0336640 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Fri, 20 Mar 2026 00:43:51 +0100
Subject: [PATCH] Fix IPA continuation: only process fully-bracketed cells,
 keep phrasal verb particles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes:
1. Step 5d now only treats cells as continuation when text is entirely
   inside brackets (e.g. "[n, nn]"). Cells with headwords outside brackets
   (e.g. "employee [im'ploi:]") are no longer overwritten.
2. fix_ipa_continuation_cell no longer skips grammar words like "down" —
   they are part of the headword in phrasal verbs like "close sth. down".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py     |  8 +++----
 klausur-service/backend/grid_editor_api.py    |  8 ++++++-
 .../backend/tests/test_grid_editor_api.py     | 21 +++++++++++++++++++
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index c662be1..a244a89 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1266,7 +1266,10 @@ def fix_ipa_continuation_cell(
     if not parts:
         return garbled_text
 
-    # Look up IPA for each headword part
+    # Look up IPA for each headword part.
+    # Do NOT skip grammar words here — they are integral parts of the
+    # headword (e.g. "close down", "the United Kingdom").  Grammar
+    # annotations like "(sth)", "(no pl)" are already stripped above.
     ipa_parts: List[str] = []
     for part in parts:
         # A part may be multi-word like "secondary school"
@@ -1276,9 +1279,6 @@ def fix_ipa_continuation_cell(
             clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
             if not clean_w or len(clean_w) < 2:
                 continue
-            # Skip grammar words like "to" at the start
-            if clean_w.lower() in _GRAMMAR_BRACKET_WORDS:
-                continue
             ipa = _lookup_ipa(clean_w, pronunciation)
             if ipa:
                 word_ipas.append(ipa)
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 67fc02a..75076af 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1634,7 +1634,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                     ct = cell.get("col_type", "")
                     if not ct.startswith("column_"):
                         continue
-                    cell_text = cell.get("text", "")
+                    cell_text = (cell.get("text") or "").strip()
+                    # Only treat as continuation when text is entirely
+                    # inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
+                    # Text like "employee [im'ploi:]" has a headword
+                    # OUTSIDE brackets and must NOT be overwritten.
+                    if not (cell_text.startswith('[') and cell_text.endswith(']')):
+                        continue
                     if not _text_has_garbled_ipa(cell_text):
                         continue
                     # Already has proper IPA brackets → already fixed
diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py
index 814f51f..74f0822 100644
--- a/klausur-service/backend/tests/test_grid_editor_api.py
+++ b/klausur-service/backend/tests/test_grid_editor_api.py
@@ -499,3 +499,24 @@ class TestGarbledIpaDetection:
         )
         assert fixed != "[1uedtX,1]"
         assert "ɪkwˈɪpmənt" in fixed  # equipment IPA
+
+    def test_fix_continuation_close_down(self):
+        """IPA continuation for 'close sth. down' → IPA for both words."""
+        fixed = fix_ipa_continuation_cell(
+            "[klaoz 'daun]", "close sth. down", pronunciation="british",
+        )
+        assert fixed != "[klaoz 'daun]"
+        assert "klˈəʊs" in fixed   # close IPA
+        assert "dˈaʊn" in fixed    # down IPA — must NOT be skipped
+
+    def test_headword_with_brackets_not_continuation(self):
+        """'employee [im'ploi:]' has a headword outside brackets → not garbled.
+
+        _text_has_garbled_ipa returns True (has ':'), but Step 5d should
+        skip this cell because text doesn't start with '['.
+        """
+        # The garbled check still triggers (has IPA-like ':')
+        assert _text_has_garbled_ipa("employee [im'ploi:]") is True
+        # But text does NOT start with '[' — Step 5d bracket guard blocks it
+        text = "employee [im'ploi:]"
+        assert not (text.strip().startswith('[') and text.strip().endswith(']'))