From 4290f70885e6dac91c84c0ab00dee0919c301e2a Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Fri, 20 Mar 2026 08:30:44 +0100
Subject: [PATCH] Fix unbracketed IPA continuations: detect garbled IPA in
 single-cell rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 5d now also processes IPA continuations without brackets (e.g.
"ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content
cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 45 +++++++++++++++++-----
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 0c20a5a..b05c344 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
         # headword in the previous row's same column.
         # Note: We check ALL columns, not just en_col_type, because
         # the EN headword column may not be the longest-average column.
+        _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
         ipa_cont_fixed = 0
         for z in zones_data:
             rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                     if not ct.startswith("column_"):
                         continue
                     cell_text = (cell.get("text") or "").strip()
-                    # Only treat as continuation when text is entirely
-                    # inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
-                    # Text like "employee [im'ploi:]" has a headword
-                    # OUTSIDE brackets and must NOT be overwritten.
-                    if not (cell_text.startswith('[') and cell_text.endswith(']')):
-                        continue
-                    if not _text_has_garbled_ipa(cell_text):
-                        continue
-                    # Already has proper IPA brackets → already fixed
-                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+                    if not cell_text:
                         continue
+
+                    is_bracketed = (
+                        cell_text.startswith('[') and cell_text.endswith(']')
+                    )
+
+                    if is_bracketed:
+                        # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
+                        # Text like "employee [im'ploi:]" is NOT fully
+                        # bracketed and won't match here.
+                        if not _text_has_garbled_ipa(cell_text):
+                            continue
+                        # Already has proper IPA brackets → skip
+                        if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+                            continue
+                    else:
+                        # Unbracketed continuation: "ska:f – ska:vz",
+                        # "'sekandarr sku:l".  Only treat as IPA
+                        # continuation if this is the ONLY content cell
+                        # in the row (single-cell row) and the text is
+                        # garbled IPA without real IPA Unicode symbols.
+                        content_cells_in_row = [
+                            c for c in row_cells
+                            if c.get("col_type", "").startswith("column_")
+                            and c.get("col_type") != "column_1"
+                        ]
+                        if len(content_cells_in_row) != 1:
+                            continue
+                        if not _text_has_garbled_ipa(cell_text):
+                            continue
+                        # Has real IPA symbols → already fixed or valid
+                        if any(c in _REAL_IPA_CHARS for c in cell_text):
+                            continue
+
                     # Find headword in previous row, same column
                     prev_ri = rows_sorted[idx - 1]["index"]
                     prev_same_col = [