Fix unbracketed IPA continuations: detect garbled IPA in single-cell rows

Step 5d now also processes IPA continuations without brackets (e.g. "ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:30:44 +01:00
parent 5c935eec23
commit 4290f70885
1 changed files with 35 additions and 10 deletions
@@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        # headword in the previous row's same column.
        # Note: We check ALL columns, not just en_col_type, because
        # the EN headword column may not be the longest-average column.
+        _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
        ipa_cont_fixed = 0
        for z in zones_data:
            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    if not ct.startswith("column_"):
                        continue
                    cell_text = (cell.get("text") or "").strip()
-                    # Only treat as continuation when text is entirely
-                    # inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
-                    # Text like "employee [im'ploi:]" has a headword
-                    # OUTSIDE brackets and must NOT be overwritten.
-                    if not (cell_text.startswith('[') and cell_text.endswith(']')):
-                        continue
-                    if not _text_has_garbled_ipa(cell_text):
-                        continue
-                    # Already has proper IPA brackets → already fixed
-                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+                    if not cell_text:
                        continue
+
+                    is_bracketed = (
+                        cell_text.startswith('[') and cell_text.endswith(']')
+                    )
+
+                    if is_bracketed:
+                        # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
+                        # Text like "employee [im'ploi:]" is NOT fully
+                        # bracketed and won't match here.
+                        if not _text_has_garbled_ipa(cell_text):
+                            continue
+                        # Already has proper IPA brackets → skip
+                        if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+                            continue
+                    else:
+                        # Unbracketed continuation: "ska:f – ska:vz",
+                        # "'sekandarr sku:l".  Only treat as IPA
+                        # continuation if this is the ONLY content cell
+                        # in the row (single-cell row) and the text is
+                        # garbled IPA without real IPA Unicode symbols.
+                        content_cells_in_row = [
+                            c for c in row_cells
+                            if c.get("col_type", "").startswith("column_")
+                            and c.get("col_type") != "column_1"
+                        ]
+                        if len(content_cells_in_row) != 1:
+                            continue
+                        if not _text_has_garbled_ipa(cell_text):
+                            continue
+                        # Has real IPA symbols → already fixed or valid
+                        if any(c in _REAL_IPA_CHARS for c in cell_text):
+                            continue
+
                    # Find headword in previous row, same column
                    prev_ri = rows_sorted[idx - 1]["index"]
                    prev_same_col = [