Fix unbracketed IPA continuations: detect garbled IPA in single-cell rows

Step 5d now also processes IPA continuations without brackets (e.g. "ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:30:44 +01:00
parent 5c935eec23
commit 4290f70885
1 changed files with 35 additions and 10 deletions
@@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        # headword in the previous row's same column.
        # Note: We check ALL columns, not just en_col_type, because
        # the EN headword column may not be the longest-average column.
        _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
        ipa_cont_fixed = 0
        for z in zones_data:
            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    if not ct.startswith("column_"):
                        continue
                    cell_text = (cell.get("text") or "").strip()
-                    # Only treat as continuation when text is entirely
+                    if not cell_text:
                    # inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
                    # Text like "employee [im'ploi:]" has a headword
                    # OUTSIDE brackets and must NOT be overwritten.
                    if not (cell_text.startswith('[') and cell_text.endswith(']')):
                        continue
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    # Already has proper IPA brackets → already fixed
                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
                        continue
                    is_bracketed = (
                        cell_text.startswith('[') and cell_text.endswith(']')
                    )
                    if is_bracketed:
                        # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
                        # Text like "employee [im'ploi:]" is NOT fully
                        # bracketed and won't match here.
                        if not _text_has_garbled_ipa(cell_text):
                            continue
                        # Already has proper IPA brackets → skip
                        if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
                            continue
                    else:
                        # Unbracketed continuation: "ska:f – ska:vz",
                        # "'sekandarr sku:l".  Only treat as IPA
                        # continuation if this is the ONLY content cell
                        # in the row (single-cell row) and the text is
                        # garbled IPA without real IPA Unicode symbols.
                        content_cells_in_row = [
                            c for c in row_cells
                            if c.get("col_type", "").startswith("column_")
                            and c.get("col_type") != "column_1"
                        ]
                        if len(content_cells_in_row) != 1:
                            continue
                        if not _text_has_garbled_ipa(cell_text):
                            continue
                        # Has real IPA symbols → already fixed or valid
                        if any(c in _REAL_IPA_CHARS for c in cell_text):
                            continue
                    # Find headword in previous row, same column
                    prev_ri = rows_sorted[idx - 1]["index"]
                    prev_same_col = [