Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]

- Detect bracketed text without real IPA symbols as garbled OCR phonetics - Allow IPA continuation fix even when other columns have content (for rows where EN cell is clearly garbled bracketed IPA) - Strip parenthetical grammar annotations like (no pl) from headword before IPA lookup in fix_ipa_continuation_cell Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 23:28:00 +01:00
parent 7750b2a05f
commit 6bfa9eed86
3 changed files with 77 additions and 12 deletions
@@ -993,6 +993,18 @@ def _text_has_garbled_ipa(text: str) -> bool:
    it must only insert IPA to *replace* garbled phonetics that are already
    in the text — never to ADD phonetics where none existed on the page.
    """
+    # Bracketed text that doesn't contain valid IPA symbols is garbled OCR
+    # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
+    stripped = text.strip()
+    if stripped.startswith('[') and stripped.endswith(']'):
+        inner = stripped[1:-1]
+        # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
+        if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
+            # Not a valid dictionary-style bracket like "(no pl)" — those
+            # use parentheses, not square brackets.  Square brackets with
+            # no IPA chars are garbled phonetics.
+            return True
+
    for w in text.strip().split():
        # Skip delimiters and very short tokens
        if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
@@ -1238,8 +1250,10 @@ def fix_ipa_continuation_cell(
    if not IPA_AVAILABLE or not garbled_text or not headword_text:
        return garbled_text

-    # Strip existing IPA brackets from headword text
-    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
+    # Strip existing IPA brackets and parenthetical grammar annotations
+    # like "(no pl)", "(sth)", "(sb)" from headword text
+    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
+    clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
    if not clean_hw:
        return garbled_text

@@ -1616,9 +1616,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:

        # 5d. Fix IPA continuation rows — rows where the printed
        # phonetic transcription wraps to a line below the headword.
-        # These contain only garbled IPA in the EN column and nothing
-        # in other columns.  Replace garbled text with proper IPA
-        # looked up from the headword in the previous row.
+        # These contain garbled IPA in the EN column.  Replace garbled
+        # text with proper IPA looked up from the headword in the
+        # previous row.
        ipa_cont_fixed = 0
        for z in zones_data:
            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1630,13 +1630,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    c for c in row_cells
                    if c.get("col_type") == en_col_type
                ]
-                # Other cells with ≥3 chars (ignore margin noise)
-                other_cells = [
-                    c for c in row_cells
-                    if c.get("col_type") != en_col_type
-                    and len((c.get("text") or "").strip()) >= 3
-                ]
-                if not en_cells or other_cells:
+                if not en_cells:
                    continue
                en_text = en_cells[0].get("text", "")
                if not _text_has_garbled_ipa(en_text):
@@ -1644,6 +1638,21 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                # Already has proper IPA brackets → already fixed
                if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
                    continue
+                # When the EN cell does NOT look obviously garbled
+                # (e.g. bracketed non-IPA), require that other columns
+                # are empty — otherwise it's a normal content row.
+                en_stripped = en_text.strip()
+                is_bracket_garbled = (
+                    en_stripped.startswith('[') and en_stripped.endswith(']')
+                )
+                if not is_bracket_garbled:
+                    other_cells = [
+                        c for c in row_cells
+                        if c.get("col_type") != en_col_type
+                        and len((c.get("text") or "").strip()) >= 3
+                    ]
+                    if other_cells:
+                        continue
                # Find headword in previous row
                if idx == 0:
                    continue
@@ -19,6 +19,7 @@ from grid_editor_api import (
    _detect_header_rows,
    _detect_heading_rows_by_color,
 )
+from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell


 # ---------------------------------------------------------------------------
@@ -452,3 +453,44 @@ class TestDetectHeaderRowsSkipFlag:
        ]
        headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
        assert 0 not in headers
+
+
+# ---------------------------------------------------------------------------
+# _text_has_garbled_ipa + fix_ipa_continuation_cell
+# ---------------------------------------------------------------------------
+
+class TestGarbledIpaDetection:
+    """Test detection and fixing of garbled IPA in bracket notation."""
+
+    def test_bracket_garbled_no_ipa_chars(self):
+        """'[n, nn]' — brackets with no real IPA chars → garbled."""
+        assert _text_has_garbled_ipa("[n, nn]") is True
+
+    def test_bracket_garbled_alphanumeric(self):
+        """'[1uedtX,1]' — brackets with digits/letters → garbled."""
+        assert _text_has_garbled_ipa("[1uedtX,1]") is True
+
+    def test_bracket_valid_ipa_not_garbled(self):
+        """'[ɪkwˈɪpmənt]' — brackets with real IPA → not garbled."""
+        assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is False
+
+    def test_no_brackets_normal_word(self):
+        """'equipment' — normal word → not garbled."""
+        assert _text_has_garbled_ipa("equipment") is False
+
+    def test_fix_continuation_united_kingdom(self):
+        """IPA continuation for 'the United Kingdom' → proper IPA."""
+        fixed = fix_ipa_continuation_cell(
+            "[n, nn]", "the United Kingdom", pronunciation="british",
+        )
+        # Should contain proper IPA, not the garbled text
+        assert fixed != "[n, nn]"
+        assert "kˈɪŋdəm" in fixed  # Kingdom IPA
+
+    def test_fix_continuation_equipment(self):
+        """IPA continuation for 'equipment' → proper IPA."""
+        fixed = fix_ipa_continuation_cell(
+            "[1uedtX,1]", "equipment (no pl)", pronunciation="british",
+        )
+        assert fixed != "[1uedtX,1]"
+        assert "ɪkwˈɪpmənt" in fixed  # equipment IPA