Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled

- Detect bracketed text without real IPA symbols as garbled OCR phonetics
- Allow IPA continuation fix even when other columns have content (for rows
  where EN cell is clearly garbled bracketed IPA)
- Strip parenthetical grammar annotations like (no pl) from headword before
  IPA lookup in fix_ipa_continuation_cell

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 23:28:00 +01:00
parent 7750b2a05f
commit 6bfa9eed86
3 changed files with 77 additions and 12 deletions

View File

@@ -19,6 +19,7 @@ from grid_editor_api import (
_detect_header_rows,
_detect_heading_rows_by_color,
)
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell
# ---------------------------------------------------------------------------
@@ -452,3 +453,44 @@ class TestDetectHeaderRowsSkipFlag:
]
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
assert 0 not in headers
# ---------------------------------------------------------------------------
# _text_has_garbled_ipa + fix_ipa_continuation_cell
# ---------------------------------------------------------------------------
class TestGarbledIpaDetection:
"""Test detection and fixing of garbled IPA in bracket notation."""
def test_bracket_garbled_no_ipa_chars(self):
"""'[n, nn]' — brackets with no real IPA chars → garbled."""
assert _text_has_garbled_ipa("[n, nn]") is True
def test_bracket_garbled_alphanumeric(self):
"""'[1uedtX,1]' — brackets with digits/letters → garbled."""
assert _text_has_garbled_ipa("[1uedtX,1]") is True
def test_bracket_valid_ipa_not_garbled(self):
"""'[ɪkwˈɪpmənt]' — brackets with real IPA → not garbled."""
assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is False
def test_no_brackets_normal_word(self):
"""'equipment' — normal word → not garbled."""
assert _text_has_garbled_ipa("equipment") is False
def test_fix_continuation_united_kingdom(self):
"""IPA continuation for 'the United Kingdom' → proper IPA."""
fixed = fix_ipa_continuation_cell(
"[n, nn]", "the United Kingdom", pronunciation="british",
)
# Should contain proper IPA, not the garbled text
assert fixed != "[n, nn]"
assert "kˈɪŋdəm" in fixed # Kingdom IPA
def test_fix_continuation_equipment(self):
"""IPA continuation for 'equipment' → proper IPA."""
fixed = fix_ipa_continuation_cell(
"[1uedtX,1]", "equipment (no pl)", pronunciation="british",
)
assert fixed != "[1uedtX,1]"
assert "ɪkwˈɪpmənt" in fixed # equipment IPA