Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
- Detect bracketed text without real IPA symbols as garbled OCR phonetics - Allow IPA continuation fix even when other columns have content (for rows where EN cell is clearly garbled bracketed IPA) - Strip parenthetical grammar annotations like (no pl) from headword before IPA lookup in fix_ipa_continuation_cell Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -993,6 +993,18 @@ def _text_has_garbled_ipa(text: str) -> bool:
|
||||
it must only insert IPA to *replace* garbled phonetics that are already
|
||||
in the text — never to ADD phonetics where none existed on the page.
|
||||
"""
|
||||
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
|
||||
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
|
||||
stripped = text.strip()
|
||||
if stripped.startswith('[') and stripped.endswith(']'):
|
||||
inner = stripped[1:-1]
|
||||
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
|
||||
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
# Not a valid dictionary-style bracket like "(no pl)" — those
|
||||
# use parentheses, not square brackets. Square brackets with
|
||||
# no IPA chars are garbled phonetics.
|
||||
return True
|
||||
|
||||
for w in text.strip().split():
|
||||
# Skip delimiters and very short tokens
|
||||
if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
@@ -1238,8 +1250,10 @@ def fix_ipa_continuation_cell(
|
||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||
return garbled_text
|
||||
|
||||
# Strip existing IPA brackets from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
|
||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
|
||||
if not clean_hw:
|
||||
return garbled_text
|
||||
|
||||
|
||||
@@ -1616,9 +1616,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
|
||||
# 5d. Fix IPA continuation rows — rows where the printed
|
||||
# phonetic transcription wraps to a line below the headword.
|
||||
# These contain only garbled IPA in the EN column and nothing
|
||||
# in other columns. Replace garbled text with proper IPA
|
||||
# looked up from the headword in the previous row.
|
||||
# These contain garbled IPA in the EN column. Replace garbled
|
||||
# text with proper IPA looked up from the headword in the
|
||||
# previous row.
|
||||
ipa_cont_fixed = 0
|
||||
for z in zones_data:
|
||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||
@@ -1630,13 +1630,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
c for c in row_cells
|
||||
if c.get("col_type") == en_col_type
|
||||
]
|
||||
# Other cells with ≥3 chars (ignore margin noise)
|
||||
other_cells = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type") != en_col_type
|
||||
and len((c.get("text") or "").strip()) >= 3
|
||||
]
|
||||
if not en_cells or other_cells:
|
||||
if not en_cells:
|
||||
continue
|
||||
en_text = en_cells[0].get("text", "")
|
||||
if not _text_has_garbled_ipa(en_text):
|
||||
@@ -1644,6 +1638,21 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
# Already has proper IPA brackets → already fixed
|
||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
|
||||
continue
|
||||
# When the EN cell does NOT look obviously garbled
|
||||
# (e.g. bracketed non-IPA), require that other columns
|
||||
# are empty — otherwise it's a normal content row.
|
||||
en_stripped = en_text.strip()
|
||||
is_bracket_garbled = (
|
||||
en_stripped.startswith('[') and en_stripped.endswith(']')
|
||||
)
|
||||
if not is_bracket_garbled:
|
||||
other_cells = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type") != en_col_type
|
||||
and len((c.get("text") or "").strip()) >= 3
|
||||
]
|
||||
if other_cells:
|
||||
continue
|
||||
# Find headword in previous row
|
||||
if idx == 0:
|
||||
continue
|
||||
|
||||
@@ -19,6 +19,7 @@ from grid_editor_api import (
|
||||
_detect_header_rows,
|
||||
_detect_heading_rows_by_color,
|
||||
)
|
||||
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -452,3 +453,44 @@ class TestDetectHeaderRowsSkipFlag:
|
||||
]
|
||||
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
|
||||
assert 0 not in headers
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _text_has_garbled_ipa + fix_ipa_continuation_cell
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGarbledIpaDetection:
|
||||
"""Test detection and fixing of garbled IPA in bracket notation."""
|
||||
|
||||
def test_bracket_garbled_no_ipa_chars(self):
|
||||
"""'[n, nn]' — brackets with no real IPA chars → garbled."""
|
||||
assert _text_has_garbled_ipa("[n, nn]") is True
|
||||
|
||||
def test_bracket_garbled_alphanumeric(self):
|
||||
"""'[1uedtX,1]' — brackets with digits/letters → garbled."""
|
||||
assert _text_has_garbled_ipa("[1uedtX,1]") is True
|
||||
|
||||
def test_bracket_valid_ipa_not_garbled(self):
|
||||
"""'[ɪkwˈɪpmənt]' — brackets with real IPA → not garbled."""
|
||||
assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is False
|
||||
|
||||
def test_no_brackets_normal_word(self):
|
||||
"""'equipment' — normal word → not garbled."""
|
||||
assert _text_has_garbled_ipa("equipment") is False
|
||||
|
||||
def test_fix_continuation_united_kingdom(self):
|
||||
"""IPA continuation for 'the United Kingdom' → proper IPA."""
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
"[n, nn]", "the United Kingdom", pronunciation="british",
|
||||
)
|
||||
# Should contain proper IPA, not the garbled text
|
||||
assert fixed != "[n, nn]"
|
||||
assert "kˈɪŋdəm" in fixed # Kingdom IPA
|
||||
|
||||
def test_fix_continuation_equipment(self):
|
||||
"""IPA continuation for 'equipment' → proper IPA."""
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
"[1uedtX,1]", "equipment (no pl)", pronunciation="british",
|
||||
)
|
||||
assert fixed != "[1uedtX,1]"
|
||||
assert "ɪkwˈɪpmənt" in fixed # equipment IPA
|
||||
|
||||
Reference in New Issue
Block a user