Fix IPA continuation: skip words with inline IPA, recover emptied cells
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m46s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m46s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 15s
Three fixes: 1. fix_ipa_continuation_cell: when headword has inline IPA like "beat [bˈiːt] , beat, beaten", only generate IPA for uncovered words (beaten), not words already shown (beat). When bracket is at end like "the Highlands [ˈhaɪləndz]", return inline IPA directly. 2. Step 5d: recover garbled IPA from word_boxes when Step 5c emptied the cell text (e.g. "[n, nn]" → ""). 3. Added 2 tests for inline IPA behavior (35 total). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1250,6 +1250,32 @@ def fix_ipa_continuation_cell(
|
||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||
return garbled_text
|
||||
|
||||
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
|
||||
# only generate continuation IPA for words NOT already covered.
|
||||
covered_words: set = set()
|
||||
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
|
||||
if has_inline_ipa:
|
||||
# Words before the first bracket already have their IPA shown
|
||||
first_bracket = headword_text.index('[')
|
||||
pre_bracket = headword_text[:first_bracket].strip()
|
||||
for w in pre_bracket.split():
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
|
||||
if clean and len(clean) >= 2:
|
||||
covered_words.add(clean)
|
||||
|
||||
last_bracket_end = headword_text.rfind(']')
|
||||
tail = headword_text[last_bracket_end + 1:].strip()
|
||||
|
||||
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
|
||||
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
|
||||
# — return the inline IPA directly (continuation duplicates it)
|
||||
last_bracket_start = headword_text.rfind('[')
|
||||
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
|
||||
return inline_ipa
|
||||
|
||||
# Only the tail words need continuation IPA
|
||||
headword_text = tail
|
||||
|
||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||
@@ -1270,6 +1296,7 @@ def fix_ipa_continuation_cell(
|
||||
# Do NOT skip grammar words here — they are integral parts of the
|
||||
# headword (e.g. "close down", "the United Kingdom"). Grammar
|
||||
# annotations like "(sth)", "(no pl)" are already stripped above.
|
||||
# Skip words that already have inline IPA in the headword row.
|
||||
ipa_parts: List[str] = []
|
||||
for part in parts:
|
||||
# A part may be multi-word like "secondary school"
|
||||
@@ -1279,6 +1306,8 @@ def fix_ipa_continuation_cell(
|
||||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean_w or len(clean_w) < 2:
|
||||
continue
|
||||
if covered_words and clean_w.lower() in covered_words:
|
||||
continue # Already has IPA inline in the headword
|
||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||
if ipa:
|
||||
word_ipas.append(ipa)
|
||||
|
||||
@@ -1798,7 +1798,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
continue
|
||||
cell_text = (cell.get("text") or "").strip()
|
||||
if not cell_text:
|
||||
continue
|
||||
# Step 5c may have emptied garbled IPA cells like
|
||||
# "[n, nn]" — recover text from word_boxes.
|
||||
wb_texts = [w.get("text", "")
|
||||
for w in cell.get("word_boxes", [])]
|
||||
cell_text = " ".join(wb_texts).strip()
|
||||
if not cell_text:
|
||||
continue
|
||||
|
||||
is_bracketed = (
|
||||
cell_text.startswith('[') and cell_text.endswith(']')
|
||||
|
||||
@@ -510,6 +510,23 @@ class TestGarbledIpaDetection:
|
||||
assert "klˈəʊs" in fixed # close IPA
|
||||
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
|
||||
|
||||
def test_continuation_skips_words_with_inline_ipa(self):
|
||||
"""'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'."""
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
"[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british",
|
||||
)
|
||||
# Should only have IPA for "beaten", NOT for "beat" (already inline)
|
||||
assert "bˈiːtən" in fixed
|
||||
assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]"
|
||||
|
||||
def test_continuation_bracket_at_end_returns_inline(self):
|
||||
"""'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'."""
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
"'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british",
|
||||
)
|
||||
assert fixed == "[ˈhaɪləndz]"
|
||||
assert "ðə" not in fixed # "the" must NOT get IPA
|
||||
|
||||
def test_headword_with_brackets_not_continuation(self):
|
||||
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user