Preserve grammar annotations (pl), (no pl) and skip articles in IPA
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Two fixes: 1. Add pl, sg, no, also, ae, be etc. to _GRAMMAR_BRACKET_WORDS so annotations like (pl) and (no pl) are not replaced with IPA. 2. Skip articles (the, a, an) in fix_ipa_continuation_cell — they never get IPA in vocabulary books. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -888,6 +888,10 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
|
|||||||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||||||
# English grammar abbreviations used in vocab tables
|
# English grammar abbreviations used in vocab tables
|
||||||
'sth', 'sb', 'adj', 'adv',
|
'sth', 'sb', 'adj', 'adv',
|
||||||
|
# Number/plural/grammar annotations
|
||||||
|
'pl', 'sg', 'sing', 'no', 'also', 'auch',
|
||||||
|
# Regional English markers
|
||||||
|
'ae', 'be', 'ame', 'bre',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
@@ -1293,10 +1297,11 @@ def fix_ipa_continuation_cell(
|
|||||||
return garbled_text
|
return garbled_text
|
||||||
|
|
||||||
# Look up IPA for each headword part.
|
# Look up IPA for each headword part.
|
||||||
# Do NOT skip grammar words here — they are integral parts of the
|
# Skip articles (the, a, an) — they never get IPA in vocab books.
|
||||||
# headword (e.g. "close down", "the United Kingdom"). Grammar
|
# Other function words like "down", "up" are kept because they are
|
||||||
# annotations like "(sth)", "(no pl)" are already stripped above.
|
# integral parts of phrasal verbs (e.g. "close down").
|
||||||
# Skip words that already have inline IPA in the headword row.
|
# Skip words that already have inline IPA in the headword row.
|
||||||
|
_ARTICLES = {'the', 'a', 'an'}
|
||||||
ipa_parts: List[str] = []
|
ipa_parts: List[str] = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
# A part may be multi-word like "secondary school"
|
# A part may be multi-word like "secondary school"
|
||||||
@@ -1308,6 +1313,8 @@ def fix_ipa_continuation_cell(
|
|||||||
continue
|
continue
|
||||||
if covered_words and clean_w.lower() in covered_words:
|
if covered_words and clean_w.lower() in covered_words:
|
||||||
continue # Already has IPA inline in the headword
|
continue # Already has IPA inline in the headword
|
||||||
|
if clean_w.lower() in _ARTICLES:
|
||||||
|
continue # Articles never get IPA in vocab books
|
||||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||||
if ipa:
|
if ipa:
|
||||||
word_ipas.append(ipa)
|
word_ipas.append(ipa)
|
||||||
|
|||||||
@@ -485,13 +485,14 @@ class TestGarbledIpaDetection:
|
|||||||
assert _text_has_garbled_ipa("equipment") is False
|
assert _text_has_garbled_ipa("equipment") is False
|
||||||
|
|
||||||
def test_fix_continuation_united_kingdom(self):
|
def test_fix_continuation_united_kingdom(self):
|
||||||
"""IPA continuation for 'the United Kingdom' → proper IPA."""
|
"""IPA continuation for 'the United Kingdom' → IPA without 'the'."""
|
||||||
fixed = fix_ipa_continuation_cell(
|
fixed = fix_ipa_continuation_cell(
|
||||||
"[n, nn]", "the United Kingdom", pronunciation="british",
|
"[n, nn]", "the United Kingdom", pronunciation="british",
|
||||||
)
|
)
|
||||||
# Should contain proper IPA, not the garbled text
|
# Should contain proper IPA, not the garbled text
|
||||||
assert fixed != "[n, nn]"
|
assert fixed != "[n, nn]"
|
||||||
assert "kˈɪŋdəm" in fixed # Kingdom IPA
|
assert "kˈɪŋdəm" in fixed # Kingdom IPA
|
||||||
|
assert "ðə" not in fixed # "the" must NOT get IPA
|
||||||
|
|
||||||
def test_fix_continuation_equipment(self):
|
def test_fix_continuation_equipment(self):
|
||||||
"""IPA continuation for 'equipment' → proper IPA."""
|
"""IPA continuation for 'equipment' → proper IPA."""
|
||||||
|
|||||||
Reference in New Issue
Block a user