fix: improve phonetic bracket replacement logic
- Replace _is_meaningful_bracket_content with _is_grammar_bracket_content
that uses a whitelist of grammar particles (with, about/of, auf, etc.)
- Check IPA dictionary FIRST: if word has IPA, treat brackets as phonetic
- Strip orphan brackets (no word before them) that are garbled IPA
- Preserve correct IPA (contains Unicode IPA chars) and grammar info
- Fix variable name bug (result → text)
Fixes: break [breik] now correctly replaced, cross (with) preserved,
orphan [mais] and {'mani setva] stripped.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4292,32 +4292,58 @@ def _fix_phonetic_brackets(
|
||||
return entries
|
||||
|
||||
|
||||
# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
|
||||
_GERMAN_BRACKET_PREFIXES = frozenset({
|
||||
'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
|
||||
'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
|
||||
'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
|
||||
# Bracket content that is grammar info, not phonetic: cross (with), complain (about/of)
|
||||
# Also German prefixes: (zer)brechen, Tanz(veranstaltung), Schild(chen)
|
||||
# These should NEVER be replaced with IPA.
|
||||
_GRAMMAR_BRACKET_WORDS = frozenset({
|
||||
# English prepositions/particles commonly in vocab tables
|
||||
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
|
||||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||||
# German prepositions/particles
|
||||
'ab', 'an', 'auf', 'aus', 'bei', 'dar', 'ein', 'für', 'her', 'hin',
|
||||
'los', 'mit', 'nach', 'um', 'unter', 'von', 'vor', 'weg', 'zu', 'zurück',
|
||||
# German verb prefixes (in parentheses before verb stems)
|
||||
'be', 'emp', 'ent', 'er', 'ge', 'un', 'ver', 'zer',
|
||||
# Abbreviations
|
||||
'sth', 'sb', 'adj', 'adv',
|
||||
})
|
||||
|
||||
|
||||
def _is_meaningful_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is a meaningful word/prefix, not garbled IPA.
|
||||
def _is_grammar_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is grammar info or a German morpheme.
|
||||
|
||||
Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
|
||||
Garbled IPA: {'tfatno, (cy, 1u], 'daens
|
||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||
German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
|
||||
NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo]
|
||||
|
||||
Strategy: check each slash-separated token. If ALL tokens are known
|
||||
grammar words or German affixes, it's grammar info. Otherwise it
|
||||
might be garbled IPA.
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
# Must be pure letters (no digits, punctuation, IPA symbols)
|
||||
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
|
||||
|
||||
# Split on / for patterns like (about/of), (on/with)
|
||||
tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
|
||||
if not tokens:
|
||||
return False
|
||||
# Known German prefix
|
||||
if content.lower() in _GERMAN_BRACKET_PREFIXES:
|
||||
return True
|
||||
# Long enough to be a real word (not 1-2 char garbled IPA like "cy")
|
||||
if len(content) >= 4:
|
||||
return True
|
||||
return False
|
||||
|
||||
for token in tokens:
|
||||
# Each token must be pure letters
|
||||
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
|
||||
return False
|
||||
# Check if it's a known grammar word
|
||||
if token in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
# German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
|
||||
# But NOT things that look like garbled IPA transcriptions.
|
||||
# Heuristic: if it's a common German suffix or a long word, keep it.
|
||||
if len(token) >= 4:
|
||||
continue
|
||||
# Short unknown token — likely garbled IPA
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||
@@ -4340,22 +4366,51 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
||||
logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'")
|
||||
return full_match
|
||||
|
||||
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
|
||||
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
|
||||
if _is_meaningful_bracket_content(bracket_content):
|
||||
logger.info(f" phonetic replacer: SKIP (meaningful content) '{full_match}'")
|
||||
# Look up IPA for the word before brackets
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
|
||||
if ipa:
|
||||
# Word has IPA → bracket content is phonetic (garbled or correct).
|
||||
# Exception: grammar particles like cross (with) — keep those.
|
||||
if _is_grammar_bracket_content(bracket_content):
|
||||
# Grammar info followed by garbled IPA? E.g. "cross (with) [kros]"
|
||||
# Keep the grammar part, IPA will be handled as orphan bracket.
|
||||
logger.info(f" phonetic replacer: SKIP (grammar info) '{full_match}'")
|
||||
return full_match
|
||||
logger.info(f" phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
|
||||
return f"{word} [{ipa}]"
|
||||
|
||||
# No IPA for this word — keep grammar info, strip garbled IPA
|
||||
if _is_grammar_bracket_content(bracket_content):
|
||||
logger.info(f" phonetic replacer: SKIP (grammar, no IPA) '{full_match}'")
|
||||
return full_match
|
||||
|
||||
# Look up in IPA dictionary
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
if not ipa:
|
||||
logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
|
||||
return full_match # Keep original
|
||||
logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
|
||||
return full_match
|
||||
|
||||
logger.info(f" phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
|
||||
return f"{word} [{ipa}]"
|
||||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
_IPA_CHARS = set('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔ̩̃ʊɐ')
|
||||
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (auf), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
logger.info(f" phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _assign_row_words_to_columns(
|
||||
|
||||
Reference in New Issue
Block a user