fix: improve phonetic bracket replacement logic

- Replace _is_meaningful_bracket_content with _is_grammar_bracket_content
  that uses a whitelist of grammar particles (with, about/of, auf, etc.)
- Check IPA dictionary FIRST: if word has IPA, treat brackets as phonetic
- Strip orphan brackets (no word before them) that are garbled IPA
- Preserve correct IPA (contains Unicode IPA chars) and grammar info
- Fix variable name bug (result → text)

Fixes: break [breik] now correctly replaced, cross (with) preserved,
orphan [mais] and {'mani setva] stripped.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 23:13:34 +01:00
parent 99573a46ef
commit edbdac3203

View File

@@ -4292,32 +4292,58 @@ def _fix_phonetic_brackets(
return entries
# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
_GERMAN_BRACKET_PREFIXES = frozenset({
'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
# Bracket content that is grammar info, not phonetic: cross (with), complain (about/of)
# Also German prefixes: (zer)brechen, Tanz(veranstaltung), Schild(chen)
# These should NEVER be replaced with IPA.
_GRAMMAR_BRACKET_WORDS = frozenset({
# English prepositions/particles commonly in vocab tables
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
# German prepositions/particles
'ab', 'an', 'auf', 'aus', 'bei', 'dar', 'ein', 'für', 'her', 'hin',
'los', 'mit', 'nach', 'um', 'unter', 'von', 'vor', 'weg', 'zu', 'zurück',
# German verb prefixes (in parentheses before verb stems)
'be', 'emp', 'ent', 'er', 'ge', 'un', 'ver', 'zer',
# Abbreviations
'sth', 'sb', 'adj', 'adv',
})
def _is_meaningful_bracket_content(content: str) -> bool:
"""Return True if bracket content is a meaningful word/prefix, not garbled IPA.
def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info or a German morpheme.
Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
Garbled IPA: {'tfatno, (cy, 1u], 'daens
Grammar info: cross (with), complain (about/of), agree (on/with)
German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo]
Strategy: check each slash-separated token. If ALL tokens are known
grammar words or German affixes, it's grammar info. Otherwise it
might be garbled IPA.
"""
if not content:
return False
# Must be pure letters (no digits, punctuation, IPA symbols)
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
# Split on / for patterns like (about/of), (on/with)
tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
if not tokens:
return False
# Known German prefix
if content.lower() in _GERMAN_BRACKET_PREFIXES:
return True
# Long enough to be a real word (not 1-2 char garbled IPA like "cy")
if len(content) >= 4:
return True
return False
for token in tokens:
# Each token must be pure letters
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
return False
# Check if it's a known grammar word
if token in _GRAMMAR_BRACKET_WORDS:
continue
# German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
# But NOT things that look like garbled IPA transcriptions.
# Heuristic: if it's a common German suffix or a long word, keep it.
if len(token) >= 4:
continue
# Short unknown token — likely garbled IPA
return False
return True
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
@@ -4340,22 +4366,51 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'")
return full_match
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
if _is_meaningful_bracket_content(bracket_content):
logger.info(f" phonetic replacer: SKIP (meaningful content) '{full_match}'")
# Look up IPA for the word before brackets
ipa = _lookup_ipa(word, pronunciation)
if ipa:
# Word has IPA → bracket content is phonetic (garbled or correct).
# Exception: grammar particles like cross (with) — keep those.
if _is_grammar_bracket_content(bracket_content):
# Grammar info followed by garbled IPA? E.g. "cross (with) [kros]"
# Keep the grammar part, IPA will be handled as orphan bracket.
logger.info(f" phonetic replacer: SKIP (grammar info) '{full_match}'")
return full_match
logger.info(f" phonetic replacer: REPLACE '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]"
# No IPA for this word — keep grammar info, strip garbled IPA
if _is_grammar_bracket_content(bracket_content):
logger.info(f" phonetic replacer: SKIP (grammar, no IPA) '{full_match}'")
return full_match
# Look up in IPA dictionary
ipa = _lookup_ipa(word, pronunciation)
if not ipa:
logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
return full_match # Keep original
logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
return full_match
logger.info(f" phonetic replacer: REPLACE '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]"
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
return _PHONETIC_BRACKET_RE.sub(replacer, text)
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
_IPA_CHARS = set('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔ̩̃ʊɐ')
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (auf), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
logger.info(f" phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip()
return text
def _assign_row_words_to_columns(