fix: only replace phonetics in english field, fix grammar detection

- Only process 'english' field for IPA replacement. German and example
  fields contain meaningful parenthetical content like (gefrorenes Wasser),
  (sich beschweren) that must never be replaced.
- Simplify _is_grammar_bracket_content: only known grammar particles
  (with, about/of, sth, etc.) are preserved. Removes the >= 4 chars
  heuristic that incorrectly preserved garbled IPA like [breik], [maus].

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 23:19:03 +01:00
parent edbdac3203
commit e6dc3fcdd7

View File

@@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets(
if not IPA_AVAILABLE: if not IPA_AVAILABLE:
return entries return entries
# IPA phonetics only appear in the ENGLISH field of vocab tables.
# German and example fields contain meaningful parenthetical content:
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
# These must NEVER be processed as phonetic transcriptions.
replaced_count = 0 replaced_count = 0
for entry in entries: for entry in entries:
for field in ('english', 'german', 'example'): text = entry.get('english', '') or ''
text = entry.get(field, '') or '' if not any(ch in text for ch in '[{('):
# Check for any bracket type — Tesseract garbles [ into { or ( continue
if not any(ch in text for ch in '[{('): new_text = _replace_phonetics_in_text(text, pronunciation)
continue if new_text != text:
new_text = _replace_phonetics_in_text(text, pronunciation) logger.info(f"_fix_phonetic_brackets: english '{text}''{new_text}'")
if new_text != text: replaced_count += 1
logger.info(f"_fix_phonetic_brackets: {field} '{text}''{new_text}'") entry['english'] = new_text
replaced_count += 1
else:
logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
entry[field] = new_text
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries") logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
return entries return entries
@@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
def _is_grammar_bracket_content(content: str) -> bool: def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info or a German morpheme. """Return True if bracket content is grammar info in the ENGLISH field.
Grammar info: cross (with), complain (about/of), agree (on/with) Grammar info: cross (with), complain (about/of), agree (on/with)
German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen) NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo]
Strategy: check each slash-separated token. If ALL tokens are known Since we only process the English field, we only need to recognize
grammar words or German affixes, it's grammar info. Otherwise it English grammar particles. Everything else is (garbled) IPA.
might be garbled IPA.
""" """
if not content: if not content:
return False return False
@@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool:
if not tokens: if not tokens:
return False return False
for token in tokens: # ALL tokens must be known grammar words
# Each token must be pure letters return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
return False
# Check if it's a known grammar word
if token in _GRAMMAR_BRACKET_WORDS:
continue
# German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
# But NOT things that look like garbled IPA transcriptions.
# Heuristic: if it's a common German suffix or a long word, keep it.
if len(token) >= 4:
continue
# Short unknown token — likely garbled IPA
return False
return True
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: