fix: only replace phonetics in english field, fix grammar detection

- Only process 'english' field for IPA replacement. German and example
  fields contain meaningful parenthetical content like (gefrorenes Wasser),
  (sich beschweren) that must never be replaced.
- Simplify _is_grammar_bracket_content: only known grammar particles
  (with, about/of, sth, etc.) are preserved. Removes the >= 4 chars
  heuristic that incorrectly preserved garbled IPA like [breik], [maus].

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 23:19:03 +01:00
parent edbdac3203
commit e6dc3fcdd7

View File

@@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets(
if not IPA_AVAILABLE:
return entries
# IPA phonetics only appear in the ENGLISH field of vocab tables.
# German and example fields contain meaningful parenthetical content:
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
# These must NEVER be processed as phonetic transcriptions.
replaced_count = 0
for entry in entries:
for field in ('english', 'german', 'example'):
text = entry.get(field, '') or ''
# Check for any bracket type — Tesseract garbles [ into { or (
if not any(ch in text for ch in '[{('):
continue
new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.info(f"_fix_phonetic_brackets: {field} '{text}''{new_text}'")
replaced_count += 1
else:
logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
entry[field] = new_text
text = entry.get('english', '') or ''
if not any(ch in text for ch in '[{('):
continue
new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.info(f"_fix_phonetic_brackets: english '{text}''{new_text}'")
replaced_count += 1
entry['english'] = new_text
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
return entries
@@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info or a German morpheme.
"""Return True if bracket content is grammar info in the ENGLISH field.
Grammar info: cross (with), complain (about/of), agree (on/with)
German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo]
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
Strategy: check each slash-separated token. If ALL tokens are known
grammar words or German affixes, it's grammar info. Otherwise it
might be garbled IPA.
Since we only process the English field, we only need to recognize
English grammar particles. Everything else is (garbled) IPA.
"""
if not content:
return False
@@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool:
if not tokens:
return False
for token in tokens:
# Each token must be pure letters
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
return False
# Check if it's a known grammar word
if token in _GRAMMAR_BRACKET_WORDS:
continue
# German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
# But NOT things that look like garbled IPA transcriptions.
# Heuristic: if it's a common German suffix or a long word, keep it.
if len(token) >= 4:
continue
# Short unknown token — likely garbled IPA
return False
return True
# ALL tokens must be known grammar words
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: