fix: only replace phonetics in english field, fix grammar detection
- Only process 'english' field for IPA replacement. German and example fields contain meaningful parenthetical content like (gefrorenes Wasser), (sich beschweren) that must never be replaced. - Simplify _is_grammar_bracket_content: only known grammar particles (with, about/of, sth, etc.) are preserved. Removes the >= 4 chars heuristic that incorrectly preserved garbled IPA like [breik], [maus]. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets(
|
||||
if not IPA_AVAILABLE:
|
||||
return entries
|
||||
|
||||
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||||
# German and example fields contain meaningful parenthetical content:
|
||||
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||||
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||||
# These must NEVER be processed as phonetic transcriptions.
|
||||
replaced_count = 0
|
||||
for entry in entries:
|
||||
for field in ('english', 'german', 'example'):
|
||||
text = entry.get(field, '') or ''
|
||||
# Check for any bracket type — Tesseract garbles [ into { or (
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
if new_text != text:
|
||||
logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'")
|
||||
replaced_count += 1
|
||||
else:
|
||||
logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
|
||||
entry[field] = new_text
|
||||
text = entry.get('english', '') or ''
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
if new_text != text:
|
||||
logger.info(f"_fix_phonetic_brackets: english '{text}' → '{new_text}'")
|
||||
replaced_count += 1
|
||||
entry['english'] = new_text
|
||||
|
||||
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
|
||||
return entries
|
||||
@@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
|
||||
|
||||
|
||||
def _is_grammar_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is grammar info or a German morpheme.
|
||||
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||||
|
||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||
German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
|
||||
NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo]
|
||||
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||||
|
||||
Strategy: check each slash-separated token. If ALL tokens are known
|
||||
grammar words or German affixes, it's grammar info. Otherwise it
|
||||
might be garbled IPA.
|
||||
Since we only process the English field, we only need to recognize
|
||||
English grammar particles. Everything else is (garbled) IPA.
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
@@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool:
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
for token in tokens:
|
||||
# Each token must be pure letters
|
||||
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
|
||||
return False
|
||||
# Check if it's a known grammar word
|
||||
if token in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
# German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
|
||||
# But NOT things that look like garbled IPA transcriptions.
|
||||
# Heuristic: if it's a common German suffix or a long word, keep it.
|
||||
if len(token) >= 4:
|
||||
continue
|
||||
# Short unknown token — likely garbled IPA
|
||||
return False
|
||||
|
||||
return True
|
||||
# ALL tokens must be known grammar words
|
||||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||
|
||||
Reference in New Issue
Block a user