fix: only replace phonetics in english field, fix grammar detection
- Only process 'english' field for IPA replacement. German and example fields contain meaningful parenthetical content like (gefrorenes Wasser), (sich beschweren) that must never be replaced. - Simplify _is_grammar_bracket_content: only known grammar particles (with, about/of, sth, etc.) are preserved. Removes the >= 4 chars heuristic that incorrectly preserved garbled IPA like [breik], [maus]. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets(
|
|||||||
if not IPA_AVAILABLE:
|
if not IPA_AVAILABLE:
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||||||
|
# German and example fields contain meaningful parenthetical content:
|
||||||
|
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||||||
|
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||||||
|
# These must NEVER be processed as phonetic transcriptions.
|
||||||
replaced_count = 0
|
replaced_count = 0
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
for field in ('english', 'german', 'example'):
|
text = entry.get('english', '') or ''
|
||||||
text = entry.get(field, '') or ''
|
if not any(ch in text for ch in '[{('):
|
||||||
# Check for any bracket type — Tesseract garbles [ into { or (
|
continue
|
||||||
if not any(ch in text for ch in '[{('):
|
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||||
continue
|
if new_text != text:
|
||||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
logger.info(f"_fix_phonetic_brackets: english '{text}' → '{new_text}'")
|
||||||
if new_text != text:
|
replaced_count += 1
|
||||||
logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'")
|
entry['english'] = new_text
|
||||||
replaced_count += 1
|
|
||||||
else:
|
|
||||||
logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
|
|
||||||
entry[field] = new_text
|
|
||||||
|
|
||||||
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
|
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
|
||||||
return entries
|
return entries
|
||||||
@@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
|
|||||||
|
|
||||||
|
|
||||||
def _is_grammar_bracket_content(content: str) -> bool:
|
def _is_grammar_bracket_content(content: str) -> bool:
|
||||||
"""Return True if bracket content is grammar info or a German morpheme.
|
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||||||
|
|
||||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||||
German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
|
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||||||
NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo]
|
|
||||||
|
|
||||||
Strategy: check each slash-separated token. If ALL tokens are known
|
Since we only process the English field, we only need to recognize
|
||||||
grammar words or German affixes, it's grammar info. Otherwise it
|
English grammar particles. Everything else is (garbled) IPA.
|
||||||
might be garbled IPA.
|
|
||||||
"""
|
"""
|
||||||
if not content:
|
if not content:
|
||||||
return False
|
return False
|
||||||
@@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool:
|
|||||||
if not tokens:
|
if not tokens:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for token in tokens:
|
# ALL tokens must be known grammar words
|
||||||
# Each token must be pure letters
|
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||||
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
|
|
||||||
return False
|
|
||||||
# Check if it's a known grammar word
|
|
||||||
if token in _GRAMMAR_BRACKET_WORDS:
|
|
||||||
continue
|
|
||||||
# German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
|
|
||||||
# But NOT things that look like garbled IPA transcriptions.
|
|
||||||
# Heuristic: if it's a common German suffix or a long word, keep it.
|
|
||||||
if len(token) >= 4:
|
|
||||||
continue
|
|
||||||
# Short unknown token — likely garbled IPA
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user