fix: only replace phonetics in english field, fix grammar detection

- Only process 'english' field for IPA replacement. German and example fields contain meaningful parenthetical content like (gefrorenes Wasser), (sich beschweren) that must never be replaced. - Simplify _is_grammar_bracket_content: only known grammar particles (with, about/of, sth, etc.) are preserved. Removes the >= 4 chars heuristic that incorrectly preserved garbled IPA like [breik], [maus]. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 23:19:03 +01:00
parent edbdac3203
commit e6dc3fcdd7
1 changed files with 19 additions and 34 deletions
@@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets(
    if not IPA_AVAILABLE:
        return entries
    # IPA phonetics only appear in the ENGLISH field of vocab tables.
    # German and example fields contain meaningful parenthetical content:
    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
    # These must NEVER be processed as phonetic transcriptions.
    replaced_count = 0
    for entry in entries:
-        for field in ('english', 'german', 'example'):
+        text = entry.get('english', '') or ''
-            text = entry.get(field, '') or ''
+        if not any(ch in text for ch in '[{('):
-            # Check for any bracket type — Tesseract garbles [ into { or (
+            continue
-            if not any(ch in text for ch in '[{('):
+        new_text = _replace_phonetics_in_text(text, pronunciation)
-                continue
+        if new_text != text:
-            new_text = _replace_phonetics_in_text(text, pronunciation)
+            logger.info(f"_fix_phonetic_brackets: english '{text}' → '{new_text}'")
-            if new_text != text:
+            replaced_count += 1
-                logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'")
+        entry['english'] = new_text
                replaced_count += 1
            else:
                logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
            entry[field] = new_text
    logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
    return entries
@@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
 def _is_grammar_bracket_content(content: str) -> bool:
-    """Return True if bracket content is grammar info or a German morpheme.
+    """Return True if bracket content is grammar info in the ENGLISH field.
    Grammar info:  cross (with), complain (about/of), agree (on/with)
-    German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
+    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
    NOT grammar:   [breik], {'tfatno], (cy), ['kju:kambo]
-    Strategy: check each slash-separated token. If ALL tokens are known
+    Since we only process the English field, we only need to recognize
-    grammar words or German affixes, it's grammar info. Otherwise it
+    English grammar particles. Everything else is (garbled) IPA.
    might be garbled IPA.
    """
    if not content:
        return False
@@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool:
    if not tokens:
        return False
-    for token in tokens:
+    # ALL tokens must be known grammar words
-        # Each token must be pure letters
+    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
        if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
            return False
        # Check if it's a known grammar word
        if token in _GRAMMAR_BRACKET_WORDS:
            continue
        # German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
        # But NOT things that look like garbled IPA transcriptions.
        # Heuristic: if it's a common German suffix or a long word, keep it.
        if len(token) >= 4:
            continue
        # Short unknown token — likely garbled IPA
        return False
    return True
 def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: