fix: improve phonetic bracket replacement logic

- Replace _is_meaningful_bracket_content with _is_grammar_bracket_content that uses a whitelist of grammar particles (with, about/of, auf, etc.) - Check IPA dictionary FIRST: if word has IPA, treat brackets as phonetic - Strip orphan brackets (no word before them) that are garbled IPA - Preserve correct IPA (contains Unicode IPA chars) and grammar info - Fix variable name bug (result → text) Fixes: break [breik] now correctly replaced, cross (with) preserved, orphan [mais] and {'mani setva] stripped. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 23:13:34 +01:00
parent 99573a46ef
commit edbdac3203
1 changed files with 85 additions and 30 deletions
@@ -4292,32 +4292,58 @@ def _fix_phonetic_brackets(
    return entries


-# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
-_GERMAN_BRACKET_PREFIXES = frozenset({
-    'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
-    'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
-    'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
+# Bracket content that is grammar info, not phonetic: cross (with), complain (about/of)
+# Also German prefixes: (zer)brechen, Tanz(veranstaltung), Schild(chen)
+# These should NEVER be replaced with IPA.
+_GRAMMAR_BRACKET_WORDS = frozenset({
+    # English prepositions/particles commonly in vocab tables
+    'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
+    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
+    # German prepositions/particles
+    'ab', 'an', 'auf', 'aus', 'bei', 'dar', 'ein', 'für', 'her', 'hin',
+    'los', 'mit', 'nach', 'um', 'unter', 'von', 'vor', 'weg', 'zu', 'zurück',
+    # German verb prefixes (in parentheses before verb stems)
+    'be', 'emp', 'ent', 'er', 'ge', 'un', 'ver', 'zer',
+    # Abbreviations
+    'sth', 'sb', 'adj', 'adv',
 })


-def _is_meaningful_bracket_content(content: str) -> bool:
-    """Return True if bracket content is a meaningful word/prefix, not garbled IPA.
+def _is_grammar_bracket_content(content: str) -> bool:
+    """Return True if bracket content is grammar info or a German morpheme.

-    Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
-    Garbled IPA: {'tfatno, (cy, 1u], 'daens
+    Grammar info:  cross (with), complain (about/of), agree (on/with)
+    German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
+    NOT grammar:   [breik], {'tfatno], (cy), ['kju:kambo]
+
+    Strategy: check each slash-separated token. If ALL tokens are known
+    grammar words or German affixes, it's grammar info. Otherwise it
+    might be garbled IPA.
    """
    if not content:
        return False
-    # Must be pure letters (no digits, punctuation, IPA symbols)
-    if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
+
+    # Split on / for patterns like (about/of), (on/with)
+    tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
+    if not tokens:
        return False
-    # Known German prefix
-    if content.lower() in _GERMAN_BRACKET_PREFIXES:
-        return True
-    # Long enough to be a real word (not 1-2 char garbled IPA like "cy")
-    if len(content) >= 4:
-        return True
-    return False
+
+    for token in tokens:
+        # Each token must be pure letters
+        if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
+            return False
+        # Check if it's a known grammar word
+        if token in _GRAMMAR_BRACKET_WORDS:
+            continue
+        # German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
+        # But NOT things that look like garbled IPA transcriptions.
+        # Heuristic: if it's a common German suffix or a long word, keep it.
+        if len(token) >= 4:
+            continue
+        # Short unknown token — likely garbled IPA
+        return False
+
+    return True


 def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
@@ -4340,22 +4366,51 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
            logger.info(f"  phonetic replacer: SKIP (too many words) '{full_match}'")
            return full_match

-        # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
-        # Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
-        if _is_meaningful_bracket_content(bracket_content):
-            logger.info(f"  phonetic replacer: SKIP (meaningful content) '{full_match}'")
+        # Look up IPA for the word before brackets
+        ipa = _lookup_ipa(word, pronunciation)
+
+        if ipa:
+            # Word has IPA → bracket content is phonetic (garbled or correct).
+            # Exception: grammar particles like cross (with) — keep those.
+            if _is_grammar_bracket_content(bracket_content):
+                # Grammar info followed by garbled IPA? E.g. "cross (with) [kros]"
+                # Keep the grammar part, IPA will be handled as orphan bracket.
+                logger.info(f"  phonetic replacer: SKIP (grammar info) '{full_match}'")
+                return full_match
+            logger.info(f"  phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
+            return f"{word} [{ipa}]"
+
+        # No IPA for this word — keep grammar info, strip garbled IPA
+        if _is_grammar_bracket_content(bracket_content):
+            logger.info(f"  phonetic replacer: SKIP (grammar, no IPA) '{full_match}'")
            return full_match

-        # Look up in IPA dictionary
-        ipa = _lookup_ipa(word, pronunciation)
-        if not ipa:
-            logger.info(f"  phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
-            return full_match  # Keep original
+        logger.info(f"  phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
+        return full_match

-        logger.info(f"  phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
-        return f"{word} [{ipa}]"
+    text = _PHONETIC_BRACKET_RE.sub(replacer, text)

-    return _PHONETIC_BRACKET_RE.sub(replacer, text)
+    # Second pass: strip remaining orphan brackets that are garbled IPA.
+    # These have no word before them (the main regex requires \b word \s* bracket).
+    # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+    # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+    _IPA_CHARS = set('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔ̩̃ʊɐ')
+
+    def _strip_orphan_bracket(m):
+        content = m.group(1).strip()
+        # Keep grammar info: (sich beschweren), (auf), (about/of)
+        if _is_grammar_bracket_content(content):
+            return m.group(0)
+        # Keep correct IPA (contains Unicode IPA characters)
+        if any(ch in _IPA_CHARS for ch in content):
+            return m.group(0)
+        logger.info(f"  phonetic: stripping orphan bracket '{m.group(0)}'")
+        return ''
+
+    text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
+    text = text.strip()
+
+    return text


 def _assign_row_words_to_columns(