fix: column_text nur garbled IPA ersetzen, keine Einfuegung/Entfernung

Fuer column_text (Full-Page Overlay mit gemischtem EN+DE Text): - Kein IPA einfuegen (wuerde Token-Count aendern, Overlay-Positionen brechen) - Keine orphan brackets entfernen (sind oft deutsche Bedeutungen wie (probieren)) - Nur garbled IPA ersetzen (z.B. [teıst] -> [tˈeɪst]) column_en behaelt volle Verarbeitung (replace + strip + insert). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 23:05:37 +01:00
parent 4afd5bd8e8
commit aa7db43f02
1 changed files with 50 additions and 34 deletions
@@ -829,12 +829,20 @@ def _is_grammar_bracket_content(content: str) -> bool:
    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
-def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
+def _replace_phonetics_in_text(
    text: str,
    pronunciation: str = 'british',
    strip_orphans: bool = True,
 ) -> str:
    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
    We match any bracket type and replace with dictionary IPA if found.
    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
    Args:
        strip_orphans: If True, strip orphan brackets that look like garbled IPA.
            Set to False for column_text where brackets may be German content.
    """
    if not IPA_AVAILABLE:
        return text
@@ -864,28 +872,30 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    text = _PHONETIC_BRACKET_RE.sub(replacer, text)
-    # Second pass: strip remaining orphan brackets that are garbled IPA.
+    if strip_orphans:
-    # These have no word before them (the main regex requires \b word \s* bracket).
+        # Second pass: strip remaining orphan brackets that are garbled IPA.
-    # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+        # These have no word before them (the main regex requires \b word \s* bracket).
-    # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+        # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
-    def _strip_orphan_bracket(m):
+        # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
-        content = m.group(1).strip()
+        def _strip_orphan_bracket(m):
-        # Keep grammar info: (sich beschweren), (about/of)
+            content = m.group(1).strip()
-        if _is_grammar_bracket_content(content):
+            # Keep grammar info: (sich beschweren), (about/of)
-            return m.group(0)
+            if _is_grammar_bracket_content(content):
-        # Keep correct IPA (contains Unicode IPA characters)
+                return m.group(0)
-        if any(ch in _IPA_CHARS for ch in content):
+            # Keep correct IPA (contains Unicode IPA characters)
-            return m.group(0)
+            if any(ch in _IPA_CHARS for ch in content):
-        # Keep real-word parentheticals like (probieren), (Profit), (Geld).
+                return m.group(0)
-        # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
+            # Keep real-word parentheticals like (probieren), (Profit), (Geld).
-        # — they never contain a real word ≥4 letters with proper casing.
+            # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
-        content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
+            # — they never contain a real word ≥4 letters with proper casing.
-        if len(content_alpha) >= 4:
+            content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
-            return m.group(0)
+            if len(content_alpha) >= 4:
-        logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
+                return m.group(0)
-        return ''
+            logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
            return ''
        text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
    text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
    text = text.strip()
    return text
@@ -952,17 +962,17 @@ def fix_cell_phonetics(
    (entry['english']).  But the overlay reads cell['text'] directly, so
    phonetic fixes must be applied to cells too.
-    This function:
+    Processing depends on column type:
-    1. Replaces garbled IPA brackets with correct dictionary IPA
+    - column_en: Full processing (replace garbled IPA + strip orphan brackets
-    2. Inserts missing IPA for English headwords that have no brackets
+      + insert missing IPA). Safe because these cells contain only English
-
+      headwords.
-    Only processes cells in English-like columns (column_en, column_text).
+    - column_text: Light processing (replace garbled IPA ONLY). No orphan
-    German columns are never processed (they contain meaningful parentheses).
+      bracket stripping (brackets may be German content like "(probieren)")
      and no IPA insertion (would add tokens and break overlay positioning).
    """
    if not IPA_AVAILABLE:
        return cells
    # Column types where IPA processing makes sense
    ipa_col_types = {'column_en', 'column_text'}
    replaced = 0
@@ -974,11 +984,17 @@ def fix_cell_phonetics(
        if not text.strip():
            continue
-        # Step 1: replace garbled IPA brackets
+        if col_type == 'column_en':
-        new_text = _replace_phonetics_in_text(text, pronunciation)
+            # Full processing: replace garbled IPA, strip orphan brackets,
-        # Step 2: insert missing IPA if no brackets were present
+            # insert missing IPA
-        if new_text == text:
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
-            new_text = _insert_missing_ipa(text, pronunciation)
+            if new_text == text:
                new_text = _insert_missing_ipa(text, pronunciation)
        else:
            # column_text: only replace garbled IPA brackets, nothing else.
            # No orphan stripping (would remove German parentheticals).
            # No IPA insertion (would add tokens, breaking overlay positioning).
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
        if new_text != text:
            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")