fix: column_text nur garbled IPA ersetzen, keine Einfuegung/Entfernung

Fuer column_text (Full-Page Overlay mit gemischtem EN+DE Text): - Kein IPA einfuegen (wuerde Token-Count aendern, Overlay-Positionen brechen) - Keine orphan brackets entfernen (sind oft deutsche Bedeutungen wie (probieren)) - Nur garbled IPA ersetzen (z.B. [teıst] -> [tˈeɪst]) column_en behaelt volle Verarbeitung (replace + strip + insert). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 23:05:37 +01:00
parent 4afd5bd8e8
commit aa7db43f02
1 changed files with 50 additions and 34 deletions
@@ -829,12 +829,20 @@ def _is_grammar_bracket_content(content: str) -> bool:
    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)


-def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
+def _replace_phonetics_in_text(
+    text: str,
+    pronunciation: str = 'british',
+    strip_orphans: bool = True,
+) -> str:
    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.

    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
    We match any bracket type and replace with dictionary IPA if found.
    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
+
+    Args:
+        strip_orphans: If True, strip orphan brackets that look like garbled IPA.
+            Set to False for column_text where brackets may be German content.
    """
    if not IPA_AVAILABLE:
        return text
@@ -864,28 +872,30 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str

    text = _PHONETIC_BRACKET_RE.sub(replacer, text)

-    # Second pass: strip remaining orphan brackets that are garbled IPA.
-    # These have no word before them (the main regex requires \b word \s* bracket).
-    # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
-    # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
-    def _strip_orphan_bracket(m):
-        content = m.group(1).strip()
-        # Keep grammar info: (sich beschweren), (about/of)
-        if _is_grammar_bracket_content(content):
-            return m.group(0)
-        # Keep correct IPA (contains Unicode IPA characters)
-        if any(ch in _IPA_CHARS for ch in content):
-            return m.group(0)
-        # Keep real-word parentheticals like (probieren), (Profit), (Geld).
-        # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
-        # — they never contain a real word ≥4 letters with proper casing.
-        content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
-        if len(content_alpha) >= 4:
-            return m.group(0)
-        logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
-        return ''
+    if strip_orphans:
+        # Second pass: strip remaining orphan brackets that are garbled IPA.
+        # These have no word before them (the main regex requires \b word \s* bracket).
+        # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+        # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+        def _strip_orphan_bracket(m):
+            content = m.group(1).strip()
+            # Keep grammar info: (sich beschweren), (about/of)
+            if _is_grammar_bracket_content(content):
+                return m.group(0)
+            # Keep correct IPA (contains Unicode IPA characters)
+            if any(ch in _IPA_CHARS for ch in content):
+                return m.group(0)
+            # Keep real-word parentheticals like (probieren), (Profit), (Geld).
+            # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
+            # — they never contain a real word ≥4 letters with proper casing.
+            content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
+            if len(content_alpha) >= 4:
+                return m.group(0)
+            logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
+            return ''
+
+        text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)

-    text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
    text = text.strip()

    return text
@@ -952,17 +962,17 @@ def fix_cell_phonetics(
    (entry['english']).  But the overlay reads cell['text'] directly, so
    phonetic fixes must be applied to cells too.

-    This function:
-    1. Replaces garbled IPA brackets with correct dictionary IPA
-    2. Inserts missing IPA for English headwords that have no brackets
-
-    Only processes cells in English-like columns (column_en, column_text).
-    German columns are never processed (they contain meaningful parentheses).
+    Processing depends on column type:
+    - column_en: Full processing (replace garbled IPA + strip orphan brackets
+      + insert missing IPA). Safe because these cells contain only English
+      headwords.
+    - column_text: Light processing (replace garbled IPA ONLY). No orphan
+      bracket stripping (brackets may be German content like "(probieren)")
+      and no IPA insertion (would add tokens and break overlay positioning).
    """
    if not IPA_AVAILABLE:
        return cells

-    # Column types where IPA processing makes sense
    ipa_col_types = {'column_en', 'column_text'}
    replaced = 0

@@ -974,11 +984,17 @@ def fix_cell_phonetics(
        if not text.strip():
            continue

-        # Step 1: replace garbled IPA brackets
-        new_text = _replace_phonetics_in_text(text, pronunciation)
-        # Step 2: insert missing IPA if no brackets were present
-        if new_text == text:
-            new_text = _insert_missing_ipa(text, pronunciation)
+        if col_type == 'column_en':
+            # Full processing: replace garbled IPA, strip orphan brackets,
+            # insert missing IPA
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
+            if new_text == text:
+                new_text = _insert_missing_ipa(text, pronunciation)
+        else:
+            # column_text: only replace garbled IPA brackets, nothing else.
+            # No orphan stripping (would remove German parentheticals).
+            # No IPA insertion (would add tokens, breaking overlay positioning).
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)

        if new_text != text:
            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")