fix: column_text nur garbled IPA ersetzen, keine Einfuegung/Entfernung
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s

Fuer column_text (Full-Page Overlay mit gemischtem EN+DE Text):
- Kein IPA einfuegen (wuerde Token-Count aendern, Overlay-Positionen brechen)
- Keine orphan brackets entfernen (sind oft deutsche Bedeutungen wie (probieren))
- Nur garbled IPA ersetzen (z.B. [teıst] -> [tˈeɪst])

column_en behaelt volle Verarbeitung (replace + strip + insert).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-11 23:05:37 +01:00
parent 4afd5bd8e8
commit aa7db43f02

View File

@@ -829,12 +829,20 @@ def _is_grammar_bracket_content(content: str) -> bool:
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens) return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: def _replace_phonetics_in_text(
text: str,
pronunciation: str = 'british',
strip_orphans: bool = True,
) -> str:
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA. """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno]. Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found. We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved. Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
Args:
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
Set to False for column_text where brackets may be German content.
""" """
if not IPA_AVAILABLE: if not IPA_AVAILABLE:
return text return text
@@ -864,28 +872,30 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
text = _PHONETIC_BRACKET_RE.sub(replacer, text) text = _PHONETIC_BRACKET_RE.sub(replacer, text)
# Second pass: strip remaining orphan brackets that are garbled IPA. if strip_orphans:
# These have no word before them (the main regex requires \b word \s* bracket). # Second pass: strip remaining orphan brackets that are garbled IPA.
# Examples: "[mais]", "{'mani setva]", trailing "(kros]" # These have no word before them (the main regex requires \b word \s* bracket).
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
def _strip_orphan_bracket(m): # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
content = m.group(1).strip() def _strip_orphan_bracket(m):
# Keep grammar info: (sich beschweren), (about/of) content = m.group(1).strip()
if _is_grammar_bracket_content(content): # Keep grammar info: (sich beschweren), (about/of)
return m.group(0) if _is_grammar_bracket_content(content):
# Keep correct IPA (contains Unicode IPA characters) return m.group(0)
if any(ch in _IPA_CHARS for ch in content): # Keep correct IPA (contains Unicode IPA characters)
return m.group(0) if any(ch in _IPA_CHARS for ch in content):
# Keep real-word parentheticals like (probieren), (Profit), (Geld). return m.group(0)
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais) # Keep real-word parentheticals like (probieren), (Profit), (Geld).
# — they never contain a real word ≥4 letters with proper casing. # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content) # — they never contain a real word ≥4 letters with proper casing.
if len(content_alpha) >= 4: content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
return m.group(0) if len(content_alpha) >= 4:
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") return m.group(0)
return '' logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip() text = text.strip()
return text return text
@@ -952,17 +962,17 @@ def fix_cell_phonetics(
(entry['english']). But the overlay reads cell['text'] directly, so (entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too. phonetic fixes must be applied to cells too.
This function: Processing depends on column type:
1. Replaces garbled IPA brackets with correct dictionary IPA - column_en: Full processing (replace garbled IPA + strip orphan brackets
2. Inserts missing IPA for English headwords that have no brackets + insert missing IPA). Safe because these cells contain only English
headwords.
Only processes cells in English-like columns (column_en, column_text). - column_text: Light processing (replace garbled IPA ONLY). No orphan
German columns are never processed (they contain meaningful parentheses). bracket stripping (brackets may be German content like "(probieren)")
and no IPA insertion (would add tokens and break overlay positioning).
""" """
if not IPA_AVAILABLE: if not IPA_AVAILABLE:
return cells return cells
# Column types where IPA processing makes sense
ipa_col_types = {'column_en', 'column_text'} ipa_col_types = {'column_en', 'column_text'}
replaced = 0 replaced = 0
@@ -974,11 +984,17 @@ def fix_cell_phonetics(
if not text.strip(): if not text.strip():
continue continue
# Step 1: replace garbled IPA brackets if col_type == 'column_en':
new_text = _replace_phonetics_in_text(text, pronunciation) # Full processing: replace garbled IPA, strip orphan brackets,
# Step 2: insert missing IPA if no brackets were present # insert missing IPA
if new_text == text: new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
new_text = _insert_missing_ipa(text, pronunciation) if new_text == text:
new_text = _insert_missing_ipa(text, pronunciation)
else:
# column_text: only replace garbled IPA brackets, nothing else.
# No orphan stripping (would remove German parentheticals).
# No IPA insertion (would add tokens, breaking overlay positioning).
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
if new_text != text: if new_text != text:
logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'") logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'")