fix: column_text nur garbled IPA ersetzen, keine Einfuegung/Entfernung
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s

Fuer column_text (Full-Page Overlay mit gemischtem EN+DE Text):
- Kein IPA einfuegen (wuerde Token-Count aendern, Overlay-Positionen brechen)
- Keine orphan brackets entfernen (sind oft deutsche Bedeutungen wie (probieren))
- Nur garbled IPA ersetzen (z.B. [teıst] -> [tˈeɪst])

column_en behaelt volle Verarbeitung (replace + strip + insert).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-11 23:05:37 +01:00
parent 4afd5bd8e8
commit aa7db43f02

View File

@@ -829,12 +829,20 @@ def _is_grammar_bracket_content(content: str) -> bool:
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
def _replace_phonetics_in_text(
text: str,
pronunciation: str = 'british',
strip_orphans: bool = True,
) -> str:
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
Args:
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
Set to False for column_text where brackets may be German content.
"""
if not IPA_AVAILABLE:
return text
@@ -864,28 +872,30 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
# — they never contain a real word ≥4 letters with proper casing.
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
if len(content_alpha) >= 4:
return m.group(0)
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
if strip_orphans:
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
# — they never contain a real word ≥4 letters with proper casing.
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
if len(content_alpha) >= 4:
return m.group(0)
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip()
return text
@@ -952,17 +962,17 @@ def fix_cell_phonetics(
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
This function:
1. Replaces garbled IPA brackets with correct dictionary IPA
2. Inserts missing IPA for English headwords that have no brackets
Only processes cells in English-like columns (column_en, column_text).
German columns are never processed (they contain meaningful parentheses).
Processing depends on column type:
- column_en: Full processing (replace garbled IPA + strip orphan brackets
+ insert missing IPA). Safe because these cells contain only English
headwords.
- column_text: Light processing (replace garbled IPA ONLY). No orphan
bracket stripping (brackets may be German content like "(probieren)")
and no IPA insertion (would add tokens and break overlay positioning).
"""
if not IPA_AVAILABLE:
return cells
# Column types where IPA processing makes sense
ipa_col_types = {'column_en', 'column_text'}
replaced = 0
@@ -974,11 +984,17 @@ def fix_cell_phonetics(
if not text.strip():
continue
# Step 1: replace garbled IPA brackets
new_text = _replace_phonetics_in_text(text, pronunciation)
# Step 2: insert missing IPA if no brackets were present
if new_text == text:
new_text = _insert_missing_ipa(text, pronunciation)
if col_type == 'column_en':
# Full processing: replace garbled IPA, strip orphan brackets,
# insert missing IPA
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
if new_text == text:
new_text = _insert_missing_ipa(text, pronunciation)
else:
# column_text: only replace garbled IPA brackets, nothing else.
# No orphan stripping (would remove German parentheticals).
# No IPA insertion (would add tokens, breaking overlay positioning).
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
if new_text != text:
logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'")