fix: column_text nur garbled IPA ersetzen, keine Einfuegung/Entfernung
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s
Fuer column_text (Full-Page Overlay mit gemischtem EN+DE Text): - Kein IPA einfuegen (wuerde Token-Count aendern, Overlay-Positionen brechen) - Keine orphan brackets entfernen (sind oft deutsche Bedeutungen wie (probieren)) - Nur garbled IPA ersetzen (z.B. [teıst] -> [tˈeɪst]) column_en behaelt volle Verarbeitung (replace + strip + insert). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -829,12 +829,20 @@ def _is_grammar_bracket_content(content: str) -> bool:
|
||||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||
def _replace_phonetics_in_text(
|
||||
text: str,
|
||||
pronunciation: str = 'british',
|
||||
strip_orphans: bool = True,
|
||||
) -> str:
|
||||
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||||
|
||||
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||||
We match any bracket type and replace with dictionary IPA if found.
|
||||
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||||
|
||||
Args:
|
||||
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
|
||||
Set to False for column_text where brackets may be German content.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
@@ -864,28 +872,30 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
||||
|
||||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
|
||||
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
|
||||
# — they never contain a real word ≥4 letters with proper casing.
|
||||
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
|
||||
if len(content_alpha) >= 4:
|
||||
return m.group(0)
|
||||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
if strip_orphans:
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
|
||||
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
|
||||
# — they never contain a real word ≥4 letters with proper casing.
|
||||
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
|
||||
if len(content_alpha) >= 4:
|
||||
return m.group(0)
|
||||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
@@ -952,17 +962,17 @@ def fix_cell_phonetics(
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
This function:
|
||||
1. Replaces garbled IPA brackets with correct dictionary IPA
|
||||
2. Inserts missing IPA for English headwords that have no brackets
|
||||
|
||||
Only processes cells in English-like columns (column_en, column_text).
|
||||
German columns are never processed (they contain meaningful parentheses).
|
||||
Processing depends on column type:
|
||||
- column_en: Full processing (replace garbled IPA + strip orphan brackets
|
||||
+ insert missing IPA). Safe because these cells contain only English
|
||||
headwords.
|
||||
- column_text: Light processing (replace garbled IPA ONLY). No orphan
|
||||
bracket stripping (brackets may be German content like "(probieren)")
|
||||
and no IPA insertion (would add tokens and break overlay positioning).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return cells
|
||||
|
||||
# Column types where IPA processing makes sense
|
||||
ipa_col_types = {'column_en', 'column_text'}
|
||||
replaced = 0
|
||||
|
||||
@@ -974,11 +984,17 @@ def fix_cell_phonetics(
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
# Step 1: replace garbled IPA brackets
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
# Step 2: insert missing IPA if no brackets were present
|
||||
if new_text == text:
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
if col_type == 'column_en':
|
||||
# Full processing: replace garbled IPA, strip orphan brackets,
|
||||
# insert missing IPA
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
|
||||
if new_text == text:
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
else:
|
||||
# column_text: only replace garbled IPA brackets, nothing else.
|
||||
# No orphan stripping (would remove German parentheticals).
|
||||
# No IPA insertion (would add tokens, breaking overlay positioning).
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
|
||||
Reference in New Issue
Block a user