fix: broaden phonetic bracket regex to catch Tesseract-garbled IPA
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Tesseract mangles IPA square brackets into curly braces or parentheses
(e.g. China [ˈtʃaɪnə] → China {'tfatno]). The previous regex only
matched [...], missing all garbled variants.
- Match any bracket type: [...], {...}, (...) including mixed pairs
- Add _is_meaningful_bracket_content() to preserve legitimate German
prefixes like (zer)brechen and Tanz(veranstaltung)
- Trigger IPA replacement on any bracket character, not just [
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4201,9 +4201,11 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
|
||||
|
||||
# --- D. Phonetic Bracket IPA Replacement ---
|
||||
|
||||
# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
|
||||
# Pattern: word followed by any bracket type containing phonetic content.
|
||||
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||||
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||||
_PHONETIC_BRACKET_RE = re.compile(
|
||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
|
||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||||
)
|
||||
|
||||
|
||||
@@ -4274,25 +4276,64 @@ def _fix_phonetic_brackets(
|
||||
for entry in entries:
|
||||
for field in ('english', 'german', 'example'):
|
||||
text = entry.get(field, '') or ''
|
||||
if '[' not in text:
|
||||
# Check for any bracket type — Tesseract garbles [ into { or (
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
|
||||
_GERMAN_BRACKET_PREFIXES = frozenset({
|
||||
'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
|
||||
'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
|
||||
'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
|
||||
})
|
||||
|
||||
|
||||
def _is_meaningful_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is a meaningful word/prefix, not garbled IPA.
|
||||
|
||||
Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
|
||||
Garbled IPA: {'tfatno, (cy, 1u], 'daens
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
# Must be pure letters (no digits, punctuation, IPA symbols)
|
||||
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
|
||||
return False
|
||||
# Known German prefix
|
||||
if content.lower() in _GERMAN_BRACKET_PREFIXES:
|
||||
return True
|
||||
# Long enough to be a real word (not 1-2 char garbled IPA like "cy")
|
||||
if len(content) >= 4:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Replace [phonetic] after words with dictionary IPA."""
|
||||
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||||
|
||||
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||||
We match any bracket type and replace with dictionary IPA if found.
|
||||
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
|
||||
def replacer(match):
|
||||
word = match.group(1)
|
||||
ocr_phonetic = match.group(2)
|
||||
bracket_content = match.group(2).strip()
|
||||
|
||||
# Skip if bracket content looks like regular text (has spaces + capitals)
|
||||
if len(ocr_phonetic.split()) > 3:
|
||||
return match.group(0) # Keep original
|
||||
# Skip if bracket content looks like regular text (multiple words)
|
||||
if len(bracket_content.split()) > 3:
|
||||
return match.group(0)
|
||||
|
||||
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
|
||||
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
|
||||
if _is_meaningful_bracket_content(bracket_content):
|
||||
return match.group(0)
|
||||
|
||||
# Look up in IPA dictionary
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
|
||||
Reference in New Issue
Block a user