diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 5ef69ec..77ebf50 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4201,9 +4201,11 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A # --- D. Phonetic Bracket IPA Replacement --- -# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets +# Pattern: word followed by any bracket type containing phonetic content. +# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc. +# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs. _PHONETIC_BRACKET_RE = re.compile( - r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]' + r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]' ) @@ -4274,25 +4276,64 @@ def _fix_phonetic_brackets( for entry in entries: for field in ('english', 'german', 'example'): text = entry.get(field, '') or '' - if '[' not in text: + # Check for any bracket type — Tesseract garbles [ into { or ( + if not any(ch in text for ch in '[{('): continue entry[field] = _replace_phonetics_in_text(text, pronunciation) return entries +# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung) +_GERMAN_BRACKET_PREFIXES = frozenset({ + 'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent', + 'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter', + 'ver', 'vor', 'weg', 'zer', 'zu', 'zurück', +}) + + +def _is_meaningful_bracket_content(content: str) -> bool: + """Return True if bracket content is a meaningful word/prefix, not garbled IPA. + + Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt + Garbled IPA: {'tfatno, (cy, 1u], 'daens + """ + if not content: + return False + # Must be pure letters (no digits, punctuation, IPA symbols) + if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content): + return False + # Known German prefix + if content.lower() in _GERMAN_BRACKET_PREFIXES: + return True + # Long enough to be a real word (not 1-2 char garbled IPA like "cy") + if len(content) >= 4: + return True + return False + + def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: - """Replace [phonetic] after words with dictionary IPA.""" + """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA. + + Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno]. + We match any bracket type and replace with dictionary IPA if found. + Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved. + """ if not IPA_AVAILABLE: return text def replacer(match): word = match.group(1) - ocr_phonetic = match.group(2) + bracket_content = match.group(2).strip() - # Skip if bracket content looks like regular text (has spaces + capitals) - if len(ocr_phonetic.split()) > 3: - return match.group(0) # Keep original + # Skip if bracket content looks like regular text (multiple words) + if len(bracket_content.split()) > 3: + return match.group(0) + + # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen, + # Tanz(veranstaltung). These are real German morphemes, not garbled IPA. + if _is_meaningful_bracket_content(bracket_content): + return match.group(0) # Look up in IPA dictionary ipa = _lookup_ipa(word, pronunciation)