diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 88061cd..a2bab0e 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets( if not IPA_AVAILABLE: return entries + # IPA phonetics only appear in the ENGLISH field of vocab tables. + # German and example fields contain meaningful parenthetical content: + # german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)" + # example: "(sich beschweren)", "(brauchen)", "(jammern)" + # These must NEVER be processed as phonetic transcriptions. replaced_count = 0 for entry in entries: - for field in ('english', 'german', 'example'): - text = entry.get(field, '') or '' - # Check for any bracket type — Tesseract garbles [ into { or ( - if not any(ch in text for ch in '[{('): - continue - new_text = _replace_phonetics_in_text(text, pronunciation) - if new_text != text: - logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'") - replaced_count += 1 - else: - logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'") - entry[field] = new_text + text = entry.get('english', '') or '' + if not any(ch in text for ch in '[{('): + continue + new_text = _replace_phonetics_in_text(text, pronunciation) + if new_text != text: + logger.info(f"_fix_phonetic_brackets: english '{text}' → '{new_text}'") + replaced_count += 1 + entry['english'] = new_text logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries") return entries @@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({ def _is_grammar_bracket_content(content: str) -> bool: - """Return True if bracket content is grammar info or a German morpheme. + """Return True if bracket content is grammar info in the ENGLISH field. Grammar info: cross (with), complain (about/of), agree (on/with) - German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen) - NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo] + NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test] - Strategy: check each slash-separated token. If ALL tokens are known - grammar words or German affixes, it's grammar info. Otherwise it - might be garbled IPA. + Since we only process the English field, we only need to recognize + English grammar particles. Everything else is (garbled) IPA. """ if not content: return False @@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool: if not tokens: return False - for token in tokens: - # Each token must be pure letters - if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token): - return False - # Check if it's a known grammar word - if token in _GRAMMAR_BRACKET_WORDS: - continue - # German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen" - # But NOT things that look like garbled IPA transcriptions. - # Heuristic: if it's a common German suffix or a long word, keep it. - if len(token) >= 4: - continue - # Short unknown token — likely garbled IPA - return False - - return True + # ALL tokens must be known grammar words + return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens) def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: