diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 77ebf50..88075bb 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4273,14 +4273,22 @@ def _fix_phonetic_brackets( if not IPA_AVAILABLE: return entries + replaced_count = 0 for entry in entries: for field in ('english', 'german', 'example'): text = entry.get(field, '') or '' # Check for any bracket type — Tesseract garbles [ into { or ( if not any(ch in text for ch in '[{('): continue - entry[field] = _replace_phonetics_in_text(text, pronunciation) + new_text = _replace_phonetics_in_text(text, pronunciation) + if new_text != text: + logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'") + replaced_count += 1 + else: + logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'") + entry[field] = new_text + logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries") return entries @@ -4325,21 +4333,26 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str def replacer(match): word = match.group(1) bracket_content = match.group(2).strip() + full_match = match.group(0) # Skip if bracket content looks like regular text (multiple words) if len(bracket_content.split()) > 3: - return match.group(0) + logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'") + return full_match # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen, # Tanz(veranstaltung). These are real German morphemes, not garbled IPA. if _is_meaningful_bracket_content(bracket_content): - return match.group(0) + logger.info(f" phonetic replacer: SKIP (meaningful content) '{full_match}'") + return full_match # Look up in IPA dictionary ipa = _lookup_ipa(word, pronunciation) if not ipa: - return match.group(0) # Keep original + logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'") + return full_match # Keep original + logger.info(f" phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'") return f"{word} [{ipa}]" return _PHONETIC_BRACKET_RE.sub(replacer, text)