diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 3d1f89e..ee43b73 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -876,6 +876,12 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str # Keep correct IPA (contains Unicode IPA characters) if any(ch in _IPA_CHARS for ch in content): return m.group(0) + # Keep real-word parentheticals like (probieren), (Profit), (Geld). + # Garbled IPA fragments are short nonsense like (kros), (cy), (mais) + # — they never contain a real word ≥4 letters with proper casing. + content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content) + if len(content_alpha) >= 4: + return m.group(0) logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") return ''