From 4afd5bd8e81c6bd0096df93990b7f0f78568e6c1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 22:47:01 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20Klammerw=C3=B6rter=20wie=20(probieren),?= =?UTF-8?q?=20(Profit)=20nicht=20mehr=20als=20garbled=20IPA=20entfernen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _strip_orphan_bracket entfernte deutsche Bedeutungsangaben in Klammern, weil sie weder als Grammar-Partikel noch als IPA erkannt wurden. Fix: Klammerinhalte mit echten Wörtern (>=4 Buchstaben) werden behalten. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 3d1f89e..ee43b73 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -876,6 +876,12 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str # Keep correct IPA (contains Unicode IPA characters) if any(ch in _IPA_CHARS for ch in content): return m.group(0) + # Keep real-word parentheticals like (probieren), (Profit), (Geld). + # Garbled IPA fragments are short nonsense like (kros), (cy), (mais) + # — they never contain a real word ≥4 letters with proper casing. + content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content) + if len(content_alpha) >= 4: + return m.group(0) logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") return ''