diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 6a2ca8e..ba2d1e1 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1182,6 +1182,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: if wj in ('–', '—', '-', '/', '|', ',', ';'): kept.extend(words[j:]) break + # Pure digits or numbering (e.g. "1", "2.", "3)") — keep + if re.match(r'^[\d.)\-]+$', wj): + kept.extend(words[j:]) + break # Starts with uppercase — likely German or proper noun clean_j = re.sub(r'[^a-zA-Z]', '', wj) if clean_j and clean_j[0].isupper(): @@ -1243,6 +1247,9 @@ def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool: wj = words[j] if wj in ('–', '—', '-', '/', '|', ',', ';'): return False + # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA + if re.match(r'^[\d.)\-]+$', wj): + return False clean_j = re.sub(r'[^a-zA-Z]', '', wj) if clean_j and clean_j[0].isupper(): return False