Fix IPA stripping digits after headwords (Theme 1 → Theme)

_insert_missing_ipa stripped "1" from "Theme 1" because it treated the digit as garbled OCR phonetics. Now treats pure digits/numbering patterns (1, 2., 3)) as delimiters that stop the garble-stripping. Also fixes _has_non_dict_trailing which incorrectly flagged "Theme 1" as having non-dictionary trailing text. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 22:13:45 +02:00
parent 2e42167c73
commit cde13c9623
1 changed files with 7 additions and 0 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1182,6 +1182,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
                if wj in ('–', '—', '-', '/', '|', ',', ';'):
                    kept.extend(words[j:])
                    break
+                # Pure digits or numbering (e.g. "1", "2.", "3)") — keep
+                if re.match(r'^[\d.)\-]+$', wj):
+                    kept.extend(words[j:])
+                    break
                # Starts with uppercase — likely German or proper noun
                clean_j = re.sub(r'[^a-zA-Z]', '', wj)
                if clean_j and clean_j[0].isupper():
@@ -1243,6 +1247,9 @@ def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
        wj = words[j]
        if wj in ('–', '—', '-', '/', '|', ',', ';'):
            return False
+        # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
+        if re.match(r'^[\d.)\-]+$', wj):
+            return False
        clean_j = re.sub(r'[^a-zA-Z]', '', wj)
        if clean_j and clean_j[0].isupper():
            return False