From cde13c9623b87e14dc8910ccf8baed3e3479dbe2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 11 Apr 2026 22:13:45 +0200 Subject: [PATCH] =?UTF-8?q?Fix=20IPA=20stripping=20digits=20after=20headwo?= =?UTF-8?q?rds=20(Theme=201=20=E2=86=92=20Theme)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _insert_missing_ipa stripped "1" from "Theme 1" because it treated the digit as garbled OCR phonetics. Now treats pure digits/numbering patterns (1, 2., 3)) as delimiters that stop the garble-stripping. Also fixes _has_non_dict_trailing which incorrectly flagged "Theme 1" as having non-dictionary trailing text. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/cv_ocr_engines.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 6a2ca8e..ba2d1e1 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1182,6 +1182,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: if wj in ('–', '—', '-', '/', '|', ',', ';'): kept.extend(words[j:]) break + # Pure digits or numbering (e.g. "1", "2.", "3)") — keep + if re.match(r'^[\d.)\-]+$', wj): + kept.extend(words[j:]) + break # Starts with uppercase — likely German or proper noun clean_j = re.sub(r'[^a-zA-Z]', '', wj) if clean_j and clean_j[0].isupper(): @@ -1243,6 +1247,9 @@ def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool: wj = words[j] if wj in ('–', '—', '-', '/', '|', ',', ';'): return False + # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA + if re.match(r'^[\d.)\-]+$', wj): + return False clean_j = re.sub(r'[^a-zA-Z]', '', wj) if clean_j and clean_j[0].isupper(): return False