From cde13c9623b87e14dc8910ccf8baed3e3479dbe2 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sat, 11 Apr 2026 22:13:45 +0200
Subject: [PATCH] =?UTF-8?q?Fix=20IPA=20stripping=20digits=20after=20headwo?=
 =?UTF-8?q?rds=20(Theme=201=20=E2=86=92=20Theme)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_insert_missing_ipa stripped "1" from "Theme 1" because it treated
the digit as garbled OCR phonetics. Now treats pure digits/numbering
patterns (1, 2., 3)) as delimiters that stop the garble-stripping.

Also fixes _has_non_dict_trailing which incorrectly flagged "Theme 1"
as having non-dictionary trailing text.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 6a2ca8e..ba2d1e1 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1182,6 +1182,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
                 if wj in ('–', '—', '-', '/', '|', ',', ';'):
                     kept.extend(words[j:])
                     break
+                # Pure digits or numbering (e.g. "1", "2.", "3)") — keep
+                if re.match(r'^[\d.)\-]+$', wj):
+                    kept.extend(words[j:])
+                    break
                 # Starts with uppercase — likely German or proper noun
                 clean_j = re.sub(r'[^a-zA-Z]', '', wj)
                 if clean_j and clean_j[0].isupper():
@@ -1243,6 +1247,9 @@ def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
         wj = words[j]
         if wj in ('–', '—', '-', '/', '|', ',', ';'):
             return False
+        # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
+        if re.match(r'^[\d.)\-]+$', wj):
+            return False
         clean_j = re.sub(r'[^a-zA-Z]', '', wj)
         if clean_j and clean_j[0].isupper():
             return False