From ab2423bd107ab78bfc4b1547f98540e39d02c445 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 3 Mar 2026 16:46:45 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20protect=20numbered=20list=20prefixes=20f?=
 =?UTF-8?q?rom=201=E2=86=92I=20confusion=20in=20char=20fix=20step?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_CHAR_CONFUSION_RULES: standalone "1" → "I" now skips "1." and "1,"
Cross-language fallback rule: same lookahead (?![\d.,]) added
Fixes: "cross = 1. Kreuz" being converted to "cross = I. Kreuz" in Step 1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 1303c22..0613c95 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3429,11 +3429,12 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str
 # Common OCR confusion pairs in vocabulary context
 _CHAR_CONFUSION_RULES = [
     # "1" at word start followed by lowercase → likely "I" or "l"
-    (re.compile(r'\b1([a-z])'), r'I\1'),        # 1ch → Ich, 1 want → I want
-    # Standalone "1" between words → "I" (English pronoun)
-    (re.compile(r'(?<!\d)\b1\b(?!\d)'), 'I'),   # "1 want" → "I want"
-    # "|" used as "I" or "l"
-    (re.compile(r'(?<!\|)\|(?!\|)'), 'I'),       # |ch → Ich
+    # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
+    (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
+    # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
+    (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
+    # "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
+    (re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'),    # |ch → Ich, | want → I want
 ]
 
 # Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
@@ -3463,7 +3464,7 @@ def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, An
         de_lower_words = set(de.lower().replace(',', ' ').split())
         if de_lower_words & _DE_INDICATORS_FOR_EN_I:
             # Any remaining "1" in EN that looks like "I"
-            en = re.sub(r'\b1\b', 'I', en)
+            en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
 
         # Fix "y " artifact before repeated word: "y you" → "you"
         en = re.sub(r'\by\s+([a-z])', r'\1', en)