From ab2423bd107ab78bfc4b1547f98540e39d02c445 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 16:46:45 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20protect=20numbered=20list=20prefixes=20f?= =?UTF-8?q?rom=201=E2=86=92I=20confusion=20in=20char=20fix=20step?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _CHAR_CONFUSION_RULES: standalone "1" → "I" now skips "1." and "1," Cross-language fallback rule: same lookahead (?![\d.,]) added Fixes: "cross = 1. Kreuz" being converted to "cross = I. Kreuz" in Step 1 Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1303c22..0613c95 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3429,11 +3429,12 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str # Common OCR confusion pairs in vocabulary context _CHAR_CONFUSION_RULES = [ # "1" at word start followed by lowercase → likely "I" or "l" - (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1 want → I want - # Standalone "1" between words → "I" (English pronoun) - (re.compile(r'(? List[Dict[str, An de_lower_words = set(de.lower().replace(',', ' ').split()) if de_lower_words & _DE_INDICATORS_FOR_EN_I: # Any remaining "1" in EN that looks like "I" - en = re.sub(r'\b1\b', 'I', en) + en = re.sub(r'\b1\b(?![\d.,])', 'I', en) # Fix "y " artifact before repeated word: "y you" → "you" en = re.sub(r'\by\s+([a-z])', r'\1', en)