diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1303c22..0613c95 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3429,11 +3429,12 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str # Common OCR confusion pairs in vocabulary context _CHAR_CONFUSION_RULES = [ # "1" at word start followed by lowercase → likely "I" or "l" - (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1 want → I want - # Standalone "1" between words → "I" (English pronoun) - (re.compile(r'(? List[Dict[str, An de_lower_words = set(de.lower().replace(',', ' ').split()) if de_lower_words & _DE_INDICATORS_FOR_EN_I: # Any remaining "1" in EN that looks like "I" - en = re.sub(r'\b1\b', 'I', en) + en = re.sub(r'\b1\b(?![\d.,])', 'I', en) # Fix "y " artifact before repeated word: "y you" → "you" en = re.sub(r'\by\s+([a-z])', r'\1', en)