fix: protect numbered list prefixes from 1→I confusion in char fix step

_CHAR_CONFUSION_RULES: standalone "1" → "I" now skips "1." and "1,"
Cross-language fallback rule: same lookahead (?![\d.,]) added
Fixes: "cross = 1. Kreuz" being converted to "cross = I. Kreuz" in Step 1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 16:46:45 +01:00
parent b914b6f49d
commit ab2423bd10

View File

@@ -3429,11 +3429,12 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str
# Common OCR confusion pairs in vocabulary context
_CHAR_CONFUSION_RULES = [
# "1" at word start followed by lowercase → likely "I" or "l"
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1 want → I want
# Standalone "1" between words → "I" (English pronoun)
(re.compile(r'(?<!\d)\b1\b(?!\d)'), 'I'), # "1 want" → "I want"
# "|" used as "I" or "l"
(re.compile(r'(?<!\|)\|(?!\|)'), 'I'), # |ch → Ich
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
]
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
@@ -3463,7 +3464,7 @@ def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, An
de_lower_words = set(de.lower().replace(',', ' ').split())
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
# Any remaining "1" in EN that looks like "I"
en = re.sub(r'\b1\b', 'I', en)
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
# Fix "y " artifact before repeated word: "y you" → "you"
en = re.sub(r'\by\s+([a-z])', r'\1', en)