fix: protect numbered list prefixes from 1→I confusion in char fix step
_CHAR_CONFUSION_RULES: standalone "1" → "I" now skips "1." and "1," Cross-language fallback rule: same lookahead (?![\d.,]) added Fixes: "cross = 1. Kreuz" being converted to "cross = I. Kreuz" in Step 1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3429,11 +3429,12 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str
|
||||
# Common OCR confusion pairs in vocabulary context
|
||||
_CHAR_CONFUSION_RULES = [
|
||||
# "1" at word start followed by lowercase → likely "I" or "l"
|
||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1 want → I want
|
||||
# Standalone "1" between words → "I" (English pronoun)
|
||||
(re.compile(r'(?<!\d)\b1\b(?!\d)'), 'I'), # "1 want" → "I want"
|
||||
# "|" used as "I" or "l"
|
||||
(re.compile(r'(?<!\|)\|(?!\|)'), 'I'), # |ch → Ich
|
||||
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
|
||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||||
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||
]
|
||||
|
||||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||
@@ -3463,7 +3464,7 @@ def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, An
|
||||
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||||
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||||
# Any remaining "1" in EN that looks like "I"
|
||||
en = re.sub(r'\b1\b', 'I', en)
|
||||
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
|
||||
|
||||
# Fix "y " artifact before repeated word: "y you" → "you"
|
||||
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||||
|
||||
Reference in New Issue
Block a user