feat(ocr-pipeline): add cell text noise filter for OCR artifacts

Add _clean_cell_text() with three sub-filters to remove OCR noise: - _is_garbage_text(): vowel/consonant ratio check for phantom row garbage - _is_noise_tail_token(): dictionary-based trailing noise detection - _RE_REAL_WORD check for cells with no real words (just fragments) Handles balanced parentheses "(auf)" and trailing hyphens "under-" as legitimate tokens while stripping noise like "Es)", "3", "ee", "B". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 10:19:31 +01:00
parent 2b1c499d54
commit 3028f421b4
1 changed files with 186 additions and 7 deletions
@@ -3112,6 +3112,190 @@ def _assign_row_words_to_columns(
    return result


+# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
+_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
+_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
+
+# Common short EN/DE words (2-3 chars).  Tokens at the end of a cell
+# that do NOT appear here are treated as trailing OCR noise.
+_COMMON_SHORT_WORDS: set = {
+    # EN 1-2 letter
+    'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
+    'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
+    'or', 'so', 'to', 'up', 'us', 'we',
+    # EN 3 letter
+    'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
+    'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
+    'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
+    'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
+    'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
+    'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
+    'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
+    'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
+    'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
+    'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
+    'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
+    'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
+    'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
+    'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
+    'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
+    'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
+    'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
+    'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
+    'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
+    'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
+    'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
+    'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
+    'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
+    'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
+    'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
+    'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
+    'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
+    'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
+    'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
+    'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
+    'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
+    'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
+    'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
+    'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
+    'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
+    'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
+    'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
+    'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
+    'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
+    'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
+    'zap', 'zip', 'zoo',
+    # DE 2-3 letter
+    'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
+    'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
+    'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
+    'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
+    'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
+    'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
+    'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
+    'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
+    'wut', 'zum', 'zur',
+}
+
+
+def _is_noise_tail_token(token: str) -> bool:
+    """Check if a token at the END of cell text is trailing OCR noise.
+
+    Trailing fragments are very common OCR artifacts from image edges,
+    borders, and neighbouring cells.  This is more aggressive than a
+    general word filter: any short token that isn't in the dictionary
+    of common EN/DE words is considered noise.
+
+    Examples of noise: "Es)", "3", "ee", "B"
+    Examples to keep:  "sister.", "cupcakes.", "...", "mice", "[eg]"
+    """
+    t = token.strip()
+    if not t:
+        return True
+
+    # Keep ellipsis
+    if t in ('...', '…'):
+        return False
+
+    # Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
+    if t.startswith('[') or t.startswith('["') or t.startswith("['"):
+        return False
+
+    # Pure non-alpha → noise ("3", ")", "|")
+    alpha_chars = _RE_ALPHA.findall(t)
+    if not alpha_chars:
+        return True
+
+    # Extract only alpha characters for dictionary lookup
+    cleaned = ''.join(alpha_chars)
+
+    # Strip normal trailing punctuation before checking for internal noise.
+    stripped_punct = re.sub(r'[.,;:!?]+$', '', t)  # "cupcakes." → "cupcakes"
+    t_check = stripped_punct if stripped_punct else t
+
+    # Check for legitimate punctuation patterns vs. real noise.
+    # Legitimate: "(auf)", "under-", "e.g.", "(on)"
+    # Noise: "Es)", "3d", "B|"
+    # Strategy: strip balanced parens & trailing hyphens, THEN check residual.
+    t_inner = t_check
+    # Remove balanced parentheses wrapping the token: "(auf)" → "auf"
+    if t_inner.startswith('(') and t_inner.endswith(')'):
+        t_inner = t_inner[1:-1]
+    # Remove trailing hyphen (word continuation): "under-" → "under"
+    if t_inner.endswith('-'):
+        t_inner = t_inner[:-1]
+    # Now check: does the inner form still have non-alpha noise?
+    inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
+    has_internal_noise = len(t_inner) > len(inner_alpha)
+
+    # Long alpha words (4+ chars) without internal noise are likely real
+    if len(cleaned) >= 4 and not has_internal_noise:
+        return False
+
+    # Short words: check dictionary (uses only alpha chars)
+    if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
+        return False
+
+    # Default: short or suspicious → noise
+    return True
+
+
+def _is_garbage_text(text: str) -> bool:
+    """Check if entire cell text is OCR garbage from image areas.
+
+    Garbage text = no recognizable dictionary word.  Catches
+    "(ci]oeu", "uanoaain." etc.
+    """
+    words = _RE_REAL_WORD.findall(text)
+    if not words:
+        return True
+
+    for w in words:
+        wl = w.lower()
+        # Known short word → not garbage
+        if wl in _COMMON_SHORT_WORDS:
+            return False
+        # Long word (>= 4 chars): check vowel/consonant ratio.
+        # Real EN/DE words have 20-60% vowels.  Garbage like "uanoaain"
+        # or "cioeu" has unusual ratios (too many or too few vowels).
+        if len(wl) >= 4:
+            vowels = sum(1 for c in wl if c in 'aeiouäöü')
+            ratio = vowels / len(wl)
+            if 0.15 <= ratio <= 0.65:
+                return False  # plausible vowel ratio → real word
+
+    return True
+
+
+def _clean_cell_text(text: str) -> str:
+    """Remove OCR noise from cell text.  Generic filters:
+
+    1. If the entire text has no real alphabetic word (>= 2 letters), clear.
+    2. If the entire text is garbage (no dictionary word), clear.
+    3. Strip trailing noise tokens from the end of the text.
+    """
+    stripped = text.strip()
+    if not stripped:
+        return ''
+
+    # --- Filter 1: No real word at all ---
+    if not _RE_REAL_WORD.search(stripped):
+        return ''
+
+    # --- Filter 2: Entire text is garbage ---
+    if _is_garbage_text(stripped):
+        return ''
+
+    # --- Filter 3: Strip trailing noise tokens ---
+    tokens = stripped.split()
+    while tokens and _is_noise_tail_token(tokens[-1]):
+        tokens.pop()
+    if not tokens:
+        return ''
+
+    return ' '.join(tokens)
+
+
 def _ocr_single_cell(
    row_idx: int,
    col_idx: int,
@@ -3223,14 +3407,9 @@ def _ocr_single_cell(
                used_engine = 'cell_ocr_fallback'

    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
-    # If the cell text has no real alphabetic word (>= 2 letters), it's
-    # noise from image edges, borders, or artifacts.  This catches
-    # fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc.
-    # but keeps real short words like "Ei", "go", "an", "up".
    if text.strip():
-        _has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text))
-        if not _has_real_word:
-            text = ''
+        text = _clean_cell_text(text)
+        if not text:
            avg_conf = 0.0

    return {