From 3028f421b4ee467eb35554176bd2652b0c5fa184 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 10:19:31 +0100 Subject: [PATCH] feat(ocr-pipeline): add cell text noise filter for OCR artifacts Add _clean_cell_text() with three sub-filters to remove OCR noise: - _is_garbage_text(): vowel/consonant ratio check for phantom row garbage - _is_noise_tail_token(): dictionary-based trailing noise detection - _RE_REAL_WORD check for cells with no real words (just fragments) Handles balanced parentheses "(auf)" and trailing hyphens "under-" as legitimate tokens while stripping noise like "Es)", "3", "ee", "B". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 193 ++++++++++++++++++- 1 file changed, 186 insertions(+), 7 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 6c0718e..5a02616 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3112,6 +3112,190 @@ def _assign_row_words_to_columns( return result +# Regex: at least 2 consecutive letters (Latin + umlauts + accents) +_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}') +_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]') + +# Common short EN/DE words (2-3 chars). Tokens at the end of a cell +# that do NOT appear here are treated as trailing OCR noise. +_COMMON_SHORT_WORDS: set = { + # EN 1-2 letter + 'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he', + 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on', + 'or', 'so', 'to', 'up', 'us', 'we', + # EN 3 letter + 'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all', + 'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art', + 'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay', + 'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy', + 'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap', + 'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad', + 'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip', + 'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel', + 'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far', + 'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit', + 'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur', + 'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut', + 'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her', + 'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how', + 'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink', + 'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet', + 'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit', + 'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let', + 'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man', + 'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob', + 'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag', + 'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut', + 'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one', + 'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad', + 'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per', + 'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot', + 'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram', + 'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid', + 'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub', + 'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap', + 'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin', + 'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob', + 'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty', + 'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan', + 'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip', + 'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug', + 'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim', + 'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet', + 'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo', + 'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you', + 'zap', 'zip', 'zoo', + # DE 2-3 letter + 'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu', + 'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem', + 'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar', + 'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist', + 'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun', + 'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag', + 'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von', + 'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir', + 'wut', 'zum', 'zur', +} + + +def _is_noise_tail_token(token: str) -> bool: + """Check if a token at the END of cell text is trailing OCR noise. + + Trailing fragments are very common OCR artifacts from image edges, + borders, and neighbouring cells. This is more aggressive than a + general word filter: any short token that isn't in the dictionary + of common EN/DE words is considered noise. + + Examples of noise: "Es)", "3", "ee", "B" + Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]" + """ + t = token.strip() + if not t: + return True + + # Keep ellipsis + if t in ('...', '…'): + return False + + # Keep phonetic brackets: [eg], [maus], ["a:mand], etc. + if t.startswith('[') or t.startswith('["') or t.startswith("['"): + return False + + # Pure non-alpha → noise ("3", ")", "|") + alpha_chars = _RE_ALPHA.findall(t) + if not alpha_chars: + return True + + # Extract only alpha characters for dictionary lookup + cleaned = ''.join(alpha_chars) + + # Strip normal trailing punctuation before checking for internal noise. + stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes" + t_check = stripped_punct if stripped_punct else t + + # Check for legitimate punctuation patterns vs. real noise. + # Legitimate: "(auf)", "under-", "e.g.", "(on)" + # Noise: "Es)", "3d", "B|" + # Strategy: strip balanced parens & trailing hyphens, THEN check residual. + t_inner = t_check + # Remove balanced parentheses wrapping the token: "(auf)" → "auf" + if t_inner.startswith('(') and t_inner.endswith(')'): + t_inner = t_inner[1:-1] + # Remove trailing hyphen (word continuation): "under-" → "under" + if t_inner.endswith('-'): + t_inner = t_inner[:-1] + # Now check: does the inner form still have non-alpha noise? + inner_alpha = ''.join(_RE_ALPHA.findall(t_inner)) + has_internal_noise = len(t_inner) > len(inner_alpha) + + # Long alpha words (4+ chars) without internal noise are likely real + if len(cleaned) >= 4 and not has_internal_noise: + return False + + # Short words: check dictionary (uses only alpha chars) + if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise: + return False + + # Default: short or suspicious → noise + return True + + +def _is_garbage_text(text: str) -> bool: + """Check if entire cell text is OCR garbage from image areas. + + Garbage text = no recognizable dictionary word. Catches + "(ci]oeu", "uanoaain." etc. + """ + words = _RE_REAL_WORD.findall(text) + if not words: + return True + + for w in words: + wl = w.lower() + # Known short word → not garbage + if wl in _COMMON_SHORT_WORDS: + return False + # Long word (>= 4 chars): check vowel/consonant ratio. + # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain" + # or "cioeu" has unusual ratios (too many or too few vowels). + if len(wl) >= 4: + vowels = sum(1 for c in wl if c in 'aeiouäöü') + ratio = vowels / len(wl) + if 0.15 <= ratio <= 0.65: + return False # plausible vowel ratio → real word + + return True + + +def _clean_cell_text(text: str) -> str: + """Remove OCR noise from cell text. Generic filters: + + 1. If the entire text has no real alphabetic word (>= 2 letters), clear. + 2. If the entire text is garbage (no dictionary word), clear. + 3. Strip trailing noise tokens from the end of the text. + """ + stripped = text.strip() + if not stripped: + return '' + + # --- Filter 1: No real word at all --- + if not _RE_REAL_WORD.search(stripped): + return '' + + # --- Filter 2: Entire text is garbage --- + if _is_garbage_text(stripped): + return '' + + # --- Filter 3: Strip trailing noise tokens --- + tokens = stripped.split() + while tokens and _is_noise_tail_token(tokens[-1]): + tokens.pop() + if not tokens: + return '' + + return ' '.join(tokens) + + def _ocr_single_cell( row_idx: int, col_idx: int, @@ -3223,14 +3407,9 @@ def _ocr_single_cell( used_engine = 'cell_ocr_fallback' # --- NOISE FILTER: clear cells that contain only OCR artifacts --- - # If the cell text has no real alphabetic word (>= 2 letters), it's - # noise from image edges, borders, or artifacts. This catches - # fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc. - # but keeps real short words like "Ei", "go", "an", "up". if text.strip(): - _has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text)) - if not _has_real_word: - text = '' + text = _clean_cell_text(text) + if not text: avg_conf = 0.0 return {