feat(ocr-pipeline): add cell text noise filter for OCR artifacts
Add _clean_cell_text() with three sub-filters to remove OCR noise: - _is_garbage_text(): vowel/consonant ratio check for phantom row garbage - _is_noise_tail_token(): dictionary-based trailing noise detection - _RE_REAL_WORD check for cells with no real words (just fragments) Handles balanced parentheses "(auf)" and trailing hyphens "under-" as legitimate tokens while stripping noise like "Es)", "3", "ee", "B". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3112,6 +3112,190 @@ def _assign_row_words_to_columns(
|
||||
return result
|
||||
|
||||
|
||||
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
|
||||
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
|
||||
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
|
||||
|
||||
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
|
||||
# that do NOT appear here are treated as trailing OCR noise.
|
||||
_COMMON_SHORT_WORDS: set = {
|
||||
# EN 1-2 letter
|
||||
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
|
||||
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
|
||||
'or', 'so', 'to', 'up', 'us', 'we',
|
||||
# EN 3 letter
|
||||
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
|
||||
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
|
||||
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
|
||||
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
|
||||
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
|
||||
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
|
||||
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
|
||||
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
|
||||
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
|
||||
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
|
||||
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
|
||||
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
|
||||
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
|
||||
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
|
||||
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
|
||||
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
|
||||
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
|
||||
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
|
||||
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
|
||||
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
|
||||
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
|
||||
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
|
||||
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
|
||||
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
|
||||
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
|
||||
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
|
||||
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
|
||||
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
|
||||
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
|
||||
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
|
||||
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
|
||||
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
|
||||
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
|
||||
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
|
||||
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
|
||||
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
|
||||
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
|
||||
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
|
||||
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
|
||||
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
|
||||
'zap', 'zip', 'zoo',
|
||||
# DE 2-3 letter
|
||||
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
|
||||
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
|
||||
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
|
||||
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
|
||||
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
|
||||
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
|
||||
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
|
||||
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
|
||||
'wut', 'zum', 'zur',
|
||||
}
|
||||
|
||||
|
||||
def _is_noise_tail_token(token: str) -> bool:
|
||||
"""Check if a token at the END of cell text is trailing OCR noise.
|
||||
|
||||
Trailing fragments are very common OCR artifacts from image edges,
|
||||
borders, and neighbouring cells. This is more aggressive than a
|
||||
general word filter: any short token that isn't in the dictionary
|
||||
of common EN/DE words is considered noise.
|
||||
|
||||
Examples of noise: "Es)", "3", "ee", "B"
|
||||
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
|
||||
"""
|
||||
t = token.strip()
|
||||
if not t:
|
||||
return True
|
||||
|
||||
# Keep ellipsis
|
||||
if t in ('...', '…'):
|
||||
return False
|
||||
|
||||
# Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
|
||||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||||
return False
|
||||
|
||||
# Pure non-alpha → noise ("3", ")", "|")
|
||||
alpha_chars = _RE_ALPHA.findall(t)
|
||||
if not alpha_chars:
|
||||
return True
|
||||
|
||||
# Extract only alpha characters for dictionary lookup
|
||||
cleaned = ''.join(alpha_chars)
|
||||
|
||||
# Strip normal trailing punctuation before checking for internal noise.
|
||||
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes"
|
||||
t_check = stripped_punct if stripped_punct else t
|
||||
|
||||
# Check for legitimate punctuation patterns vs. real noise.
|
||||
# Legitimate: "(auf)", "under-", "e.g.", "(on)"
|
||||
# Noise: "Es)", "3d", "B|"
|
||||
# Strategy: strip balanced parens & trailing hyphens, THEN check residual.
|
||||
t_inner = t_check
|
||||
# Remove balanced parentheses wrapping the token: "(auf)" → "auf"
|
||||
if t_inner.startswith('(') and t_inner.endswith(')'):
|
||||
t_inner = t_inner[1:-1]
|
||||
# Remove trailing hyphen (word continuation): "under-" → "under"
|
||||
if t_inner.endswith('-'):
|
||||
t_inner = t_inner[:-1]
|
||||
# Now check: does the inner form still have non-alpha noise?
|
||||
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
||||
has_internal_noise = len(t_inner) > len(inner_alpha)
|
||||
|
||||
# Long alpha words (4+ chars) without internal noise are likely real
|
||||
if len(cleaned) >= 4 and not has_internal_noise:
|
||||
return False
|
||||
|
||||
# Short words: check dictionary (uses only alpha chars)
|
||||
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
|
||||
return False
|
||||
|
||||
# Default: short or suspicious → noise
|
||||
return True
|
||||
|
||||
|
||||
def _is_garbage_text(text: str) -> bool:
|
||||
"""Check if entire cell text is OCR garbage from image areas.
|
||||
|
||||
Garbage text = no recognizable dictionary word. Catches
|
||||
"(ci]oeu", "uanoaain." etc.
|
||||
"""
|
||||
words = _RE_REAL_WORD.findall(text)
|
||||
if not words:
|
||||
return True
|
||||
|
||||
for w in words:
|
||||
wl = w.lower()
|
||||
# Known short word → not garbage
|
||||
if wl in _COMMON_SHORT_WORDS:
|
||||
return False
|
||||
# Long word (>= 4 chars): check vowel/consonant ratio.
|
||||
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
|
||||
# or "cioeu" has unusual ratios (too many or too few vowels).
|
||||
if len(wl) >= 4:
|
||||
vowels = sum(1 for c in wl if c in 'aeiouäöü')
|
||||
ratio = vowels / len(wl)
|
||||
if 0.15 <= ratio <= 0.65:
|
||||
return False # plausible vowel ratio → real word
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _clean_cell_text(text: str) -> str:
|
||||
"""Remove OCR noise from cell text. Generic filters:
|
||||
|
||||
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
|
||||
2. If the entire text is garbage (no dictionary word), clear.
|
||||
3. Strip trailing noise tokens from the end of the text.
|
||||
"""
|
||||
stripped = text.strip()
|
||||
if not stripped:
|
||||
return ''
|
||||
|
||||
# --- Filter 1: No real word at all ---
|
||||
if not _RE_REAL_WORD.search(stripped):
|
||||
return ''
|
||||
|
||||
# --- Filter 2: Entire text is garbage ---
|
||||
if _is_garbage_text(stripped):
|
||||
return ''
|
||||
|
||||
# --- Filter 3: Strip trailing noise tokens ---
|
||||
tokens = stripped.split()
|
||||
while tokens and _is_noise_tail_token(tokens[-1]):
|
||||
tokens.pop()
|
||||
if not tokens:
|
||||
return ''
|
||||
|
||||
return ' '.join(tokens)
|
||||
|
||||
|
||||
def _ocr_single_cell(
|
||||
row_idx: int,
|
||||
col_idx: int,
|
||||
@@ -3223,14 +3407,9 @@ def _ocr_single_cell(
|
||||
used_engine = 'cell_ocr_fallback'
|
||||
|
||||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||||
# If the cell text has no real alphabetic word (>= 2 letters), it's
|
||||
# noise from image edges, borders, or artifacts. This catches
|
||||
# fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc.
|
||||
# but keeps real short words like "Ei", "go", "an", "up".
|
||||
if text.strip():
|
||||
_has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text))
|
||||
if not _has_real_word:
|
||||
text = ''
|
||||
text = _clean_cell_text(text)
|
||||
if not text:
|
||||
avg_conf = 0.0
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user