fix(ocr-pipeline): filter OCR noise from image areas and artifacts
Two generic noise filters added to _ocr_single_cell(): 1. Word confidence filter (conf < 30): removes low-confidence words before text assembly. Catches trailing artifacts like "Es)" after real text, and standalone noise from image edges. 2. Cell noise filter: clears cells whose entire text has no real alphabetic word (>= 2 letters). Catches fragments like "E:", "3", "u", "D", "2.77", "and )" from image areas, while keeping real short words like "Ei", "go", "an". Both filters apply to word-lookup AND cell-OCR fallback results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3162,6 +3162,13 @@ def _ocr_single_cell(
|
||||
words = preassigned_words if preassigned_words is not None else []
|
||||
used_engine = 'word_lookup'
|
||||
|
||||
# Filter low-confidence words (OCR noise from images/artifacts).
|
||||
# Tesseract gives low confidence to misread image edges, borders,
|
||||
# and other non-text elements.
|
||||
_MIN_WORD_CONF = 30
|
||||
if words:
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
if words:
|
||||
# Use row height as Y-tolerance so all words within a single row
|
||||
# are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
|
||||
@@ -3181,8 +3188,6 @@ def _ocr_single_cell(
|
||||
# plausibly contain text.
|
||||
_run_fallback = False
|
||||
if not text.strip() and cell_w > 0 and cell_h > 0:
|
||||
# Quick pixel-density check: binarise the cell crop and count
|
||||
# dark pixels. Text cells typically have >2% ink coverage.
|
||||
if ocr_img is not None:
|
||||
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||||
if crop.size > 0:
|
||||
@@ -3203,6 +3208,9 @@ def _ocr_single_cell(
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
|
||||
if fallback_words:
|
||||
# Apply same confidence filter to fallback words
|
||||
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
if fallback_words:
|
||||
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
|
||||
fb_y_tol = max(10, int(fb_avg_h * 0.5))
|
||||
@@ -3214,6 +3222,17 @@ def _ocr_single_cell(
|
||||
)
|
||||
used_engine = 'cell_ocr_fallback'
|
||||
|
||||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||||
# If the cell text has no real alphabetic word (>= 2 letters), it's
|
||||
# noise from image edges, borders, or artifacts. This catches
|
||||
# fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc.
|
||||
# but keeps real short words like "Ei", "go", "an", "up".
|
||||
if text.strip():
|
||||
_has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text))
|
||||
if not _has_real_word:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
|
||||
return {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
|
||||
Reference in New Issue
Block a user