fix(ocr-pipeline): filter OCR noise from image areas and artifacts

Two generic noise filters added to _ocr_single_cell():

1. Word confidence filter (conf < 30): removes low-confidence words
   before text assembly.  Catches trailing artifacts like "Es)" after
   real text, and standalone noise from image edges.

2. Cell noise filter: clears cells whose entire text has no real
   alphabetic word (>= 2 letters).  Catches fragments like "E:", "3",
   "u", "D", "2.77", "and )" from image areas, while keeping real
   short words like "Ei", "go", "an".

Both filters apply to word-lookup AND cell-OCR fallback results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 09:56:54 +01:00
parent 72cc77dcf4
commit 2b1c499d54

View File

@@ -3162,6 +3162,13 @@ def _ocr_single_cell(
words = preassigned_words if preassigned_words is not None else []
used_engine = 'word_lookup'
# Filter low-confidence words (OCR noise from images/artifacts).
# Tesseract gives low confidence to misread image edges, borders,
# and other non-text elements.
_MIN_WORD_CONF = 30
if words:
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
if words:
# Use row height as Y-tolerance so all words within a single row
# are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
@@ -3181,8 +3188,6 @@ def _ocr_single_cell(
# plausibly contain text.
_run_fallback = False
if not text.strip() and cell_w > 0 and cell_h > 0:
# Quick pixel-density check: binarise the cell crop and count
# dark pixels. Text cells typically have >2% ink coverage.
if ocr_img is not None:
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
if crop.size > 0:
@@ -3203,6 +3208,9 @@ def _ocr_single_cell(
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
if fallback_words:
# Apply same confidence filter to fallback words
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if fallback_words:
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
fb_y_tol = max(10, int(fb_avg_h * 0.5))
@@ -3214,6 +3222,17 @@ def _ocr_single_cell(
)
used_engine = 'cell_ocr_fallback'
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
# If the cell text has no real alphabetic word (>= 2 letters), it's
# noise from image edges, borders, or artifacts. This catches
# fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc.
# but keeps real short words like "Ei", "go", "an", "up".
if text.strip():
_has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text))
if not _has_real_word:
text = ''
avg_conf = 0.0
return {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,