fix(ocr-pipeline): filter OCR noise from image areas and artifacts

Two generic noise filters added to _ocr_single_cell(): 1. Word confidence filter (conf < 30): removes low-confidence words before text assembly. Catches trailing artifacts like "Es)" after real text, and standalone noise from image edges. 2. Cell noise filter: clears cells whose entire text has no real alphabetic word (>= 2 letters). Catches fragments like "E:", "3", "u", "D", "2.77", "and )" from image areas, while keeping real short words like "Ei", "go", "an". Both filters apply to word-lookup AND cell-OCR fallback results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 09:56:54 +01:00
parent 72cc77dcf4
commit 2b1c499d54
1 changed files with 21 additions and 2 deletions
@@ -3162,6 +3162,13 @@ def _ocr_single_cell(
    words = preassigned_words if preassigned_words is not None else []
    used_engine = 'word_lookup'

+    # Filter low-confidence words (OCR noise from images/artifacts).
+    # Tesseract gives low confidence to misread image edges, borders,
+    # and other non-text elements.
+    _MIN_WORD_CONF = 30
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
    if words:
        # Use row height as Y-tolerance so all words within a single row
        # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
@@ -3181,8 +3188,6 @@ def _ocr_single_cell(
    # plausibly contain text.
    _run_fallback = False
    if not text.strip() and cell_w > 0 and cell_h > 0:
-        # Quick pixel-density check: binarise the cell crop and count
-        # dark pixels.  Text cells typically have >2% ink coverage.
        if ocr_img is not None:
            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
            if crop.size > 0:
@@ -3203,6 +3208,9 @@ def _ocr_single_cell(
            cell_lang = lang_map.get(col.type, lang)
            fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)

+        if fallback_words:
+            # Apply same confidence filter to fallback words
+            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
        if fallback_words:
            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
            fb_y_tol = max(10, int(fb_avg_h * 0.5))
@@ -3214,6 +3222,17 @@ def _ocr_single_cell(
                )
                used_engine = 'cell_ocr_fallback'

+    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+    # If the cell text has no real alphabetic word (>= 2 letters), it's
+    # noise from image edges, borders, or artifacts.  This catches
+    # fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc.
+    # but keeps real short words like "Ei", "go", "an", "up".
+    if text.strip():
+        _has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text))
+        if not _has_real_word:
+            text = ''
+            avg_conf = 0.0
+
    return {
        'cell_id': f"R{row_idx:02d}_C{col_idx}",
        'row_index': row_idx,