From 2b1c499d547029f7aad909bfec7f5900ccf3908a Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 09:56:54 +0100
Subject: [PATCH] fix(ocr-pipeline): filter OCR noise from image areas and
 artifacts

Two generic noise filters added to _ocr_single_cell():

1. Word confidence filter (conf < 30): removes low-confidence words
   before text assembly.  Catches trailing artifacts like "Es)" after
   real text, and standalone noise from image edges.

2. Cell noise filter: clears cells whose entire text has no real
   alphabetic word (>= 2 letters).  Catches fragments like "E:", "3",
   "u", "D", "2.77", "and )" from image areas, while keeping real
   short words like "Ei", "go", "an".

Both filters apply to word-lookup AND cell-OCR fallback results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 23 ++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index cedd4fa..6c0718e 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3162,6 +3162,13 @@ def _ocr_single_cell(
     words = preassigned_words if preassigned_words is not None else []
     used_engine = 'word_lookup'
 
+    # Filter low-confidence words (OCR noise from images/artifacts).
+    # Tesseract gives low confidence to misread image edges, borders,
+    # and other non-text elements.
+    _MIN_WORD_CONF = 30
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
     if words:
         # Use row height as Y-tolerance so all words within a single row
         # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
@@ -3181,8 +3188,6 @@ def _ocr_single_cell(
     # plausibly contain text.
     _run_fallback = False
     if not text.strip() and cell_w > 0 and cell_h > 0:
-        # Quick pixel-density check: binarise the cell crop and count
-        # dark pixels.  Text cells typically have >2% ink coverage.
         if ocr_img is not None:
             crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
             if crop.size > 0:
@@ -3203,6 +3208,9 @@ def _ocr_single_cell(
             cell_lang = lang_map.get(col.type, lang)
             fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
 
+        if fallback_words:
+            # Apply same confidence filter to fallback words
+            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
         if fallback_words:
             fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
             fb_y_tol = max(10, int(fb_avg_h * 0.5))
@@ -3214,6 +3222,17 @@ def _ocr_single_cell(
                 )
                 used_engine = 'cell_ocr_fallback'
 
+    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+    # If the cell text has no real alphabetic word (>= 2 letters), it's
+    # noise from image edges, borders, or artifacts.  This catches
+    # fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc.
+    # but keeps real short words like "Ei", "go", "an", "up".
+    if text.strip():
+        _has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text))
+        if not _has_real_word:
+            text = ''
+            avg_conf = 0.0
+
     return {
         'cell_id': f"R{row_idx:02d}_C{col_idx}",
         'row_index': row_idx,