From 2b1c499d547029f7aad909bfec7f5900ccf3908a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 09:56:54 +0100 Subject: [PATCH] fix(ocr-pipeline): filter OCR noise from image areas and artifacts Two generic noise filters added to _ocr_single_cell(): 1. Word confidence filter (conf < 30): removes low-confidence words before text assembly. Catches trailing artifacts like "Es)" after real text, and standalone noise from image edges. 2. Cell noise filter: clears cells whose entire text has no real alphabetic word (>= 2 letters). Catches fragments like "E:", "3", "u", "D", "2.77", "and )" from image areas, while keeping real short words like "Ei", "go", "an". Both filters apply to word-lookup AND cell-OCR fallback results. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 23 ++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index cedd4fa..6c0718e 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3162,6 +3162,13 @@ def _ocr_single_cell( words = preassigned_words if preassigned_words is not None else [] used_engine = 'word_lookup' + # Filter low-confidence words (OCR noise from images/artifacts). + # Tesseract gives low confidence to misread image edges, borders, + # and other non-text elements. + _MIN_WORD_CONF = 30 + if words: + words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] + if words: # Use row height as Y-tolerance so all words within a single row # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse" @@ -3181,8 +3188,6 @@ def _ocr_single_cell( # plausibly contain text. _run_fallback = False if not text.strip() and cell_w > 0 and cell_h > 0: - # Quick pixel-density check: binarise the cell crop and count - # dark pixels. Text cells typically have >2% ink coverage. if ocr_img is not None: crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] if crop.size > 0: @@ -3203,6 +3208,9 @@ def _ocr_single_cell( cell_lang = lang_map.get(col.type, lang) fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) + if fallback_words: + # Apply same confidence filter to fallback words + fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF] if fallback_words: fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words) fb_y_tol = max(10, int(fb_avg_h * 0.5)) @@ -3214,6 +3222,17 @@ def _ocr_single_cell( ) used_engine = 'cell_ocr_fallback' + # --- NOISE FILTER: clear cells that contain only OCR artifacts --- + # If the cell text has no real alphabetic word (>= 2 letters), it's + # noise from image edges, borders, or artifacts. This catches + # fragments like "E:", "3", "u", "D", "ee", "2.77", "and )" etc. + # but keeps real short words like "Ei", "go", "an", "up". + if text.strip(): + _has_real_word = bool(re.search(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}', text)) + if not _has_real_word: + text = '' + avg_conf = 0.0 + return { 'cell_id': f"R{row_idx:02d}_C{col_idx}", 'row_index': row_idx,