From 489835a279b8feb31572a51f3cfc4be366398a95 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 4 Mar 2026 11:38:12 +0100 Subject: [PATCH] fix: detect red/coloured markers in OCR pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for marker column content (e.g. red "!" marks): 1. Skip _clean_cell_text() noise filter for column_marker — it requires 2+ consecutive letters, which drops punctuation-only markers like "!" or "*". 2. For marker columns, detect coloured pixels via HSV saturation check (S>80) in addition to grayscale darkness. Create a binarized image where both dark AND saturated pixels become black foreground, so Tesseract can see red markers that appear near-white in standard grayscale conversion. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 43 +++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index a10a0c7..fb2d96c 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4621,10 +4621,39 @@ def _ocr_single_cell( # in an otherwise empty cell. dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size _run_fallback = dark_ratio > 0.005 + # For marker columns, also check the colour image — red/coloured + # markers appear near-white in grayscale but have high saturation. + if not _run_fallback and img_bgr is not None and col.type == 'column_marker': + bgr_crop = img_bgr[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] + if bgr_crop.size > 0: + hsv = cv2.cvtColor(bgr_crop, cv2.COLOR_BGR2HSV) + # Saturation > 80 indicates coloured (non-grey) pixels + sat_ratio = float(np.count_nonzero(hsv[:, :, 1] > 80)) / (hsv.shape[0] * hsv.shape[1]) + if sat_ratio > 0.005: + _run_fallback = True if _run_fallback: + # For marker columns with coloured content (e.g. red "!"), convert + # the BGR crop to a binarized grayscale that preserves saturated pixels. + _marker_ocr_img = ocr_img + if col.type == 'column_marker' and img_bgr is not None: + bgr_crop_full = img_bgr[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] + if bgr_crop_full.size > 0: + hsv = cv2.cvtColor(bgr_crop_full, cv2.COLOR_BGR2HSV) + # Create mask: either dark (V<180) or saturated (S>60) + dark_mask = hsv[:, :, 2] < 180 + sat_mask = hsv[:, :, 1] > 60 + combined = dark_mask | sat_mask + # Build grayscale: foreground=0 (black), background=255 (white) + marker_gray = np.full(combined.shape, 255, dtype=np.uint8) + marker_gray[combined] = 0 + # Place into a full-size image at the crop position + _marker_full = np.full_like(ocr_img, 255) + _marker_full[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] = marker_gray + _marker_ocr_img = _marker_full + # For narrow columns, upscale the crop before OCR - if is_narrow and ocr_img is not None: - _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] + if is_narrow and _marker_ocr_img is not None: + _crop_slice = _marker_ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] _upscaled = _ensure_minimum_crop_size(_crop_slice) if _upscaled is not _crop_slice: # Build a temporary full-size image with the upscaled crop @@ -4653,7 +4682,7 @@ def _ocr_single_cell( ) _cell_psm = _select_psm_for_column(col.type, col.width, row.height) cell_lang = lang_map.get(col.type, lang) - fallback_words = ocr_region(ocr_img, cell_region, + fallback_words = ocr_region(_marker_ocr_img, cell_region, lang=cell_lang, psm=_cell_psm) else: cell_region = PageRegion( @@ -4670,7 +4699,7 @@ def _ocr_single_cell( else: _cell_psm = _select_psm_for_column(col.type, col.width, row.height) cell_lang = lang_map.get(col.type, lang) - fallback_words = ocr_region(ocr_img, cell_region, + fallback_words = ocr_region(_marker_ocr_img, cell_region, lang=cell_lang, psm=_cell_psm) if fallback_words: @@ -4694,7 +4723,7 @@ def _ocr_single_cell( width=cell_w, height=cell_h, ) cell_lang = lang_map.get(col.type, lang) - psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7) + psm7_words = ocr_region(_marker_ocr_img, _fb_region, lang=cell_lang, psm=7) if psm7_words: psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] if psm7_words: @@ -4739,7 +4768,9 @@ def _ocr_single_cell( used_engine = 'row_strip_rapid' # --- NOISE FILTER: clear cells that contain only OCR artifacts --- - if text.strip(): + # Skip noise filter for marker columns — they legitimately contain + # only punctuation like "!" or "*" which _clean_cell_text would remove. + if text.strip() and col.type != 'column_marker': text = _clean_cell_text(text) if not text: avg_conf = 0.0