From 203b3c0e2db19d026839e5e39c941b30208be801 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Feb 2026 01:39:20 +0100 Subject: [PATCH] fix(ocr-pipeline): mask out images in row detection horizontal projection Build a word-coverage mask so only pixels near Tesseract word bounding boxes contribute to the horizontal projection. Image regions (high ink but no words) are treated as white, preventing illustrations from merging multiple vocabulary rows into one. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 6c8eeaa..4e25f67 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1312,9 +1312,24 @@ def detect_row_geometry( logger.warning("detect_row_geometry: content area too small") return [] - # --- Step 1: Horizontal projection profile --- + # --- Step 1: Horizontal projection profile (text-only, images masked out) --- content_strip = inv[top_y:bottom_y, left_x:right_x] - h_proj = np.sum(content_strip, axis=1).astype(float) + + # Build a word-coverage mask so that image regions (high ink density but no + # Tesseract words) are ignored. Only pixels within/near word bounding boxes + # contribute to the projection. This prevents large illustrations from + # merging multiple vocabulary rows into one. + WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words + word_mask = np.zeros((content_h, content_w), dtype=np.uint8) + for wd in word_dicts: + y1 = max(0, wd['top'] - WORD_PAD_Y) + y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y) + x1 = max(0, wd['left']) + x2 = min(content_w, wd['left'] + wd['width']) + word_mask[y1:y2, x1:x2] = 255 + + masked_strip = cv2.bitwise_and(content_strip, word_mask) + h_proj = np.sum(masked_strip, axis=1).astype(float) h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj # --- Step 2: Smoothing + threshold ---