From 203b3c0e2db19d026839e5e39c941b30208be801 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sat, 28 Feb 2026 01:39:20 +0100
Subject: [PATCH] fix(ocr-pipeline): mask out images in row detection
 horizontal projection

Build a word-coverage mask so only pixels near Tesseract word bounding
boxes contribute to the horizontal projection. Image regions (high ink
but no words) are treated as white, preventing illustrations from
merging multiple vocabulary rows into one.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 6c8eeaa..4e25f67 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1312,9 +1312,24 @@ def detect_row_geometry(
         logger.warning("detect_row_geometry: content area too small")
         return []
 
-    # --- Step 1: Horizontal projection profile ---
+    # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
     content_strip = inv[top_y:bottom_y, left_x:right_x]
-    h_proj = np.sum(content_strip, axis=1).astype(float)
+
+    # Build a word-coverage mask so that image regions (high ink density but no
+    # Tesseract words) are ignored.  Only pixels within/near word bounding boxes
+    # contribute to the projection.  This prevents large illustrations from
+    # merging multiple vocabulary rows into one.
+    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
+    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
+    for wd in word_dicts:
+        y1 = max(0, wd['top'] - WORD_PAD_Y)
+        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
+        x1 = max(0, wd['left'])
+        x2 = min(content_w, wd['left'] + wd['width'])
+        word_mask[y1:y2, x1:x2] = 255
+
+    masked_strip = cv2.bitwise_and(content_strip, word_mask)
+    h_proj = np.sum(masked_strip, axis=1).astype(float)
     h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
 
     # --- Step 2: Smoothing + threshold ---