fix(ocr-pipeline): mask out images in row detection horizontal projection

Build a word-coverage mask so only pixels near Tesseract word bounding boxes contribute to the horizontal projection. Image regions (high ink but no words) are treated as white, preventing illustrations from merging multiple vocabulary rows into one. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 01:39:20 +01:00
parent b58aecd081
commit 203b3c0e2d
1 changed files with 17 additions and 2 deletions
@@ -1312,9 +1312,24 @@ def detect_row_geometry(
        logger.warning("detect_row_geometry: content area too small")
        return []

-    # --- Step 1: Horizontal projection profile ---
+    # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
    content_strip = inv[top_y:bottom_y, left_x:right_x]
-    h_proj = np.sum(content_strip, axis=1).astype(float)
+
+    # Build a word-coverage mask so that image regions (high ink density but no
+    # Tesseract words) are ignored.  Only pixels within/near word bounding boxes
+    # contribute to the projection.  This prevents large illustrations from
+    # merging multiple vocabulary rows into one.
+    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
+    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
+    for wd in word_dicts:
+        y1 = max(0, wd['top'] - WORD_PAD_Y)
+        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
+        x1 = max(0, wd['left'])
+        x2 = min(content_w, wd['left'] + wd['width'])
+        word_mask[y1:y2, x1:x2] = 255
+
+    masked_strip = cv2.bitwise_and(content_strip, word_mask)
+    h_proj = np.sum(masked_strip, axis=1).astype(float)
    h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj

    # --- Step 2: Smoothing + threshold ---