fix(ocr-pipeline): mask out images in row detection horizontal projection

Build a word-coverage mask so only pixels near Tesseract word bounding
boxes contribute to the horizontal projection. Image regions (high ink
but no words) are treated as white, preventing illustrations from
merging multiple vocabulary rows into one.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 01:39:20 +01:00
parent b58aecd081
commit 203b3c0e2d

View File

@@ -1312,9 +1312,24 @@ def detect_row_geometry(
logger.warning("detect_row_geometry: content area too small")
return []
# --- Step 1: Horizontal projection profile ---
# --- Step 1: Horizontal projection profile (text-only, images masked out) ---
content_strip = inv[top_y:bottom_y, left_x:right_x]
h_proj = np.sum(content_strip, axis=1).astype(float)
# Build a word-coverage mask so that image regions (high ink density but no
# Tesseract words) are ignored. Only pixels within/near word bounding boxes
# contribute to the projection. This prevents large illustrations from
# merging multiple vocabulary rows into one.
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
for wd in word_dicts:
y1 = max(0, wd['top'] - WORD_PAD_Y)
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
x1 = max(0, wd['left'])
x2 = min(content_w, wd['left'] + wd['width'])
word_mask[y1:y2, x1:x2] = 255
masked_strip = cv2.bitwise_and(content_strip, word_mask)
h_proj = np.sum(masked_strip, axis=1).astype(float)
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
# --- Step 2: Smoothing + threshold ---