fix(ocr-pipeline): mask out images in row detection horizontal projection
Build a word-coverage mask so only pixels near Tesseract word bounding boxes contribute to the horizontal projection. Image regions (high ink but no words) are treated as white, preventing illustrations from merging multiple vocabulary rows into one. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1312,9 +1312,24 @@ def detect_row_geometry(
|
||||
logger.warning("detect_row_geometry: content area too small")
|
||||
return []
|
||||
|
||||
# --- Step 1: Horizontal projection profile ---
|
||||
# --- Step 1: Horizontal projection profile (text-only, images masked out) ---
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
h_proj = np.sum(content_strip, axis=1).astype(float)
|
||||
|
||||
# Build a word-coverage mask so that image regions (high ink density but no
|
||||
# Tesseract words) are ignored. Only pixels within/near word bounding boxes
|
||||
# contribute to the projection. This prevents large illustrations from
|
||||
# merging multiple vocabulary rows into one.
|
||||
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
|
||||
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
|
||||
for wd in word_dicts:
|
||||
y1 = max(0, wd['top'] - WORD_PAD_Y)
|
||||
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
|
||||
x1 = max(0, wd['left'])
|
||||
x2 = min(content_w, wd['left'] + wd['width'])
|
||||
word_mask[y1:y2, x1:x2] = 255
|
||||
|
||||
masked_strip = cv2.bitwise_and(content_strip, word_mask)
|
||||
h_proj = np.sum(masked_strip, axis=1).astype(float)
|
||||
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||||
|
||||
# --- Step 2: Smoothing + threshold ---
|
||||
|
||||
Reference in New Issue
Block a user