diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 6c8eeaa..4e25f67 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1312,9 +1312,24 @@ def detect_row_geometry( logger.warning("detect_row_geometry: content area too small") return [] - # --- Step 1: Horizontal projection profile --- + # --- Step 1: Horizontal projection profile (text-only, images masked out) --- content_strip = inv[top_y:bottom_y, left_x:right_x] - h_proj = np.sum(content_strip, axis=1).astype(float) + + # Build a word-coverage mask so that image regions (high ink density but no + # Tesseract words) are ignored. Only pixels within/near word bounding boxes + # contribute to the projection. This prevents large illustrations from + # merging multiple vocabulary rows into one. + WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words + word_mask = np.zeros((content_h, content_w), dtype=np.uint8) + for wd in word_dicts: + y1 = max(0, wd['top'] - WORD_PAD_Y) + y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y) + x1 = max(0, wd['left']) + x2 = min(content_w, wd['left'] + wd['width']) + word_mask[y1:y2, x1:x2] = 255 + + masked_strip = cv2.bitwise_and(content_strip, word_mask) + h_proj = np.sum(masked_strip, axis=1).astype(float) h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj # --- Step 2: Smoothing + threshold ---