feat: region-based graphic detection with word-overlap filtering

New approach: dilate color mask heavily (25x25) to merge nearby colored pixels into regions, then check word overlap: - >50% overlap with OCR word boxes → colored text → skip - <50% overlap → colored image/graphic → keep This detects balloon clusters as one "image" region instead of trying to classify individual shapes. Red words like "borrow/lend" are filtered because they overlap with their word boxes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 14:49:15 +01:00
parent eeee61108a
commit 6668661895
1 changed files with 89 additions and 88 deletions
@@ -1,14 +1,13 @@
 """
 Graphical element detection for OCR pages.
-Two-pass approach:
+Region-based approach:
-  Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
+  1. Build a color mask (saturation channel — black text is invisible).
-           arrows, icons) on the saturation channel alone.  Black text has
+  2. Dilate heavily to merge nearby colored pixels into regions.
-           zero saturation and is invisible on this channel, so no word
+  3. For each region, check overlap with OCR word boxes:
-           exclusion is needed.
+       - High word overlap → colored text (skip)
-  Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
+       - Low word overlap  → colored graphic / image (keep)
-           OCR word boxes from the full ink mask and keeping only very large
+  4. Separately detect large black-ink illustrations via ink mask.
           remaining contours.
 Boxes and text colors are handled by cv_box_detect / cv_color_detect.
@@ -36,7 +35,7 @@ class GraphicElement:
    width: int
    height: int
    area: int
-    shape: str          # circle, illustration
+    shape: str          # image, illustration
    color_name: str     # dominant color or 'black'
    color_hex: str
    confidence: float
@@ -59,7 +58,7 @@ _COLOR_HEX = {
 }
-def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
+def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
    """Return (color_name, color_hex) for an HSV region."""
    if hsv_roi.size == 0:
        return "black", _COLOR_HEX["black"]
@@ -104,13 +103,10 @@ def detect_graphic_elements(
    detected_boxes: Optional[List[Dict]] = None,
    max_elements: int = 50,
 ) -> List[GraphicElement]:
-    """Find non-text graphical elements on the page.
+    """Find non-text graphical regions on the page.
-    Two-pass approach:
+    Region-based: dilate color mask to form regions, then check word
-      Pass 1 (color): Find colored elements via saturation channel.
+    overlap to distinguish colored text from colored graphics.
                       No word exclusion needed — black text is invisible.
      Pass 2 (ink):    Find large black illustrations via ink mask minus
                       word exclusion.
    Args:
        img_bgr: BGR color image.
@@ -133,89 +129,104 @@ def detect_graphic_elements(
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    candidates: List[GraphicElement] = []
    # --- Build word mask (for overlap checking) ---
    word_mask = np.zeros((h, w), dtype=np.uint8)
    for wb in word_boxes:
        x1 = max(0, int(wb.get("left", 0)))
        y1 = max(0, int(wb.get("top", 0)))
        x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
        y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
        word_mask[y1:y2, x1:x2] = 255
    # =====================================================================
-    # PASS 1 — COLOR CHANNEL (no word exclusion needed)
+    # PASS 1 — COLORED IMAGE REGIONS
    # =====================================================================
-    # Saturated pixels = colored ink.  Black text has sat ≈ 0 → invisible.
+    # Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
    sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
    # Exclude very bright backgrounds (white/near-white with color cast)
    val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
-    color_mask = cv2.bitwise_and(sat_mask, val_mask)
+    color_pixels = cv2.bitwise_and(sat_mask, val_mask)
-    # Only remove tiny speckle — NO closing, which would merge nearby
+    # Remove tiny speckle
    # colored elements into one giant blob spanning half the page.
    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
-    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
+    color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
-    contours_color, _ = cv2.findContours(
+    # Count raw colored pixels before dilation (for density check later)
-        color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    color_pixel_raw = color_pixels.copy()
    # Heavy dilation to merge nearby colored elements into regions.
    # A 25x25 kernel merges elements within ~12px of each other.
    kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
    region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
    contours_regions, _ = cv2.findContours(
        region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
-    logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
+    logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
    for cnt in contours_color:
        area = cv2.contourArea(cnt)
        if area < 80:
            continue
    for cnt in contours_regions:
        bx, by, bw, bh = cv2.boundingRect(cnt)
-        if bw < 8 or bh < 8:
+
        # Skip tiny regions
        if bw < 15 or bh < 15:
            continue
-        # Skip page-spanning contours (background color cast / merged blobs)
+        # Skip page-spanning regions
-        if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10:
+        if bw > w * 0.5 or bh > h * 0.5:
            logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
            continue
-        perimeter = cv2.arcLength(cnt, True)
+        bbox_area = bw * bh
        circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
        aspect = bw / bh if bh > 0 else 1.0
        min_dim = min(bw, bh)
-        # Colored circle / balloon
+        # Check: how much of this region's bounding box overlaps with words?
-        if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
+        roi_words = word_mask[by:by + bh, bx:bx + bw]
-            # Determine color
+        word_pixel_count = int(np.sum(roi_words > 0))
-            roi_hsv = hsv[by:by + bh, bx:bx + bw]
+        word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
            cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
            cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
            masked_hsv = roi_hsv[cnt_mask_roi > 0]
            color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
-            conf = min(0.95, circularity)
+        # Check: how many actual colored pixels are in this region?
-            logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
+        roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
-                        bx, by, bw, bh, int(area), circularity, color_name)
+        color_pixel_count = int(np.sum(roi_color > 0))
-            candidates.append(GraphicElement(
+
-                x=bx, y=by, width=bw, height=bh,
+        # If most of the region is covered by word boxes → colored text, skip
-                area=int(area), shape="circle",
+        if word_overlap > 0.5:
-                color_name=color_name, color_hex=color_hex,
+            logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
-                confidence=conf, contour=cnt,
+                        bx, by, bw, bh, word_overlap * 100)
            ))
            continue
-        # Colored illustration (large colored region)
+        # Need a minimum number of colored pixels (not just dilated area)
-        if area > 2000 and min_dim > 20:
+        if color_pixel_count < 200:
            roi_hsv = hsv[by:by + bh, bx:bx + bw]
            cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
            cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
            masked_hsv = roi_hsv[cnt_mask_roi > 0]
            color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
            logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
                        bx, by, bw, bh, int(area), color_name)
            candidates.append(GraphicElement(
                x=bx, y=by, width=bw, height=bh,
                area=int(area), shape="illustration",
                color_name=color_name, color_hex=color_hex,
                confidence=0.6, contour=cnt,
            ))
            continue
        # Determine dominant color from the actual colored pixels
        roi_hsv = hsv[by:by + bh, bx:bx + bw]
        color_px_mask = roi_color > 0
        if np.sum(color_px_mask) > 0:
            masked_hsv = roi_hsv[color_px_mask]
            color_name, color_hex = _dominant_color(masked_hsv)
        else:
            color_name, color_hex = "black", _COLOR_HEX["black"]
        # Confidence based on color density and low word overlap
        density = color_pixel_count / bbox_area if bbox_area > 0 else 0
        conf = min(0.95, 0.5 + density * 0.5)
        logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
                     "color_px=%d word_overlap=%.0f%% color=%s",
                     bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
            area=color_pixel_count,
            shape="image",
            color_name=color_name, color_hex=color_hex,
            confidence=round(conf, 2), contour=cnt,
        ))
    # =====================================================================
-    # PASS 2 — INK (dark pixels) with word exclusion
+    # PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
    # Only for large black illustrations (drawings in black ink).
    # =====================================================================
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-    # Build exclusion mask from words
+    # Exclude words and colored regions already found
    exclusion = np.zeros((h, w), dtype=np.uint8)
    word_pad = 5
    for wb in word_boxes:
@@ -225,7 +236,6 @@ def detect_graphic_elements(
        y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
        exclusion[y1:y2, x1:x2] = 255
    # Also exclude detected box regions
    if detected_boxes:
        for box in detected_boxes:
            bbx = int(box.get("x", 0))
@@ -241,11 +251,8 @@ def detect_graphic_elements(
                exclusion[y1:y2, x1:x2] = 255
    ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
    ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
    # Remove colored regions already found in pass 1
    ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
    # Only look for LARGE remaining regions (black illustrations)
    contours_ink, _ = cv2.findContours(
        ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
@@ -254,14 +261,10 @@ def detect_graphic_elements(
    for cnt in contours_ink:
        area = cv2.contourArea(cnt)
        bx, by, bw, bh = cv2.boundingRect(cnt)
        min_dim = min(bw, bh)
-        # Only large illustrations survive (area > 5000, min_dim > 40)
+        if area < 5000 or min(bw, bh) < 40:
        if area < 5000 or min_dim < 40:
            continue
-
+        if bw > w * 0.8 or bh > h * 0.8:
        # Skip page-spanning contours
        if bw > w * 0.8 and bh > h * 0.8:
            continue
        logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
@@ -274,16 +277,14 @@ def detect_graphic_elements(
        ))
    # =====================================================================
-    # Deduplicate overlapping results and return
+    # Deduplicate and return
    # =====================================================================
    candidates.sort(key=lambda g: g.area, reverse=True)
    # Remove duplicates where bounding boxes overlap > 50%
    final: List[GraphicElement] = []
    for c in candidates:
        overlap = False
        for f in final:
            # Intersection
            ix1 = max(c.x, f.x)
            iy1 = max(c.y, f.y)
            ix2 = min(c.x + c.width, f.x + f.width)