feat: two-pass graphic detection (color channel + ink)

Pass 1 (color): Detect colored graphics on HSV saturation channel. Black text is invisible on this channel, so no word exclusion needed. Catches colored balloons, arrows, icons reliably. Pass 2 (ink): Detect large black illustrations on dark ink mask minus word exclusion. Only keeps area > 5000 to avoid text fragments. Fixes: all 5 balloons now detectable (previously word exclusion zones were eating colored graphics that overlapped with nearby OCR words). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 14:30:33 +01:00
parent 86ae71fd65
commit 1653e7cff4
1 changed files with 153 additions and 147 deletions
@@ -1,11 +1,16 @@
 """
 Graphical element detection for OCR pages.

-Finds non-text visual elements (arrows, balloons, icons, illustrations)
-by subtracting known OCR word regions from the page ink and analysing
-remaining connected components via contour shape metrics.
+Two-pass approach:
+  Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
+           arrows, icons) on the saturation channel alone.  Black text has
+           zero saturation and is invisible on this channel, so no word
+           exclusion is needed.
+  Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
+           OCR word boxes from the full ink mask and keeping only very large
+           remaining contours.

-Works on both color and grayscale scans.
+Boxes and text colors are handled by cv_box_detect / cv_color_detect.

 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -31,11 +36,11 @@ class GraphicElement:
    width: int
    height: int
    area: int
-    shape: str          # arrow, circle, line, icon, illustration
+    shape: str          # circle, illustration
    color_name: str     # dominant color or 'black'
    color_hex: str
    confidence: float
-    contour: Any = field(default=None, repr=False)  # numpy contour, excluded from repr
+    contour: Any = field(default=None, repr=False)


 # ---------------------------------------------------------------------------
@@ -89,46 +94,6 @@ def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
    return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])


-# ---------------------------------------------------------------------------
-# Shape classification via contour analysis
-# ---------------------------------------------------------------------------
-
-def _classify_shape(
-    contour: np.ndarray,
-    bw: int,
-    bh: int,
-    area: float,
-) -> tuple:
-    """Classify contour shape → (shape_name, confidence).
-
-    Only detects high-confidence shapes that are clearly non-text:
-    - circle/balloon: high circularity (very reliable)
-    - illustration: large area (clearly a drawing/image)
-
-    Text fragments are classified as 'noise' and filtered out.
-    Boxes and colors are detected by separate modules.
-    """
-    perimeter = cv2.arcLength(contour, True)
-    circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
-    aspect = bw / bh if bh > 0 else 1.0
-    min_dim = min(bw, bh)
-
-    # --- Circle / balloon ---
-    # High circularity is the most reliable non-text indicator.
-    # Text characters rarely have circularity > 0.55.
-    if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15:
-        conf = min(0.95, circularity)
-        return "circle", conf
-
-    # --- Illustration (drawing, image, large graphic) ---
-    # Large connected regions that survived word exclusion = genuine graphics.
-    if area > 3000 and min_dim > 30:
-        return "illustration", 0.6
-
-    # Everything else is likely a text fragment — skip
-    return "noise", 0.0
-
-
 # ---------------------------------------------------------------------------
 # Main detection
 # ---------------------------------------------------------------------------
@@ -137,24 +102,20 @@ def detect_graphic_elements(
    img_bgr: np.ndarray,
    word_boxes: List[Dict],
    detected_boxes: Optional[List[Dict]] = None,
-    min_area: int = 80,
-    max_area_ratio: float = 0.25,
-    word_pad: int = 5,
    max_elements: int = 50,
 ) -> List[GraphicElement]:
    """Find non-text graphical elements on the page.

-    1. Build ink mask (dark + colored pixels).
-    2. Subtract OCR word regions and detected boxes.
-    3. Find connected components and classify shapes.
+    Two-pass approach:
+      Pass 1 (color): Find colored elements via saturation channel.
+                       No word exclusion needed — black text is invisible.
+      Pass 2 (ink):    Find large black illustrations via ink mask minus
+                       word exclusion.

    Args:
        img_bgr: BGR color image.
        word_boxes: List of OCR word dicts with left/top/width/height.
        detected_boxes: Optional list of detected box dicts (x/y/w/h).
-        min_area: Minimum contour area to keep (80 filters tiny noise).
-        max_area_ratio: Maximum area as fraction of image area.
-        word_pad: Padding around word boxes for exclusion (5px).
        max_elements: Maximum number of elements to return.

    Returns:
@@ -164,27 +125,100 @@ def detect_graphic_elements(
        return []

    h, w = img_bgr.shape[:2]
-    max_area = int(h * w * max_area_ratio)
+    img_area = h * w

    logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
                w, h, len(word_boxes), len(detected_boxes or []))

-    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    candidates: List[GraphicElement] = []

-    # --- 1. Build ink mask: dark pixels + saturated colored pixels ---
+    # =====================================================================
+    # PASS 1 — COLOR CHANNEL (no word exclusion needed)
+    # =====================================================================
+    # Saturated pixels = colored ink.  Black text has sat ≈ 0 → invisible.
+    sat_mask = (hsv[:, :, 1] > 50).astype(np.uint8) * 255
+    # Exclude very bright backgrounds (white/near-white with color cast)
+    val_mask = (hsv[:, :, 2] < 235).astype(np.uint8) * 255
+    color_mask = cv2.bitwise_and(sat_mask, val_mask)
+
+    # Morphological cleanup: close small gaps, remove speckle
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
+    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel)
+    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
+
+    contours_color, _ = cv2.findContours(
+        color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
+
+    for cnt in contours_color:
+        area = cv2.contourArea(cnt)
+        if area < 80:
+            continue
+
+        bx, by, bw, bh = cv2.boundingRect(cnt)
+        if bw < 8 or bh < 8:
+            continue
+
+        # Skip page-spanning contours (background color cast)
+        if bw > w * 0.8 and bh > h * 0.8:
+            continue
+
+        perimeter = cv2.arcLength(cnt, True)
+        circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
+        aspect = bw / bh if bh > 0 else 1.0
+        min_dim = min(bw, bh)
+
+        # Colored circle / balloon
+        if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
+            # Determine color
+            roi_hsv = hsv[by:by + bh, bx:bx + bw]
+            cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
+            cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
+            masked_hsv = roi_hsv[cnt_mask_roi > 0]
+            color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
+
+            conf = min(0.95, circularity)
+            logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
+                        bx, by, bw, bh, int(area), circularity, color_name)
+            candidates.append(GraphicElement(
+                x=bx, y=by, width=bw, height=bh,
+                area=int(area), shape="circle",
+                color_name=color_name, color_hex=color_hex,
+                confidence=conf, contour=cnt,
+            ))
+            continue
+
+        # Colored illustration (large colored region)
+        if area > 2000 and min_dim > 20:
+            roi_hsv = hsv[by:by + bh, bx:bx + bw]
+            cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
+            cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
+            masked_hsv = roi_hsv[cnt_mask_roi > 0]
+            color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
+
+            logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
+                        bx, by, bw, bh, int(area), color_name)
+            candidates.append(GraphicElement(
+                x=bx, y=by, width=bw, height=bh,
+                area=int(area), shape="illustration",
+                color_name=color_name, color_hex=color_hex,
+                confidence=0.6, contour=cnt,
+            ))
+            continue
+
+    # =====================================================================
+    # PASS 2 — INK (dark pixels) with word exclusion
+    # Only for large black illustrations (drawings in black ink).
+    # =====================================================================
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

-    # Saturated colored pixels (catches colored arrows, markers)
-    sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
-    val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
-    color_ink = cv2.bitwise_and(sat_mask, val_mask)
-
-    ink_mask = cv2.bitwise_or(dark_mask, color_ink)
-
-    # --- 2. Build exclusion mask from OCR words ---
+    # Build exclusion mask from words
    exclusion = np.zeros((h, w), dtype=np.uint8)
-
+    word_pad = 5
    for wb in word_boxes:
        x1 = max(0, int(wb.get("left", 0)) - word_pad)
        y1 = max(0, int(wb.get("top", 0)) - word_pad)
@@ -192,110 +226,82 @@ def detect_graphic_elements(
        y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
        exclusion[y1:y2, x1:x2] = 255

-    # Also exclude detected box interiors (they contain text, not graphics)
-    # But keep a border strip so arrows/icons at box edges are found
+    # Also exclude detected box regions
    if detected_boxes:
-        box_inset = 8
        for box in detected_boxes:
-            bx = int(box.get("x", 0))
-            by = int(box.get("y", 0))
+            bbx = int(box.get("x", 0))
+            bby = int(box.get("y", 0))
            bbw = int(box.get("w", box.get("width", 0)))
            bbh = int(box.get("h", box.get("height", 0)))
-            x1 = max(0, bx + box_inset)
-            y1 = max(0, by + box_inset)
-            x2 = min(w, bx + bbw - box_inset)
-            y2 = min(h, by + bbh - box_inset)
+            inset = 8
+            x1 = max(0, bbx + inset)
+            y1 = max(0, bby + inset)
+            x2 = min(w, bbx + bbw - inset)
+            y2 = min(h, bby + bbh - inset)
            if x2 > x1 and y2 > y1:
                exclusion[y1:y2, x1:x2] = 255

-    excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
-    logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
+    ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))

-    # Subtract exclusion from ink
-    graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
+    # Remove colored regions already found in pass 1
+    ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))

-    # --- 3. Morphological cleanup ---
-    # Close small gaps (connects arrow stroke + head) — but not too large
-    # to avoid reconnecting text fragments
-    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
-    graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
-    # Remove small noise
-    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
-    graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
-
-    # --- 4. Find contours ---
-    contours, _ = cv2.findContours(
-        graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    # Only look for LARGE remaining regions (black illustrations)
+    contours_ink, _ = cv2.findContours(
+        ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
+    logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))

-    logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
-
-    # --- 5. Analyse and classify ---
-    candidates: List[GraphicElement] = []
-    skip_reasons: Dict[str, int] = {}
-    for cnt in contours:
+    for cnt in contours_ink:
        area = cv2.contourArea(cnt)
-        if area < min_area or area > max_area:
-            bx, by, bw, bh = cv2.boundingRect(cnt)
-            reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
-            logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
-            skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
-            continue
-
        bx, by, bw, bh = cv2.boundingRect(cnt)
-        if bw < 8 or bh < 8:
-            skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
+        min_dim = min(bw, bh)
+
+        # Only large illustrations survive (area > 5000, min_dim > 40)
+        if area < 5000 or min_dim < 40:
            continue

-        # Skip elements that overlap significantly with the exclusion zone
-        roi_excl = exclusion[by:by + bh, bx:bx + bw]
-        excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
-        if excl_ratio > 0.4:
-            logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
-                         excl_ratio, bx, by, bw, bh, int(area))
-            skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
+        # Skip page-spanning contours
+        if bw > w * 0.8 and bh > h * 0.8:
            continue

-        # Classify shape
-        shape, conf = _classify_shape(cnt, bw, bh, area)
-
-        # Skip noise (too small or text-like)
-        if shape == "noise":
-            logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
-                         bx, by, bw, bh, int(area))
-            skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
-            continue
-
-        # Determine dominant color
-        roi_hsv = hsv[by:by + bh, bx:bx + bw]
-        cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
-        shifted_cnt = cnt - np.array([bx, by])
-        cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
-        masked_hsv = roi_hsv[cnt_mask > 0]
-        color_name, color_hex = _dominant_color(masked_hsv)
-
-        logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f",
-                    shape, bx, by, bw, bh, int(area), color_name, conf)
+        logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
+                    bx, by, bw, bh, int(area))
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
-            area=int(area),
-            shape=shape,
-            color_name=color_name,
-            color_hex=color_hex,
-            confidence=conf,
-            contour=cnt,
+            area=int(area), shape="illustration",
+            color_name="black", color_hex="#000000",
+            confidence=0.5, contour=cnt,
        ))

-    if skip_reasons:
-        logger.info("GraphicDetect: skipped contours: %s",
-                     ", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items())))
-
-    # Sort by area descending, limit count
+    # =====================================================================
+    # Deduplicate overlapping results and return
+    # =====================================================================
    candidates.sort(key=lambda g: g.area, reverse=True)
-    result = candidates[:max_elements]
+
+    # Remove duplicates where bounding boxes overlap > 50%
+    final: List[GraphicElement] = []
+    for c in candidates:
+        overlap = False
+        for f in final:
+            # Intersection
+            ix1 = max(c.x, f.x)
+            iy1 = max(c.y, f.y)
+            ix2 = min(c.x + c.width, f.x + f.width)
+            iy2 = min(c.y + c.height, f.y + f.height)
+            if ix2 > ix1 and iy2 > iy1:
+                inter = (ix2 - ix1) * (iy2 - iy1)
+                smaller = min(c.width * c.height, f.width * f.height)
+                if smaller > 0 and inter / smaller > 0.5:
+                    overlap = True
+                    break
+        if not overlap:
+            final.append(c)
+
+    result = final[:max_elements]

    if result:
-        shape_counts = {}
+        shape_counts: Dict[str, int] = {}
        for g in result:
            shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
        logger.info(