feat: two-pass graphic detection (color channel + ink)

Pass 1 (color): Detect colored graphics on HSV saturation channel. Black text is invisible on this channel, so no word exclusion needed. Catches colored balloons, arrows, icons reliably. Pass 2 (ink): Detect large black illustrations on dark ink mask minus word exclusion. Only keeps area > 5000 to avoid text fragments. Fixes: all 5 balloons now detectable (previously word exclusion zones were eating colored graphics that overlapped with nearby OCR words). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 14:30:33 +01:00
parent 86ae71fd65
commit 1653e7cff4
1 changed files with 153 additions and 147 deletions
@@ -1,11 +1,16 @@
 """
 Graphical element detection for OCR pages.
-Finds non-text visual elements (arrows, balloons, icons, illustrations)
+Two-pass approach:
-by subtracting known OCR word regions from the page ink and analysing
+  Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
-remaining connected components via contour shape metrics.
+           arrows, icons) on the saturation channel alone.  Black text has
           zero saturation and is invisible on this channel, so no word
           exclusion is needed.
  Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
           OCR word boxes from the full ink mask and keeping only very large
           remaining contours.
-Works on both color and grayscale scans.
+Boxes and text colors are handled by cv_box_detect / cv_color_detect.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -31,11 +36,11 @@ class GraphicElement:
    width: int
    height: int
    area: int
-    shape: str          # arrow, circle, line, icon, illustration
+    shape: str          # circle, illustration
    color_name: str     # dominant color or 'black'
    color_hex: str
    confidence: float
-    contour: Any = field(default=None, repr=False)  # numpy contour, excluded from repr
+    contour: Any = field(default=None, repr=False)
 # ---------------------------------------------------------------------------
@@ -89,46 +94,6 @@ def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
    return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
 # ---------------------------------------------------------------------------
 # Shape classification via contour analysis
 # ---------------------------------------------------------------------------
 def _classify_shape(
    contour: np.ndarray,
    bw: int,
    bh: int,
    area: float,
 ) -> tuple:
    """Classify contour shape → (shape_name, confidence).
    Only detects high-confidence shapes that are clearly non-text:
    - circle/balloon: high circularity (very reliable)
    - illustration: large area (clearly a drawing/image)
    Text fragments are classified as 'noise' and filtered out.
    Boxes and colors are detected by separate modules.
    """
    perimeter = cv2.arcLength(contour, True)
    circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
    aspect = bw / bh if bh > 0 else 1.0
    min_dim = min(bw, bh)
    # --- Circle / balloon ---
    # High circularity is the most reliable non-text indicator.
    # Text characters rarely have circularity > 0.55.
    if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15:
        conf = min(0.95, circularity)
        return "circle", conf
    # --- Illustration (drawing, image, large graphic) ---
    # Large connected regions that survived word exclusion = genuine graphics.
    if area > 3000 and min_dim > 30:
        return "illustration", 0.6
    # Everything else is likely a text fragment — skip
    return "noise", 0.0
 # ---------------------------------------------------------------------------
 # Main detection
 # ---------------------------------------------------------------------------
@@ -137,24 +102,20 @@ def detect_graphic_elements(
    img_bgr: np.ndarray,
    word_boxes: List[Dict],
    detected_boxes: Optional[List[Dict]] = None,
    min_area: int = 80,
    max_area_ratio: float = 0.25,
    word_pad: int = 5,
    max_elements: int = 50,
 ) -> List[GraphicElement]:
    """Find non-text graphical elements on the page.
-    1. Build ink mask (dark + colored pixels).
+    Two-pass approach:
-    2. Subtract OCR word regions and detected boxes.
+      Pass 1 (color): Find colored elements via saturation channel.
-    3. Find connected components and classify shapes.
+                       No word exclusion needed — black text is invisible.
      Pass 2 (ink):    Find large black illustrations via ink mask minus
                       word exclusion.
    Args:
        img_bgr: BGR color image.
        word_boxes: List of OCR word dicts with left/top/width/height.
        detected_boxes: Optional list of detected box dicts (x/y/w/h).
        min_area: Minimum contour area to keep (80 filters tiny noise).
        max_area_ratio: Maximum area as fraction of image area.
        word_pad: Padding around word boxes for exclusion (5px).
        max_elements: Maximum number of elements to return.
    Returns:
@@ -164,27 +125,100 @@ def detect_graphic_elements(
        return []
    h, w = img_bgr.shape[:2]
-    max_area = int(h * w * max_area_ratio)
+    img_area = h * w
    logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
                w, h, len(word_boxes), len(detected_boxes or []))
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    candidates: List[GraphicElement] = []
-    # --- 1. Build ink mask: dark pixels + saturated colored pixels ---
+    # =====================================================================
    # PASS 1 — COLOR CHANNEL (no word exclusion needed)
    # =====================================================================
    # Saturated pixels = colored ink.  Black text has sat ≈ 0 → invisible.
    sat_mask = (hsv[:, :, 1] > 50).astype(np.uint8) * 255
    # Exclude very bright backgrounds (white/near-white with color cast)
    val_mask = (hsv[:, :, 2] < 235).astype(np.uint8) * 255
    color_mask = cv2.bitwise_and(sat_mask, val_mask)
    # Morphological cleanup: close small gaps, remove speckle
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel)
    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
    contours_color, _ = cv2.findContours(
        color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
    logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
    for cnt in contours_color:
        area = cv2.contourArea(cnt)
        if area < 80:
            continue
        bx, by, bw, bh = cv2.boundingRect(cnt)
        if bw < 8 or bh < 8:
            continue
        # Skip page-spanning contours (background color cast)
        if bw > w * 0.8 and bh > h * 0.8:
            continue
        perimeter = cv2.arcLength(cnt, True)
        circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
        aspect = bw / bh if bh > 0 else 1.0
        min_dim = min(bw, bh)
        # Colored circle / balloon
        if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
            # Determine color
            roi_hsv = hsv[by:by + bh, bx:bx + bw]
            cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
            cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
            masked_hsv = roi_hsv[cnt_mask_roi > 0]
            color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
            conf = min(0.95, circularity)
            logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
                        bx, by, bw, bh, int(area), circularity, color_name)
            candidates.append(GraphicElement(
                x=bx, y=by, width=bw, height=bh,
                area=int(area), shape="circle",
                color_name=color_name, color_hex=color_hex,
                confidence=conf, contour=cnt,
            ))
            continue
        # Colored illustration (large colored region)
        if area > 2000 and min_dim > 20:
            roi_hsv = hsv[by:by + bh, bx:bx + bw]
            cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
            cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
            masked_hsv = roi_hsv[cnt_mask_roi > 0]
            color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
            logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
                        bx, by, bw, bh, int(area), color_name)
            candidates.append(GraphicElement(
                x=bx, y=by, width=bw, height=bh,
                area=int(area), shape="illustration",
                color_name=color_name, color_hex=color_hex,
                confidence=0.6, contour=cnt,
            ))
            continue
    # =====================================================================
    # PASS 2 — INK (dark pixels) with word exclusion
    # Only for large black illustrations (drawings in black ink).
    # =====================================================================
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-    # Saturated colored pixels (catches colored arrows, markers)
+    # Build exclusion mask from words
    sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
    val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
    color_ink = cv2.bitwise_and(sat_mask, val_mask)
    ink_mask = cv2.bitwise_or(dark_mask, color_ink)
    # --- 2. Build exclusion mask from OCR words ---
    exclusion = np.zeros((h, w), dtype=np.uint8)
-
+    word_pad = 5
    for wb in word_boxes:
        x1 = max(0, int(wb.get("left", 0)) - word_pad)
        y1 = max(0, int(wb.get("top", 0)) - word_pad)
@@ -192,110 +226,82 @@ def detect_graphic_elements(
        y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
        exclusion[y1:y2, x1:x2] = 255
-    # Also exclude detected box interiors (they contain text, not graphics)
+    # Also exclude detected box regions
    # But keep a border strip so arrows/icons at box edges are found
    if detected_boxes:
        box_inset = 8
        for box in detected_boxes:
-            bx = int(box.get("x", 0))
+            bbx = int(box.get("x", 0))
-            by = int(box.get("y", 0))
+            bby = int(box.get("y", 0))
            bbw = int(box.get("w", box.get("width", 0)))
            bbh = int(box.get("h", box.get("height", 0)))
-            x1 = max(0, bx + box_inset)
+            inset = 8
-            y1 = max(0, by + box_inset)
+            x1 = max(0, bbx + inset)
-            x2 = min(w, bx + bbw - box_inset)
+            y1 = max(0, bby + inset)
-            y2 = min(h, by + bbh - box_inset)
+            x2 = min(w, bbx + bbw - inset)
            y2 = min(h, bby + bbh - inset)
            if x2 > x1 and y2 > y1:
                exclusion[y1:y2, x1:x2] = 255
-    excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
+    ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
    logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
-    # Subtract exclusion from ink
+    # Remove colored regions already found in pass 1
-    graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
+    ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
-    # --- 3. Morphological cleanup ---
+    # Only look for LARGE remaining regions (black illustrations)
-    # Close small gaps (connects arrow stroke + head) — but not too large
+    contours_ink, _ = cv2.findContours(
-    # to avoid reconnecting text fragments
+        ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
    # Remove small noise
    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
    # --- 4. Find contours ---
    contours, _ = cv2.findContours(
        graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
    logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
-    logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
+    for cnt in contours_ink:
    # --- 5. Analyse and classify ---
    candidates: List[GraphicElement] = []
    skip_reasons: Dict[str, int] = {}
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area < min_area or area > max_area:
            bx, by, bw, bh = cv2.boundingRect(cnt)
            reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
            logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
            skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
            continue
        bx, by, bw, bh = cv2.boundingRect(cnt)
-        if bw < 8 or bh < 8:
+        min_dim = min(bw, bh)
-            skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
+
        # Only large illustrations survive (area > 5000, min_dim > 40)
        if area < 5000 or min_dim < 40:
            continue
-        # Skip elements that overlap significantly with the exclusion zone
+        # Skip page-spanning contours
-        roi_excl = exclusion[by:by + bh, bx:bx + bw]
+        if bw > w * 0.8 and bh > h * 0.8:
        excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
        if excl_ratio > 0.4:
            logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
                         excl_ratio, bx, by, bw, bh, int(area))
            skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
            continue
-        # Classify shape
+        logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
-        shape, conf = _classify_shape(cnt, bw, bh, area)
+                    bx, by, bw, bh, int(area))
        # Skip noise (too small or text-like)
        if shape == "noise":
            logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
                         bx, by, bw, bh, int(area))
            skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
            continue
        # Determine dominant color
        roi_hsv = hsv[by:by + bh, bx:bx + bw]
        cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
        shifted_cnt = cnt - np.array([bx, by])
        cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
        masked_hsv = roi_hsv[cnt_mask > 0]
        color_name, color_hex = _dominant_color(masked_hsv)
        logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f",
                    shape, bx, by, bw, bh, int(area), color_name, conf)
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
-            area=int(area),
+            area=int(area), shape="illustration",
-            shape=shape,
+            color_name="black", color_hex="#000000",
-            color_name=color_name,
+            confidence=0.5, contour=cnt,
            color_hex=color_hex,
            confidence=conf,
            contour=cnt,
        ))
-    if skip_reasons:
+    # =====================================================================
-        logger.info("GraphicDetect: skipped contours: %s",
+    # Deduplicate overlapping results and return
-                     ", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items())))
+    # =====================================================================
    # Sort by area descending, limit count
    candidates.sort(key=lambda g: g.area, reverse=True)
-    result = candidates[:max_elements]
+
    # Remove duplicates where bounding boxes overlap > 50%
    final: List[GraphicElement] = []
    for c in candidates:
        overlap = False
        for f in final:
            # Intersection
            ix1 = max(c.x, f.x)
            iy1 = max(c.y, f.y)
            ix2 = min(c.x + c.width, f.x + f.width)
            iy2 = min(c.y + c.height, f.y + f.height)
            if ix2 > ix1 and iy2 > iy1:
                inter = (ix2 - ix1) * (iy2 - iy1)
                smaller = min(c.width * c.height, f.width * f.height)
                if smaller > 0 and inter / smaller > 0.5:
                    overlap = True
                    break
        if not overlap:
            final.append(c)
    result = final[:max_elements]
    if result:
-        shape_counts = {}
+        shape_counts: Dict[str, int] = {}
        for g in result:
            shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
        logger.info(