Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
The 5x5 MORPH_CLOSE was connecting scattered color pixels into one page-spanning contour that swallowed individual balloons. Fix: - Remove MORPH_CLOSE, keep only MORPH_OPEN for speckle removal - Lower sat threshold 50→40 to catch more colored elements - Filter contours spanning >50% of width OR height (was AND) - Filter contours >10% of image area Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
315 lines
11 KiB
Python
315 lines
11 KiB
Python
"""
|
|
Graphical element detection for OCR pages.
|
|
|
|
Two-pass approach:
|
|
Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
|
|
arrows, icons) on the saturation channel alone. Black text has
|
|
zero saturation and is invisible on this channel, so no word
|
|
exclusion is needed.
|
|
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
|
|
OCR word boxes from the full ink mask and keeping only very large
|
|
remaining contours.
|
|
|
|
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = ["detect_graphic_elements", "GraphicElement"]
|
|
|
|
|
|
@dataclass
|
|
class GraphicElement:
|
|
"""A detected non-text graphical element."""
|
|
x: int
|
|
y: int
|
|
width: int
|
|
height: int
|
|
area: int
|
|
shape: str # circle, illustration
|
|
color_name: str # dominant color or 'black'
|
|
color_hex: str
|
|
confidence: float
|
|
contour: Any = field(default=None, repr=False)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Color helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_COLOR_HEX = {
|
|
"black": "#000000",
|
|
"gray": "#6b7280",
|
|
"red": "#dc2626",
|
|
"orange": "#ea580c",
|
|
"yellow": "#ca8a04",
|
|
"green": "#16a34a",
|
|
"blue": "#2563eb",
|
|
"purple": "#9333ea",
|
|
}
|
|
|
|
|
|
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
|
|
"""Return (color_name, color_hex) for an HSV region."""
|
|
if hsv_roi.size == 0:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
pixels = hsv_roi.reshape(-1, 3)
|
|
sat = pixels[:, 1]
|
|
sat_mask = sat > sat_threshold
|
|
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
|
|
|
|
if sat_ratio < 0.15:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
sat_pixels = pixels[sat_mask]
|
|
if len(sat_pixels) < 3:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
med_hue = float(np.median(sat_pixels[:, 0]))
|
|
|
|
if med_hue < 10 or med_hue > 170:
|
|
name = "red"
|
|
elif med_hue < 25:
|
|
name = "orange"
|
|
elif med_hue < 35:
|
|
name = "yellow"
|
|
elif med_hue < 85:
|
|
name = "green"
|
|
elif med_hue < 130:
|
|
name = "blue"
|
|
else:
|
|
name = "purple"
|
|
|
|
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def detect_graphic_elements(
|
|
img_bgr: np.ndarray,
|
|
word_boxes: List[Dict],
|
|
detected_boxes: Optional[List[Dict]] = None,
|
|
max_elements: int = 50,
|
|
) -> List[GraphicElement]:
|
|
"""Find non-text graphical elements on the page.
|
|
|
|
Two-pass approach:
|
|
Pass 1 (color): Find colored elements via saturation channel.
|
|
No word exclusion needed — black text is invisible.
|
|
Pass 2 (ink): Find large black illustrations via ink mask minus
|
|
word exclusion.
|
|
|
|
Args:
|
|
img_bgr: BGR color image.
|
|
word_boxes: List of OCR word dicts with left/top/width/height.
|
|
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
|
max_elements: Maximum number of elements to return.
|
|
|
|
Returns:
|
|
List of GraphicElement, sorted by area descending.
|
|
"""
|
|
if img_bgr is None:
|
|
return []
|
|
|
|
h, w = img_bgr.shape[:2]
|
|
img_area = h * w
|
|
|
|
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
|
w, h, len(word_boxes), len(detected_boxes or []))
|
|
|
|
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
|
candidates: List[GraphicElement] = []
|
|
|
|
# =====================================================================
|
|
# PASS 1 — COLOR CHANNEL (no word exclusion needed)
|
|
# =====================================================================
|
|
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible.
|
|
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
|
# Exclude very bright backgrounds (white/near-white with color cast)
|
|
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
|
|
color_mask = cv2.bitwise_and(sat_mask, val_mask)
|
|
|
|
# Only remove tiny speckle — NO closing, which would merge nearby
|
|
# colored elements into one giant blob spanning half the page.
|
|
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
|
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
|
|
|
|
contours_color, _ = cv2.findContours(
|
|
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
|
)
|
|
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
|
|
|
|
for cnt in contours_color:
|
|
area = cv2.contourArea(cnt)
|
|
if area < 80:
|
|
continue
|
|
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
if bw < 8 or bh < 8:
|
|
continue
|
|
|
|
# Skip page-spanning contours (background color cast / merged blobs)
|
|
if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10:
|
|
continue
|
|
|
|
perimeter = cv2.arcLength(cnt, True)
|
|
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
|
aspect = bw / bh if bh > 0 else 1.0
|
|
min_dim = min(bw, bh)
|
|
|
|
# Colored circle / balloon
|
|
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
|
|
# Determine color
|
|
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
|
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
|
|
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
|
|
masked_hsv = roi_hsv[cnt_mask_roi > 0]
|
|
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
|
|
|
|
conf = min(0.95, circularity)
|
|
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
|
|
bx, by, bw, bh, int(area), circularity, color_name)
|
|
candidates.append(GraphicElement(
|
|
x=bx, y=by, width=bw, height=bh,
|
|
area=int(area), shape="circle",
|
|
color_name=color_name, color_hex=color_hex,
|
|
confidence=conf, contour=cnt,
|
|
))
|
|
continue
|
|
|
|
# Colored illustration (large colored region)
|
|
if area > 2000 and min_dim > 20:
|
|
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
|
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
|
|
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
|
|
masked_hsv = roi_hsv[cnt_mask_roi > 0]
|
|
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
|
|
|
|
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
|
|
bx, by, bw, bh, int(area), color_name)
|
|
candidates.append(GraphicElement(
|
|
x=bx, y=by, width=bw, height=bh,
|
|
area=int(area), shape="illustration",
|
|
color_name=color_name, color_hex=color_hex,
|
|
confidence=0.6, contour=cnt,
|
|
))
|
|
continue
|
|
|
|
# =====================================================================
|
|
# PASS 2 — INK (dark pixels) with word exclusion
|
|
# Only for large black illustrations (drawings in black ink).
|
|
# =====================================================================
|
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
|
|
# Build exclusion mask from words
|
|
exclusion = np.zeros((h, w), dtype=np.uint8)
|
|
word_pad = 5
|
|
for wb in word_boxes:
|
|
x1 = max(0, int(wb.get("left", 0)) - word_pad)
|
|
y1 = max(0, int(wb.get("top", 0)) - word_pad)
|
|
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
|
|
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
|
exclusion[y1:y2, x1:x2] = 255
|
|
|
|
# Also exclude detected box regions
|
|
if detected_boxes:
|
|
for box in detected_boxes:
|
|
bbx = int(box.get("x", 0))
|
|
bby = int(box.get("y", 0))
|
|
bbw = int(box.get("w", box.get("width", 0)))
|
|
bbh = int(box.get("h", box.get("height", 0)))
|
|
inset = 8
|
|
x1 = max(0, bbx + inset)
|
|
y1 = max(0, bby + inset)
|
|
x2 = min(w, bbx + bbw - inset)
|
|
y2 = min(h, bby + bbh - inset)
|
|
if x2 > x1 and y2 > y1:
|
|
exclusion[y1:y2, x1:x2] = 255
|
|
|
|
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
|
|
|
|
# Remove colored regions already found in pass 1
|
|
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
|
|
|
|
# Only look for LARGE remaining regions (black illustrations)
|
|
contours_ink, _ = cv2.findContours(
|
|
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
|
)
|
|
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
|
|
|
|
for cnt in contours_ink:
|
|
area = cv2.contourArea(cnt)
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
min_dim = min(bw, bh)
|
|
|
|
# Only large illustrations survive (area > 5000, min_dim > 40)
|
|
if area < 5000 or min_dim < 40:
|
|
continue
|
|
|
|
# Skip page-spanning contours
|
|
if bw > w * 0.8 and bh > h * 0.8:
|
|
continue
|
|
|
|
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
|
|
bx, by, bw, bh, int(area))
|
|
candidates.append(GraphicElement(
|
|
x=bx, y=by, width=bw, height=bh,
|
|
area=int(area), shape="illustration",
|
|
color_name="black", color_hex="#000000",
|
|
confidence=0.5, contour=cnt,
|
|
))
|
|
|
|
# =====================================================================
|
|
# Deduplicate overlapping results and return
|
|
# =====================================================================
|
|
candidates.sort(key=lambda g: g.area, reverse=True)
|
|
|
|
# Remove duplicates where bounding boxes overlap > 50%
|
|
final: List[GraphicElement] = []
|
|
for c in candidates:
|
|
overlap = False
|
|
for f in final:
|
|
# Intersection
|
|
ix1 = max(c.x, f.x)
|
|
iy1 = max(c.y, f.y)
|
|
ix2 = min(c.x + c.width, f.x + f.width)
|
|
iy2 = min(c.y + c.height, f.y + f.height)
|
|
if ix2 > ix1 and iy2 > iy1:
|
|
inter = (ix2 - ix1) * (iy2 - iy1)
|
|
smaller = min(c.width * c.height, f.width * f.height)
|
|
if smaller > 0 and inter / smaller > 0.5:
|
|
overlap = True
|
|
break
|
|
if not overlap:
|
|
final.append(c)
|
|
|
|
result = final[:max_elements]
|
|
|
|
if result:
|
|
shape_counts: Dict[str, int] = {}
|
|
for g in result:
|
|
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
|
logger.info(
|
|
"GraphicDetect: %d elements found (%s)",
|
|
len(result),
|
|
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
|
)
|
|
else:
|
|
logger.info("GraphicDetect: no graphic elements found")
|
|
|
|
return result
|