feat: two-pass graphic detection (color channel + ink)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s

Pass 1 (color): Detect colored graphics on HSV saturation channel.
Black text is invisible on this channel, so no word exclusion needed.
Catches colored balloons, arrows, icons reliably.

Pass 2 (ink): Detect large black illustrations on dark ink mask
minus word exclusion. Only keeps area > 5000 to avoid text fragments.

Fixes: all 5 balloons now detectable (previously word exclusion zones
were eating colored graphics that overlapped with nearby OCR words).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 14:30:33 +01:00
parent 86ae71fd65
commit 1653e7cff4

View File

@@ -1,11 +1,16 @@
""" """
Graphical element detection for OCR pages. Graphical element detection for OCR pages.
Finds non-text visual elements (arrows, balloons, icons, illustrations) Two-pass approach:
by subtracting known OCR word regions from the page ink and analysing Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
remaining connected components via contour shape metrics. arrows, icons) on the saturation channel alone. Black text has
zero saturation and is invisible on this channel, so no word
exclusion is needed.
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
OCR word boxes from the full ink mask and keeping only very large
remaining contours.
Works on both color and grayscale scans. Boxes and text colors are handled by cv_box_detect / cv_color_detect.
Lizenz: Apache 2.0 (kommerziell nutzbar) Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -31,11 +36,11 @@ class GraphicElement:
width: int width: int
height: int height: int
area: int area: int
shape: str # arrow, circle, line, icon, illustration shape: str # circle, illustration
color_name: str # dominant color or 'black' color_name: str # dominant color or 'black'
color_hex: str color_hex: str
confidence: float confidence: float
contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr contour: Any = field(default=None, repr=False)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -89,46 +94,6 @@ def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"]) return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
# ---------------------------------------------------------------------------
# Shape classification via contour analysis
# ---------------------------------------------------------------------------
def _classify_shape(
contour: np.ndarray,
bw: int,
bh: int,
area: float,
) -> tuple:
"""Classify contour shape → (shape_name, confidence).
Only detects high-confidence shapes that are clearly non-text:
- circle/balloon: high circularity (very reliable)
- illustration: large area (clearly a drawing/image)
Text fragments are classified as 'noise' and filtered out.
Boxes and colors are detected by separate modules.
"""
perimeter = cv2.arcLength(contour, True)
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
aspect = bw / bh if bh > 0 else 1.0
min_dim = min(bw, bh)
# --- Circle / balloon ---
# High circularity is the most reliable non-text indicator.
# Text characters rarely have circularity > 0.55.
if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15:
conf = min(0.95, circularity)
return "circle", conf
# --- Illustration (drawing, image, large graphic) ---
# Large connected regions that survived word exclusion = genuine graphics.
if area > 3000 and min_dim > 30:
return "illustration", 0.6
# Everything else is likely a text fragment — skip
return "noise", 0.0
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Main detection # Main detection
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -137,24 +102,20 @@ def detect_graphic_elements(
img_bgr: np.ndarray, img_bgr: np.ndarray,
word_boxes: List[Dict], word_boxes: List[Dict],
detected_boxes: Optional[List[Dict]] = None, detected_boxes: Optional[List[Dict]] = None,
min_area: int = 80,
max_area_ratio: float = 0.25,
word_pad: int = 5,
max_elements: int = 50, max_elements: int = 50,
) -> List[GraphicElement]: ) -> List[GraphicElement]:
"""Find non-text graphical elements on the page. """Find non-text graphical elements on the page.
1. Build ink mask (dark + colored pixels). Two-pass approach:
2. Subtract OCR word regions and detected boxes. Pass 1 (color): Find colored elements via saturation channel.
3. Find connected components and classify shapes. No word exclusion needed — black text is invisible.
Pass 2 (ink): Find large black illustrations via ink mask minus
word exclusion.
Args: Args:
img_bgr: BGR color image. img_bgr: BGR color image.
word_boxes: List of OCR word dicts with left/top/width/height. word_boxes: List of OCR word dicts with left/top/width/height.
detected_boxes: Optional list of detected box dicts (x/y/w/h). detected_boxes: Optional list of detected box dicts (x/y/w/h).
min_area: Minimum contour area to keep (80 filters tiny noise).
max_area_ratio: Maximum area as fraction of image area.
word_pad: Padding around word boxes for exclusion (5px).
max_elements: Maximum number of elements to return. max_elements: Maximum number of elements to return.
Returns: Returns:
@@ -164,27 +125,100 @@ def detect_graphic_elements(
return [] return []
h, w = img_bgr.shape[:2] h, w = img_bgr.shape[:2]
max_area = int(h * w * max_area_ratio) img_area = h * w
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes", logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or [])) w, h, len(word_boxes), len(detected_boxes or []))
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = []
# --- 1. Build ink mask: dark pixels + saturated colored pixels --- # =====================================================================
# PASS 1 — COLOR CHANNEL (no word exclusion needed)
# =====================================================================
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible.
sat_mask = (hsv[:, :, 1] > 50).astype(np.uint8) * 255
# Exclude very bright backgrounds (white/near-white with color cast)
val_mask = (hsv[:, :, 2] < 235).astype(np.uint8) * 255
color_mask = cv2.bitwise_and(sat_mask, val_mask)
# Morphological cleanup: close small gaps, remove speckle
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel)
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
contours_color, _ = cv2.findContours(
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
for cnt in contours_color:
area = cv2.contourArea(cnt)
if area < 80:
continue
bx, by, bw, bh = cv2.boundingRect(cnt)
if bw < 8 or bh < 8:
continue
# Skip page-spanning contours (background color cast)
if bw > w * 0.8 and bh > h * 0.8:
continue
perimeter = cv2.arcLength(cnt, True)
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
aspect = bw / bh if bh > 0 else 1.0
min_dim = min(bw, bh)
# Colored circle / balloon
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
# Determine color
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
conf = min(0.95, circularity)
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
bx, by, bw, bh, int(area), circularity, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="circle",
color_name=color_name, color_hex=color_hex,
confidence=conf, contour=cnt,
))
continue
# Colored illustration (large colored region)
if area > 2000 and min_dim > 20:
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
bx, by, bw, bh, int(area), color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name=color_name, color_hex=color_hex,
confidence=0.6, contour=cnt,
))
continue
# =====================================================================
# PASS 2 — INK (dark pixels) with word exclusion
# Only for large black illustrations (drawings in black ink).
# =====================================================================
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Saturated colored pixels (catches colored arrows, markers) # Build exclusion mask from words
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
color_ink = cv2.bitwise_and(sat_mask, val_mask)
ink_mask = cv2.bitwise_or(dark_mask, color_ink)
# --- 2. Build exclusion mask from OCR words ---
exclusion = np.zeros((h, w), dtype=np.uint8) exclusion = np.zeros((h, w), dtype=np.uint8)
word_pad = 5
for wb in word_boxes: for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)) - word_pad) x1 = max(0, int(wb.get("left", 0)) - word_pad)
y1 = max(0, int(wb.get("top", 0)) - word_pad) y1 = max(0, int(wb.get("top", 0)) - word_pad)
@@ -192,110 +226,82 @@ def detect_graphic_elements(
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad) y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255 exclusion[y1:y2, x1:x2] = 255
# Also exclude detected box interiors (they contain text, not graphics) # Also exclude detected box regions
# But keep a border strip so arrows/icons at box edges are found
if detected_boxes: if detected_boxes:
box_inset = 8
for box in detected_boxes: for box in detected_boxes:
bx = int(box.get("x", 0)) bbx = int(box.get("x", 0))
by = int(box.get("y", 0)) bby = int(box.get("y", 0))
bbw = int(box.get("w", box.get("width", 0))) bbw = int(box.get("w", box.get("width", 0)))
bbh = int(box.get("h", box.get("height", 0))) bbh = int(box.get("h", box.get("height", 0)))
x1 = max(0, bx + box_inset) inset = 8
y1 = max(0, by + box_inset) x1 = max(0, bbx + inset)
x2 = min(w, bx + bbw - box_inset) y1 = max(0, bby + inset)
y2 = min(h, by + bbh - box_inset) x2 = min(w, bbx + bbw - inset)
y2 = min(h, bby + bbh - inset)
if x2 > x1 and y2 > y1: if x2 > x1 and y2 > y1:
exclusion[y1:y2, x1:x2] = 255 exclusion[y1:y2, x1:x2] = 255
excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0 ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
# Subtract exclusion from ink # Remove colored regions already found in pass 1
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion)) ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
# --- 3. Morphological cleanup --- # Only look for LARGE remaining regions (black illustrations)
# Close small gaps (connects arrow stroke + head) — but not too large contours_ink, _ = cv2.findContours(
# to avoid reconnecting text fragments ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
# Remove small noise
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
# --- 4. Find contours ---
contours, _ = cv2.findContours(
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
) )
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
logger.info("GraphicDetect: %d raw contours after exclusion", len(contours)) for cnt in contours_ink:
# --- 5. Analyse and classify ---
candidates: List[GraphicElement] = []
skip_reasons: Dict[str, int] = {}
for cnt in contours:
area = cv2.contourArea(cnt) area = cv2.contourArea(cnt)
if area < min_area or area > max_area:
bx, by, bw, bh = cv2.boundingRect(cnt)
reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
continue
bx, by, bw, bh = cv2.boundingRect(cnt) bx, by, bw, bh = cv2.boundingRect(cnt)
if bw < 8 or bh < 8: min_dim = min(bw, bh)
skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
# Only large illustrations survive (area > 5000, min_dim > 40)
if area < 5000 or min_dim < 40:
continue continue
# Skip elements that overlap significantly with the exclusion zone # Skip page-spanning contours
roi_excl = exclusion[by:by + bh, bx:bx + bw] if bw > w * 0.8 and bh > h * 0.8:
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
if excl_ratio > 0.4:
logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
excl_ratio, bx, by, bw, bh, int(area))
skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
continue continue
# Classify shape logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
shape, conf = _classify_shape(cnt, bw, bh, area) bx, by, bw, bh, int(area))
# Skip noise (too small or text-like)
if shape == "noise":
logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
continue
# Determine dominant color
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
shifted_cnt = cnt - np.array([bx, by])
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask > 0]
color_name, color_hex = _dominant_color(masked_hsv)
logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f",
shape, bx, by, bw, bh, int(area), color_name, conf)
candidates.append(GraphicElement( candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh, x=bx, y=by, width=bw, height=bh,
area=int(area), area=int(area), shape="illustration",
shape=shape, color_name="black", color_hex="#000000",
color_name=color_name, confidence=0.5, contour=cnt,
color_hex=color_hex,
confidence=conf,
contour=cnt,
)) ))
if skip_reasons: # =====================================================================
logger.info("GraphicDetect: skipped contours: %s", # Deduplicate overlapping results and return
", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items()))) # =====================================================================
# Sort by area descending, limit count
candidates.sort(key=lambda g: g.area, reverse=True) candidates.sort(key=lambda g: g.area, reverse=True)
result = candidates[:max_elements]
# Remove duplicates where bounding boxes overlap > 50%
final: List[GraphicElement] = []
for c in candidates:
overlap = False
for f in final:
# Intersection
ix1 = max(c.x, f.x)
iy1 = max(c.y, f.y)
ix2 = min(c.x + c.width, f.x + f.width)
iy2 = min(c.y + c.height, f.y + f.height)
if ix2 > ix1 and iy2 > iy1:
inter = (ix2 - ix1) * (iy2 - iy1)
smaller = min(c.width * c.height, f.width * f.height)
if smaller > 0 and inter / smaller > 0.5:
overlap = True
break
if not overlap:
final.append(c)
result = final[:max_elements]
if result: if result:
shape_counts = {} shape_counts: Dict[str, int] = {}
for g in result: for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1 shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info( logger.info(