Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
340 lines
12 KiB
Python
340 lines
12 KiB
Python
"""
|
|
Graphical element detection for OCR pages.
|
|
|
|
Finds non-text visual elements (arrows, balloons, icons, illustrations)
|
|
by subtracting known OCR word regions from the page ink and analysing
|
|
remaining connected components via contour shape metrics.
|
|
|
|
Works on both color and grayscale scans.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = ["detect_graphic_elements", "GraphicElement"]
|
|
|
|
|
|
@dataclass
|
|
class GraphicElement:
|
|
"""A detected non-text graphical element."""
|
|
x: int
|
|
y: int
|
|
width: int
|
|
height: int
|
|
area: int
|
|
shape: str # arrow, circle, line, icon, illustration
|
|
color_name: str # dominant color or 'black'
|
|
color_hex: str
|
|
confidence: float
|
|
contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Color helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_COLOR_HEX = {
|
|
"black": "#000000",
|
|
"gray": "#6b7280",
|
|
"red": "#dc2626",
|
|
"orange": "#ea580c",
|
|
"yellow": "#ca8a04",
|
|
"green": "#16a34a",
|
|
"blue": "#2563eb",
|
|
"purple": "#9333ea",
|
|
}
|
|
|
|
|
|
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
|
|
"""Return (color_name, color_hex) for an HSV region."""
|
|
if hsv_roi.size == 0:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
pixels = hsv_roi.reshape(-1, 3)
|
|
sat = pixels[:, 1]
|
|
sat_mask = sat > sat_threshold
|
|
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
|
|
|
|
if sat_ratio < 0.15:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
sat_pixels = pixels[sat_mask]
|
|
if len(sat_pixels) < 3:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
med_hue = float(np.median(sat_pixels[:, 0]))
|
|
|
|
if med_hue < 10 or med_hue > 170:
|
|
name = "red"
|
|
elif med_hue < 25:
|
|
name = "orange"
|
|
elif med_hue < 35:
|
|
name = "yellow"
|
|
elif med_hue < 85:
|
|
name = "green"
|
|
elif med_hue < 130:
|
|
name = "blue"
|
|
else:
|
|
name = "purple"
|
|
|
|
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Shape classification via contour analysis
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _classify_shape(
|
|
contour: np.ndarray,
|
|
bw: int,
|
|
bh: int,
|
|
area: float,
|
|
) -> tuple:
|
|
"""Classify contour shape → (shape_name, confidence).
|
|
|
|
Uses circularity, aspect ratio, solidity, and vertex count.
|
|
Only classifies as arrow/circle/line if the element is large enough
|
|
to be a genuine graphic (not a text fragment).
|
|
"""
|
|
aspect = bw / bh if bh > 0 else 1.0
|
|
perimeter = cv2.arcLength(contour, True)
|
|
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
|
|
|
hull = cv2.convexHull(contour)
|
|
hull_area = cv2.contourArea(hull)
|
|
solidity = area / hull_area if hull_area > 0 else 0
|
|
|
|
# Approximate polygon
|
|
epsilon = 0.03 * perimeter
|
|
approx = cv2.approxPolyDP(contour, epsilon, True)
|
|
vertices = len(approx)
|
|
|
|
min_dim = min(bw, bh)
|
|
max_dim = max(bw, bh)
|
|
|
|
# --- Circle / balloon --- (check first, most reliable)
|
|
# Must be reasonably large (not a dot/period)
|
|
if circularity > 0.70 and 0.6 < aspect < 1.7 and min_dim > 25:
|
|
conf = min(0.95, circularity)
|
|
return "circle", conf
|
|
|
|
# --- Arrow detection --- (strict: must be sizable, distinct shape)
|
|
# Arrows must be at least 20px in both dimensions
|
|
if (min_dim > 20 and max_dim > 30
|
|
and 5 <= vertices <= 9
|
|
and 0.35 < solidity < 0.80
|
|
and circularity < 0.35):
|
|
hull_idx = cv2.convexHull(contour, returnPoints=False)
|
|
if len(hull_idx) >= 4:
|
|
try:
|
|
defects = cv2.convexityDefects(contour, hull_idx)
|
|
if defects is not None and len(defects) >= 2:
|
|
max_depth = max(d[0][3] for d in defects) / 256.0
|
|
if max_depth > min_dim * 0.25:
|
|
return "arrow", min(0.75, 0.5 + max_depth / max_dim)
|
|
except cv2.error:
|
|
pass
|
|
|
|
# --- Line (decorative rule, separator) ---
|
|
# Must be long enough to not be a dash/hyphen
|
|
if (aspect > 6.0 or aspect < 1 / 6.0) and max_dim > 40:
|
|
return "line", 0.7
|
|
|
|
# --- Larger illustration (drawing, image) ---
|
|
if area > 3000 and min_dim > 30:
|
|
return "illustration", 0.6
|
|
|
|
# --- Generic icon (moderate size, non-text shape) ---
|
|
if area > 500 and min_dim > 15:
|
|
return "icon", 0.4
|
|
|
|
# Everything else is too small or text-like — skip
|
|
return "noise", 0.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def detect_graphic_elements(
|
|
img_bgr: np.ndarray,
|
|
word_boxes: List[Dict],
|
|
detected_boxes: Optional[List[Dict]] = None,
|
|
min_area: int = 200,
|
|
max_area_ratio: float = 0.25,
|
|
word_pad: int = 10,
|
|
max_elements: int = 50,
|
|
) -> List[GraphicElement]:
|
|
"""Find non-text graphical elements on the page.
|
|
|
|
1. Build ink mask (dark + colored pixels).
|
|
2. Subtract OCR word regions and detected boxes.
|
|
3. Find connected components and classify shapes.
|
|
|
|
Args:
|
|
img_bgr: BGR color image.
|
|
word_boxes: List of OCR word dicts with left/top/width/height.
|
|
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
|
min_area: Minimum contour area to keep (200 filters text fragments).
|
|
max_area_ratio: Maximum area as fraction of image area.
|
|
word_pad: Padding around word boxes for exclusion (10px covers font edges).
|
|
max_elements: Maximum number of elements to return.
|
|
|
|
Returns:
|
|
List of GraphicElement, sorted by area descending.
|
|
"""
|
|
if img_bgr is None:
|
|
return []
|
|
|
|
h, w = img_bgr.shape[:2]
|
|
max_area = int(h * w * max_area_ratio)
|
|
|
|
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
|
w, h, len(word_boxes), len(detected_boxes or []))
|
|
|
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
|
|
|
# --- 1. Build ink mask: dark pixels + saturated colored pixels ---
|
|
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
|
|
# Saturated colored pixels (catches colored arrows, markers)
|
|
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
|
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
|
|
color_ink = cv2.bitwise_and(sat_mask, val_mask)
|
|
|
|
ink_mask = cv2.bitwise_or(dark_mask, color_ink)
|
|
|
|
# --- 2. Build exclusion mask from OCR words ---
|
|
exclusion = np.zeros((h, w), dtype=np.uint8)
|
|
|
|
for wb in word_boxes:
|
|
x1 = max(0, int(wb.get("left", 0)) - word_pad)
|
|
y1 = max(0, int(wb.get("top", 0)) - word_pad)
|
|
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
|
|
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
|
exclusion[y1:y2, x1:x2] = 255
|
|
|
|
# Also exclude detected box interiors (they contain text, not graphics)
|
|
# But keep a border strip so arrows/icons at box edges are found
|
|
if detected_boxes:
|
|
box_inset = 8
|
|
for box in detected_boxes:
|
|
bx = int(box.get("x", 0))
|
|
by = int(box.get("y", 0))
|
|
bbw = int(box.get("w", box.get("width", 0)))
|
|
bbh = int(box.get("h", box.get("height", 0)))
|
|
x1 = max(0, bx + box_inset)
|
|
y1 = max(0, by + box_inset)
|
|
x2 = min(w, bx + bbw - box_inset)
|
|
y2 = min(h, by + bbh - box_inset)
|
|
if x2 > x1 and y2 > y1:
|
|
exclusion[y1:y2, x1:x2] = 255
|
|
|
|
excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
|
|
logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
|
|
|
|
# Subtract exclusion from ink
|
|
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
|
|
|
|
# --- 3. Morphological cleanup ---
|
|
# Close small gaps (connects arrow stroke + head) — but not too large
|
|
# to avoid reconnecting text fragments
|
|
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
|
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
|
|
# Remove small noise
|
|
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
|
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
|
|
|
|
# --- 4. Find contours ---
|
|
contours, _ = cv2.findContours(
|
|
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
|
)
|
|
|
|
logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
|
|
|
|
# --- 5. Analyse and classify ---
|
|
candidates: List[GraphicElement] = []
|
|
skip_reasons: Dict[str, int] = {}
|
|
for cnt in contours:
|
|
area = cv2.contourArea(cnt)
|
|
if area < min_area or area > max_area:
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
|
|
logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
|
|
skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
|
|
continue
|
|
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
if bw < 8 or bh < 8:
|
|
skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
|
|
continue
|
|
|
|
# Skip elements that overlap significantly with the exclusion zone
|
|
roi_excl = exclusion[by:by + bh, bx:bx + bw]
|
|
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
|
|
if excl_ratio > 0.4:
|
|
logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
|
|
excl_ratio, bx, by, bw, bh, int(area))
|
|
skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
|
|
continue
|
|
|
|
# Classify shape
|
|
shape, conf = _classify_shape(cnt, bw, bh, area)
|
|
|
|
# Skip noise (too small or text-like)
|
|
if shape == "noise":
|
|
logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
|
|
bx, by, bw, bh, int(area))
|
|
skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
|
|
continue
|
|
|
|
# Determine dominant color
|
|
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
|
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
|
|
shifted_cnt = cnt - np.array([bx, by])
|
|
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
|
|
masked_hsv = roi_hsv[cnt_mask > 0]
|
|
color_name, color_hex = _dominant_color(masked_hsv)
|
|
|
|
candidates.append(GraphicElement(
|
|
x=bx, y=by, width=bw, height=bh,
|
|
area=int(area),
|
|
shape=shape,
|
|
color_name=color_name,
|
|
color_hex=color_hex,
|
|
confidence=conf,
|
|
contour=cnt,
|
|
))
|
|
|
|
if skip_reasons:
|
|
logger.info("GraphicDetect: skipped contours: %s",
|
|
", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items())))
|
|
|
|
# Sort by area descending, limit count
|
|
candidates.sort(key=lambda g: g.area, reverse=True)
|
|
result = candidates[:max_elements]
|
|
|
|
if result:
|
|
shape_counts = {}
|
|
for g in result:
|
|
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
|
logger.info(
|
|
"GraphicDetect: %d elements found (%s)",
|
|
len(result),
|
|
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
|
)
|
|
else:
|
|
logger.info("GraphicDetect: no graphic elements found")
|
|
|
|
return result
|