Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m6s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 22s
Text fragments after word exclusion are indistinguishable from arrows and icons via contour metrics. Since the goal is detecting graphics, images, boxes and colors (not arrows/icons), simplify to only: - circle/balloon (circularity > 0.55 — very reliable) - illustration (area > 3000 — clearly non-text) Boxes and colors are handled by cv_box_detect and cv_color_detect. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
310 lines
11 KiB
Python
310 lines
11 KiB
Python
"""
|
|
Graphical element detection for OCR pages.
|
|
|
|
Finds non-text visual elements (arrows, balloons, icons, illustrations)
|
|
by subtracting known OCR word regions from the page ink and analysing
|
|
remaining connected components via contour shape metrics.
|
|
|
|
Works on both color and grayscale scans.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = ["detect_graphic_elements", "GraphicElement"]
|
|
|
|
|
|
@dataclass
|
|
class GraphicElement:
|
|
"""A detected non-text graphical element."""
|
|
x: int
|
|
y: int
|
|
width: int
|
|
height: int
|
|
area: int
|
|
shape: str # arrow, circle, line, icon, illustration
|
|
color_name: str # dominant color or 'black'
|
|
color_hex: str
|
|
confidence: float
|
|
contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Color helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_COLOR_HEX = {
|
|
"black": "#000000",
|
|
"gray": "#6b7280",
|
|
"red": "#dc2626",
|
|
"orange": "#ea580c",
|
|
"yellow": "#ca8a04",
|
|
"green": "#16a34a",
|
|
"blue": "#2563eb",
|
|
"purple": "#9333ea",
|
|
}
|
|
|
|
|
|
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
|
|
"""Return (color_name, color_hex) for an HSV region."""
|
|
if hsv_roi.size == 0:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
pixels = hsv_roi.reshape(-1, 3)
|
|
sat = pixels[:, 1]
|
|
sat_mask = sat > sat_threshold
|
|
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
|
|
|
|
if sat_ratio < 0.15:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
sat_pixels = pixels[sat_mask]
|
|
if len(sat_pixels) < 3:
|
|
return "black", _COLOR_HEX["black"]
|
|
|
|
med_hue = float(np.median(sat_pixels[:, 0]))
|
|
|
|
if med_hue < 10 or med_hue > 170:
|
|
name = "red"
|
|
elif med_hue < 25:
|
|
name = "orange"
|
|
elif med_hue < 35:
|
|
name = "yellow"
|
|
elif med_hue < 85:
|
|
name = "green"
|
|
elif med_hue < 130:
|
|
name = "blue"
|
|
else:
|
|
name = "purple"
|
|
|
|
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Shape classification via contour analysis
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _classify_shape(
|
|
contour: np.ndarray,
|
|
bw: int,
|
|
bh: int,
|
|
area: float,
|
|
) -> tuple:
|
|
"""Classify contour shape → (shape_name, confidence).
|
|
|
|
Only detects high-confidence shapes that are clearly non-text:
|
|
- circle/balloon: high circularity (very reliable)
|
|
- illustration: large area (clearly a drawing/image)
|
|
|
|
Text fragments are classified as 'noise' and filtered out.
|
|
Boxes and colors are detected by separate modules.
|
|
"""
|
|
perimeter = cv2.arcLength(contour, True)
|
|
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
|
aspect = bw / bh if bh > 0 else 1.0
|
|
min_dim = min(bw, bh)
|
|
|
|
# --- Circle / balloon ---
|
|
# High circularity is the most reliable non-text indicator.
|
|
# Text characters rarely have circularity > 0.55.
|
|
if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15:
|
|
conf = min(0.95, circularity)
|
|
return "circle", conf
|
|
|
|
# --- Illustration (drawing, image, large graphic) ---
|
|
# Large connected regions that survived word exclusion = genuine graphics.
|
|
if area > 3000 and min_dim > 30:
|
|
return "illustration", 0.6
|
|
|
|
# Everything else is likely a text fragment — skip
|
|
return "noise", 0.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def detect_graphic_elements(
|
|
img_bgr: np.ndarray,
|
|
word_boxes: List[Dict],
|
|
detected_boxes: Optional[List[Dict]] = None,
|
|
min_area: int = 80,
|
|
max_area_ratio: float = 0.25,
|
|
word_pad: int = 5,
|
|
max_elements: int = 50,
|
|
) -> List[GraphicElement]:
|
|
"""Find non-text graphical elements on the page.
|
|
|
|
1. Build ink mask (dark + colored pixels).
|
|
2. Subtract OCR word regions and detected boxes.
|
|
3. Find connected components and classify shapes.
|
|
|
|
Args:
|
|
img_bgr: BGR color image.
|
|
word_boxes: List of OCR word dicts with left/top/width/height.
|
|
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
|
min_area: Minimum contour area to keep (80 filters tiny noise).
|
|
max_area_ratio: Maximum area as fraction of image area.
|
|
word_pad: Padding around word boxes for exclusion (5px).
|
|
max_elements: Maximum number of elements to return.
|
|
|
|
Returns:
|
|
List of GraphicElement, sorted by area descending.
|
|
"""
|
|
if img_bgr is None:
|
|
return []
|
|
|
|
h, w = img_bgr.shape[:2]
|
|
max_area = int(h * w * max_area_ratio)
|
|
|
|
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
|
w, h, len(word_boxes), len(detected_boxes or []))
|
|
|
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
|
|
|
# --- 1. Build ink mask: dark pixels + saturated colored pixels ---
|
|
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
|
|
# Saturated colored pixels (catches colored arrows, markers)
|
|
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
|
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
|
|
color_ink = cv2.bitwise_and(sat_mask, val_mask)
|
|
|
|
ink_mask = cv2.bitwise_or(dark_mask, color_ink)
|
|
|
|
# --- 2. Build exclusion mask from OCR words ---
|
|
exclusion = np.zeros((h, w), dtype=np.uint8)
|
|
|
|
for wb in word_boxes:
|
|
x1 = max(0, int(wb.get("left", 0)) - word_pad)
|
|
y1 = max(0, int(wb.get("top", 0)) - word_pad)
|
|
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
|
|
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
|
exclusion[y1:y2, x1:x2] = 255
|
|
|
|
# Also exclude detected box interiors (they contain text, not graphics)
|
|
# But keep a border strip so arrows/icons at box edges are found
|
|
if detected_boxes:
|
|
box_inset = 8
|
|
for box in detected_boxes:
|
|
bx = int(box.get("x", 0))
|
|
by = int(box.get("y", 0))
|
|
bbw = int(box.get("w", box.get("width", 0)))
|
|
bbh = int(box.get("h", box.get("height", 0)))
|
|
x1 = max(0, bx + box_inset)
|
|
y1 = max(0, by + box_inset)
|
|
x2 = min(w, bx + bbw - box_inset)
|
|
y2 = min(h, by + bbh - box_inset)
|
|
if x2 > x1 and y2 > y1:
|
|
exclusion[y1:y2, x1:x2] = 255
|
|
|
|
excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
|
|
logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
|
|
|
|
# Subtract exclusion from ink
|
|
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
|
|
|
|
# --- 3. Morphological cleanup ---
|
|
# Close small gaps (connects arrow stroke + head) — but not too large
|
|
# to avoid reconnecting text fragments
|
|
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
|
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
|
|
# Remove small noise
|
|
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
|
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
|
|
|
|
# --- 4. Find contours ---
|
|
contours, _ = cv2.findContours(
|
|
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
|
)
|
|
|
|
logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
|
|
|
|
# --- 5. Analyse and classify ---
|
|
candidates: List[GraphicElement] = []
|
|
skip_reasons: Dict[str, int] = {}
|
|
for cnt in contours:
|
|
area = cv2.contourArea(cnt)
|
|
if area < min_area or area > max_area:
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
|
|
logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
|
|
skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
|
|
continue
|
|
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
if bw < 8 or bh < 8:
|
|
skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
|
|
continue
|
|
|
|
# Skip elements that overlap significantly with the exclusion zone
|
|
roi_excl = exclusion[by:by + bh, bx:bx + bw]
|
|
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
|
|
if excl_ratio > 0.4:
|
|
logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
|
|
excl_ratio, bx, by, bw, bh, int(area))
|
|
skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
|
|
continue
|
|
|
|
# Classify shape
|
|
shape, conf = _classify_shape(cnt, bw, bh, area)
|
|
|
|
# Skip noise (too small or text-like)
|
|
if shape == "noise":
|
|
logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
|
|
bx, by, bw, bh, int(area))
|
|
skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
|
|
continue
|
|
|
|
# Determine dominant color
|
|
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
|
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
|
|
shifted_cnt = cnt - np.array([bx, by])
|
|
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
|
|
masked_hsv = roi_hsv[cnt_mask > 0]
|
|
color_name, color_hex = _dominant_color(masked_hsv)
|
|
|
|
logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f",
|
|
shape, bx, by, bw, bh, int(area), color_name, conf)
|
|
candidates.append(GraphicElement(
|
|
x=bx, y=by, width=bw, height=bh,
|
|
area=int(area),
|
|
shape=shape,
|
|
color_name=color_name,
|
|
color_hex=color_hex,
|
|
confidence=conf,
|
|
contour=cnt,
|
|
))
|
|
|
|
if skip_reasons:
|
|
logger.info("GraphicDetect: skipped contours: %s",
|
|
", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items())))
|
|
|
|
# Sort by area descending, limit count
|
|
candidates.sort(key=lambda g: g.area, reverse=True)
|
|
result = candidates[:max_elements]
|
|
|
|
if result:
|
|
shape_counts = {}
|
|
for g in result:
|
|
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
|
logger.info(
|
|
"GraphicDetect: %d elements found (%s)",
|
|
len(result),
|
|
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
|
)
|
|
else:
|
|
logger.info("GraphicDetect: no graphic elements found")
|
|
|
|
return result
|