Files
breakpilot-lehrer/klausur-service/backend/cv_graphic_detect.py
Benjamin Admin eeee61108a
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
fix: remove morph close that merged balloons into giant blob
The 5x5 MORPH_CLOSE was connecting scattered color pixels into one
page-spanning contour that swallowed individual balloons. Fix:
- Remove MORPH_CLOSE, keep only MORPH_OPEN for speckle removal
- Lower sat threshold 50→40 to catch more colored elements
- Filter contours spanning >50% of width OR height (was AND)
- Filter contours >10% of image area

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 14:42:51 +01:00

315 lines
11 KiB
Python

"""
Graphical element detection for OCR pages.
Two-pass approach:
Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
arrows, icons) on the saturation channel alone. Black text has
zero saturation and is invisible on this channel, so no word
exclusion is needed.
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
OCR word boxes from the full ink mask and keeping only very large
remaining contours.
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
logger = logging.getLogger(__name__)
__all__ = ["detect_graphic_elements", "GraphicElement"]
@dataclass
class GraphicElement:
"""A detected non-text graphical element."""
x: int
y: int
width: int
height: int
area: int
shape: str # circle, illustration
color_name: str # dominant color or 'black'
color_hex: str
confidence: float
contour: Any = field(default=None, repr=False)
# ---------------------------------------------------------------------------
# Color helpers
# ---------------------------------------------------------------------------
_COLOR_HEX = {
"black": "#000000",
"gray": "#6b7280",
"red": "#dc2626",
"orange": "#ea580c",
"yellow": "#ca8a04",
"green": "#16a34a",
"blue": "#2563eb",
"purple": "#9333ea",
}
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
"""Return (color_name, color_hex) for an HSV region."""
if hsv_roi.size == 0:
return "black", _COLOR_HEX["black"]
pixels = hsv_roi.reshape(-1, 3)
sat = pixels[:, 1]
sat_mask = sat > sat_threshold
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
if sat_ratio < 0.15:
return "black", _COLOR_HEX["black"]
sat_pixels = pixels[sat_mask]
if len(sat_pixels) < 3:
return "black", _COLOR_HEX["black"]
med_hue = float(np.median(sat_pixels[:, 0]))
if med_hue < 10 or med_hue > 170:
name = "red"
elif med_hue < 25:
name = "orange"
elif med_hue < 35:
name = "yellow"
elif med_hue < 85:
name = "green"
elif med_hue < 130:
name = "blue"
else:
name = "purple"
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
# ---------------------------------------------------------------------------
# Main detection
# ---------------------------------------------------------------------------
def detect_graphic_elements(
img_bgr: np.ndarray,
word_boxes: List[Dict],
detected_boxes: Optional[List[Dict]] = None,
max_elements: int = 50,
) -> List[GraphicElement]:
"""Find non-text graphical elements on the page.
Two-pass approach:
Pass 1 (color): Find colored elements via saturation channel.
No word exclusion needed — black text is invisible.
Pass 2 (ink): Find large black illustrations via ink mask minus
word exclusion.
Args:
img_bgr: BGR color image.
word_boxes: List of OCR word dicts with left/top/width/height.
detected_boxes: Optional list of detected box dicts (x/y/w/h).
max_elements: Maximum number of elements to return.
Returns:
List of GraphicElement, sorted by area descending.
"""
if img_bgr is None:
return []
h, w = img_bgr.shape[:2]
img_area = h * w
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or []))
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = []
# =====================================================================
# PASS 1 — COLOR CHANNEL (no word exclusion needed)
# =====================================================================
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible.
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
# Exclude very bright backgrounds (white/near-white with color cast)
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
color_mask = cv2.bitwise_and(sat_mask, val_mask)
# Only remove tiny speckle — NO closing, which would merge nearby
# colored elements into one giant blob spanning half the page.
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
contours_color, _ = cv2.findContours(
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
for cnt in contours_color:
area = cv2.contourArea(cnt)
if area < 80:
continue
bx, by, bw, bh = cv2.boundingRect(cnt)
if bw < 8 or bh < 8:
continue
# Skip page-spanning contours (background color cast / merged blobs)
if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10:
continue
perimeter = cv2.arcLength(cnt, True)
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
aspect = bw / bh if bh > 0 else 1.0
min_dim = min(bw, bh)
# Colored circle / balloon
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
# Determine color
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
conf = min(0.95, circularity)
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
bx, by, bw, bh, int(area), circularity, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="circle",
color_name=color_name, color_hex=color_hex,
confidence=conf, contour=cnt,
))
continue
# Colored illustration (large colored region)
if area > 2000 and min_dim > 20:
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
bx, by, bw, bh, int(area), color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name=color_name, color_hex=color_hex,
confidence=0.6, contour=cnt,
))
continue
# =====================================================================
# PASS 2 — INK (dark pixels) with word exclusion
# Only for large black illustrations (drawings in black ink).
# =====================================================================
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Build exclusion mask from words
exclusion = np.zeros((h, w), dtype=np.uint8)
word_pad = 5
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)) - word_pad)
y1 = max(0, int(wb.get("top", 0)) - word_pad)
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255
# Also exclude detected box regions
if detected_boxes:
for box in detected_boxes:
bbx = int(box.get("x", 0))
bby = int(box.get("y", 0))
bbw = int(box.get("w", box.get("width", 0)))
bbh = int(box.get("h", box.get("height", 0)))
inset = 8
x1 = max(0, bbx + inset)
y1 = max(0, bby + inset)
x2 = min(w, bbx + bbw - inset)
y2 = min(h, bby + bbh - inset)
if x2 > x1 and y2 > y1:
exclusion[y1:y2, x1:x2] = 255
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
# Remove colored regions already found in pass 1
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
# Only look for LARGE remaining regions (black illustrations)
contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
for cnt in contours_ink:
area = cv2.contourArea(cnt)
bx, by, bw, bh = cv2.boundingRect(cnt)
min_dim = min(bw, bh)
# Only large illustrations survive (area > 5000, min_dim > 40)
if area < 5000 or min_dim < 40:
continue
# Skip page-spanning contours
if bw > w * 0.8 and bh > h * 0.8:
continue
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name="black", color_hex="#000000",
confidence=0.5, contour=cnt,
))
# =====================================================================
# Deduplicate overlapping results and return
# =====================================================================
candidates.sort(key=lambda g: g.area, reverse=True)
# Remove duplicates where bounding boxes overlap > 50%
final: List[GraphicElement] = []
for c in candidates:
overlap = False
for f in final:
# Intersection
ix1 = max(c.x, f.x)
iy1 = max(c.y, f.y)
ix2 = min(c.x + c.width, f.x + f.width)
iy2 = min(c.y + c.height, f.y + f.height)
if ix2 > ix1 and iy2 > iy1:
inter = (ix2 - ix1) * (iy2 - iy1)
smaller = min(c.width * c.height, f.width * f.height)
if smaller > 0 and inter / smaller > 0.5:
overlap = True
break
if not overlap:
final.append(c)
result = final[:max_elements]
if result:
shape_counts: Dict[str, int] = {}
for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info(
"GraphicDetect: %d elements found (%s)",
len(result),
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
)
else:
logger.info("GraphicDetect: no graphic elements found")
return result