Files
breakpilot-lehrer/klausur-service/backend/cv_graphic_detect.py
Benjamin Admin ba513968c5
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 18s
fix: relax graphic detection for small circles/balloons
- Lower min_area from 200 to 80 (small balloons ~100-300px²)
- Lower word_pad from 10 to 5 (10px was eating nearby graphics)
- Relax circle detection: circularity>0.55, min_dim>15 (was 0.70/25)
- Text fragments still filtered by _classify_shape noise threshold
- Add ACCEPT logging for debugging

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 14:00:09 +01:00

342 lines
12 KiB
Python

"""
Graphical element detection for OCR pages.
Finds non-text visual elements (arrows, balloons, icons, illustrations)
by subtracting known OCR word regions from the page ink and analysing
remaining connected components via contour shape metrics.
Works on both color and grayscale scans.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
logger = logging.getLogger(__name__)
__all__ = ["detect_graphic_elements", "GraphicElement"]
@dataclass
class GraphicElement:
"""A detected non-text graphical element."""
x: int
y: int
width: int
height: int
area: int
shape: str # arrow, circle, line, icon, illustration
color_name: str # dominant color or 'black'
color_hex: str
confidence: float
contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr
# ---------------------------------------------------------------------------
# Color helpers
# ---------------------------------------------------------------------------
_COLOR_HEX = {
"black": "#000000",
"gray": "#6b7280",
"red": "#dc2626",
"orange": "#ea580c",
"yellow": "#ca8a04",
"green": "#16a34a",
"blue": "#2563eb",
"purple": "#9333ea",
}
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
"""Return (color_name, color_hex) for an HSV region."""
if hsv_roi.size == 0:
return "black", _COLOR_HEX["black"]
pixels = hsv_roi.reshape(-1, 3)
sat = pixels[:, 1]
sat_mask = sat > sat_threshold
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
if sat_ratio < 0.15:
return "black", _COLOR_HEX["black"]
sat_pixels = pixels[sat_mask]
if len(sat_pixels) < 3:
return "black", _COLOR_HEX["black"]
med_hue = float(np.median(sat_pixels[:, 0]))
if med_hue < 10 or med_hue > 170:
name = "red"
elif med_hue < 25:
name = "orange"
elif med_hue < 35:
name = "yellow"
elif med_hue < 85:
name = "green"
elif med_hue < 130:
name = "blue"
else:
name = "purple"
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
# ---------------------------------------------------------------------------
# Shape classification via contour analysis
# ---------------------------------------------------------------------------
def _classify_shape(
contour: np.ndarray,
bw: int,
bh: int,
area: float,
) -> tuple:
"""Classify contour shape → (shape_name, confidence).
Uses circularity, aspect ratio, solidity, and vertex count.
Only classifies as arrow/circle/line if the element is large enough
to be a genuine graphic (not a text fragment).
"""
aspect = bw / bh if bh > 0 else 1.0
perimeter = cv2.arcLength(contour, True)
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
hull = cv2.convexHull(contour)
hull_area = cv2.contourArea(hull)
solidity = area / hull_area if hull_area > 0 else 0
# Approximate polygon
epsilon = 0.03 * perimeter
approx = cv2.approxPolyDP(contour, epsilon, True)
vertices = len(approx)
min_dim = min(bw, bh)
max_dim = max(bw, bh)
# --- Circle / balloon --- (check first, most reliable)
# Must be reasonably large (not a dot/period) — min 15px
if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15:
conf = min(0.95, circularity)
return "circle", conf
# --- Arrow detection --- (strict: must be sizable, distinct shape)
# Arrows must be at least 20px in both dimensions
if (min_dim > 20 and max_dim > 30
and 5 <= vertices <= 9
and 0.35 < solidity < 0.80
and circularity < 0.35):
hull_idx = cv2.convexHull(contour, returnPoints=False)
if len(hull_idx) >= 4:
try:
defects = cv2.convexityDefects(contour, hull_idx)
if defects is not None and len(defects) >= 2:
max_depth = max(d[0][3] for d in defects) / 256.0
if max_depth > min_dim * 0.25:
return "arrow", min(0.75, 0.5 + max_depth / max_dim)
except cv2.error:
pass
# --- Line (decorative rule, separator) ---
# Must be long enough to not be a dash/hyphen
if (aspect > 6.0 or aspect < 1 / 6.0) and max_dim > 40:
return "line", 0.7
# --- Larger illustration (drawing, image) ---
if area > 3000 and min_dim > 30:
return "illustration", 0.6
# --- Generic icon (moderate size, non-text shape) ---
if area > 500 and min_dim > 15:
return "icon", 0.4
# Everything else is too small or text-like — skip
return "noise", 0.0
# ---------------------------------------------------------------------------
# Main detection
# ---------------------------------------------------------------------------
def detect_graphic_elements(
img_bgr: np.ndarray,
word_boxes: List[Dict],
detected_boxes: Optional[List[Dict]] = None,
min_area: int = 80,
max_area_ratio: float = 0.25,
word_pad: int = 5,
max_elements: int = 50,
) -> List[GraphicElement]:
"""Find non-text graphical elements on the page.
1. Build ink mask (dark + colored pixels).
2. Subtract OCR word regions and detected boxes.
3. Find connected components and classify shapes.
Args:
img_bgr: BGR color image.
word_boxes: List of OCR word dicts with left/top/width/height.
detected_boxes: Optional list of detected box dicts (x/y/w/h).
min_area: Minimum contour area to keep (80 filters tiny noise).
max_area_ratio: Maximum area as fraction of image area.
word_pad: Padding around word boxes for exclusion (5px).
max_elements: Maximum number of elements to return.
Returns:
List of GraphicElement, sorted by area descending.
"""
if img_bgr is None:
return []
h, w = img_bgr.shape[:2]
max_area = int(h * w * max_area_ratio)
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or []))
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
# --- 1. Build ink mask: dark pixels + saturated colored pixels ---
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Saturated colored pixels (catches colored arrows, markers)
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
color_ink = cv2.bitwise_and(sat_mask, val_mask)
ink_mask = cv2.bitwise_or(dark_mask, color_ink)
# --- 2. Build exclusion mask from OCR words ---
exclusion = np.zeros((h, w), dtype=np.uint8)
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)) - word_pad)
y1 = max(0, int(wb.get("top", 0)) - word_pad)
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255
# Also exclude detected box interiors (they contain text, not graphics)
# But keep a border strip so arrows/icons at box edges are found
if detected_boxes:
box_inset = 8
for box in detected_boxes:
bx = int(box.get("x", 0))
by = int(box.get("y", 0))
bbw = int(box.get("w", box.get("width", 0)))
bbh = int(box.get("h", box.get("height", 0)))
x1 = max(0, bx + box_inset)
y1 = max(0, by + box_inset)
x2 = min(w, bx + bbw - box_inset)
y2 = min(h, by + bbh - box_inset)
if x2 > x1 and y2 > y1:
exclusion[y1:y2, x1:x2] = 255
excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
# Subtract exclusion from ink
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
# --- 3. Morphological cleanup ---
# Close small gaps (connects arrow stroke + head) — but not too large
# to avoid reconnecting text fragments
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
# Remove small noise
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
# --- 4. Find contours ---
contours, _ = cv2.findContours(
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
# --- 5. Analyse and classify ---
candidates: List[GraphicElement] = []
skip_reasons: Dict[str, int] = {}
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area or area > max_area:
bx, by, bw, bh = cv2.boundingRect(cnt)
reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
continue
bx, by, bw, bh = cv2.boundingRect(cnt)
if bw < 8 or bh < 8:
skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
continue
# Skip elements that overlap significantly with the exclusion zone
roi_excl = exclusion[by:by + bh, bx:bx + bw]
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
if excl_ratio > 0.4:
logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
excl_ratio, bx, by, bw, bh, int(area))
skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
continue
# Classify shape
shape, conf = _classify_shape(cnt, bw, bh, area)
# Skip noise (too small or text-like)
if shape == "noise":
logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
continue
# Determine dominant color
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
shifted_cnt = cnt - np.array([bx, by])
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask > 0]
color_name, color_hex = _dominant_color(masked_hsv)
logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f",
shape, bx, by, bw, bh, int(area), color_name, conf)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area),
shape=shape,
color_name=color_name,
color_hex=color_hex,
confidence=conf,
contour=cnt,
))
if skip_reasons:
logger.info("GraphicDetect: skipped contours: %s",
", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items())))
# Sort by area descending, limit count
candidates.sort(key=lambda g: g.area, reverse=True)
result = candidates[:max_elements]
if result:
shape_counts = {}
for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info(
"GraphicDetect: %d elements found (%s)",
len(result),
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
)
else:
logger.info("GraphicDetect: no graphic elements found")
return result