Files
breakpilot-lehrer/klausur-service/backend/cv_graphic_detect.py
Benjamin Admin be7f5f1872 feat: Sprint 2 — TrOCR ONNX, PP-DocLayout, Model Management
D2: TrOCR ONNX export script (printed + handwritten, int8 quantization)
D3: PP-DocLayout ONNX export script (download or Docker-based conversion)
B3: Model Management admin page (PyTorch vs ONNX status, benchmarks, config)
A4: TrOCR ONNX service with runtime routing (auto/pytorch/onnx via TROCR_BACKEND)
A5: PP-DocLayout ONNX detection with OpenCV fallback (via GRAPHIC_DETECT_BACKEND)
B4: Structure Detection UI toggle (OpenCV vs PP-DocLayout) with class color coding
C3: TrOCR-ONNX.md documentation
C4: OCR-Pipeline.md ONNX section added
C5: mkdocs.yml nav updated, optimum added to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 09:53:02 +01:00

423 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Graphical element detection for OCR pages.
Region-based approach:
1. Build a color mask (saturation channel — black text is invisible).
2. Dilate heavily to merge nearby colored pixels into regions.
3. For each region, check overlap with OCR word boxes:
- High word overlap → colored text (skip)
- Low word overlap → colored graphic / image (keep)
4. Separately detect large black-ink illustrations via ink mask.
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
logger = logging.getLogger(__name__)
__all__ = ["detect_graphic_elements", "GraphicElement"]
@dataclass
class GraphicElement:
"""A detected non-text graphical element."""
x: int
y: int
width: int
height: int
area: int
shape: str # image, illustration
color_name: str # dominant color or 'black'
color_hex: str
confidence: float
contour: Any = field(default=None, repr=False)
# ---------------------------------------------------------------------------
# Color helpers
# ---------------------------------------------------------------------------
_COLOR_HEX = {
"black": "#000000",
"gray": "#6b7280",
"red": "#dc2626",
"orange": "#ea580c",
"yellow": "#ca8a04",
"green": "#16a34a",
"blue": "#2563eb",
"purple": "#9333ea",
}
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
"""Return (color_name, color_hex) for an HSV region."""
if hsv_roi.size == 0:
return "black", _COLOR_HEX["black"]
pixels = hsv_roi.reshape(-1, 3)
sat = pixels[:, 1]
sat_mask = sat > sat_threshold
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
if sat_ratio < 0.15:
return "black", _COLOR_HEX["black"]
sat_pixels = pixels[sat_mask]
if len(sat_pixels) < 3:
return "black", _COLOR_HEX["black"]
med_hue = float(np.median(sat_pixels[:, 0]))
if med_hue < 10 or med_hue > 170:
name = "red"
elif med_hue < 25:
name = "orange"
elif med_hue < 35:
name = "yellow"
elif med_hue < 85:
name = "green"
elif med_hue < 130:
name = "blue"
else:
name = "purple"
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
# ---------------------------------------------------------------------------
# Main detection
# ---------------------------------------------------------------------------
def detect_graphic_elements(
img_bgr: np.ndarray,
word_boxes: List[Dict],
detected_boxes: Optional[List[Dict]] = None,
max_elements: int = 50,
) -> List[GraphicElement]:
"""Find non-text graphical regions on the page.
Region-based: dilate color mask to form regions, then check word
overlap to distinguish colored text from colored graphics.
Args:
img_bgr: BGR color image.
word_boxes: List of OCR word dicts with left/top/width/height.
detected_boxes: Optional list of detected box dicts (x/y/w/h).
max_elements: Maximum number of elements to return.
Returns:
List of GraphicElement, sorted by area descending.
"""
if img_bgr is None:
return []
# ------------------------------------------------------------------
# Try PP-DocLayout ONNX first if available
# ------------------------------------------------------------------
import os
backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
if backend in ("doclayout", "auto"):
try:
from cv_doclayout_detect import detect_layout_regions, is_doclayout_available
if is_doclayout_available():
regions = detect_layout_regions(img_bgr)
if regions:
_LABEL_TO_COLOR = {
"figure": ("image", "green", _COLOR_HEX.get("green", "#16a34a")),
"table": ("image", "blue", _COLOR_HEX.get("blue", "#2563eb")),
}
converted: List[GraphicElement] = []
for r in regions:
shape, color_name, color_hex = _LABEL_TO_COLOR.get(
r.label,
(r.label, "gray", _COLOR_HEX.get("gray", "#6b7280")),
)
converted.append(GraphicElement(
x=r.x,
y=r.y,
width=r.width,
height=r.height,
area=r.width * r.height,
shape=shape,
color_name=color_name,
color_hex=color_hex,
confidence=r.confidence,
contour=None,
))
converted.sort(key=lambda g: g.area, reverse=True)
result = converted[:max_elements]
if result:
shape_counts: Dict[str, int] = {}
for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info(
"GraphicDetect (PP-DocLayout): %d elements (%s)",
len(result),
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
)
return result
except Exception as e:
logger.warning("PP-DocLayout failed, falling back to OpenCV: %s", e)
# ------------------------------------------------------------------
# OpenCV fallback (original logic)
# ------------------------------------------------------------------
h, w = img_bgr.shape[:2]
logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or []))
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = []
# --- Build word mask (for overlap checking) ---
word_mask = np.zeros((h, w), dtype=np.uint8)
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)))
y1 = max(0, int(wb.get("top", 0)))
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
word_mask[y1:y2, x1:x2] = 255
# =====================================================================
# PASS 1 — COLORED IMAGE REGIONS
# =====================================================================
# Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
color_pixels = cv2.bitwise_and(sat_mask, val_mask)
# Remove tiny speckle
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
# Count raw colored pixels before dilation (for density check later)
color_pixel_raw = color_pixels.copy()
# Heavy dilation to merge nearby colored elements into regions.
# A 25x25 kernel merges elements within ~12px of each other.
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
contours_regions, _ = cv2.findContours(
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
for cnt in contours_regions:
bx, by, bw, bh = cv2.boundingRect(cnt)
# Skip tiny regions
if bw < 15 or bh < 15:
continue
# Skip page-spanning regions
if bw > w * 0.6 or bh > h * 0.6:
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue
bbox_area = bw * bh
# Check: how much of this region's bounding box overlaps with words?
roi_words = word_mask[by:by + bh, bx:bx + bw]
word_pixel_count = int(np.sum(roi_words > 0))
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
# Check: how many OCR word centroids fall inside this region?
# Colored text that OCR detected will have multiple centroids inside.
# Actual images may have 0-1 spurious OCR artifacts.
word_centroid_count = sum(
1 for wb in word_boxes
if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
)
# Check: how many actual colored pixels are in this region?
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
color_pixel_count = int(np.sum(roi_color > 0))
# Color pixel density (before any skip checks so we can log it)
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
# --- Skip heuristics for colored TEXT (not images) ---
# (a) High word-box pixel overlap → clearly text
if word_overlap > 0.40:
logger.info(
"GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
"overlap=%.0f%% centroids=%d",
bx, by, bw, bh, word_overlap * 100, word_centroid_count,
)
continue
# (b) Multiple OCR words detected inside → colored text
# (images rarely produce 2+ confident word detections)
if word_centroid_count >= 2:
logger.info(
"GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
"centroids=%d overlap=%.0f%% density=%.0f%%",
bx, by, bw, bh, word_centroid_count,
word_overlap * 100, density * 100,
)
continue
# (c) Even 1 word + some pixel overlap → likely text
if word_centroid_count >= 1 and word_overlap > 0.10:
logger.info(
"GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
"centroids=%d overlap=%.0f%%",
bx, by, bw, bh, word_centroid_count, word_overlap * 100,
)
continue
# Need a minimum number of colored pixels (not just dilated area)
if color_pixel_count < 200:
continue
# (d) Very low density → thin strokes, almost certainly text.
# Large regions (photos/illustrations) can have low color density
# because most pixels are grayscale ink. Use a lower threshold
# for regions bigger than 100×80 px.
_min_density = 0.05 if (bw > 100 and bh > 80) else 0.20
if density < _min_density:
logger.info(
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
"density=%.0f%% (min=%.0f%%, likely colored text)",
bx, by, bw, bh, density * 100, _min_density * 100,
)
continue
# (e) Moderate density + small height → colored text line
if density < 0.35 and bh < h * 0.05:
logger.info(
"GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
"density=%.0f%% height=%.1f%%",
bx, by, bw, bh, density * 100, 100.0 * bh / h,
)
continue
# Determine dominant color from the actual colored pixels
roi_hsv = hsv[by:by + bh, bx:bx + bw]
color_px_mask = roi_color > 0
if np.sum(color_px_mask) > 0:
masked_hsv = roi_hsv[color_px_mask]
color_name, color_hex = _dominant_color(masked_hsv)
else:
color_name, color_hex = "black", _COLOR_HEX["black"]
# Confidence based on color density and low word overlap
conf = min(0.95, 0.5 + density * 0.5)
logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s",
bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=color_pixel_count,
shape="image",
color_name=color_name, color_hex=color_hex,
confidence=round(conf, 2), contour=cnt,
))
# =====================================================================
# PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
# =====================================================================
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Exclude words and colored regions already found
exclusion = np.zeros((h, w), dtype=np.uint8)
word_pad = 5
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)) - word_pad)
y1 = max(0, int(wb.get("top", 0)) - word_pad)
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255
if detected_boxes:
for box in detected_boxes:
bbx = int(box.get("x", 0))
bby = int(box.get("y", 0))
bbw = int(box.get("w", box.get("width", 0)))
bbh = int(box.get("h", box.get("height", 0)))
inset = 8
x1 = max(0, bbx + inset)
y1 = max(0, bby + inset)
x2 = min(w, bbx + bbw - inset)
y2 = min(h, bby + bbh - inset)
if x2 > x1 and y2 > y1:
exclusion[y1:y2, x1:x2] = 255
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
for cnt in contours_ink:
area = cv2.contourArea(cnt)
bx, by, bw, bh = cv2.boundingRect(cnt)
if area < 5000 or min(bw, bh) < 40:
continue
if bw > w * 0.8 or bh > h * 0.8:
continue
logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name="black", color_hex="#000000",
confidence=0.5, contour=cnt,
))
# =====================================================================
# Deduplicate and return
# =====================================================================
candidates.sort(key=lambda g: g.area, reverse=True)
final: List[GraphicElement] = []
for c in candidates:
overlap = False
for f in final:
ix1 = max(c.x, f.x)
iy1 = max(c.y, f.y)
ix2 = min(c.x + c.width, f.x + f.width)
iy2 = min(c.y + c.height, f.y + f.height)
if ix2 > ix1 and iy2 > iy1:
inter = (ix2 - ix1) * (iy2 - iy1)
smaller = min(c.width * c.height, f.width * f.height)
if smaller > 0 and inter / smaller > 0.5:
overlap = True
break
if not overlap:
final.append(c)
result = final[:max_elements]
if result:
shape_counts: Dict[str, int] = {}
for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info(
"GraphicDetect: %d elements found (%s)",
len(result),
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
)
else:
logger.info("GraphicDetect: no graphic elements found")
return result