D2: TrOCR ONNX export script (printed + handwritten, int8 quantization) D3: PP-DocLayout ONNX export script (download or Docker-based conversion) B3: Model Management admin page (PyTorch vs ONNX status, benchmarks, config) A4: TrOCR ONNX service with runtime routing (auto/pytorch/onnx via TROCR_BACKEND) A5: PP-DocLayout ONNX detection with OpenCV fallback (via GRAPHIC_DETECT_BACKEND) B4: Structure Detection UI toggle (OpenCV vs PP-DocLayout) with class color coding C3: TrOCR-ONNX.md documentation C4: OCR-Pipeline.md ONNX section added C5: mkdocs.yml nav updated, optimum added to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
423 lines
16 KiB
Python
423 lines
16 KiB
Python
"""
|
||
Graphical element detection for OCR pages.
|
||
|
||
Region-based approach:
|
||
1. Build a color mask (saturation channel — black text is invisible).
|
||
2. Dilate heavily to merge nearby colored pixels into regions.
|
||
3. For each region, check overlap with OCR word boxes:
|
||
- High word overlap → colored text (skip)
|
||
- Low word overlap → colored graphic / image (keep)
|
||
4. Separately detect large black-ink illustrations via ink mask.
|
||
|
||
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
import cv2
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
__all__ = ["detect_graphic_elements", "GraphicElement"]
|
||
|
||
|
||
@dataclass
|
||
class GraphicElement:
|
||
"""A detected non-text graphical element."""
|
||
x: int
|
||
y: int
|
||
width: int
|
||
height: int
|
||
area: int
|
||
shape: str # image, illustration
|
||
color_name: str # dominant color or 'black'
|
||
color_hex: str
|
||
confidence: float
|
||
contour: Any = field(default=None, repr=False)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Color helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_COLOR_HEX = {
|
||
"black": "#000000",
|
||
"gray": "#6b7280",
|
||
"red": "#dc2626",
|
||
"orange": "#ea580c",
|
||
"yellow": "#ca8a04",
|
||
"green": "#16a34a",
|
||
"blue": "#2563eb",
|
||
"purple": "#9333ea",
|
||
}
|
||
|
||
|
||
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
|
||
"""Return (color_name, color_hex) for an HSV region."""
|
||
if hsv_roi.size == 0:
|
||
return "black", _COLOR_HEX["black"]
|
||
|
||
pixels = hsv_roi.reshape(-1, 3)
|
||
sat = pixels[:, 1]
|
||
sat_mask = sat > sat_threshold
|
||
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
|
||
|
||
if sat_ratio < 0.15:
|
||
return "black", _COLOR_HEX["black"]
|
||
|
||
sat_pixels = pixels[sat_mask]
|
||
if len(sat_pixels) < 3:
|
||
return "black", _COLOR_HEX["black"]
|
||
|
||
med_hue = float(np.median(sat_pixels[:, 0]))
|
||
|
||
if med_hue < 10 or med_hue > 170:
|
||
name = "red"
|
||
elif med_hue < 25:
|
||
name = "orange"
|
||
elif med_hue < 35:
|
||
name = "yellow"
|
||
elif med_hue < 85:
|
||
name = "green"
|
||
elif med_hue < 130:
|
||
name = "blue"
|
||
else:
|
||
name = "purple"
|
||
|
||
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main detection
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def detect_graphic_elements(
|
||
img_bgr: np.ndarray,
|
||
word_boxes: List[Dict],
|
||
detected_boxes: Optional[List[Dict]] = None,
|
||
max_elements: int = 50,
|
||
) -> List[GraphicElement]:
|
||
"""Find non-text graphical regions on the page.
|
||
|
||
Region-based: dilate color mask to form regions, then check word
|
||
overlap to distinguish colored text from colored graphics.
|
||
|
||
Args:
|
||
img_bgr: BGR color image.
|
||
word_boxes: List of OCR word dicts with left/top/width/height.
|
||
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
||
max_elements: Maximum number of elements to return.
|
||
|
||
Returns:
|
||
List of GraphicElement, sorted by area descending.
|
||
"""
|
||
if img_bgr is None:
|
||
return []
|
||
|
||
# ------------------------------------------------------------------
|
||
# Try PP-DocLayout ONNX first if available
|
||
# ------------------------------------------------------------------
|
||
import os
|
||
backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
|
||
if backend in ("doclayout", "auto"):
|
||
try:
|
||
from cv_doclayout_detect import detect_layout_regions, is_doclayout_available
|
||
if is_doclayout_available():
|
||
regions = detect_layout_regions(img_bgr)
|
||
if regions:
|
||
_LABEL_TO_COLOR = {
|
||
"figure": ("image", "green", _COLOR_HEX.get("green", "#16a34a")),
|
||
"table": ("image", "blue", _COLOR_HEX.get("blue", "#2563eb")),
|
||
}
|
||
converted: List[GraphicElement] = []
|
||
for r in regions:
|
||
shape, color_name, color_hex = _LABEL_TO_COLOR.get(
|
||
r.label,
|
||
(r.label, "gray", _COLOR_HEX.get("gray", "#6b7280")),
|
||
)
|
||
converted.append(GraphicElement(
|
||
x=r.x,
|
||
y=r.y,
|
||
width=r.width,
|
||
height=r.height,
|
||
area=r.width * r.height,
|
||
shape=shape,
|
||
color_name=color_name,
|
||
color_hex=color_hex,
|
||
confidence=r.confidence,
|
||
contour=None,
|
||
))
|
||
converted.sort(key=lambda g: g.area, reverse=True)
|
||
result = converted[:max_elements]
|
||
if result:
|
||
shape_counts: Dict[str, int] = {}
|
||
for g in result:
|
||
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
||
logger.info(
|
||
"GraphicDetect (PP-DocLayout): %d elements (%s)",
|
||
len(result),
|
||
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
logger.warning("PP-DocLayout failed, falling back to OpenCV: %s", e)
|
||
# ------------------------------------------------------------------
|
||
# OpenCV fallback (original logic)
|
||
# ------------------------------------------------------------------
|
||
|
||
h, w = img_bgr.shape[:2]
|
||
|
||
logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
||
w, h, len(word_boxes), len(detected_boxes or []))
|
||
|
||
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||
candidates: List[GraphicElement] = []
|
||
|
||
# --- Build word mask (for overlap checking) ---
|
||
word_mask = np.zeros((h, w), dtype=np.uint8)
|
||
for wb in word_boxes:
|
||
x1 = max(0, int(wb.get("left", 0)))
|
||
y1 = max(0, int(wb.get("top", 0)))
|
||
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
|
||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
|
||
word_mask[y1:y2, x1:x2] = 255
|
||
|
||
# =====================================================================
|
||
# PASS 1 — COLORED IMAGE REGIONS
|
||
# =====================================================================
|
||
# Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
|
||
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
||
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
|
||
color_pixels = cv2.bitwise_and(sat_mask, val_mask)
|
||
|
||
# Remove tiny speckle
|
||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
||
color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
|
||
|
||
# Count raw colored pixels before dilation (for density check later)
|
||
color_pixel_raw = color_pixels.copy()
|
||
|
||
# Heavy dilation to merge nearby colored elements into regions.
|
||
# A 25x25 kernel merges elements within ~12px of each other.
|
||
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
|
||
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
|
||
|
||
contours_regions, _ = cv2.findContours(
|
||
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||
)
|
||
logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
|
||
|
||
for cnt in contours_regions:
|
||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||
|
||
# Skip tiny regions
|
||
if bw < 15 or bh < 15:
|
||
continue
|
||
|
||
# Skip page-spanning regions
|
||
if bw > w * 0.6 or bh > h * 0.6:
|
||
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
|
||
continue
|
||
|
||
bbox_area = bw * bh
|
||
|
||
# Check: how much of this region's bounding box overlaps with words?
|
||
roi_words = word_mask[by:by + bh, bx:bx + bw]
|
||
word_pixel_count = int(np.sum(roi_words > 0))
|
||
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
|
||
|
||
# Check: how many OCR word centroids fall inside this region?
|
||
# Colored text that OCR detected will have multiple centroids inside.
|
||
# Actual images may have 0-1 spurious OCR artifacts.
|
||
word_centroid_count = sum(
|
||
1 for wb in word_boxes
|
||
if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
|
||
and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
|
||
)
|
||
|
||
# Check: how many actual colored pixels are in this region?
|
||
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
|
||
color_pixel_count = int(np.sum(roi_color > 0))
|
||
|
||
# Color pixel density (before any skip checks so we can log it)
|
||
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
|
||
|
||
# --- Skip heuristics for colored TEXT (not images) ---
|
||
|
||
# (a) High word-box pixel overlap → clearly text
|
||
if word_overlap > 0.40:
|
||
logger.info(
|
||
"GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
|
||
"overlap=%.0f%% centroids=%d",
|
||
bx, by, bw, bh, word_overlap * 100, word_centroid_count,
|
||
)
|
||
continue
|
||
|
||
# (b) Multiple OCR words detected inside → colored text
|
||
# (images rarely produce 2+ confident word detections)
|
||
if word_centroid_count >= 2:
|
||
logger.info(
|
||
"GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
|
||
"centroids=%d overlap=%.0f%% density=%.0f%%",
|
||
bx, by, bw, bh, word_centroid_count,
|
||
word_overlap * 100, density * 100,
|
||
)
|
||
continue
|
||
|
||
# (c) Even 1 word + some pixel overlap → likely text
|
||
if word_centroid_count >= 1 and word_overlap > 0.10:
|
||
logger.info(
|
||
"GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
|
||
"centroids=%d overlap=%.0f%%",
|
||
bx, by, bw, bh, word_centroid_count, word_overlap * 100,
|
||
)
|
||
continue
|
||
|
||
# Need a minimum number of colored pixels (not just dilated area)
|
||
if color_pixel_count < 200:
|
||
continue
|
||
|
||
# (d) Very low density → thin strokes, almost certainly text.
|
||
# Large regions (photos/illustrations) can have low color density
|
||
# because most pixels are grayscale ink. Use a lower threshold
|
||
# for regions bigger than 100×80 px.
|
||
_min_density = 0.05 if (bw > 100 and bh > 80) else 0.20
|
||
if density < _min_density:
|
||
logger.info(
|
||
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
|
||
"density=%.0f%% (min=%.0f%%, likely colored text)",
|
||
bx, by, bw, bh, density * 100, _min_density * 100,
|
||
)
|
||
continue
|
||
|
||
# (e) Moderate density + small height → colored text line
|
||
if density < 0.35 and bh < h * 0.05:
|
||
logger.info(
|
||
"GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
|
||
"density=%.0f%% height=%.1f%%",
|
||
bx, by, bw, bh, density * 100, 100.0 * bh / h,
|
||
)
|
||
continue
|
||
|
||
# Determine dominant color from the actual colored pixels
|
||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||
color_px_mask = roi_color > 0
|
||
if np.sum(color_px_mask) > 0:
|
||
masked_hsv = roi_hsv[color_px_mask]
|
||
color_name, color_hex = _dominant_color(masked_hsv)
|
||
else:
|
||
color_name, color_hex = "black", _COLOR_HEX["black"]
|
||
|
||
# Confidence based on color density and low word overlap
|
||
conf = min(0.95, 0.5 + density * 0.5)
|
||
|
||
logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s",
|
||
bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name)
|
||
candidates.append(GraphicElement(
|
||
x=bx, y=by, width=bw, height=bh,
|
||
area=color_pixel_count,
|
||
shape="image",
|
||
color_name=color_name, color_hex=color_hex,
|
||
confidence=round(conf, 2), contour=cnt,
|
||
))
|
||
|
||
# =====================================================================
|
||
# PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
|
||
# =====================================================================
|
||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
|
||
# Exclude words and colored regions already found
|
||
exclusion = np.zeros((h, w), dtype=np.uint8)
|
||
word_pad = 5
|
||
for wb in word_boxes:
|
||
x1 = max(0, int(wb.get("left", 0)) - word_pad)
|
||
y1 = max(0, int(wb.get("top", 0)) - word_pad)
|
||
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
|
||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
||
exclusion[y1:y2, x1:x2] = 255
|
||
|
||
if detected_boxes:
|
||
for box in detected_boxes:
|
||
bbx = int(box.get("x", 0))
|
||
bby = int(box.get("y", 0))
|
||
bbw = int(box.get("w", box.get("width", 0)))
|
||
bbh = int(box.get("h", box.get("height", 0)))
|
||
inset = 8
|
||
x1 = max(0, bbx + inset)
|
||
y1 = max(0, bby + inset)
|
||
x2 = min(w, bbx + bbw - inset)
|
||
y2 = min(h, bby + bbh - inset)
|
||
if x2 > x1 and y2 > y1:
|
||
exclusion[y1:y2, x1:x2] = 255
|
||
|
||
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
|
||
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
|
||
|
||
contours_ink, _ = cv2.findContours(
|
||
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||
)
|
||
logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
|
||
|
||
for cnt in contours_ink:
|
||
area = cv2.contourArea(cnt)
|
||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||
|
||
if area < 5000 or min(bw, bh) < 40:
|
||
continue
|
||
if bw > w * 0.8 or bh > h * 0.8:
|
||
continue
|
||
|
||
logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
|
||
bx, by, bw, bh, int(area))
|
||
candidates.append(GraphicElement(
|
||
x=bx, y=by, width=bw, height=bh,
|
||
area=int(area), shape="illustration",
|
||
color_name="black", color_hex="#000000",
|
||
confidence=0.5, contour=cnt,
|
||
))
|
||
|
||
# =====================================================================
|
||
# Deduplicate and return
|
||
# =====================================================================
|
||
candidates.sort(key=lambda g: g.area, reverse=True)
|
||
|
||
final: List[GraphicElement] = []
|
||
for c in candidates:
|
||
overlap = False
|
||
for f in final:
|
||
ix1 = max(c.x, f.x)
|
||
iy1 = max(c.y, f.y)
|
||
ix2 = min(c.x + c.width, f.x + f.width)
|
||
iy2 = min(c.y + c.height, f.y + f.height)
|
||
if ix2 > ix1 and iy2 > iy1:
|
||
inter = (ix2 - ix1) * (iy2 - iy1)
|
||
smaller = min(c.width * c.height, f.width * f.height)
|
||
if smaller > 0 and inter / smaller > 0.5:
|
||
overlap = True
|
||
break
|
||
if not overlap:
|
||
final.append(c)
|
||
|
||
result = final[:max_elements]
|
||
|
||
if result:
|
||
shape_counts: Dict[str, int] = {}
|
||
for g in result:
|
||
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
||
logger.info(
|
||
"GraphicDetect: %d elements found (%s)",
|
||
len(result),
|
||
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
||
)
|
||
else:
|
||
logger.info("GraphicDetect: no graphic elements found")
|
||
|
||
return result
|