Files
breakpilot-lehrer/klausur-service/backend/cv_doclayout_detect.py
Benjamin Admin be7f5f1872 feat: Sprint 2 — TrOCR ONNX, PP-DocLayout, Model Management
D2: TrOCR ONNX export script (printed + handwritten, int8 quantization)
D3: PP-DocLayout ONNX export script (download or Docker-based conversion)
B3: Model Management admin page (PyTorch vs ONNX status, benchmarks, config)
A4: TrOCR ONNX service with runtime routing (auto/pytorch/onnx via TROCR_BACKEND)
A5: PP-DocLayout ONNX detection with OpenCV fallback (via GRAPHIC_DETECT_BACKEND)
B4: Structure Detection UI toggle (OpenCV vs PP-DocLayout) with class color coding
C3: TrOCR-ONNX.md documentation
C4: OCR-Pipeline.md ONNX section added
C5: mkdocs.yml nav updated, optimum added to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 09:53:02 +01:00

414 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PP-DocLayout ONNX Document Layout Detection.
Uses PP-DocLayout ONNX model to detect document structure regions:
table, figure, title, text, list, header, footer, equation, reference, abstract
Fallback: If ONNX model not available, returns empty list (caller should
fall back to OpenCV-based detection in cv_graphic_detect.py).
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
logger = logging.getLogger(__name__)
__all__ = [
"detect_layout_regions",
"is_doclayout_available",
"get_doclayout_status",
"LayoutRegion",
"DOCLAYOUT_CLASSES",
]
# ---------------------------------------------------------------------------
# Class labels (PP-DocLayout default order)
# ---------------------------------------------------------------------------
DOCLAYOUT_CLASSES = [
"table", "figure", "title", "text", "list",
"header", "footer", "equation", "reference", "abstract",
]
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass
class LayoutRegion:
"""A detected document layout region."""
x: int
y: int
width: int
height: int
label: str # table, figure, title, text, list, etc.
confidence: float
label_index: int # raw class index
# ---------------------------------------------------------------------------
# ONNX model loading
# ---------------------------------------------------------------------------
_MODEL_SEARCH_PATHS = [
# 1. Explicit environment variable
os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
# 2. Docker default cache path
"/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
# 3. Local dev relative to working directory
"models/onnx/pp-doclayout/model.onnx",
]
_onnx_session: Optional[object] = None
_model_path: Optional[str] = None
_load_attempted: bool = False
_load_error: Optional[str] = None
def _find_model_path() -> Optional[str]:
"""Search for the ONNX model file in known locations."""
for p in _MODEL_SEARCH_PATHS:
if p and Path(p).is_file():
return str(Path(p).resolve())
return None
def _load_onnx_session():
"""Lazy-load the ONNX runtime session (once)."""
global _onnx_session, _model_path, _load_attempted, _load_error
if _load_attempted:
return _onnx_session
_load_attempted = True
path = _find_model_path()
if path is None:
_load_error = "ONNX model not found in any search path"
logger.info("PP-DocLayout: %s", _load_error)
return None
try:
import onnxruntime as ort # type: ignore[import-untyped]
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Prefer CPU keeps the GPU free for OCR / LLM.
providers = ["CPUExecutionProvider"]
_onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
_model_path = path
logger.info("PP-DocLayout: model loaded from %s", path)
except ImportError:
_load_error = "onnxruntime not installed"
logger.info("PP-DocLayout: %s", _load_error)
except Exception as exc:
_load_error = str(exc)
logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
return _onnx_session
# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------
def is_doclayout_available() -> bool:
"""Return True if the ONNX model can be loaded successfully."""
return _load_onnx_session() is not None
def get_doclayout_status() -> Dict:
"""Return diagnostic information about the DocLayout backend."""
_load_onnx_session() # ensure we tried
return {
"available": _onnx_session is not None,
"model_path": _model_path,
"load_error": _load_error,
"classes": DOCLAYOUT_CLASSES,
"class_count": len(DOCLAYOUT_CLASSES),
}
# ---------------------------------------------------------------------------
# Pre-processing
# ---------------------------------------------------------------------------
_INPUT_SIZE = 800 # PP-DocLayout expects 800x800
def preprocess_image(img_bgr: np.ndarray) -> tuple:
"""Resize + normalize image for PP-DocLayout ONNX input.
Returns:
(input_tensor, scale_x, scale_y, pad_x, pad_y)
where scale/pad allow mapping boxes back to original coords.
"""
orig_h, orig_w = img_bgr.shape[:2]
# Compute scale to fit within _INPUT_SIZE keeping aspect ratio
scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
new_w = int(orig_w * scale)
new_h = int(orig_h * scale)
import cv2 # local import — cv2 is always available in this service
resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
# Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
pad_x = (_INPUT_SIZE - new_w) // 2
pad_y = (_INPUT_SIZE - new_h) // 2
padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
# Normalize to [0, 1] float32
blob = padded.astype(np.float32) / 255.0
# HWC → CHW
blob = blob.transpose(2, 0, 1)
# Add batch dimension → (1, 3, 800, 800)
blob = np.expand_dims(blob, axis=0)
return blob, scale, pad_x, pad_y
# ---------------------------------------------------------------------------
# Non-Maximum Suppression (NMS)
# ---------------------------------------------------------------------------
def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
"""Compute IoU between two boxes [x1, y1, x2, y2]."""
ix1 = max(box_a[0], box_b[0])
iy1 = max(box_a[1], box_b[1])
ix2 = min(box_a[2], box_b[2])
iy2 = min(box_a[3], box_b[3])
inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
union = area_a + area_b - inter
return inter / union if union > 0 else 0.0
def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
"""Apply greedy Non-Maximum Suppression.
Args:
boxes: (N, 4) array of [x1, y1, x2, y2].
scores: (N,) confidence scores.
iou_threshold: Overlap threshold for suppression.
Returns:
List of kept indices.
"""
if len(boxes) == 0:
return []
order = np.argsort(scores)[::-1].tolist()
keep: List[int] = []
while order:
i = order.pop(0)
keep.append(i)
remaining = []
for j in order:
if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
remaining.append(j)
order = remaining
return keep
# ---------------------------------------------------------------------------
# Post-processing
# ---------------------------------------------------------------------------
def _postprocess(
outputs: list,
scale: float,
pad_x: int,
pad_y: int,
orig_w: int,
orig_h: int,
confidence_threshold: float,
max_regions: int,
) -> List[LayoutRegion]:
"""Parse ONNX output tensors into LayoutRegion list.
PP-DocLayout ONNX typically outputs one tensor of shape
(1, N, 6) or three tensors (boxes, scores, class_ids).
We handle both common formats.
"""
regions: List[LayoutRegion] = []
# --- Determine output format ---
if len(outputs) == 1:
# Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
raw = np.squeeze(outputs[0]) # (N, 6) or (N, 5+num_classes)
if raw.ndim == 1:
raw = raw.reshape(1, -1)
if raw.shape[0] == 0:
return []
if raw.shape[1] == 6:
# Format: x1, y1, x2, y2, score, class_id
all_boxes = raw[:, :4]
all_scores = raw[:, 4]
all_classes = raw[:, 5].astype(int)
elif raw.shape[1] > 6:
# Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
all_boxes = raw[:, :4]
cls_scores = raw[:, 5:]
all_classes = np.argmax(cls_scores, axis=1)
all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
else:
logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
return []
elif len(outputs) == 3:
# Three tensors: boxes (N,4), scores (N,), class_ids (N,)
all_boxes = np.squeeze(outputs[0])
all_scores = np.squeeze(outputs[1])
all_classes = np.squeeze(outputs[2]).astype(int)
if all_boxes.ndim == 1:
all_boxes = all_boxes.reshape(1, 4)
all_scores = np.array([all_scores])
all_classes = np.array([all_classes])
else:
logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
return []
# --- Confidence filter ---
mask = all_scores >= confidence_threshold
boxes = all_boxes[mask]
scores = all_scores[mask]
classes = all_classes[mask]
if len(boxes) == 0:
return []
# --- NMS ---
keep_idxs = nms(boxes, scores, iou_threshold=0.5)
boxes = boxes[keep_idxs]
scores = scores[keep_idxs]
classes = classes[keep_idxs]
# --- Scale boxes back to original image coordinates ---
for i in range(len(boxes)):
x1, y1, x2, y2 = boxes[i]
# Remove padding offset
x1 = (x1 - pad_x) / scale
y1 = (y1 - pad_y) / scale
x2 = (x2 - pad_x) / scale
y2 = (y2 - pad_y) / scale
# Clamp to original dimensions
x1 = max(0, min(x1, orig_w))
y1 = max(0, min(y1, orig_h))
x2 = max(0, min(x2, orig_w))
y2 = max(0, min(y2, orig_h))
w = int(round(x2 - x1))
h = int(round(y2 - y1))
if w < 5 or h < 5:
continue
cls_idx = int(classes[i])
label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
regions.append(LayoutRegion(
x=int(round(x1)),
y=int(round(y1)),
width=w,
height=h,
label=label,
confidence=round(float(scores[i]), 4),
label_index=cls_idx,
))
# Sort by confidence descending, limit
regions.sort(key=lambda r: r.confidence, reverse=True)
return regions[:max_regions]
# ---------------------------------------------------------------------------
# Main detection function
# ---------------------------------------------------------------------------
def detect_layout_regions(
img_bgr: np.ndarray,
confidence_threshold: float = 0.5,
max_regions: int = 50,
) -> List[LayoutRegion]:
"""Detect document layout regions using PP-DocLayout ONNX model.
Args:
img_bgr: BGR color image (OpenCV format).
confidence_threshold: Minimum confidence to keep a detection.
max_regions: Maximum number of regions to return.
Returns:
List of LayoutRegion sorted by confidence descending.
Returns empty list if model is not available.
"""
session = _load_onnx_session()
if session is None:
return []
if img_bgr is None or img_bgr.size == 0:
return []
orig_h, orig_w = img_bgr.shape[:2]
# Pre-process
input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
# Run inference
try:
input_name = session.get_inputs()[0].name
outputs = session.run(None, {input_name: input_tensor})
except Exception as exc:
logger.warning("PP-DocLayout inference failed: %s", exc)
return []
# Post-process
regions = _postprocess(
outputs,
scale=scale,
pad_x=pad_x,
pad_y=pad_y,
orig_w=orig_w,
orig_h=orig_h,
confidence_threshold=confidence_threshold,
max_regions=max_regions,
)
if regions:
label_counts: Dict[str, int] = {}
for r in regions:
label_counts[r.label] = label_counts.get(r.label, 0) + 1
logger.info(
"PP-DocLayout: %d regions (%s)",
len(regions),
", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
)
else:
logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
return regions