feat: Sprint 2 — TrOCR ONNX, PP-DocLayout, Model Management

D2: TrOCR ONNX export script (printed + handwritten, int8 quantization)
D3: PP-DocLayout ONNX export script (download or Docker-based conversion)
B3: Model Management admin page (PyTorch vs ONNX status, benchmarks, config)
A4: TrOCR ONNX service with runtime routing (auto/pytorch/onnx via TROCR_BACKEND)
A5: PP-DocLayout ONNX detection with OpenCV fallback (via GRAPHIC_DETECT_BACKEND)
B4: Structure Detection UI toggle (OpenCV vs PP-DocLayout) with class color coding
C3: TrOCR-ONNX.md documentation
C4: OCR-Pipeline.md ONNX section added
C5: mkdocs.yml nav updated, optimum added to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-23 09:53:02 +01:00
parent c695b659fb
commit be7f5f1872
16 changed files with 3616 additions and 60 deletions

View File

@@ -0,0 +1,413 @@
"""
PP-DocLayout ONNX Document Layout Detection.
Uses PP-DocLayout ONNX model to detect document structure regions:
table, figure, title, text, list, header, footer, equation, reference, abstract
Fallback: If ONNX model not available, returns empty list (caller should
fall back to OpenCV-based detection in cv_graphic_detect.py).
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
logger = logging.getLogger(__name__)
__all__ = [
"detect_layout_regions",
"is_doclayout_available",
"get_doclayout_status",
"LayoutRegion",
"DOCLAYOUT_CLASSES",
]
# ---------------------------------------------------------------------------
# Class labels (PP-DocLayout default order)
# ---------------------------------------------------------------------------
DOCLAYOUT_CLASSES = [
"table", "figure", "title", "text", "list",
"header", "footer", "equation", "reference", "abstract",
]
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass
class LayoutRegion:
"""A detected document layout region."""
x: int
y: int
width: int
height: int
label: str # table, figure, title, text, list, etc.
confidence: float
label_index: int # raw class index
# ---------------------------------------------------------------------------
# ONNX model loading
# ---------------------------------------------------------------------------
_MODEL_SEARCH_PATHS = [
# 1. Explicit environment variable
os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
# 2. Docker default cache path
"/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
# 3. Local dev relative to working directory
"models/onnx/pp-doclayout/model.onnx",
]
_onnx_session: Optional[object] = None
_model_path: Optional[str] = None
_load_attempted: bool = False
_load_error: Optional[str] = None
def _find_model_path() -> Optional[str]:
"""Search for the ONNX model file in known locations."""
for p in _MODEL_SEARCH_PATHS:
if p and Path(p).is_file():
return str(Path(p).resolve())
return None
def _load_onnx_session():
"""Lazy-load the ONNX runtime session (once)."""
global _onnx_session, _model_path, _load_attempted, _load_error
if _load_attempted:
return _onnx_session
_load_attempted = True
path = _find_model_path()
if path is None:
_load_error = "ONNX model not found in any search path"
logger.info("PP-DocLayout: %s", _load_error)
return None
try:
import onnxruntime as ort # type: ignore[import-untyped]
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Prefer CPU keeps the GPU free for OCR / LLM.
providers = ["CPUExecutionProvider"]
_onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
_model_path = path
logger.info("PP-DocLayout: model loaded from %s", path)
except ImportError:
_load_error = "onnxruntime not installed"
logger.info("PP-DocLayout: %s", _load_error)
except Exception as exc:
_load_error = str(exc)
logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
return _onnx_session
# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------
def is_doclayout_available() -> bool:
"""Return True if the ONNX model can be loaded successfully."""
return _load_onnx_session() is not None
def get_doclayout_status() -> Dict:
"""Return diagnostic information about the DocLayout backend."""
_load_onnx_session() # ensure we tried
return {
"available": _onnx_session is not None,
"model_path": _model_path,
"load_error": _load_error,
"classes": DOCLAYOUT_CLASSES,
"class_count": len(DOCLAYOUT_CLASSES),
}
# ---------------------------------------------------------------------------
# Pre-processing
# ---------------------------------------------------------------------------
_INPUT_SIZE = 800 # PP-DocLayout expects 800x800
def preprocess_image(img_bgr: np.ndarray) -> tuple:
"""Resize + normalize image for PP-DocLayout ONNX input.
Returns:
(input_tensor, scale_x, scale_y, pad_x, pad_y)
where scale/pad allow mapping boxes back to original coords.
"""
orig_h, orig_w = img_bgr.shape[:2]
# Compute scale to fit within _INPUT_SIZE keeping aspect ratio
scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
new_w = int(orig_w * scale)
new_h = int(orig_h * scale)
import cv2 # local import — cv2 is always available in this service
resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
# Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
pad_x = (_INPUT_SIZE - new_w) // 2
pad_y = (_INPUT_SIZE - new_h) // 2
padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
# Normalize to [0, 1] float32
blob = padded.astype(np.float32) / 255.0
# HWC → CHW
blob = blob.transpose(2, 0, 1)
# Add batch dimension → (1, 3, 800, 800)
blob = np.expand_dims(blob, axis=0)
return blob, scale, pad_x, pad_y
# ---------------------------------------------------------------------------
# Non-Maximum Suppression (NMS)
# ---------------------------------------------------------------------------
def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
"""Compute IoU between two boxes [x1, y1, x2, y2]."""
ix1 = max(box_a[0], box_b[0])
iy1 = max(box_a[1], box_b[1])
ix2 = min(box_a[2], box_b[2])
iy2 = min(box_a[3], box_b[3])
inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
union = area_a + area_b - inter
return inter / union if union > 0 else 0.0
def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
"""Apply greedy Non-Maximum Suppression.
Args:
boxes: (N, 4) array of [x1, y1, x2, y2].
scores: (N,) confidence scores.
iou_threshold: Overlap threshold for suppression.
Returns:
List of kept indices.
"""
if len(boxes) == 0:
return []
order = np.argsort(scores)[::-1].tolist()
keep: List[int] = []
while order:
i = order.pop(0)
keep.append(i)
remaining = []
for j in order:
if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
remaining.append(j)
order = remaining
return keep
# ---------------------------------------------------------------------------
# Post-processing
# ---------------------------------------------------------------------------
def _postprocess(
outputs: list,
scale: float,
pad_x: int,
pad_y: int,
orig_w: int,
orig_h: int,
confidence_threshold: float,
max_regions: int,
) -> List[LayoutRegion]:
"""Parse ONNX output tensors into LayoutRegion list.
PP-DocLayout ONNX typically outputs one tensor of shape
(1, N, 6) or three tensors (boxes, scores, class_ids).
We handle both common formats.
"""
regions: List[LayoutRegion] = []
# --- Determine output format ---
if len(outputs) == 1:
# Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
raw = np.squeeze(outputs[0]) # (N, 6) or (N, 5+num_classes)
if raw.ndim == 1:
raw = raw.reshape(1, -1)
if raw.shape[0] == 0:
return []
if raw.shape[1] == 6:
# Format: x1, y1, x2, y2, score, class_id
all_boxes = raw[:, :4]
all_scores = raw[:, 4]
all_classes = raw[:, 5].astype(int)
elif raw.shape[1] > 6:
# Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
all_boxes = raw[:, :4]
cls_scores = raw[:, 5:]
all_classes = np.argmax(cls_scores, axis=1)
all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
else:
logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
return []
elif len(outputs) == 3:
# Three tensors: boxes (N,4), scores (N,), class_ids (N,)
all_boxes = np.squeeze(outputs[0])
all_scores = np.squeeze(outputs[1])
all_classes = np.squeeze(outputs[2]).astype(int)
if all_boxes.ndim == 1:
all_boxes = all_boxes.reshape(1, 4)
all_scores = np.array([all_scores])
all_classes = np.array([all_classes])
else:
logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
return []
# --- Confidence filter ---
mask = all_scores >= confidence_threshold
boxes = all_boxes[mask]
scores = all_scores[mask]
classes = all_classes[mask]
if len(boxes) == 0:
return []
# --- NMS ---
keep_idxs = nms(boxes, scores, iou_threshold=0.5)
boxes = boxes[keep_idxs]
scores = scores[keep_idxs]
classes = classes[keep_idxs]
# --- Scale boxes back to original image coordinates ---
for i in range(len(boxes)):
x1, y1, x2, y2 = boxes[i]
# Remove padding offset
x1 = (x1 - pad_x) / scale
y1 = (y1 - pad_y) / scale
x2 = (x2 - pad_x) / scale
y2 = (y2 - pad_y) / scale
# Clamp to original dimensions
x1 = max(0, min(x1, orig_w))
y1 = max(0, min(y1, orig_h))
x2 = max(0, min(x2, orig_w))
y2 = max(0, min(y2, orig_h))
w = int(round(x2 - x1))
h = int(round(y2 - y1))
if w < 5 or h < 5:
continue
cls_idx = int(classes[i])
label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
regions.append(LayoutRegion(
x=int(round(x1)),
y=int(round(y1)),
width=w,
height=h,
label=label,
confidence=round(float(scores[i]), 4),
label_index=cls_idx,
))
# Sort by confidence descending, limit
regions.sort(key=lambda r: r.confidence, reverse=True)
return regions[:max_regions]
# ---------------------------------------------------------------------------
# Main detection function
# ---------------------------------------------------------------------------
def detect_layout_regions(
img_bgr: np.ndarray,
confidence_threshold: float = 0.5,
max_regions: int = 50,
) -> List[LayoutRegion]:
"""Detect document layout regions using PP-DocLayout ONNX model.
Args:
img_bgr: BGR color image (OpenCV format).
confidence_threshold: Minimum confidence to keep a detection.
max_regions: Maximum number of regions to return.
Returns:
List of LayoutRegion sorted by confidence descending.
Returns empty list if model is not available.
"""
session = _load_onnx_session()
if session is None:
return []
if img_bgr is None or img_bgr.size == 0:
return []
orig_h, orig_w = img_bgr.shape[:2]
# Pre-process
input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
# Run inference
try:
input_name = session.get_inputs()[0].name
outputs = session.run(None, {input_name: input_tensor})
except Exception as exc:
logger.warning("PP-DocLayout inference failed: %s", exc)
return []
# Post-process
regions = _postprocess(
outputs,
scale=scale,
pad_x=pad_x,
pad_y=pad_y,
orig_w=orig_w,
orig_h=orig_h,
confidence_threshold=confidence_threshold,
max_regions=max_regions,
)
if regions:
label_counts: Dict[str, int] = {}
for r in regions:
label_counts[r.label] = label_counts.get(r.label, 0) + 1
logger.info(
"PP-DocLayout: %d regions (%s)",
len(regions),
", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
)
else:
logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
return regions