D2: TrOCR ONNX export script (printed + handwritten, int8 quantization) D3: PP-DocLayout ONNX export script (download or Docker-based conversion) B3: Model Management admin page (PyTorch vs ONNX status, benchmarks, config) A4: TrOCR ONNX service with runtime routing (auto/pytorch/onnx via TROCR_BACKEND) A5: PP-DocLayout ONNX detection with OpenCV fallback (via GRAPHIC_DETECT_BACKEND) B4: Structure Detection UI toggle (OpenCV vs PP-DocLayout) with class color coding C3: TrOCR-ONNX.md documentation C4: OCR-Pipeline.md ONNX section added C5: mkdocs.yml nav updated, optimum added to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
414 lines
12 KiB
Python
414 lines
12 KiB
Python
"""
|
||
PP-DocLayout ONNX Document Layout Detection.
|
||
|
||
Uses PP-DocLayout ONNX model to detect document structure regions:
|
||
table, figure, title, text, list, header, footer, equation, reference, abstract
|
||
|
||
Fallback: If ONNX model not available, returns empty list (caller should
|
||
fall back to OpenCV-based detection in cv_graphic_detect.py).
|
||
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
import os
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional
|
||
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
__all__ = [
|
||
"detect_layout_regions",
|
||
"is_doclayout_available",
|
||
"get_doclayout_status",
|
||
"LayoutRegion",
|
||
"DOCLAYOUT_CLASSES",
|
||
]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Class labels (PP-DocLayout default order)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
DOCLAYOUT_CLASSES = [
|
||
"table", "figure", "title", "text", "list",
|
||
"header", "footer", "equation", "reference", "abstract",
|
||
]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Data types
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@dataclass
|
||
class LayoutRegion:
|
||
"""A detected document layout region."""
|
||
x: int
|
||
y: int
|
||
width: int
|
||
height: int
|
||
label: str # table, figure, title, text, list, etc.
|
||
confidence: float
|
||
label_index: int # raw class index
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ONNX model loading
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_MODEL_SEARCH_PATHS = [
|
||
# 1. Explicit environment variable
|
||
os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
|
||
# 2. Docker default cache path
|
||
"/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
|
||
# 3. Local dev relative to working directory
|
||
"models/onnx/pp-doclayout/model.onnx",
|
||
]
|
||
|
||
_onnx_session: Optional[object] = None
|
||
_model_path: Optional[str] = None
|
||
_load_attempted: bool = False
|
||
_load_error: Optional[str] = None
|
||
|
||
|
||
def _find_model_path() -> Optional[str]:
|
||
"""Search for the ONNX model file in known locations."""
|
||
for p in _MODEL_SEARCH_PATHS:
|
||
if p and Path(p).is_file():
|
||
return str(Path(p).resolve())
|
||
return None
|
||
|
||
|
||
def _load_onnx_session():
|
||
"""Lazy-load the ONNX runtime session (once)."""
|
||
global _onnx_session, _model_path, _load_attempted, _load_error
|
||
|
||
if _load_attempted:
|
||
return _onnx_session
|
||
|
||
_load_attempted = True
|
||
|
||
path = _find_model_path()
|
||
if path is None:
|
||
_load_error = "ONNX model not found in any search path"
|
||
logger.info("PP-DocLayout: %s", _load_error)
|
||
return None
|
||
|
||
try:
|
||
import onnxruntime as ort # type: ignore[import-untyped]
|
||
|
||
sess_options = ort.SessionOptions()
|
||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||
# Prefer CPU – keeps the GPU free for OCR / LLM.
|
||
providers = ["CPUExecutionProvider"]
|
||
_onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
|
||
_model_path = path
|
||
logger.info("PP-DocLayout: model loaded from %s", path)
|
||
except ImportError:
|
||
_load_error = "onnxruntime not installed"
|
||
logger.info("PP-DocLayout: %s", _load_error)
|
||
except Exception as exc:
|
||
_load_error = str(exc)
|
||
logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
|
||
|
||
return _onnx_session
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def is_doclayout_available() -> bool:
|
||
"""Return True if the ONNX model can be loaded successfully."""
|
||
return _load_onnx_session() is not None
|
||
|
||
|
||
def get_doclayout_status() -> Dict:
|
||
"""Return diagnostic information about the DocLayout backend."""
|
||
_load_onnx_session() # ensure we tried
|
||
return {
|
||
"available": _onnx_session is not None,
|
||
"model_path": _model_path,
|
||
"load_error": _load_error,
|
||
"classes": DOCLAYOUT_CLASSES,
|
||
"class_count": len(DOCLAYOUT_CLASSES),
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Pre-processing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_INPUT_SIZE = 800 # PP-DocLayout expects 800x800
|
||
|
||
|
||
def preprocess_image(img_bgr: np.ndarray) -> tuple:
|
||
"""Resize + normalize image for PP-DocLayout ONNX input.
|
||
|
||
Returns:
|
||
(input_tensor, scale_x, scale_y, pad_x, pad_y)
|
||
where scale/pad allow mapping boxes back to original coords.
|
||
"""
|
||
orig_h, orig_w = img_bgr.shape[:2]
|
||
|
||
# Compute scale to fit within _INPUT_SIZE keeping aspect ratio
|
||
scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
|
||
new_w = int(orig_w * scale)
|
||
new_h = int(orig_h * scale)
|
||
|
||
import cv2 # local import — cv2 is always available in this service
|
||
resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
|
||
|
||
# Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
|
||
pad_x = (_INPUT_SIZE - new_w) // 2
|
||
pad_y = (_INPUT_SIZE - new_h) // 2
|
||
padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
|
||
padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
|
||
|
||
# Normalize to [0, 1] float32
|
||
blob = padded.astype(np.float32) / 255.0
|
||
|
||
# HWC → CHW
|
||
blob = blob.transpose(2, 0, 1)
|
||
|
||
# Add batch dimension → (1, 3, 800, 800)
|
||
blob = np.expand_dims(blob, axis=0)
|
||
|
||
return blob, scale, pad_x, pad_y
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Non-Maximum Suppression (NMS)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
|
||
"""Compute IoU between two boxes [x1, y1, x2, y2]."""
|
||
ix1 = max(box_a[0], box_b[0])
|
||
iy1 = max(box_a[1], box_b[1])
|
||
ix2 = min(box_a[2], box_b[2])
|
||
iy2 = min(box_a[3], box_b[3])
|
||
|
||
inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
|
||
if inter == 0:
|
||
return 0.0
|
||
|
||
area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
|
||
area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
|
||
union = area_a + area_b - inter
|
||
return inter / union if union > 0 else 0.0
|
||
|
||
|
||
def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
|
||
"""Apply greedy Non-Maximum Suppression.
|
||
|
||
Args:
|
||
boxes: (N, 4) array of [x1, y1, x2, y2].
|
||
scores: (N,) confidence scores.
|
||
iou_threshold: Overlap threshold for suppression.
|
||
|
||
Returns:
|
||
List of kept indices.
|
||
"""
|
||
if len(boxes) == 0:
|
||
return []
|
||
|
||
order = np.argsort(scores)[::-1].tolist()
|
||
keep: List[int] = []
|
||
|
||
while order:
|
||
i = order.pop(0)
|
||
keep.append(i)
|
||
remaining = []
|
||
for j in order:
|
||
if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
|
||
remaining.append(j)
|
||
order = remaining
|
||
|
||
return keep
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Post-processing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _postprocess(
|
||
outputs: list,
|
||
scale: float,
|
||
pad_x: int,
|
||
pad_y: int,
|
||
orig_w: int,
|
||
orig_h: int,
|
||
confidence_threshold: float,
|
||
max_regions: int,
|
||
) -> List[LayoutRegion]:
|
||
"""Parse ONNX output tensors into LayoutRegion list.
|
||
|
||
PP-DocLayout ONNX typically outputs one tensor of shape
|
||
(1, N, 6) or three tensors (boxes, scores, class_ids).
|
||
We handle both common formats.
|
||
"""
|
||
regions: List[LayoutRegion] = []
|
||
|
||
# --- Determine output format ---
|
||
if len(outputs) == 1:
|
||
# Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
|
||
raw = np.squeeze(outputs[0]) # (N, 6) or (N, 5+num_classes)
|
||
if raw.ndim == 1:
|
||
raw = raw.reshape(1, -1)
|
||
if raw.shape[0] == 0:
|
||
return []
|
||
|
||
if raw.shape[1] == 6:
|
||
# Format: x1, y1, x2, y2, score, class_id
|
||
all_boxes = raw[:, :4]
|
||
all_scores = raw[:, 4]
|
||
all_classes = raw[:, 5].astype(int)
|
||
elif raw.shape[1] > 6:
|
||
# Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
|
||
all_boxes = raw[:, :4]
|
||
cls_scores = raw[:, 5:]
|
||
all_classes = np.argmax(cls_scores, axis=1)
|
||
all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
|
||
else:
|
||
logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
|
||
return []
|
||
|
||
elif len(outputs) == 3:
|
||
# Three tensors: boxes (N,4), scores (N,), class_ids (N,)
|
||
all_boxes = np.squeeze(outputs[0])
|
||
all_scores = np.squeeze(outputs[1])
|
||
all_classes = np.squeeze(outputs[2]).astype(int)
|
||
if all_boxes.ndim == 1:
|
||
all_boxes = all_boxes.reshape(1, 4)
|
||
all_scores = np.array([all_scores])
|
||
all_classes = np.array([all_classes])
|
||
else:
|
||
logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
|
||
return []
|
||
|
||
# --- Confidence filter ---
|
||
mask = all_scores >= confidence_threshold
|
||
boxes = all_boxes[mask]
|
||
scores = all_scores[mask]
|
||
classes = all_classes[mask]
|
||
|
||
if len(boxes) == 0:
|
||
return []
|
||
|
||
# --- NMS ---
|
||
keep_idxs = nms(boxes, scores, iou_threshold=0.5)
|
||
boxes = boxes[keep_idxs]
|
||
scores = scores[keep_idxs]
|
||
classes = classes[keep_idxs]
|
||
|
||
# --- Scale boxes back to original image coordinates ---
|
||
for i in range(len(boxes)):
|
||
x1, y1, x2, y2 = boxes[i]
|
||
|
||
# Remove padding offset
|
||
x1 = (x1 - pad_x) / scale
|
||
y1 = (y1 - pad_y) / scale
|
||
x2 = (x2 - pad_x) / scale
|
||
y2 = (y2 - pad_y) / scale
|
||
|
||
# Clamp to original dimensions
|
||
x1 = max(0, min(x1, orig_w))
|
||
y1 = max(0, min(y1, orig_h))
|
||
x2 = max(0, min(x2, orig_w))
|
||
y2 = max(0, min(y2, orig_h))
|
||
|
||
w = int(round(x2 - x1))
|
||
h = int(round(y2 - y1))
|
||
if w < 5 or h < 5:
|
||
continue
|
||
|
||
cls_idx = int(classes[i])
|
||
label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
|
||
|
||
regions.append(LayoutRegion(
|
||
x=int(round(x1)),
|
||
y=int(round(y1)),
|
||
width=w,
|
||
height=h,
|
||
label=label,
|
||
confidence=round(float(scores[i]), 4),
|
||
label_index=cls_idx,
|
||
))
|
||
|
||
# Sort by confidence descending, limit
|
||
regions.sort(key=lambda r: r.confidence, reverse=True)
|
||
return regions[:max_regions]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main detection function
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def detect_layout_regions(
|
||
img_bgr: np.ndarray,
|
||
confidence_threshold: float = 0.5,
|
||
max_regions: int = 50,
|
||
) -> List[LayoutRegion]:
|
||
"""Detect document layout regions using PP-DocLayout ONNX model.
|
||
|
||
Args:
|
||
img_bgr: BGR color image (OpenCV format).
|
||
confidence_threshold: Minimum confidence to keep a detection.
|
||
max_regions: Maximum number of regions to return.
|
||
|
||
Returns:
|
||
List of LayoutRegion sorted by confidence descending.
|
||
Returns empty list if model is not available.
|
||
"""
|
||
session = _load_onnx_session()
|
||
if session is None:
|
||
return []
|
||
|
||
if img_bgr is None or img_bgr.size == 0:
|
||
return []
|
||
|
||
orig_h, orig_w = img_bgr.shape[:2]
|
||
|
||
# Pre-process
|
||
input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
|
||
|
||
# Run inference
|
||
try:
|
||
input_name = session.get_inputs()[0].name
|
||
outputs = session.run(None, {input_name: input_tensor})
|
||
except Exception as exc:
|
||
logger.warning("PP-DocLayout inference failed: %s", exc)
|
||
return []
|
||
|
||
# Post-process
|
||
regions = _postprocess(
|
||
outputs,
|
||
scale=scale,
|
||
pad_x=pad_x,
|
||
pad_y=pad_y,
|
||
orig_w=orig_w,
|
||
orig_h=orig_h,
|
||
confidence_threshold=confidence_threshold,
|
||
max_regions=max_regions,
|
||
)
|
||
|
||
if regions:
|
||
label_counts: Dict[str, int] = {}
|
||
for r in regions:
|
||
label_counts[r.label] = label_counts.get(r.label, 0) + 1
|
||
logger.info(
|
||
"PP-DocLayout: %d regions (%s)",
|
||
len(regions),
|
||
", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
|
||
)
|
||
else:
|
||
logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
|
||
|
||
return regions
|