feat: Sprint 2 — TrOCR ONNX, PP-DocLayout, Model Management

D2: TrOCR ONNX export script (printed + handwritten, int8 quantization) D3: PP-DocLayout ONNX export script (download or Docker-based conversion) B3: Model Management admin page (PyTorch vs ONNX status, benchmarks, config) A4: TrOCR ONNX service with runtime routing (auto/pytorch/onnx via TROCR_BACKEND) A5: PP-DocLayout ONNX detection with OpenCV fallback (via GRAPHIC_DETECT_BACKEND) B4: Structure Detection UI toggle (OpenCV vs PP-DocLayout) with class color coding C3: TrOCR-ONNX.md documentation C4: OCR-Pipeline.md ONNX section added C5: mkdocs.yml nav updated, optimum added to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 09:53:02 +01:00
parent c695b659fb
commit be7f5f1872
16 changed files with 3616 additions and 60 deletions
--- a/klausur-service/backend/cv_doclayout_detect.py
+++ b/klausur-service/backend/cv_doclayout_detect.py
@@ -0,0 +1,413 @@
+"""
+PP-DocLayout ONNX Document Layout Detection.
+
+Uses PP-DocLayout ONNX model to detect document structure regions:
+  table, figure, title, text, list, header, footer, equation, reference, abstract
+
+Fallback: If ONNX model not available, returns empty list (caller should
+fall back to OpenCV-based detection in cv_graphic_detect.py).
+
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "detect_layout_regions",
+    "is_doclayout_available",
+    "get_doclayout_status",
+    "LayoutRegion",
+    "DOCLAYOUT_CLASSES",
+]
+
+# ---------------------------------------------------------------------------
+# Class labels (PP-DocLayout default order)
+# ---------------------------------------------------------------------------
+
+DOCLAYOUT_CLASSES = [
+    "table", "figure", "title", "text", "list",
+    "header", "footer", "equation", "reference", "abstract",
+]
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LayoutRegion:
+    """A detected document layout region."""
+    x: int
+    y: int
+    width: int
+    height: int
+    label: str           # table, figure, title, text, list, etc.
+    confidence: float
+    label_index: int     # raw class index
+
+
+# ---------------------------------------------------------------------------
+# ONNX model loading
+# ---------------------------------------------------------------------------
+
+_MODEL_SEARCH_PATHS = [
+    # 1. Explicit environment variable
+    os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
+    # 2. Docker default cache path
+    "/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
+    # 3. Local dev relative to working directory
+    "models/onnx/pp-doclayout/model.onnx",
+]
+
+_onnx_session: Optional[object] = None
+_model_path: Optional[str] = None
+_load_attempted: bool = False
+_load_error: Optional[str] = None
+
+
+def _find_model_path() -> Optional[str]:
+    """Search for the ONNX model file in known locations."""
+    for p in _MODEL_SEARCH_PATHS:
+        if p and Path(p).is_file():
+            return str(Path(p).resolve())
+    return None
+
+
+def _load_onnx_session():
+    """Lazy-load the ONNX runtime session (once)."""
+    global _onnx_session, _model_path, _load_attempted, _load_error
+
+    if _load_attempted:
+        return _onnx_session
+
+    _load_attempted = True
+
+    path = _find_model_path()
+    if path is None:
+        _load_error = "ONNX model not found in any search path"
+        logger.info("PP-DocLayout: %s", _load_error)
+        return None
+
+    try:
+        import onnxruntime as ort  # type: ignore[import-untyped]
+
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # Prefer CPU – keeps the GPU free for OCR / LLM.
+        providers = ["CPUExecutionProvider"]
+        _onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
+        _model_path = path
+        logger.info("PP-DocLayout: model loaded from %s", path)
+    except ImportError:
+        _load_error = "onnxruntime not installed"
+        logger.info("PP-DocLayout: %s", _load_error)
+    except Exception as exc:
+        _load_error = str(exc)
+        logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
+
+    return _onnx_session
+
+
+# ---------------------------------------------------------------------------
+# Public helpers
+# ---------------------------------------------------------------------------
+
+
+def is_doclayout_available() -> bool:
+    """Return True if the ONNX model can be loaded successfully."""
+    return _load_onnx_session() is not None
+
+
+def get_doclayout_status() -> Dict:
+    """Return diagnostic information about the DocLayout backend."""
+    _load_onnx_session()  # ensure we tried
+    return {
+        "available": _onnx_session is not None,
+        "model_path": _model_path,
+        "load_error": _load_error,
+        "classes": DOCLAYOUT_CLASSES,
+        "class_count": len(DOCLAYOUT_CLASSES),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Pre-processing
+# ---------------------------------------------------------------------------
+
+_INPUT_SIZE = 800  # PP-DocLayout expects 800x800
+
+
+def preprocess_image(img_bgr: np.ndarray) -> tuple:
+    """Resize + normalize image for PP-DocLayout ONNX input.
+
+    Returns:
+        (input_tensor, scale_x, scale_y, pad_x, pad_y)
+        where scale/pad allow mapping boxes back to original coords.
+    """
+    orig_h, orig_w = img_bgr.shape[:2]
+
+    # Compute scale to fit within _INPUT_SIZE keeping aspect ratio
+    scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
+    new_w = int(orig_w * scale)
+    new_h = int(orig_h * scale)
+
+    import cv2  # local import — cv2 is always available in this service
+    resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+
+    # Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
+    pad_x = (_INPUT_SIZE - new_w) // 2
+    pad_y = (_INPUT_SIZE - new_h) // 2
+    padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
+    padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
+
+    # Normalize to [0, 1] float32
+    blob = padded.astype(np.float32) / 255.0
+
+    # HWC → CHW
+    blob = blob.transpose(2, 0, 1)
+
+    # Add batch dimension → (1, 3, 800, 800)
+    blob = np.expand_dims(blob, axis=0)
+
+    return blob, scale, pad_x, pad_y
+
+
+# ---------------------------------------------------------------------------
+# Non-Maximum Suppression (NMS)
+# ---------------------------------------------------------------------------
+
+
+def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
+    """Compute IoU between two boxes [x1, y1, x2, y2]."""
+    ix1 = max(box_a[0], box_b[0])
+    iy1 = max(box_a[1], box_b[1])
+    ix2 = min(box_a[2], box_b[2])
+    iy2 = min(box_a[3], box_b[3])
+
+    inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
+    if inter == 0:
+        return 0.0
+
+    area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
+    area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+    union = area_a + area_b - inter
+    return inter / union if union > 0 else 0.0
+
+
+def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
+    """Apply greedy Non-Maximum Suppression.
+
+    Args:
+        boxes: (N, 4) array of [x1, y1, x2, y2].
+        scores: (N,) confidence scores.
+        iou_threshold: Overlap threshold for suppression.
+
+    Returns:
+        List of kept indices.
+    """
+    if len(boxes) == 0:
+        return []
+
+    order = np.argsort(scores)[::-1].tolist()
+    keep: List[int] = []
+
+    while order:
+        i = order.pop(0)
+        keep.append(i)
+        remaining = []
+        for j in order:
+            if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
+                remaining.append(j)
+        order = remaining
+
+    return keep
+
+
+# ---------------------------------------------------------------------------
+# Post-processing
+# ---------------------------------------------------------------------------
+
+
+def _postprocess(
+    outputs: list,
+    scale: float,
+    pad_x: int,
+    pad_y: int,
+    orig_w: int,
+    orig_h: int,
+    confidence_threshold: float,
+    max_regions: int,
+) -> List[LayoutRegion]:
+    """Parse ONNX output tensors into LayoutRegion list.
+
+    PP-DocLayout ONNX typically outputs one tensor of shape
+    (1, N, 6) or three tensors (boxes, scores, class_ids).
+    We handle both common formats.
+    """
+    regions: List[LayoutRegion] = []
+
+    # --- Determine output format ---
+    if len(outputs) == 1:
+        # Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
+        raw = np.squeeze(outputs[0])  # (N, 6) or (N, 5+num_classes)
+        if raw.ndim == 1:
+            raw = raw.reshape(1, -1)
+        if raw.shape[0] == 0:
+            return []
+
+        if raw.shape[1] == 6:
+            # Format: x1, y1, x2, y2, score, class_id
+            all_boxes = raw[:, :4]
+            all_scores = raw[:, 4]
+            all_classes = raw[:, 5].astype(int)
+        elif raw.shape[1] > 6:
+            # Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
+            all_boxes = raw[:, :4]
+            cls_scores = raw[:, 5:]
+            all_classes = np.argmax(cls_scores, axis=1)
+            all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
+        else:
+            logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
+            return []
+
+    elif len(outputs) == 3:
+        # Three tensors: boxes (N,4), scores (N,), class_ids (N,)
+        all_boxes = np.squeeze(outputs[0])
+        all_scores = np.squeeze(outputs[1])
+        all_classes = np.squeeze(outputs[2]).astype(int)
+        if all_boxes.ndim == 1:
+            all_boxes = all_boxes.reshape(1, 4)
+            all_scores = np.array([all_scores])
+            all_classes = np.array([all_classes])
+    else:
+        logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
+        return []
+
+    # --- Confidence filter ---
+    mask = all_scores >= confidence_threshold
+    boxes = all_boxes[mask]
+    scores = all_scores[mask]
+    classes = all_classes[mask]
+
+    if len(boxes) == 0:
+        return []
+
+    # --- NMS ---
+    keep_idxs = nms(boxes, scores, iou_threshold=0.5)
+    boxes = boxes[keep_idxs]
+    scores = scores[keep_idxs]
+    classes = classes[keep_idxs]
+
+    # --- Scale boxes back to original image coordinates ---
+    for i in range(len(boxes)):
+        x1, y1, x2, y2 = boxes[i]
+
+        # Remove padding offset
+        x1 = (x1 - pad_x) / scale
+        y1 = (y1 - pad_y) / scale
+        x2 = (x2 - pad_x) / scale
+        y2 = (y2 - pad_y) / scale
+
+        # Clamp to original dimensions
+        x1 = max(0, min(x1, orig_w))
+        y1 = max(0, min(y1, orig_h))
+        x2 = max(0, min(x2, orig_w))
+        y2 = max(0, min(y2, orig_h))
+
+        w = int(round(x2 - x1))
+        h = int(round(y2 - y1))
+        if w < 5 or h < 5:
+            continue
+
+        cls_idx = int(classes[i])
+        label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
+
+        regions.append(LayoutRegion(
+            x=int(round(x1)),
+            y=int(round(y1)),
+            width=w,
+            height=h,
+            label=label,
+            confidence=round(float(scores[i]), 4),
+            label_index=cls_idx,
+        ))
+
+    # Sort by confidence descending, limit
+    regions.sort(key=lambda r: r.confidence, reverse=True)
+    return regions[:max_regions]
+
+
+# ---------------------------------------------------------------------------
+# Main detection function
+# ---------------------------------------------------------------------------
+
+
+def detect_layout_regions(
+    img_bgr: np.ndarray,
+    confidence_threshold: float = 0.5,
+    max_regions: int = 50,
+) -> List[LayoutRegion]:
+    """Detect document layout regions using PP-DocLayout ONNX model.
+
+    Args:
+        img_bgr: BGR color image (OpenCV format).
+        confidence_threshold: Minimum confidence to keep a detection.
+        max_regions: Maximum number of regions to return.
+
+    Returns:
+        List of LayoutRegion sorted by confidence descending.
+        Returns empty list if model is not available.
+    """
+    session = _load_onnx_session()
+    if session is None:
+        return []
+
+    if img_bgr is None or img_bgr.size == 0:
+        return []
+
+    orig_h, orig_w = img_bgr.shape[:2]
+
+    # Pre-process
+    input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
+
+    # Run inference
+    try:
+        input_name = session.get_inputs()[0].name
+        outputs = session.run(None, {input_name: input_tensor})
+    except Exception as exc:
+        logger.warning("PP-DocLayout inference failed: %s", exc)
+        return []
+
+    # Post-process
+    regions = _postprocess(
+        outputs,
+        scale=scale,
+        pad_x=pad_x,
+        pad_y=pad_y,
+        orig_w=orig_w,
+        orig_h=orig_h,
+        confidence_threshold=confidence_threshold,
+        max_regions=max_regions,
+    )
+
+    if regions:
+        label_counts: Dict[str, int] = {}
+        for r in regions:
+            label_counts[r.label] = label_counts.get(r.label, 0) + 1
+        logger.info(
+            "PP-DocLayout: %d regions (%s)",
+            len(regions),
+            ", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
+        )
+    else:
+        logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
+
+    return regions