breakpilot-lehrer/klausur-service/backend/cv_doclayout_detect.py

"""
PP-DocLayout ONNX Document Layout Detection.

Uses PP-DocLayout ONNX model to detect document structure regions:
  table, figure, title, text, list, header, footer, equation, reference, abstract

Fallback: If ONNX model not available, returns empty list (caller should
fall back to OpenCV-based detection in cv_graphic_detect.py).

DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

import numpy as np

logger = logging.getLogger(__name__)

__all__ = [
    "detect_layout_regions",
    "is_doclayout_available",
    "get_doclayout_status",
    "LayoutRegion",
    "DOCLAYOUT_CLASSES",
]

# ---------------------------------------------------------------------------
# Class labels (PP-DocLayout default order)
# ---------------------------------------------------------------------------

DOCLAYOUT_CLASSES = [
    "table", "figure", "title", "text", "list",
    "header", "footer", "equation", "reference", "abstract",
]

# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------


@dataclass
class LayoutRegion:
    """A detected document layout region."""
    x: int
    y: int
    width: int
    height: int
    label: str           # table, figure, title, text, list, etc.
    confidence: float
    label_index: int     # raw class index


# ---------------------------------------------------------------------------
# ONNX model loading
# ---------------------------------------------------------------------------

_MODEL_SEARCH_PATHS = [
    # 1. Explicit environment variable
    os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
    # 2. Docker default cache path
    "/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
    # 3. Local dev relative to working directory
    "models/onnx/pp-doclayout/model.onnx",
]

_onnx_session: Optional[object] = None
_model_path: Optional[str] = None
_load_attempted: bool = False
_load_error: Optional[str] = None


def _find_model_path() -> Optional[str]:
    """Search for the ONNX model file in known locations."""
    for p in _MODEL_SEARCH_PATHS:
        if p and Path(p).is_file():
            return str(Path(p).resolve())
    return None


def _load_onnx_session():
    """Lazy-load the ONNX runtime session (once)."""
    global _onnx_session, _model_path, _load_attempted, _load_error

    if _load_attempted:
        return _onnx_session

    _load_attempted = True

    path = _find_model_path()
    if path is None:
        _load_error = "ONNX model not found in any search path"
        logger.info("PP-DocLayout: %s", _load_error)
        return None

    try:
        import onnxruntime as ort  # type: ignore[import-untyped]

        sess_options = ort.SessionOptions()
        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        # Prefer CPU – keeps the GPU free for OCR / LLM.
        providers = ["CPUExecutionProvider"]
        _onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
        _model_path = path
        logger.info("PP-DocLayout: model loaded from %s", path)
    except ImportError:
        _load_error = "onnxruntime not installed"
        logger.info("PP-DocLayout: %s", _load_error)
    except Exception as exc:
        _load_error = str(exc)
        logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)

    return _onnx_session


# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------


def is_doclayout_available() -> bool:
    """Return True if the ONNX model can be loaded successfully."""
    return _load_onnx_session() is not None


def get_doclayout_status() -> Dict:
    """Return diagnostic information about the DocLayout backend."""
    _load_onnx_session()  # ensure we tried
    return {
        "available": _onnx_session is not None,
        "model_path": _model_path,
        "load_error": _load_error,
        "classes": DOCLAYOUT_CLASSES,
        "class_count": len(DOCLAYOUT_CLASSES),
    }


# ---------------------------------------------------------------------------
# Pre-processing
# ---------------------------------------------------------------------------

_INPUT_SIZE = 800  # PP-DocLayout expects 800x800


def preprocess_image(img_bgr: np.ndarray) -> tuple:
    """Resize + normalize image for PP-DocLayout ONNX input.

    Returns:
        (input_tensor, scale_x, scale_y, pad_x, pad_y)
        where scale/pad allow mapping boxes back to original coords.
    """
    orig_h, orig_w = img_bgr.shape[:2]

    # Compute scale to fit within _INPUT_SIZE keeping aspect ratio
    scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
    new_w = int(orig_w * scale)
    new_h = int(orig_h * scale)

    import cv2  # local import — cv2 is always available in this service
    resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

    # Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
    pad_x = (_INPUT_SIZE - new_w) // 2
    pad_y = (_INPUT_SIZE - new_h) // 2
    padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
    padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized

    # Normalize to [0, 1] float32
    blob = padded.astype(np.float32) / 255.0

    # HWC → CHW
    blob = blob.transpose(2, 0, 1)

    # Add batch dimension → (1, 3, 800, 800)
    blob = np.expand_dims(blob, axis=0)

    return blob, scale, pad_x, pad_y


# ---------------------------------------------------------------------------
# Non-Maximum Suppression (NMS)
# ---------------------------------------------------------------------------


def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
    """Compute IoU between two boxes [x1, y1, x2, y2]."""
    ix1 = max(box_a[0], box_b[0])
    iy1 = max(box_a[1], box_b[1])
    ix2 = min(box_a[2], box_b[2])
    iy2 = min(box_a[3], box_b[3])

    inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
    if inter == 0:
        return 0.0

    area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
    area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
    union = area_a + area_b - inter
    return inter / union if union > 0 else 0.0


def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
    """Apply greedy Non-Maximum Suppression.

    Args:
        boxes: (N, 4) array of [x1, y1, x2, y2].
        scores: (N,) confidence scores.
        iou_threshold: Overlap threshold for suppression.

    Returns:
        List of kept indices.
    """
    if len(boxes) == 0:
        return []

    order = np.argsort(scores)[::-1].tolist()
    keep: List[int] = []

    while order:
        i = order.pop(0)
        keep.append(i)
        remaining = []
        for j in order:
            if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
                remaining.append(j)
        order = remaining

    return keep


# ---------------------------------------------------------------------------
# Post-processing
# ---------------------------------------------------------------------------


def _postprocess(
    outputs: list,
    scale: float,
    pad_x: int,
    pad_y: int,
    orig_w: int,
    orig_h: int,
    confidence_threshold: float,
    max_regions: int,
) -> List[LayoutRegion]:
    """Parse ONNX output tensors into LayoutRegion list.

    PP-DocLayout ONNX typically outputs one tensor of shape
    (1, N, 6) or three tensors (boxes, scores, class_ids).
    We handle both common formats.
    """
    regions: List[LayoutRegion] = []

    # --- Determine output format ---
    if len(outputs) == 1:
        # Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
        raw = np.squeeze(outputs[0])  # (N, 6) or (N, 5+num_classes)
        if raw.ndim == 1:
            raw = raw.reshape(1, -1)
        if raw.shape[0] == 0:
            return []

        if raw.shape[1] == 6:
            # Format: x1, y1, x2, y2, score, class_id
            all_boxes = raw[:, :4]
            all_scores = raw[:, 4]
            all_classes = raw[:, 5].astype(int)
        elif raw.shape[1] > 6:
            # Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
            all_boxes = raw[:, :4]
            cls_scores = raw[:, 5:]
            all_classes = np.argmax(cls_scores, axis=1)
            all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
        else:
            logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
            return []

    elif len(outputs) == 3:
        # Three tensors: boxes (N,4), scores (N,), class_ids (N,)
        all_boxes = np.squeeze(outputs[0])
        all_scores = np.squeeze(outputs[1])
        all_classes = np.squeeze(outputs[2]).astype(int)
        if all_boxes.ndim == 1:
            all_boxes = all_boxes.reshape(1, 4)
            all_scores = np.array([all_scores])
            all_classes = np.array([all_classes])
    else:
        logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
        return []

    # --- Confidence filter ---
    mask = all_scores >= confidence_threshold
    boxes = all_boxes[mask]
    scores = all_scores[mask]
    classes = all_classes[mask]

    if len(boxes) == 0:
        return []

    # --- NMS ---
    keep_idxs = nms(boxes, scores, iou_threshold=0.5)
    boxes = boxes[keep_idxs]
    scores = scores[keep_idxs]
    classes = classes[keep_idxs]

    # --- Scale boxes back to original image coordinates ---
    for i in range(len(boxes)):
        x1, y1, x2, y2 = boxes[i]

        # Remove padding offset
        x1 = (x1 - pad_x) / scale
        y1 = (y1 - pad_y) / scale
        x2 = (x2 - pad_x) / scale
        y2 = (y2 - pad_y) / scale

        # Clamp to original dimensions
        x1 = max(0, min(x1, orig_w))
        y1 = max(0, min(y1, orig_h))
        x2 = max(0, min(x2, orig_w))
        y2 = max(0, min(y2, orig_h))

        w = int(round(x2 - x1))
        h = int(round(y2 - y1))
        if w < 5 or h < 5:
            continue

        cls_idx = int(classes[i])
        label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"

        regions.append(LayoutRegion(
            x=int(round(x1)),
            y=int(round(y1)),
            width=w,
            height=h,
            label=label,
            confidence=round(float(scores[i]), 4),
            label_index=cls_idx,
        ))

    # Sort by confidence descending, limit
    regions.sort(key=lambda r: r.confidence, reverse=True)
    return regions[:max_regions]


# ---------------------------------------------------------------------------
# Main detection function
# ---------------------------------------------------------------------------


def detect_layout_regions(
    img_bgr: np.ndarray,
    confidence_threshold: float = 0.5,
    max_regions: int = 50,
) -> List[LayoutRegion]:
    """Detect document layout regions using PP-DocLayout ONNX model.

    Args:
        img_bgr: BGR color image (OpenCV format).
        confidence_threshold: Minimum confidence to keep a detection.
        max_regions: Maximum number of regions to return.

    Returns:
        List of LayoutRegion sorted by confidence descending.
        Returns empty list if model is not available.
    """
    session = _load_onnx_session()
    if session is None:
        return []

    if img_bgr is None or img_bgr.size == 0:
        return []

    orig_h, orig_w = img_bgr.shape[:2]

    # Pre-process
    input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)

    # Run inference
    try:
        input_name = session.get_inputs()[0].name
        outputs = session.run(None, {input_name: input_tensor})
    except Exception as exc:
        logger.warning("PP-DocLayout inference failed: %s", exc)
        return []

    # Post-process
    regions = _postprocess(
        outputs,
        scale=scale,
        pad_x=pad_x,
        pad_y=pad_y,
        orig_w=orig_w,
        orig_h=orig_h,
        confidence_threshold=confidence_threshold,
        max_regions=max_regions,
    )

    if regions:
        label_counts: Dict[str, int] = {}
        for r in regions:
            label_counts[r.label] = label_counts.get(r.label, 0) + 1
        logger.info(
            "PP-DocLayout: %d regions (%s)",
            len(regions),
            ", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
        )
    else:
        logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)

    return regions