""" PP-DocLayout ONNX Document Layout Detection. Uses PP-DocLayout ONNX model to detect document structure regions: table, figure, title, text, list, header, footer, equation, reference, abstract Fallback: If ONNX model not available, returns empty list (caller should fall back to OpenCV-based detection in cv_graphic_detect.py). DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import os from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional import numpy as np logger = logging.getLogger(__name__) __all__ = [ "detect_layout_regions", "is_doclayout_available", "get_doclayout_status", "LayoutRegion", "DOCLAYOUT_CLASSES", ] # --------------------------------------------------------------------------- # Class labels (PP-DocLayout default order) # --------------------------------------------------------------------------- DOCLAYOUT_CLASSES = [ "table", "figure", "title", "text", "list", "header", "footer", "equation", "reference", "abstract", ] # --------------------------------------------------------------------------- # Data types # --------------------------------------------------------------------------- @dataclass class LayoutRegion: """A detected document layout region.""" x: int y: int width: int height: int label: str # table, figure, title, text, list, etc. confidence: float label_index: int # raw class index # --------------------------------------------------------------------------- # ONNX model loading # --------------------------------------------------------------------------- _MODEL_SEARCH_PATHS = [ # 1. Explicit environment variable os.environ.get("DOCLAYOUT_ONNX_PATH", ""), # 2. Docker default cache path "/root/.cache/huggingface/onnx/pp-doclayout/model.onnx", # 3. Local dev relative to working directory "models/onnx/pp-doclayout/model.onnx", ] _onnx_session: Optional[object] = None _model_path: Optional[str] = None _load_attempted: bool = False _load_error: Optional[str] = None def _find_model_path() -> Optional[str]: """Search for the ONNX model file in known locations.""" for p in _MODEL_SEARCH_PATHS: if p and Path(p).is_file(): return str(Path(p).resolve()) return None def _load_onnx_session(): """Lazy-load the ONNX runtime session (once).""" global _onnx_session, _model_path, _load_attempted, _load_error if _load_attempted: return _onnx_session _load_attempted = True path = _find_model_path() if path is None: _load_error = "ONNX model not found in any search path" logger.info("PP-DocLayout: %s", _load_error) return None try: import onnxruntime as ort # type: ignore[import-untyped] sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL # Prefer CPU – keeps the GPU free for OCR / LLM. providers = ["CPUExecutionProvider"] _onnx_session = ort.InferenceSession(path, sess_options, providers=providers) _model_path = path logger.info("PP-DocLayout: model loaded from %s", path) except ImportError: _load_error = "onnxruntime not installed" logger.info("PP-DocLayout: %s", _load_error) except Exception as exc: _load_error = str(exc) logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc) return _onnx_session # --------------------------------------------------------------------------- # Public helpers # --------------------------------------------------------------------------- def is_doclayout_available() -> bool: """Return True if the ONNX model can be loaded successfully.""" return _load_onnx_session() is not None def get_doclayout_status() -> Dict: """Return diagnostic information about the DocLayout backend.""" _load_onnx_session() # ensure we tried return { "available": _onnx_session is not None, "model_path": _model_path, "load_error": _load_error, "classes": DOCLAYOUT_CLASSES, "class_count": len(DOCLAYOUT_CLASSES), } # --------------------------------------------------------------------------- # Pre-processing # --------------------------------------------------------------------------- _INPUT_SIZE = 800 # PP-DocLayout expects 800x800 def preprocess_image(img_bgr: np.ndarray) -> tuple: """Resize + normalize image for PP-DocLayout ONNX input. Returns: (input_tensor, scale_x, scale_y, pad_x, pad_y) where scale/pad allow mapping boxes back to original coords. """ orig_h, orig_w = img_bgr.shape[:2] # Compute scale to fit within _INPUT_SIZE keeping aspect ratio scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h) new_w = int(orig_w * scale) new_h = int(orig_h * scale) import cv2 # local import — cv2 is always available in this service resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR) # Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114) pad_x = (_INPUT_SIZE - new_w) // 2 pad_y = (_INPUT_SIZE - new_h) // 2 padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8) padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized # Normalize to [0, 1] float32 blob = padded.astype(np.float32) / 255.0 # HWC → CHW blob = blob.transpose(2, 0, 1) # Add batch dimension → (1, 3, 800, 800) blob = np.expand_dims(blob, axis=0) return blob, scale, pad_x, pad_y # --------------------------------------------------------------------------- # Non-Maximum Suppression (NMS) # --------------------------------------------------------------------------- def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float: """Compute IoU between two boxes [x1, y1, x2, y2].""" ix1 = max(box_a[0], box_b[0]) iy1 = max(box_a[1], box_b[1]) ix2 = min(box_a[2], box_b[2]) iy2 = min(box_a[3], box_b[3]) inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1) if inter == 0: return 0.0 area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) union = area_a + area_b - inter return inter / union if union > 0 else 0.0 def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]: """Apply greedy Non-Maximum Suppression. Args: boxes: (N, 4) array of [x1, y1, x2, y2]. scores: (N,) confidence scores. iou_threshold: Overlap threshold for suppression. Returns: List of kept indices. """ if len(boxes) == 0: return [] order = np.argsort(scores)[::-1].tolist() keep: List[int] = [] while order: i = order.pop(0) keep.append(i) remaining = [] for j in order: if _compute_iou(boxes[i], boxes[j]) < iou_threshold: remaining.append(j) order = remaining return keep # --------------------------------------------------------------------------- # Post-processing # --------------------------------------------------------------------------- def _postprocess( outputs: list, scale: float, pad_x: int, pad_y: int, orig_w: int, orig_h: int, confidence_threshold: float, max_regions: int, ) -> List[LayoutRegion]: """Parse ONNX output tensors into LayoutRegion list. PP-DocLayout ONNX typically outputs one tensor of shape (1, N, 6) or three tensors (boxes, scores, class_ids). We handle both common formats. """ regions: List[LayoutRegion] = [] # --- Determine output format --- if len(outputs) == 1: # Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class]) raw = np.squeeze(outputs[0]) # (N, 6) or (N, 5+num_classes) if raw.ndim == 1: raw = raw.reshape(1, -1) if raw.shape[0] == 0: return [] if raw.shape[1] == 6: # Format: x1, y1, x2, y2, score, class_id all_boxes = raw[:, :4] all_scores = raw[:, 4] all_classes = raw[:, 5].astype(int) elif raw.shape[1] > 6: # Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ... all_boxes = raw[:, :4] cls_scores = raw[:, 5:] all_classes = np.argmax(cls_scores, axis=1) all_scores = raw[:, 4] * np.max(cls_scores, axis=1) else: logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape) return [] elif len(outputs) == 3: # Three tensors: boxes (N,4), scores (N,), class_ids (N,) all_boxes = np.squeeze(outputs[0]) all_scores = np.squeeze(outputs[1]) all_classes = np.squeeze(outputs[2]).astype(int) if all_boxes.ndim == 1: all_boxes = all_boxes.reshape(1, 4) all_scores = np.array([all_scores]) all_classes = np.array([all_classes]) else: logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs)) return [] # --- Confidence filter --- mask = all_scores >= confidence_threshold boxes = all_boxes[mask] scores = all_scores[mask] classes = all_classes[mask] if len(boxes) == 0: return [] # --- NMS --- keep_idxs = nms(boxes, scores, iou_threshold=0.5) boxes = boxes[keep_idxs] scores = scores[keep_idxs] classes = classes[keep_idxs] # --- Scale boxes back to original image coordinates --- for i in range(len(boxes)): x1, y1, x2, y2 = boxes[i] # Remove padding offset x1 = (x1 - pad_x) / scale y1 = (y1 - pad_y) / scale x2 = (x2 - pad_x) / scale y2 = (y2 - pad_y) / scale # Clamp to original dimensions x1 = max(0, min(x1, orig_w)) y1 = max(0, min(y1, orig_h)) x2 = max(0, min(x2, orig_w)) y2 = max(0, min(y2, orig_h)) w = int(round(x2 - x1)) h = int(round(y2 - y1)) if w < 5 or h < 5: continue cls_idx = int(classes[i]) label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}" regions.append(LayoutRegion( x=int(round(x1)), y=int(round(y1)), width=w, height=h, label=label, confidence=round(float(scores[i]), 4), label_index=cls_idx, )) # Sort by confidence descending, limit regions.sort(key=lambda r: r.confidence, reverse=True) return regions[:max_regions] # --------------------------------------------------------------------------- # Main detection function # --------------------------------------------------------------------------- def detect_layout_regions( img_bgr: np.ndarray, confidence_threshold: float = 0.5, max_regions: int = 50, ) -> List[LayoutRegion]: """Detect document layout regions using PP-DocLayout ONNX model. Args: img_bgr: BGR color image (OpenCV format). confidence_threshold: Minimum confidence to keep a detection. max_regions: Maximum number of regions to return. Returns: List of LayoutRegion sorted by confidence descending. Returns empty list if model is not available. """ session = _load_onnx_session() if session is None: return [] if img_bgr is None or img_bgr.size == 0: return [] orig_h, orig_w = img_bgr.shape[:2] # Pre-process input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr) # Run inference try: input_name = session.get_inputs()[0].name outputs = session.run(None, {input_name: input_tensor}) except Exception as exc: logger.warning("PP-DocLayout inference failed: %s", exc) return [] # Post-process regions = _postprocess( outputs, scale=scale, pad_x=pad_x, pad_y=pad_y, orig_w=orig_w, orig_h=orig_h, confidence_threshold=confidence_threshold, max_regions=max_regions, ) if regions: label_counts: Dict[str, int] = {} for r in regions: label_counts[r.label] = label_counts.get(r.label, 0) + 1 logger.info( "PP-DocLayout: %d regions (%s)", len(regions), ", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())), ) else: logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold) return regions