breakpilot-lehrer/scripts/export-doclayout-onnx.py

#!/usr/bin/env python3
"""
PP-DocLayout ONNX Export — exports PP-DocLayout model to ONNX for document layout detection.

PP-DocLayout detects: table, figure, title, text, list regions on document pages.
Since PaddlePaddle doesn't work natively on ARM Mac, this script either:
  1. Downloads a pre-exported ONNX model
  2. Uses Docker (linux/amd64) for the conversion

Usage:
    python scripts/export-doclayout-onnx.py
    python scripts/export-doclayout-onnx.py --method docker
"""

import argparse
import logging
import sys
from pathlib import Path

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("export-doclayout")

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

# 10 PP-DocLayout class labels in standard order
CLASS_LABELS = [
    "table",
    "figure",
    "title",
    "text",
    "list",
    "header",
    "footer",
    "equation",
    "reference",
    "abstract",
]

# Expected input shape for the model (batch, channels, height, width).
MODEL_INPUT_SHAPE = (1, 3, 800, 800)

# Import methods from sibling module
from doclayout_export_methods import (
    try_download,
    try_docker,
    write_metadata,
    sha256_file,
)


# ---------------------------------------------------------------------------
# Verification
# ---------------------------------------------------------------------------


def verify_onnx(model_path: Path) -> bool:
    """Load the ONNX model with onnxruntime, run a dummy inference, check outputs."""
    log.info("Verifying ONNX model: %s", model_path)

    try:
        import numpy as np
    except ImportError:
        log.error("numpy is required for verification: pip install numpy")
        return False

    try:
        import onnxruntime as ort
    except ImportError:
        log.error("onnxruntime is required for verification: pip install onnxruntime")
        return False

    try:
        # Load the model
        opts = ort.SessionOptions()
        opts.log_severity_level = 3  # suppress verbose logs
        session = ort.InferenceSession(str(model_path), sess_options=opts)

        # Inspect inputs
        inputs = session.get_inputs()
        log.info("  Model inputs:")
        for inp in inputs:
            log.info("    %s: shape=%s dtype=%s", inp.name, inp.shape, inp.type)

        # Inspect outputs
        outputs = session.get_outputs()
        log.info("  Model outputs:")
        for out in outputs:
            log.info("    %s: shape=%s dtype=%s", out.name, out.shape, out.type)

        # Build dummy input
        input_name = inputs[0].name
        input_shape = inputs[0].shape

        # Replace dynamic dims with concrete sizes.
        concrete_shape = []
        for i, dim in enumerate(input_shape):
            if isinstance(dim, (int,)) and dim > 0:
                concrete_shape.append(dim)
            elif i == 0:
                concrete_shape.append(1)
            elif i == 1:
                concrete_shape.append(3)
            else:
                concrete_shape.append(800)
        concrete_shape = tuple(concrete_shape)

        if len(concrete_shape) != 4:
            concrete_shape = MODEL_INPUT_SHAPE

        log.info("  Running dummy inference with shape %s ...", concrete_shape)
        dummy = np.random.randn(*concrete_shape).astype(np.float32)
        result = session.run(None, {input_name: dummy})

        log.info("  Inference succeeded — %d output tensors:", len(result))
        for i, r in enumerate(result):
            arr = np.asarray(r)
            log.info("    output[%d]: shape=%s dtype=%s", i, arr.shape, arr.dtype)

        if len(result) == 0:
            log.error("  Model produced no outputs!")
            return False

        has_plausible_output = False
        for r in result:
            arr = np.asarray(r)
            if arr.ndim >= 2 and any(d >= 4 for d in arr.shape):
                has_plausible_output = True
            if arr.ndim >= 1 and arr.size > 0:
                has_plausible_output = True

        if has_plausible_output:
            log.info("  Verification PASSED")
            return True
        else:
            log.warning("  Output shapes look unexpected, but model loaded OK.")
            log.warning("  Treating as PASSED (shapes may differ by export variant).")
            return True

    except Exception as exc:
        log.error("  Verification FAILED: %s", exc)
        return False


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Export PP-DocLayout model to ONNX for document layout detection.",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("models/onnx/pp-doclayout"),
        help="Directory for the exported ONNX model (default: models/onnx/pp-doclayout/)",
    )
    parser.add_argument(
        "--method",
        choices=["auto", "download", "docker"],
        default="auto",
        help="Export method: auto (try download then docker), download, or docker.",
    )
    parser.add_argument(
        "--skip-verify",
        action="store_true",
        help="Skip ONNX model verification after export.",
    )
    args = parser.parse_args()

    output_dir: Path = args.output_dir
    model_path = output_dir / "model.onnx"

    # Check if model already exists.
    if model_path.exists():
        size_mb = model_path.stat().st_size / (1 << 20)
        log.info("Model already exists: %s (%.1f MB)", model_path, size_mb)
        log.info("Delete it first if you want to re-export.")
        if not args.skip_verify:
            if not verify_onnx(model_path):
                log.error("Existing model failed verification!")
                return 1
        return 0

    success = False
    used_method = None

    if args.method in ("auto", "download"):
        success = try_download(output_dir)
        if success:
            used_method = "download"

    if not success and args.method in ("auto", "docker"):
        success = try_docker(output_dir)
        if success:
            used_method = "docker"

    if not success:
        log.error("All export methods failed.")
        if args.method == "download":
            log.info("Hint: try --method docker to convert via Docker (linux/amd64).")
        elif args.method == "docker":
            log.info("Hint: ensure Docker is running and has internet access.")
        else:
            log.info("Hint: check your internet connection and Docker installation.")
        return 1

    # Write metadata.
    write_metadata(output_dir, used_method, CLASS_LABELS, MODEL_INPUT_SHAPE)

    # Verify.
    if not args.skip_verify:
        if not verify_onnx(model_path):
            log.error("Exported model failed verification!")
            log.info("The file is kept at %s — inspect manually.", model_path)
            return 1
    else:
        log.info("Skipping verification (--skip-verify).")

    log.info("Done. Model ready at %s", model_path)
    return 0


if __name__ == "__main__":
    sys.exit(main())