breakpilot-lehrer/scripts/export-doclayout-onnx.py

#!/usr/bin/env python3
"""
PP-DocLayout ONNX Export — exports PP-DocLayout model to ONNX for document layout detection.

PP-DocLayout detects: table, figure, title, text, list regions on document pages.
Since PaddlePaddle doesn't work natively on ARM Mac, this script either:
  1. Downloads a pre-exported ONNX model
  2. Uses Docker (linux/amd64) for the conversion

Usage:
    python scripts/export-doclayout-onnx.py
    python scripts/export-doclayout-onnx.py --method docker
"""

import argparse
import hashlib
import json
import logging
import os
import shutil
import subprocess
import sys
import tempfile
import urllib.request
from pathlib import Path

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("export-doclayout")

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

# 10 PP-DocLayout class labels in standard order
CLASS_LABELS = [
    "table",
    "figure",
    "title",
    "text",
    "list",
    "header",
    "footer",
    "equation",
    "reference",
    "abstract",
]

# Known download sources for pre-exported ONNX models.
# Ordered by preference — first successful download wins.
DOWNLOAD_SOURCES = [
    {
        "name": "PaddleOCR PP-DocLayout (ppyoloe_plus_sod, HuggingFace)",
        "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
        "filename": "model.onnx",
        "sha256": None,  # populated once a known-good hash is available
    },
    {
        "name": "PaddleOCR PP-DocLayout (RapidOCR mirror)",
        "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
        "filename": "model.onnx",
        "sha256": None,
    },
]

# Paddle inference model URLs (for Docker-based conversion).
PADDLE_MODEL_URL = (
    "https://paddleocr.bj.bcebos.com/PP-DocLayout/PP-DocLayout_plus.tar"
)

# Expected input shape for the model (batch, channels, height, width).
MODEL_INPUT_SHAPE = (1, 3, 800, 800)

# Docker image name used for conversion.
DOCKER_IMAGE_TAG = "breakpilot/paddle2onnx-converter:latest"

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def sha256_file(path: Path) -> str:
    """Compute SHA-256 hex digest for a file."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()


def download_file(url: str, dest: Path, desc: str = "") -> bool:
    """Download a file with progress reporting. Returns True on success."""
    label = desc or url.split("/")[-1]
    log.info("Downloading %s ...", label)
    log.info("  URL: %s", url)

    try:
        req = urllib.request.Request(url, headers={"User-Agent": "breakpilot-export/1.0"})
        with urllib.request.urlopen(req, timeout=120) as resp:
            total = resp.headers.get("Content-Length")
            total = int(total) if total else None
            downloaded = 0

            dest.parent.mkdir(parents=True, exist_ok=True)
            with open(dest, "wb") as f:
                while True:
                    chunk = resp.read(1 << 18)  # 256 KB
                    if not chunk:
                        break
                    f.write(chunk)
                    downloaded += len(chunk)
                    if total:
                        pct = downloaded * 100 / total
                        mb = downloaded / (1 << 20)
                        total_mb = total / (1 << 20)
                        print(
                            f"\r  {mb:.1f}/{total_mb:.1f} MB ({pct:.0f}%)",
                            end="",
                            flush=True,
                        )
            if total:
                print()  # newline after progress

        size_mb = dest.stat().st_size / (1 << 20)
        log.info("  Downloaded %.1f MB -> %s", size_mb, dest)
        return True

    except Exception as exc:
        log.warning("  Download failed: %s", exc)
        if dest.exists():
            dest.unlink()
        return False


def verify_onnx(model_path: Path) -> bool:
    """Load the ONNX model with onnxruntime, run a dummy inference, check outputs."""
    log.info("Verifying ONNX model: %s", model_path)

    try:
        import numpy as np
    except ImportError:
        log.error("numpy is required for verification: pip install numpy")
        return False

    try:
        import onnxruntime as ort
    except ImportError:
        log.error("onnxruntime is required for verification: pip install onnxruntime")
        return False

    try:
        # Load the model
        opts = ort.SessionOptions()
        opts.log_severity_level = 3  # suppress verbose logs
        session = ort.InferenceSession(str(model_path), sess_options=opts)

        # Inspect inputs
        inputs = session.get_inputs()
        log.info("  Model inputs:")
        for inp in inputs:
            log.info("    %s: shape=%s dtype=%s", inp.name, inp.shape, inp.type)

        # Inspect outputs
        outputs = session.get_outputs()
        log.info("  Model outputs:")
        for out in outputs:
            log.info("    %s: shape=%s dtype=%s", out.name, out.shape, out.type)

        # Build dummy input — use the first input's name and expected shape.
        input_name = inputs[0].name
        input_shape = inputs[0].shape

        # Replace dynamic dims (strings or None) with concrete sizes.
        concrete_shape = []
        for i, dim in enumerate(input_shape):
            if isinstance(dim, (int,)) and dim > 0:
                concrete_shape.append(dim)
            elif i == 0:
                concrete_shape.append(1)  # batch
            elif i == 1:
                concrete_shape.append(3)  # channels
            else:
                concrete_shape.append(800)  # spatial
        concrete_shape = tuple(concrete_shape)

        # Fallback if shape looks wrong — use standard MODEL_INPUT_SHAPE.
        if len(concrete_shape) != 4:
            concrete_shape = MODEL_INPUT_SHAPE

        log.info("  Running dummy inference with shape %s ...", concrete_shape)
        dummy = np.random.randn(*concrete_shape).astype(np.float32)
        result = session.run(None, {input_name: dummy})

        log.info("  Inference succeeded — %d output tensors:", len(result))
        for i, r in enumerate(result):
            arr = np.asarray(r)
            log.info("    output[%d]: shape=%s dtype=%s", i, arr.shape, arr.dtype)

        # Basic sanity checks
        if len(result) == 0:
            log.error("  Model produced no outputs!")
            return False

        # Check for at least one output with a bounding-box-like shape (N, 4) or
        # a detection-like structure. Be lenient — different ONNX exports vary.
        has_plausible_output = False
        for r in result:
            arr = np.asarray(r)
            # Common detection output shapes: (1, N, 6), (N, 4), (N, 6), (1, N, 5+C), etc.
            if arr.ndim >= 2 and any(d >= 4 for d in arr.shape):
                has_plausible_output = True
            # Some models output (N,) labels or scores
            if arr.ndim >= 1 and arr.size > 0:
                has_plausible_output = True

        if has_plausible_output:
            log.info("  Verification PASSED")
            return True
        else:
            log.warning("  Output shapes look unexpected, but model loaded OK.")
            log.warning("  Treating as PASSED (shapes may differ by export variant).")
            return True

    except Exception as exc:
        log.error("  Verification FAILED: %s", exc)
        return False


# ---------------------------------------------------------------------------
# Method: Download
# ---------------------------------------------------------------------------


def try_download(output_dir: Path) -> bool:
    """Attempt to download a pre-exported ONNX model. Returns True on success."""
    log.info("=== Method: DOWNLOAD ===")

    output_dir.mkdir(parents=True, exist_ok=True)
    model_path = output_dir / "model.onnx"

    for source in DOWNLOAD_SOURCES:
        log.info("Trying source: %s", source["name"])
        tmp_path = output_dir / f".{source['filename']}.tmp"

        if not download_file(source["url"], tmp_path, desc=source["name"]):
            continue

        # Check SHA-256 if known.
        if source["sha256"]:
            actual_hash = sha256_file(tmp_path)
            if actual_hash != source["sha256"]:
                log.warning(
                    "  SHA-256 mismatch: expected %s, got %s",
                    source["sha256"],
                    actual_hash,
                )
                tmp_path.unlink()
                continue

        # Basic sanity: file should be > 1 MB (a real ONNX model, not an error page).
        size = tmp_path.stat().st_size
        if size < 1 << 20:
            log.warning("  File too small (%.1f KB) — probably not a valid model.", size / 1024)
            tmp_path.unlink()
            continue

        # Move into place.
        shutil.move(str(tmp_path), str(model_path))
        log.info("Model saved to %s (%.1f MB)", model_path, model_path.stat().st_size / (1 << 20))
        return True

    log.warning("All download sources failed.")
    return False


# ---------------------------------------------------------------------------
# Method: Docker
# ---------------------------------------------------------------------------

DOCKERFILE_CONTENT = r"""
FROM --platform=linux/amd64 python:3.11-slim

RUN pip install --no-cache-dir \
    paddlepaddle==3.0.0 \
    paddle2onnx==1.3.1 \
    onnx==1.17.0 \
    requests

WORKDIR /work

# Download + extract the PP-DocLayout Paddle inference model.
RUN python3 -c "
import urllib.request, tarfile, os
url = 'PADDLE_MODEL_URL_PLACEHOLDER'
print(f'Downloading {url} ...')
dest = '/work/pp_doclayout.tar'
urllib.request.urlretrieve(url, dest)
print('Extracting ...')
with tarfile.open(dest) as t:
    t.extractall('/work/paddle_model')
os.remove(dest)
# List what we extracted
for root, dirs, files in os.walk('/work/paddle_model'):
    for f in files:
        fp = os.path.join(root, f)
        sz = os.path.getsize(fp)
        print(f'  {fp} ({sz} bytes)')
"

# Convert Paddle model to ONNX.
# paddle2onnx expects model_dir with model.pdmodel + model.pdiparams
RUN python3 -c "
import os, glob, subprocess

# Find the inference model files
model_dir = '/work/paddle_model'
pdmodel_files = glob.glob(os.path.join(model_dir, '**', '*.pdmodel'), recursive=True)
pdiparams_files = glob.glob(os.path.join(model_dir, '**', '*.pdiparams'), recursive=True)

if not pdmodel_files:
    raise FileNotFoundError('No .pdmodel file found in extracted archive')

pdmodel = pdmodel_files[0]
pdiparams = pdiparams_files[0] if pdiparams_files else None
model_dir_actual = os.path.dirname(pdmodel)
pdmodel_name = os.path.basename(pdmodel).replace('.pdmodel', '')

print(f'Found model: {pdmodel}')
print(f'Found params: {pdiparams}')
print(f'Model dir: {model_dir_actual}')
print(f'Model name prefix: {pdmodel_name}')

cmd = [
    'paddle2onnx',
    '--model_dir', model_dir_actual,
    '--model_filename', os.path.basename(pdmodel),
]
if pdiparams:
    cmd += ['--params_filename', os.path.basename(pdiparams)]
cmd += [
    '--save_file', '/work/output/model.onnx',
    '--opset_version', '14',
    '--enable_onnx_checker', 'True',
]

os.makedirs('/work/output', exist_ok=True)
print(f'Running: {\" \".join(cmd)}')
subprocess.run(cmd, check=True)

out_size = os.path.getsize('/work/output/model.onnx')
print(f'Conversion done: /work/output/model.onnx ({out_size} bytes)')
"

CMD ["cp", "-v", "/work/output/model.onnx", "/output/model.onnx"]
""".replace(
    "PADDLE_MODEL_URL_PLACEHOLDER", PADDLE_MODEL_URL
)


def try_docker(output_dir: Path) -> bool:
    """Build a Docker image to convert the Paddle model to ONNX. Returns True on success."""
    log.info("=== Method: DOCKER (linux/amd64) ===")

    # Check Docker is available.
    docker_bin = shutil.which("docker") or "/usr/local/bin/docker"
    try:
        subprocess.run(
            [docker_bin, "version"],
            capture_output=True,
            check=True,
            timeout=15,
        )
    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as exc:
        log.error("Docker is not available: %s", exc)
        return False

    output_dir.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory(prefix="doclayout-export-") as tmpdir:
        tmpdir = Path(tmpdir)

        # Write Dockerfile.
        dockerfile_path = tmpdir / "Dockerfile"
        dockerfile_path.write_text(DOCKERFILE_CONTENT)
        log.info("Wrote Dockerfile to %s", dockerfile_path)

        # Build image.
        log.info("Building Docker image (this downloads ~2 GB, may take a while) ...")
        build_cmd = [
            docker_bin, "build",
            "--platform", "linux/amd64",
            "-t", DOCKER_IMAGE_TAG,
            "-f", str(dockerfile_path),
            str(tmpdir),
        ]
        log.info("  %s", " ".join(build_cmd))
        build_result = subprocess.run(
            build_cmd,
            capture_output=False,  # stream output to terminal
            timeout=1200,  # 20 min
        )
        if build_result.returncode != 0:
            log.error("Docker build failed (exit code %d).", build_result.returncode)
            return False

        # Run container — mount output_dir as /output, the CMD copies model.onnx there.
        log.info("Running conversion container ...")
        run_cmd = [
            docker_bin, "run",
            "--rm",
            "--platform", "linux/amd64",
            "-v", f"{output_dir.resolve()}:/output",
            DOCKER_IMAGE_TAG,
        ]
        log.info("  %s", " ".join(run_cmd))
        run_result = subprocess.run(
            run_cmd,
            capture_output=False,
            timeout=300,
        )
        if run_result.returncode != 0:
            log.error("Docker run failed (exit code %d).", run_result.returncode)
            return False

    model_path = output_dir / "model.onnx"
    if model_path.exists():
        size_mb = model_path.stat().st_size / (1 << 20)
        log.info("Model exported: %s (%.1f MB)", model_path, size_mb)
        return True
    else:
        log.error("Expected output file not found: %s", model_path)
        return False


# ---------------------------------------------------------------------------
# Write metadata
# ---------------------------------------------------------------------------


def write_metadata(output_dir: Path, method: str) -> None:
    """Write a metadata JSON next to the model for provenance tracking."""
    model_path = output_dir / "model.onnx"
    if not model_path.exists():
        return

    meta = {
        "model": "PP-DocLayout",
        "format": "ONNX",
        "export_method": method,
        "class_labels": CLASS_LABELS,
        "input_shape": list(MODEL_INPUT_SHAPE),
        "file_size_bytes": model_path.stat().st_size,
        "sha256": sha256_file(model_path),
    }
    meta_path = output_dir / "metadata.json"
    with open(meta_path, "w") as f:
        json.dump(meta, f, indent=2)
    log.info("Metadata written to %s", meta_path)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Export PP-DocLayout model to ONNX for document layout detection.",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("models/onnx/pp-doclayout"),
        help="Directory for the exported ONNX model (default: models/onnx/pp-doclayout/)",
    )
    parser.add_argument(
        "--method",
        choices=["auto", "download", "docker"],
        default="auto",
        help="Export method: auto (try download then docker), download, or docker.",
    )
    parser.add_argument(
        "--skip-verify",
        action="store_true",
        help="Skip ONNX model verification after export.",
    )
    args = parser.parse_args()

    output_dir: Path = args.output_dir
    model_path = output_dir / "model.onnx"

    # Check if model already exists.
    if model_path.exists():
        size_mb = model_path.stat().st_size / (1 << 20)
        log.info("Model already exists: %s (%.1f MB)", model_path, size_mb)
        log.info("Delete it first if you want to re-export.")
        if not args.skip_verify:
            if not verify_onnx(model_path):
                log.error("Existing model failed verification!")
                return 1
        return 0

    success = False
    used_method = None

    if args.method in ("auto", "download"):
        success = try_download(output_dir)
        if success:
            used_method = "download"

    if not success and args.method in ("auto", "docker"):
        success = try_docker(output_dir)
        if success:
            used_method = "docker"

    if not success:
        log.error("All export methods failed.")
        if args.method == "download":
            log.info("Hint: try --method docker to convert via Docker (linux/amd64).")
        elif args.method == "docker":
            log.info("Hint: ensure Docker is running and has internet access.")
        else:
            log.info("Hint: check your internet connection and Docker installation.")
        return 1

    # Write metadata.
    write_metadata(output_dir, used_method)

    # Verify.
    if not args.skip_verify:
        if not verify_onnx(model_path):
            log.error("Exported model failed verification!")
            log.info("The file is kept at %s — inspect manually.", model_path)
            return 1
    else:
        log.info("Skipping verification (--skip-verify).")

    log.info("Done. Model ready at %s", model_path)
    return 0


if __name__ == "__main__":
    sys.exit(main())