#!/usr/bin/env python3 """ PP-DocLayout ONNX Export — exports PP-DocLayout model to ONNX for document layout detection. PP-DocLayout detects: table, figure, title, text, list regions on document pages. Since PaddlePaddle doesn't work natively on ARM Mac, this script either: 1. Downloads a pre-exported ONNX model 2. Uses Docker (linux/amd64) for the conversion Usage: python scripts/export-doclayout-onnx.py python scripts/export-doclayout-onnx.py --method docker """ import argparse import hashlib import json import logging import os import shutil import subprocess import sys import tempfile import urllib.request from pathlib import Path logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("export-doclayout") # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- # 10 PP-DocLayout class labels in standard order CLASS_LABELS = [ "table", "figure", "title", "text", "list", "header", "footer", "equation", "reference", "abstract", ] # Known download sources for pre-exported ONNX models. # Ordered by preference — first successful download wins. DOWNLOAD_SOURCES = [ { "name": "PaddleOCR PP-DocLayout (ppyoloe_plus_sod, HuggingFace)", "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx", "filename": "model.onnx", "sha256": None, # populated once a known-good hash is available }, { "name": "PaddleOCR PP-DocLayout (RapidOCR mirror)", "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx", "filename": "model.onnx", "sha256": None, }, ] # Paddle inference model URLs (for Docker-based conversion). PADDLE_MODEL_URL = ( "https://paddleocr.bj.bcebos.com/PP-DocLayout/PP-DocLayout_plus.tar" ) # Expected input shape for the model (batch, channels, height, width). MODEL_INPUT_SHAPE = (1, 3, 800, 800) # Docker image name used for conversion. DOCKER_IMAGE_TAG = "breakpilot/paddle2onnx-converter:latest" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def sha256_file(path: Path) -> str: """Compute SHA-256 hex digest for a file.""" h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(1 << 20), b""): h.update(chunk) return h.hexdigest() def download_file(url: str, dest: Path, desc: str = "") -> bool: """Download a file with progress reporting. Returns True on success.""" label = desc or url.split("/")[-1] log.info("Downloading %s ...", label) log.info(" URL: %s", url) try: req = urllib.request.Request(url, headers={"User-Agent": "breakpilot-export/1.0"}) with urllib.request.urlopen(req, timeout=120) as resp: total = resp.headers.get("Content-Length") total = int(total) if total else None downloaded = 0 dest.parent.mkdir(parents=True, exist_ok=True) with open(dest, "wb") as f: while True: chunk = resp.read(1 << 18) # 256 KB if not chunk: break f.write(chunk) downloaded += len(chunk) if total: pct = downloaded * 100 / total mb = downloaded / (1 << 20) total_mb = total / (1 << 20) print( f"\r {mb:.1f}/{total_mb:.1f} MB ({pct:.0f}%)", end="", flush=True, ) if total: print() # newline after progress size_mb = dest.stat().st_size / (1 << 20) log.info(" Downloaded %.1f MB -> %s", size_mb, dest) return True except Exception as exc: log.warning(" Download failed: %s", exc) if dest.exists(): dest.unlink() return False def verify_onnx(model_path: Path) -> bool: """Load the ONNX model with onnxruntime, run a dummy inference, check outputs.""" log.info("Verifying ONNX model: %s", model_path) try: import numpy as np except ImportError: log.error("numpy is required for verification: pip install numpy") return False try: import onnxruntime as ort except ImportError: log.error("onnxruntime is required for verification: pip install onnxruntime") return False try: # Load the model opts = ort.SessionOptions() opts.log_severity_level = 3 # suppress verbose logs session = ort.InferenceSession(str(model_path), sess_options=opts) # Inspect inputs inputs = session.get_inputs() log.info(" Model inputs:") for inp in inputs: log.info(" %s: shape=%s dtype=%s", inp.name, inp.shape, inp.type) # Inspect outputs outputs = session.get_outputs() log.info(" Model outputs:") for out in outputs: log.info(" %s: shape=%s dtype=%s", out.name, out.shape, out.type) # Build dummy input — use the first input's name and expected shape. input_name = inputs[0].name input_shape = inputs[0].shape # Replace dynamic dims (strings or None) with concrete sizes. concrete_shape = [] for i, dim in enumerate(input_shape): if isinstance(dim, (int,)) and dim > 0: concrete_shape.append(dim) elif i == 0: concrete_shape.append(1) # batch elif i == 1: concrete_shape.append(3) # channels else: concrete_shape.append(800) # spatial concrete_shape = tuple(concrete_shape) # Fallback if shape looks wrong — use standard MODEL_INPUT_SHAPE. if len(concrete_shape) != 4: concrete_shape = MODEL_INPUT_SHAPE log.info(" Running dummy inference with shape %s ...", concrete_shape) dummy = np.random.randn(*concrete_shape).astype(np.float32) result = session.run(None, {input_name: dummy}) log.info(" Inference succeeded — %d output tensors:", len(result)) for i, r in enumerate(result): arr = np.asarray(r) log.info(" output[%d]: shape=%s dtype=%s", i, arr.shape, arr.dtype) # Basic sanity checks if len(result) == 0: log.error(" Model produced no outputs!") return False # Check for at least one output with a bounding-box-like shape (N, 4) or # a detection-like structure. Be lenient — different ONNX exports vary. has_plausible_output = False for r in result: arr = np.asarray(r) # Common detection output shapes: (1, N, 6), (N, 4), (N, 6), (1, N, 5+C), etc. if arr.ndim >= 2 and any(d >= 4 for d in arr.shape): has_plausible_output = True # Some models output (N,) labels or scores if arr.ndim >= 1 and arr.size > 0: has_plausible_output = True if has_plausible_output: log.info(" Verification PASSED") return True else: log.warning(" Output shapes look unexpected, but model loaded OK.") log.warning(" Treating as PASSED (shapes may differ by export variant).") return True except Exception as exc: log.error(" Verification FAILED: %s", exc) return False # --------------------------------------------------------------------------- # Method: Download # --------------------------------------------------------------------------- def try_download(output_dir: Path) -> bool: """Attempt to download a pre-exported ONNX model. Returns True on success.""" log.info("=== Method: DOWNLOAD ===") output_dir.mkdir(parents=True, exist_ok=True) model_path = output_dir / "model.onnx" for source in DOWNLOAD_SOURCES: log.info("Trying source: %s", source["name"]) tmp_path = output_dir / f".{source['filename']}.tmp" if not download_file(source["url"], tmp_path, desc=source["name"]): continue # Check SHA-256 if known. if source["sha256"]: actual_hash = sha256_file(tmp_path) if actual_hash != source["sha256"]: log.warning( " SHA-256 mismatch: expected %s, got %s", source["sha256"], actual_hash, ) tmp_path.unlink() continue # Basic sanity: file should be > 1 MB (a real ONNX model, not an error page). size = tmp_path.stat().st_size if size < 1 << 20: log.warning(" File too small (%.1f KB) — probably not a valid model.", size / 1024) tmp_path.unlink() continue # Move into place. shutil.move(str(tmp_path), str(model_path)) log.info("Model saved to %s (%.1f MB)", model_path, model_path.stat().st_size / (1 << 20)) return True log.warning("All download sources failed.") return False # --------------------------------------------------------------------------- # Method: Docker # --------------------------------------------------------------------------- DOCKERFILE_CONTENT = r""" FROM --platform=linux/amd64 python:3.11-slim RUN pip install --no-cache-dir \ paddlepaddle==3.0.0 \ paddle2onnx==1.3.1 \ onnx==1.17.0 \ requests WORKDIR /work # Download + extract the PP-DocLayout Paddle inference model. RUN python3 -c " import urllib.request, tarfile, os url = 'PADDLE_MODEL_URL_PLACEHOLDER' print(f'Downloading {url} ...') dest = '/work/pp_doclayout.tar' urllib.request.urlretrieve(url, dest) print('Extracting ...') with tarfile.open(dest) as t: t.extractall('/work/paddle_model') os.remove(dest) # List what we extracted for root, dirs, files in os.walk('/work/paddle_model'): for f in files: fp = os.path.join(root, f) sz = os.path.getsize(fp) print(f' {fp} ({sz} bytes)') " # Convert Paddle model to ONNX. # paddle2onnx expects model_dir with model.pdmodel + model.pdiparams RUN python3 -c " import os, glob, subprocess # Find the inference model files model_dir = '/work/paddle_model' pdmodel_files = glob.glob(os.path.join(model_dir, '**', '*.pdmodel'), recursive=True) pdiparams_files = glob.glob(os.path.join(model_dir, '**', '*.pdiparams'), recursive=True) if not pdmodel_files: raise FileNotFoundError('No .pdmodel file found in extracted archive') pdmodel = pdmodel_files[0] pdiparams = pdiparams_files[0] if pdiparams_files else None model_dir_actual = os.path.dirname(pdmodel) pdmodel_name = os.path.basename(pdmodel).replace('.pdmodel', '') print(f'Found model: {pdmodel}') print(f'Found params: {pdiparams}') print(f'Model dir: {model_dir_actual}') print(f'Model name prefix: {pdmodel_name}') cmd = [ 'paddle2onnx', '--model_dir', model_dir_actual, '--model_filename', os.path.basename(pdmodel), ] if pdiparams: cmd += ['--params_filename', os.path.basename(pdiparams)] cmd += [ '--save_file', '/work/output/model.onnx', '--opset_version', '14', '--enable_onnx_checker', 'True', ] os.makedirs('/work/output', exist_ok=True) print(f'Running: {\" \".join(cmd)}') subprocess.run(cmd, check=True) out_size = os.path.getsize('/work/output/model.onnx') print(f'Conversion done: /work/output/model.onnx ({out_size} bytes)') " CMD ["cp", "-v", "/work/output/model.onnx", "/output/model.onnx"] """.replace( "PADDLE_MODEL_URL_PLACEHOLDER", PADDLE_MODEL_URL ) def try_docker(output_dir: Path) -> bool: """Build a Docker image to convert the Paddle model to ONNX. Returns True on success.""" log.info("=== Method: DOCKER (linux/amd64) ===") # Check Docker is available. docker_bin = shutil.which("docker") or "/usr/local/bin/docker" try: subprocess.run( [docker_bin, "version"], capture_output=True, check=True, timeout=15, ) except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as exc: log.error("Docker is not available: %s", exc) return False output_dir.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="doclayout-export-") as tmpdir: tmpdir = Path(tmpdir) # Write Dockerfile. dockerfile_path = tmpdir / "Dockerfile" dockerfile_path.write_text(DOCKERFILE_CONTENT) log.info("Wrote Dockerfile to %s", dockerfile_path) # Build image. log.info("Building Docker image (this downloads ~2 GB, may take a while) ...") build_cmd = [ docker_bin, "build", "--platform", "linux/amd64", "-t", DOCKER_IMAGE_TAG, "-f", str(dockerfile_path), str(tmpdir), ] log.info(" %s", " ".join(build_cmd)) build_result = subprocess.run( build_cmd, capture_output=False, # stream output to terminal timeout=1200, # 20 min ) if build_result.returncode != 0: log.error("Docker build failed (exit code %d).", build_result.returncode) return False # Run container — mount output_dir as /output, the CMD copies model.onnx there. log.info("Running conversion container ...") run_cmd = [ docker_bin, "run", "--rm", "--platform", "linux/amd64", "-v", f"{output_dir.resolve()}:/output", DOCKER_IMAGE_TAG, ] log.info(" %s", " ".join(run_cmd)) run_result = subprocess.run( run_cmd, capture_output=False, timeout=300, ) if run_result.returncode != 0: log.error("Docker run failed (exit code %d).", run_result.returncode) return False model_path = output_dir / "model.onnx" if model_path.exists(): size_mb = model_path.stat().st_size / (1 << 20) log.info("Model exported: %s (%.1f MB)", model_path, size_mb) return True else: log.error("Expected output file not found: %s", model_path) return False # --------------------------------------------------------------------------- # Write metadata # --------------------------------------------------------------------------- def write_metadata(output_dir: Path, method: str) -> None: """Write a metadata JSON next to the model for provenance tracking.""" model_path = output_dir / "model.onnx" if not model_path.exists(): return meta = { "model": "PP-DocLayout", "format": "ONNX", "export_method": method, "class_labels": CLASS_LABELS, "input_shape": list(MODEL_INPUT_SHAPE), "file_size_bytes": model_path.stat().st_size, "sha256": sha256_file(model_path), } meta_path = output_dir / "metadata.json" with open(meta_path, "w") as f: json.dump(meta, f, indent=2) log.info("Metadata written to %s", meta_path) # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: parser = argparse.ArgumentParser( description="Export PP-DocLayout model to ONNX for document layout detection.", ) parser.add_argument( "--output-dir", type=Path, default=Path("models/onnx/pp-doclayout"), help="Directory for the exported ONNX model (default: models/onnx/pp-doclayout/)", ) parser.add_argument( "--method", choices=["auto", "download", "docker"], default="auto", help="Export method: auto (try download then docker), download, or docker.", ) parser.add_argument( "--skip-verify", action="store_true", help="Skip ONNX model verification after export.", ) args = parser.parse_args() output_dir: Path = args.output_dir model_path = output_dir / "model.onnx" # Check if model already exists. if model_path.exists(): size_mb = model_path.stat().st_size / (1 << 20) log.info("Model already exists: %s (%.1f MB)", model_path, size_mb) log.info("Delete it first if you want to re-export.") if not args.skip_verify: if not verify_onnx(model_path): log.error("Existing model failed verification!") return 1 return 0 success = False used_method = None if args.method in ("auto", "download"): success = try_download(output_dir) if success: used_method = "download" if not success and args.method in ("auto", "docker"): success = try_docker(output_dir) if success: used_method = "docker" if not success: log.error("All export methods failed.") if args.method == "download": log.info("Hint: try --method docker to convert via Docker (linux/amd64).") elif args.method == "docker": log.info("Hint: ensure Docker is running and has internet access.") else: log.info("Hint: check your internet connection and Docker installation.") return 1 # Write metadata. write_metadata(output_dir, used_method) # Verify. if not args.skip_verify: if not verify_onnx(model_path): log.error("Exported model failed verification!") log.info("The file is kept at %s — inspect manually.", model_path) return 1 else: log.info("Skipping verification (--skip-verify).") log.info("Done. Model ready at %s", model_path) return 0 if __name__ == "__main__": sys.exit(main())