""" PP-DocLayout ONNX Export Methods Download and Docker-based conversion methods for PP-DocLayout model. Extracted from export-doclayout-onnx.py. """ import hashlib import json import logging import shutil import subprocess import tempfile import urllib.request from pathlib import Path log = logging.getLogger("export-doclayout") # Known download sources for pre-exported ONNX models. DOWNLOAD_SOURCES = [ { "name": "PaddleOCR PP-DocLayout (ppyoloe_plus_sod, HuggingFace)", "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx", "filename": "model.onnx", "sha256": None, }, { "name": "PaddleOCR PP-DocLayout (RapidOCR mirror)", "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx", "filename": "model.onnx", "sha256": None, }, ] # Paddle inference model URLs (for Docker-based conversion). PADDLE_MODEL_URL = ( "https://paddleocr.bj.bcebos.com/PP-DocLayout/PP-DocLayout_plus.tar" ) # Docker image name used for conversion. DOCKER_IMAGE_TAG = "breakpilot/paddle2onnx-converter:latest" def sha256_file(path: Path) -> str: """Compute SHA-256 hex digest for a file.""" h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(1 << 20), b""): h.update(chunk) return h.hexdigest() def download_file(url: str, dest: Path, desc: str = "") -> bool: """Download a file with progress reporting. Returns True on success.""" label = desc or url.split("/")[-1] log.info("Downloading %s ...", label) log.info(" URL: %s", url) try: req = urllib.request.Request(url, headers={"User-Agent": "breakpilot-export/1.0"}) with urllib.request.urlopen(req, timeout=120) as resp: total = resp.headers.get("Content-Length") total = int(total) if total else None downloaded = 0 dest.parent.mkdir(parents=True, exist_ok=True) with open(dest, "wb") as f: while True: chunk = resp.read(1 << 18) # 256 KB if not chunk: break f.write(chunk) downloaded += len(chunk) if total: pct = downloaded * 100 / total mb = downloaded / (1 << 20) total_mb = total / (1 << 20) print( f"\r {mb:.1f}/{total_mb:.1f} MB ({pct:.0f}%)", end="", flush=True, ) if total: print() # newline after progress size_mb = dest.stat().st_size / (1 << 20) log.info(" Downloaded %.1f MB -> %s", size_mb, dest) return True except Exception as exc: log.warning(" Download failed: %s", exc) if dest.exists(): dest.unlink() return False def try_download(output_dir: Path) -> bool: """Attempt to download a pre-exported ONNX model. Returns True on success.""" log.info("=== Method: DOWNLOAD ===") output_dir.mkdir(parents=True, exist_ok=True) model_path = output_dir / "model.onnx" for source in DOWNLOAD_SOURCES: log.info("Trying source: %s", source["name"]) tmp_path = output_dir / f".{source['filename']}.tmp" if not download_file(source["url"], tmp_path, desc=source["name"]): continue # Check SHA-256 if known. if source["sha256"]: actual_hash = sha256_file(tmp_path) if actual_hash != source["sha256"]: log.warning( " SHA-256 mismatch: expected %s, got %s", source["sha256"], actual_hash, ) tmp_path.unlink() continue # Basic sanity: file should be > 1 MB. size = tmp_path.stat().st_size if size < 1 << 20: log.warning(" File too small (%.1f KB) — probably not a valid model.", size / 1024) tmp_path.unlink() continue # Move into place. shutil.move(str(tmp_path), str(model_path)) log.info("Model saved to %s (%.1f MB)", model_path, model_path.stat().st_size / (1 << 20)) return True log.warning("All download sources failed.") return False DOCKERFILE_CONTENT = r""" FROM --platform=linux/amd64 python:3.11-slim RUN pip install --no-cache-dir \ paddlepaddle==3.0.0 \ paddle2onnx==1.3.1 \ onnx==1.17.0 \ requests WORKDIR /work # Download + extract the PP-DocLayout Paddle inference model. RUN python3 -c " import urllib.request, tarfile, os url = 'PADDLE_MODEL_URL_PLACEHOLDER' print(f'Downloading {url} ...') dest = '/work/pp_doclayout.tar' urllib.request.urlretrieve(url, dest) print('Extracting ...') with tarfile.open(dest) as t: t.extractall('/work/paddle_model') os.remove(dest) # List what we extracted for root, dirs, files in os.walk('/work/paddle_model'): for f in files: fp = os.path.join(root, f) sz = os.path.getsize(fp) print(f' {fp} ({sz} bytes)') " # Convert Paddle model to ONNX. RUN python3 -c " import os, glob, subprocess # Find the inference model files model_dir = '/work/paddle_model' pdmodel_files = glob.glob(os.path.join(model_dir, '**', '*.pdmodel'), recursive=True) pdiparams_files = glob.glob(os.path.join(model_dir, '**', '*.pdiparams'), recursive=True) if not pdmodel_files: raise FileNotFoundError('No .pdmodel file found in extracted archive') pdmodel = pdmodel_files[0] pdiparams = pdiparams_files[0] if pdiparams_files else None model_dir_actual = os.path.dirname(pdmodel) pdmodel_name = os.path.basename(pdmodel).replace('.pdmodel', '') print(f'Found model: {pdmodel}') print(f'Found params: {pdiparams}') print(f'Model dir: {model_dir_actual}') print(f'Model name prefix: {pdmodel_name}') cmd = [ 'paddle2onnx', '--model_dir', model_dir_actual, '--model_filename', os.path.basename(pdmodel), ] if pdiparams: cmd += ['--params_filename', os.path.basename(pdiparams)] cmd += [ '--save_file', '/work/output/model.onnx', '--opset_version', '14', '--enable_onnx_checker', 'True', ] os.makedirs('/work/output', exist_ok=True) print(f'Running: {\" \".join(cmd)}') subprocess.run(cmd, check=True) out_size = os.path.getsize('/work/output/model.onnx') print(f'Conversion done: /work/output/model.onnx ({out_size} bytes)') " CMD ["cp", "-v", "/work/output/model.onnx", "/output/model.onnx"] """.replace( "PADDLE_MODEL_URL_PLACEHOLDER", PADDLE_MODEL_URL ) def try_docker(output_dir: Path) -> bool: """Build a Docker image to convert the Paddle model to ONNX. Returns True on success.""" log.info("=== Method: DOCKER (linux/amd64) ===") # Check Docker is available. docker_bin = shutil.which("docker") or "/usr/local/bin/docker" try: subprocess.run( [docker_bin, "version"], capture_output=True, check=True, timeout=15, ) except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as exc: log.error("Docker is not available: %s", exc) return False output_dir.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="doclayout-export-") as tmpdir: tmpdir = Path(tmpdir) # Write Dockerfile. dockerfile_path = tmpdir / "Dockerfile" dockerfile_path.write_text(DOCKERFILE_CONTENT) log.info("Wrote Dockerfile to %s", dockerfile_path) # Build image. log.info("Building Docker image (this downloads ~2 GB, may take a while) ...") build_cmd = [ docker_bin, "build", "--platform", "linux/amd64", "-t", DOCKER_IMAGE_TAG, "-f", str(dockerfile_path), str(tmpdir), ] log.info(" %s", " ".join(build_cmd)) build_result = subprocess.run( build_cmd, capture_output=False, timeout=1200, ) if build_result.returncode != 0: log.error("Docker build failed (exit code %d).", build_result.returncode) return False # Run container. log.info("Running conversion container ...") run_cmd = [ docker_bin, "run", "--rm", "--platform", "linux/amd64", "-v", f"{output_dir.resolve()}:/output", DOCKER_IMAGE_TAG, ] log.info(" %s", " ".join(run_cmd)) run_result = subprocess.run( run_cmd, capture_output=False, timeout=300, ) if run_result.returncode != 0: log.error("Docker run failed (exit code %d).", run_result.returncode) return False model_path = output_dir / "model.onnx" if model_path.exists(): size_mb = model_path.stat().st_size / (1 << 20) log.info("Model exported: %s (%.1f MB)", model_path, size_mb) return True else: log.error("Expected output file not found: %s", model_path) return False def write_metadata(output_dir: Path, method: str, class_labels: list, model_input_shape: tuple) -> None: """Write a metadata JSON next to the model for provenance tracking.""" model_path = output_dir / "model.onnx" if not model_path.exists(): return meta = { "model": "PP-DocLayout", "format": "ONNX", "export_method": method, "class_labels": class_labels, "input_shape": list(model_input_shape), "file_size_bytes": model_path.stat().st_size, "sha256": sha256_file(model_path), } meta_path = output_dir / "metadata.json" with open(meta_path, "w") as f: json.dump(meta, f, indent=2) log.info("Metadata written to %s", meta_path)