Files
breakpilot-lehrer/scripts/export-doclayout-onnx.py
Benjamin Admin bd4b956e3c [split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00

233 lines
7.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
PP-DocLayout ONNX Export — exports PP-DocLayout model to ONNX for document layout detection.
PP-DocLayout detects: table, figure, title, text, list regions on document pages.
Since PaddlePaddle doesn't work natively on ARM Mac, this script either:
1. Downloads a pre-exported ONNX model
2. Uses Docker (linux/amd64) for the conversion
Usage:
python scripts/export-doclayout-onnx.py
python scripts/export-doclayout-onnx.py --method docker
"""
import argparse
import logging
import sys
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("export-doclayout")
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
# 10 PP-DocLayout class labels in standard order
CLASS_LABELS = [
"table",
"figure",
"title",
"text",
"list",
"header",
"footer",
"equation",
"reference",
"abstract",
]
# Expected input shape for the model (batch, channels, height, width).
MODEL_INPUT_SHAPE = (1, 3, 800, 800)
# Import methods from sibling module
from doclayout_export_methods import (
try_download,
try_docker,
write_metadata,
sha256_file,
)
# ---------------------------------------------------------------------------
# Verification
# ---------------------------------------------------------------------------
def verify_onnx(model_path: Path) -> bool:
"""Load the ONNX model with onnxruntime, run a dummy inference, check outputs."""
log.info("Verifying ONNX model: %s", model_path)
try:
import numpy as np
except ImportError:
log.error("numpy is required for verification: pip install numpy")
return False
try:
import onnxruntime as ort
except ImportError:
log.error("onnxruntime is required for verification: pip install onnxruntime")
return False
try:
# Load the model
opts = ort.SessionOptions()
opts.log_severity_level = 3 # suppress verbose logs
session = ort.InferenceSession(str(model_path), sess_options=opts)
# Inspect inputs
inputs = session.get_inputs()
log.info(" Model inputs:")
for inp in inputs:
log.info(" %s: shape=%s dtype=%s", inp.name, inp.shape, inp.type)
# Inspect outputs
outputs = session.get_outputs()
log.info(" Model outputs:")
for out in outputs:
log.info(" %s: shape=%s dtype=%s", out.name, out.shape, out.type)
# Build dummy input
input_name = inputs[0].name
input_shape = inputs[0].shape
# Replace dynamic dims with concrete sizes.
concrete_shape = []
for i, dim in enumerate(input_shape):
if isinstance(dim, (int,)) and dim > 0:
concrete_shape.append(dim)
elif i == 0:
concrete_shape.append(1)
elif i == 1:
concrete_shape.append(3)
else:
concrete_shape.append(800)
concrete_shape = tuple(concrete_shape)
if len(concrete_shape) != 4:
concrete_shape = MODEL_INPUT_SHAPE
log.info(" Running dummy inference with shape %s ...", concrete_shape)
dummy = np.random.randn(*concrete_shape).astype(np.float32)
result = session.run(None, {input_name: dummy})
log.info(" Inference succeeded — %d output tensors:", len(result))
for i, r in enumerate(result):
arr = np.asarray(r)
log.info(" output[%d]: shape=%s dtype=%s", i, arr.shape, arr.dtype)
if len(result) == 0:
log.error(" Model produced no outputs!")
return False
has_plausible_output = False
for r in result:
arr = np.asarray(r)
if arr.ndim >= 2 and any(d >= 4 for d in arr.shape):
has_plausible_output = True
if arr.ndim >= 1 and arr.size > 0:
has_plausible_output = True
if has_plausible_output:
log.info(" Verification PASSED")
return True
else:
log.warning(" Output shapes look unexpected, but model loaded OK.")
log.warning(" Treating as PASSED (shapes may differ by export variant).")
return True
except Exception as exc:
log.error(" Verification FAILED: %s", exc)
return False
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> int:
parser = argparse.ArgumentParser(
description="Export PP-DocLayout model to ONNX for document layout detection.",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("models/onnx/pp-doclayout"),
help="Directory for the exported ONNX model (default: models/onnx/pp-doclayout/)",
)
parser.add_argument(
"--method",
choices=["auto", "download", "docker"],
default="auto",
help="Export method: auto (try download then docker), download, or docker.",
)
parser.add_argument(
"--skip-verify",
action="store_true",
help="Skip ONNX model verification after export.",
)
args = parser.parse_args()
output_dir: Path = args.output_dir
model_path = output_dir / "model.onnx"
# Check if model already exists.
if model_path.exists():
size_mb = model_path.stat().st_size / (1 << 20)
log.info("Model already exists: %s (%.1f MB)", model_path, size_mb)
log.info("Delete it first if you want to re-export.")
if not args.skip_verify:
if not verify_onnx(model_path):
log.error("Existing model failed verification!")
return 1
return 0
success = False
used_method = None
if args.method in ("auto", "download"):
success = try_download(output_dir)
if success:
used_method = "download"
if not success and args.method in ("auto", "docker"):
success = try_docker(output_dir)
if success:
used_method = "docker"
if not success:
log.error("All export methods failed.")
if args.method == "download":
log.info("Hint: try --method docker to convert via Docker (linux/amd64).")
elif args.method == "docker":
log.info("Hint: ensure Docker is running and has internet access.")
else:
log.info("Hint: check your internet connection and Docker installation.")
return 1
# Write metadata.
write_metadata(output_dir, used_method, CLASS_LABELS, MODEL_INPUT_SHAPE)
# Verify.
if not args.skip_verify:
if not verify_onnx(model_path):
log.error("Exported model failed verification!")
log.info("The file is kept at %s — inspect manually.", model_path)
return 1
else:
log.info("Skipping verification (--skip-verify).")
log.info("Done. Model ready at %s", model_path)
return 0
if __name__ == "__main__":
sys.exit(main())