Files
breakpilot-lehrer/scripts/doclayout_export_methods.py
Benjamin Admin bd4b956e3c [split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00

312 lines
9.8 KiB
Python

"""
PP-DocLayout ONNX Export Methods
Download and Docker-based conversion methods for PP-DocLayout model.
Extracted from export-doclayout-onnx.py.
"""
import hashlib
import json
import logging
import shutil
import subprocess
import tempfile
import urllib.request
from pathlib import Path
log = logging.getLogger("export-doclayout")
# Known download sources for pre-exported ONNX models.
DOWNLOAD_SOURCES = [
{
"name": "PaddleOCR PP-DocLayout (ppyoloe_plus_sod, HuggingFace)",
"url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
"filename": "model.onnx",
"sha256": None,
},
{
"name": "PaddleOCR PP-DocLayout (RapidOCR mirror)",
"url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
"filename": "model.onnx",
"sha256": None,
},
]
# Paddle inference model URLs (for Docker-based conversion).
PADDLE_MODEL_URL = (
"https://paddleocr.bj.bcebos.com/PP-DocLayout/PP-DocLayout_plus.tar"
)
# Docker image name used for conversion.
DOCKER_IMAGE_TAG = "breakpilot/paddle2onnx-converter:latest"
def sha256_file(path: Path) -> str:
"""Compute SHA-256 hex digest for a file."""
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(1 << 20), b""):
h.update(chunk)
return h.hexdigest()
def download_file(url: str, dest: Path, desc: str = "") -> bool:
"""Download a file with progress reporting. Returns True on success."""
label = desc or url.split("/")[-1]
log.info("Downloading %s ...", label)
log.info(" URL: %s", url)
try:
req = urllib.request.Request(url, headers={"User-Agent": "breakpilot-export/1.0"})
with urllib.request.urlopen(req, timeout=120) as resp:
total = resp.headers.get("Content-Length")
total = int(total) if total else None
downloaded = 0
dest.parent.mkdir(parents=True, exist_ok=True)
with open(dest, "wb") as f:
while True:
chunk = resp.read(1 << 18) # 256 KB
if not chunk:
break
f.write(chunk)
downloaded += len(chunk)
if total:
pct = downloaded * 100 / total
mb = downloaded / (1 << 20)
total_mb = total / (1 << 20)
print(
f"\r {mb:.1f}/{total_mb:.1f} MB ({pct:.0f}%)",
end="",
flush=True,
)
if total:
print() # newline after progress
size_mb = dest.stat().st_size / (1 << 20)
log.info(" Downloaded %.1f MB -> %s", size_mb, dest)
return True
except Exception as exc:
log.warning(" Download failed: %s", exc)
if dest.exists():
dest.unlink()
return False
def try_download(output_dir: Path) -> bool:
"""Attempt to download a pre-exported ONNX model. Returns True on success."""
log.info("=== Method: DOWNLOAD ===")
output_dir.mkdir(parents=True, exist_ok=True)
model_path = output_dir / "model.onnx"
for source in DOWNLOAD_SOURCES:
log.info("Trying source: %s", source["name"])
tmp_path = output_dir / f".{source['filename']}.tmp"
if not download_file(source["url"], tmp_path, desc=source["name"]):
continue
# Check SHA-256 if known.
if source["sha256"]:
actual_hash = sha256_file(tmp_path)
if actual_hash != source["sha256"]:
log.warning(
" SHA-256 mismatch: expected %s, got %s",
source["sha256"],
actual_hash,
)
tmp_path.unlink()
continue
# Basic sanity: file should be > 1 MB.
size = tmp_path.stat().st_size
if size < 1 << 20:
log.warning(" File too small (%.1f KB) — probably not a valid model.", size / 1024)
tmp_path.unlink()
continue
# Move into place.
shutil.move(str(tmp_path), str(model_path))
log.info("Model saved to %s (%.1f MB)", model_path, model_path.stat().st_size / (1 << 20))
return True
log.warning("All download sources failed.")
return False
DOCKERFILE_CONTENT = r"""
FROM --platform=linux/amd64 python:3.11-slim
RUN pip install --no-cache-dir \
paddlepaddle==3.0.0 \
paddle2onnx==1.3.1 \
onnx==1.17.0 \
requests
WORKDIR /work
# Download + extract the PP-DocLayout Paddle inference model.
RUN python3 -c "
import urllib.request, tarfile, os
url = 'PADDLE_MODEL_URL_PLACEHOLDER'
print(f'Downloading {url} ...')
dest = '/work/pp_doclayout.tar'
urllib.request.urlretrieve(url, dest)
print('Extracting ...')
with tarfile.open(dest) as t:
t.extractall('/work/paddle_model')
os.remove(dest)
# List what we extracted
for root, dirs, files in os.walk('/work/paddle_model'):
for f in files:
fp = os.path.join(root, f)
sz = os.path.getsize(fp)
print(f' {fp} ({sz} bytes)')
"
# Convert Paddle model to ONNX.
RUN python3 -c "
import os, glob, subprocess
# Find the inference model files
model_dir = '/work/paddle_model'
pdmodel_files = glob.glob(os.path.join(model_dir, '**', '*.pdmodel'), recursive=True)
pdiparams_files = glob.glob(os.path.join(model_dir, '**', '*.pdiparams'), recursive=True)
if not pdmodel_files:
raise FileNotFoundError('No .pdmodel file found in extracted archive')
pdmodel = pdmodel_files[0]
pdiparams = pdiparams_files[0] if pdiparams_files else None
model_dir_actual = os.path.dirname(pdmodel)
pdmodel_name = os.path.basename(pdmodel).replace('.pdmodel', '')
print(f'Found model: {pdmodel}')
print(f'Found params: {pdiparams}')
print(f'Model dir: {model_dir_actual}')
print(f'Model name prefix: {pdmodel_name}')
cmd = [
'paddle2onnx',
'--model_dir', model_dir_actual,
'--model_filename', os.path.basename(pdmodel),
]
if pdiparams:
cmd += ['--params_filename', os.path.basename(pdiparams)]
cmd += [
'--save_file', '/work/output/model.onnx',
'--opset_version', '14',
'--enable_onnx_checker', 'True',
]
os.makedirs('/work/output', exist_ok=True)
print(f'Running: {\" \".join(cmd)}')
subprocess.run(cmd, check=True)
out_size = os.path.getsize('/work/output/model.onnx')
print(f'Conversion done: /work/output/model.onnx ({out_size} bytes)')
"
CMD ["cp", "-v", "/work/output/model.onnx", "/output/model.onnx"]
""".replace(
"PADDLE_MODEL_URL_PLACEHOLDER", PADDLE_MODEL_URL
)
def try_docker(output_dir: Path) -> bool:
"""Build a Docker image to convert the Paddle model to ONNX. Returns True on success."""
log.info("=== Method: DOCKER (linux/amd64) ===")
# Check Docker is available.
docker_bin = shutil.which("docker") or "/usr/local/bin/docker"
try:
subprocess.run(
[docker_bin, "version"],
capture_output=True,
check=True,
timeout=15,
)
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as exc:
log.error("Docker is not available: %s", exc)
return False
output_dir.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(prefix="doclayout-export-") as tmpdir:
tmpdir = Path(tmpdir)
# Write Dockerfile.
dockerfile_path = tmpdir / "Dockerfile"
dockerfile_path.write_text(DOCKERFILE_CONTENT)
log.info("Wrote Dockerfile to %s", dockerfile_path)
# Build image.
log.info("Building Docker image (this downloads ~2 GB, may take a while) ...")
build_cmd = [
docker_bin, "build",
"--platform", "linux/amd64",
"-t", DOCKER_IMAGE_TAG,
"-f", str(dockerfile_path),
str(tmpdir),
]
log.info(" %s", " ".join(build_cmd))
build_result = subprocess.run(
build_cmd,
capture_output=False,
timeout=1200,
)
if build_result.returncode != 0:
log.error("Docker build failed (exit code %d).", build_result.returncode)
return False
# Run container.
log.info("Running conversion container ...")
run_cmd = [
docker_bin, "run",
"--rm",
"--platform", "linux/amd64",
"-v", f"{output_dir.resolve()}:/output",
DOCKER_IMAGE_TAG,
]
log.info(" %s", " ".join(run_cmd))
run_result = subprocess.run(
run_cmd,
capture_output=False,
timeout=300,
)
if run_result.returncode != 0:
log.error("Docker run failed (exit code %d).", run_result.returncode)
return False
model_path = output_dir / "model.onnx"
if model_path.exists():
size_mb = model_path.stat().st_size / (1 << 20)
log.info("Model exported: %s (%.1f MB)", model_path, size_mb)
return True
else:
log.error("Expected output file not found: %s", model_path)
return False
def write_metadata(output_dir: Path, method: str, class_labels: list, model_input_shape: tuple) -> None:
"""Write a metadata JSON next to the model for provenance tracking."""
model_path = output_dir / "model.onnx"
if not model_path.exists():
return
meta = {
"model": "PP-DocLayout",
"format": "ONNX",
"export_method": method,
"class_labels": class_labels,
"input_shape": list(model_input_shape),
"file_size_bytes": model_path.stat().st_size,
"sha256": sha256_file(model_path),
}
meta_path = output_dir / "metadata.json"
with open(meta_path, "w") as f:
json.dump(meta, f, indent=2)
log.info("Metadata written to %s", meta_path)