[split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,15 +13,8 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(
|
||||
@@ -49,92 +42,23 @@ CLASS_LABELS = [
|
||||
"abstract",
|
||||
]
|
||||
|
||||
# Known download sources for pre-exported ONNX models.
|
||||
# Ordered by preference — first successful download wins.
|
||||
DOWNLOAD_SOURCES = [
|
||||
{
|
||||
"name": "PaddleOCR PP-DocLayout (ppyoloe_plus_sod, HuggingFace)",
|
||||
"url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
|
||||
"filename": "model.onnx",
|
||||
"sha256": None, # populated once a known-good hash is available
|
||||
},
|
||||
{
|
||||
"name": "PaddleOCR PP-DocLayout (RapidOCR mirror)",
|
||||
"url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
|
||||
"filename": "model.onnx",
|
||||
"sha256": None,
|
||||
},
|
||||
]
|
||||
|
||||
# Paddle inference model URLs (for Docker-based conversion).
|
||||
PADDLE_MODEL_URL = (
|
||||
"https://paddleocr.bj.bcebos.com/PP-DocLayout/PP-DocLayout_plus.tar"
|
||||
)
|
||||
|
||||
# Expected input shape for the model (batch, channels, height, width).
|
||||
MODEL_INPUT_SHAPE = (1, 3, 800, 800)
|
||||
|
||||
# Docker image name used for conversion.
|
||||
DOCKER_IMAGE_TAG = "breakpilot/paddle2onnx-converter:latest"
|
||||
# Import methods from sibling module
|
||||
from doclayout_export_methods import (
|
||||
try_download,
|
||||
try_docker,
|
||||
write_metadata,
|
||||
sha256_file,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# Verification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
"""Compute SHA-256 hex digest for a file."""
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(1 << 20), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, desc: str = "") -> bool:
|
||||
"""Download a file with progress reporting. Returns True on success."""
|
||||
label = desc or url.split("/")[-1]
|
||||
log.info("Downloading %s ...", label)
|
||||
log.info(" URL: %s", url)
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "breakpilot-export/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
||||
total = resp.headers.get("Content-Length")
|
||||
total = int(total) if total else None
|
||||
downloaded = 0
|
||||
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest, "wb") as f:
|
||||
while True:
|
||||
chunk = resp.read(1 << 18) # 256 KB
|
||||
if not chunk:
|
||||
break
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total:
|
||||
pct = downloaded * 100 / total
|
||||
mb = downloaded / (1 << 20)
|
||||
total_mb = total / (1 << 20)
|
||||
print(
|
||||
f"\r {mb:.1f}/{total_mb:.1f} MB ({pct:.0f}%)",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
if total:
|
||||
print() # newline after progress
|
||||
|
||||
size_mb = dest.stat().st_size / (1 << 20)
|
||||
log.info(" Downloaded %.1f MB -> %s", size_mb, dest)
|
||||
return True
|
||||
|
||||
except Exception as exc:
|
||||
log.warning(" Download failed: %s", exc)
|
||||
if dest.exists():
|
||||
dest.unlink()
|
||||
return False
|
||||
|
||||
|
||||
def verify_onnx(model_path: Path) -> bool:
|
||||
"""Load the ONNX model with onnxruntime, run a dummy inference, check outputs."""
|
||||
log.info("Verifying ONNX model: %s", model_path)
|
||||
@@ -169,24 +93,23 @@ def verify_onnx(model_path: Path) -> bool:
|
||||
for out in outputs:
|
||||
log.info(" %s: shape=%s dtype=%s", out.name, out.shape, out.type)
|
||||
|
||||
# Build dummy input — use the first input's name and expected shape.
|
||||
# Build dummy input
|
||||
input_name = inputs[0].name
|
||||
input_shape = inputs[0].shape
|
||||
|
||||
# Replace dynamic dims (strings or None) with concrete sizes.
|
||||
# Replace dynamic dims with concrete sizes.
|
||||
concrete_shape = []
|
||||
for i, dim in enumerate(input_shape):
|
||||
if isinstance(dim, (int,)) and dim > 0:
|
||||
concrete_shape.append(dim)
|
||||
elif i == 0:
|
||||
concrete_shape.append(1) # batch
|
||||
concrete_shape.append(1)
|
||||
elif i == 1:
|
||||
concrete_shape.append(3) # channels
|
||||
concrete_shape.append(3)
|
||||
else:
|
||||
concrete_shape.append(800) # spatial
|
||||
concrete_shape.append(800)
|
||||
concrete_shape = tuple(concrete_shape)
|
||||
|
||||
# Fallback if shape looks wrong — use standard MODEL_INPUT_SHAPE.
|
||||
if len(concrete_shape) != 4:
|
||||
concrete_shape = MODEL_INPUT_SHAPE
|
||||
|
||||
@@ -199,20 +122,15 @@ def verify_onnx(model_path: Path) -> bool:
|
||||
arr = np.asarray(r)
|
||||
log.info(" output[%d]: shape=%s dtype=%s", i, arr.shape, arr.dtype)
|
||||
|
||||
# Basic sanity checks
|
||||
if len(result) == 0:
|
||||
log.error(" Model produced no outputs!")
|
||||
return False
|
||||
|
||||
# Check for at least one output with a bounding-box-like shape (N, 4) or
|
||||
# a detection-like structure. Be lenient — different ONNX exports vary.
|
||||
has_plausible_output = False
|
||||
for r in result:
|
||||
arr = np.asarray(r)
|
||||
# Common detection output shapes: (1, N, 6), (N, 4), (N, 6), (1, N, 5+C), etc.
|
||||
if arr.ndim >= 2 and any(d >= 4 for d in arr.shape):
|
||||
has_plausible_output = True
|
||||
# Some models output (N,) labels or scores
|
||||
if arr.ndim >= 1 and arr.size > 0:
|
||||
has_plausible_output = True
|
||||
|
||||
@@ -229,238 +147,6 @@ def verify_onnx(model_path: Path) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Method: Download
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def try_download(output_dir: Path) -> bool:
|
||||
"""Attempt to download a pre-exported ONNX model. Returns True on success."""
|
||||
log.info("=== Method: DOWNLOAD ===")
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
model_path = output_dir / "model.onnx"
|
||||
|
||||
for source in DOWNLOAD_SOURCES:
|
||||
log.info("Trying source: %s", source["name"])
|
||||
tmp_path = output_dir / f".{source['filename']}.tmp"
|
||||
|
||||
if not download_file(source["url"], tmp_path, desc=source["name"]):
|
||||
continue
|
||||
|
||||
# Check SHA-256 if known.
|
||||
if source["sha256"]:
|
||||
actual_hash = sha256_file(tmp_path)
|
||||
if actual_hash != source["sha256"]:
|
||||
log.warning(
|
||||
" SHA-256 mismatch: expected %s, got %s",
|
||||
source["sha256"],
|
||||
actual_hash,
|
||||
)
|
||||
tmp_path.unlink()
|
||||
continue
|
||||
|
||||
# Basic sanity: file should be > 1 MB (a real ONNX model, not an error page).
|
||||
size = tmp_path.stat().st_size
|
||||
if size < 1 << 20:
|
||||
log.warning(" File too small (%.1f KB) — probably not a valid model.", size / 1024)
|
||||
tmp_path.unlink()
|
||||
continue
|
||||
|
||||
# Move into place.
|
||||
shutil.move(str(tmp_path), str(model_path))
|
||||
log.info("Model saved to %s (%.1f MB)", model_path, model_path.stat().st_size / (1 << 20))
|
||||
return True
|
||||
|
||||
log.warning("All download sources failed.")
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Method: Docker
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOCKERFILE_CONTENT = r"""
|
||||
FROM --platform=linux/amd64 python:3.11-slim
|
||||
|
||||
RUN pip install --no-cache-dir \
|
||||
paddlepaddle==3.0.0 \
|
||||
paddle2onnx==1.3.1 \
|
||||
onnx==1.17.0 \
|
||||
requests
|
||||
|
||||
WORKDIR /work
|
||||
|
||||
# Download + extract the PP-DocLayout Paddle inference model.
|
||||
RUN python3 -c "
|
||||
import urllib.request, tarfile, os
|
||||
url = 'PADDLE_MODEL_URL_PLACEHOLDER'
|
||||
print(f'Downloading {url} ...')
|
||||
dest = '/work/pp_doclayout.tar'
|
||||
urllib.request.urlretrieve(url, dest)
|
||||
print('Extracting ...')
|
||||
with tarfile.open(dest) as t:
|
||||
t.extractall('/work/paddle_model')
|
||||
os.remove(dest)
|
||||
# List what we extracted
|
||||
for root, dirs, files in os.walk('/work/paddle_model'):
|
||||
for f in files:
|
||||
fp = os.path.join(root, f)
|
||||
sz = os.path.getsize(fp)
|
||||
print(f' {fp} ({sz} bytes)')
|
||||
"
|
||||
|
||||
# Convert Paddle model to ONNX.
|
||||
# paddle2onnx expects model_dir with model.pdmodel + model.pdiparams
|
||||
RUN python3 -c "
|
||||
import os, glob, subprocess
|
||||
|
||||
# Find the inference model files
|
||||
model_dir = '/work/paddle_model'
|
||||
pdmodel_files = glob.glob(os.path.join(model_dir, '**', '*.pdmodel'), recursive=True)
|
||||
pdiparams_files = glob.glob(os.path.join(model_dir, '**', '*.pdiparams'), recursive=True)
|
||||
|
||||
if not pdmodel_files:
|
||||
raise FileNotFoundError('No .pdmodel file found in extracted archive')
|
||||
|
||||
pdmodel = pdmodel_files[0]
|
||||
pdiparams = pdiparams_files[0] if pdiparams_files else None
|
||||
model_dir_actual = os.path.dirname(pdmodel)
|
||||
pdmodel_name = os.path.basename(pdmodel).replace('.pdmodel', '')
|
||||
|
||||
print(f'Found model: {pdmodel}')
|
||||
print(f'Found params: {pdiparams}')
|
||||
print(f'Model dir: {model_dir_actual}')
|
||||
print(f'Model name prefix: {pdmodel_name}')
|
||||
|
||||
cmd = [
|
||||
'paddle2onnx',
|
||||
'--model_dir', model_dir_actual,
|
||||
'--model_filename', os.path.basename(pdmodel),
|
||||
]
|
||||
if pdiparams:
|
||||
cmd += ['--params_filename', os.path.basename(pdiparams)]
|
||||
cmd += [
|
||||
'--save_file', '/work/output/model.onnx',
|
||||
'--opset_version', '14',
|
||||
'--enable_onnx_checker', 'True',
|
||||
]
|
||||
|
||||
os.makedirs('/work/output', exist_ok=True)
|
||||
print(f'Running: {\" \".join(cmd)}')
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
out_size = os.path.getsize('/work/output/model.onnx')
|
||||
print(f'Conversion done: /work/output/model.onnx ({out_size} bytes)')
|
||||
"
|
||||
|
||||
CMD ["cp", "-v", "/work/output/model.onnx", "/output/model.onnx"]
|
||||
""".replace(
|
||||
"PADDLE_MODEL_URL_PLACEHOLDER", PADDLE_MODEL_URL
|
||||
)
|
||||
|
||||
|
||||
def try_docker(output_dir: Path) -> bool:
|
||||
"""Build a Docker image to convert the Paddle model to ONNX. Returns True on success."""
|
||||
log.info("=== Method: DOCKER (linux/amd64) ===")
|
||||
|
||||
# Check Docker is available.
|
||||
docker_bin = shutil.which("docker") or "/usr/local/bin/docker"
|
||||
try:
|
||||
subprocess.run(
|
||||
[docker_bin, "version"],
|
||||
capture_output=True,
|
||||
check=True,
|
||||
timeout=15,
|
||||
)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as exc:
|
||||
log.error("Docker is not available: %s", exc)
|
||||
return False
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="doclayout-export-") as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Write Dockerfile.
|
||||
dockerfile_path = tmpdir / "Dockerfile"
|
||||
dockerfile_path.write_text(DOCKERFILE_CONTENT)
|
||||
log.info("Wrote Dockerfile to %s", dockerfile_path)
|
||||
|
||||
# Build image.
|
||||
log.info("Building Docker image (this downloads ~2 GB, may take a while) ...")
|
||||
build_cmd = [
|
||||
docker_bin, "build",
|
||||
"--platform", "linux/amd64",
|
||||
"-t", DOCKER_IMAGE_TAG,
|
||||
"-f", str(dockerfile_path),
|
||||
str(tmpdir),
|
||||
]
|
||||
log.info(" %s", " ".join(build_cmd))
|
||||
build_result = subprocess.run(
|
||||
build_cmd,
|
||||
capture_output=False, # stream output to terminal
|
||||
timeout=1200, # 20 min
|
||||
)
|
||||
if build_result.returncode != 0:
|
||||
log.error("Docker build failed (exit code %d).", build_result.returncode)
|
||||
return False
|
||||
|
||||
# Run container — mount output_dir as /output, the CMD copies model.onnx there.
|
||||
log.info("Running conversion container ...")
|
||||
run_cmd = [
|
||||
docker_bin, "run",
|
||||
"--rm",
|
||||
"--platform", "linux/amd64",
|
||||
"-v", f"{output_dir.resolve()}:/output",
|
||||
DOCKER_IMAGE_TAG,
|
||||
]
|
||||
log.info(" %s", " ".join(run_cmd))
|
||||
run_result = subprocess.run(
|
||||
run_cmd,
|
||||
capture_output=False,
|
||||
timeout=300,
|
||||
)
|
||||
if run_result.returncode != 0:
|
||||
log.error("Docker run failed (exit code %d).", run_result.returncode)
|
||||
return False
|
||||
|
||||
model_path = output_dir / "model.onnx"
|
||||
if model_path.exists():
|
||||
size_mb = model_path.stat().st_size / (1 << 20)
|
||||
log.info("Model exported: %s (%.1f MB)", model_path, size_mb)
|
||||
return True
|
||||
else:
|
||||
log.error("Expected output file not found: %s", model_path)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Write metadata
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def write_metadata(output_dir: Path, method: str) -> None:
|
||||
"""Write a metadata JSON next to the model for provenance tracking."""
|
||||
model_path = output_dir / "model.onnx"
|
||||
if not model_path.exists():
|
||||
return
|
||||
|
||||
meta = {
|
||||
"model": "PP-DocLayout",
|
||||
"format": "ONNX",
|
||||
"export_method": method,
|
||||
"class_labels": CLASS_LABELS,
|
||||
"input_shape": list(MODEL_INPUT_SHAPE),
|
||||
"file_size_bytes": model_path.stat().st_size,
|
||||
"sha256": sha256_file(model_path),
|
||||
}
|
||||
meta_path = output_dir / "metadata.json"
|
||||
with open(meta_path, "w") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
log.info("Metadata written to %s", meta_path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -527,7 +213,7 @@ def main() -> int:
|
||||
return 1
|
||||
|
||||
# Write metadata.
|
||||
write_metadata(output_dir, used_method)
|
||||
write_metadata(output_dir, used_method, CLASS_LABELS, MODEL_INPUT_SHAPE)
|
||||
|
||||
# Verify.
|
||||
if not args.skip_verify:
|
||||
|
||||
Reference in New Issue
Block a user