[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
--- a/scripts/export-doclayout-onnx.py
+++ b/scripts/export-doclayout-onnx.py
@@ -13,15 +13,8 @@ Usage:
 """

 import argparse
-import hashlib
-import json
 import logging
-import os
-import shutil
-import subprocess
 import sys
-import tempfile
-import urllib.request
 from pathlib import Path

 logging.basicConfig(
@@ -49,92 +42,23 @@ CLASS_LABELS = [
    "abstract",
 ]

-# Known download sources for pre-exported ONNX models.
-# Ordered by preference — first successful download wins.
-DOWNLOAD_SOURCES = [
-    {
-        "name": "PaddleOCR PP-DocLayout (ppyoloe_plus_sod, HuggingFace)",
-        "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
-        "filename": "model.onnx",
-        "sha256": None,  # populated once a known-good hash is available
-    },
-    {
-        "name": "PaddleOCR PP-DocLayout (RapidOCR mirror)",
-        "url": "https://huggingface.co/SWHL/PP-DocLayout/resolve/main/pp_doclayout_onnx/model.onnx",
-        "filename": "model.onnx",
-        "sha256": None,
-    },
-]
-
-# Paddle inference model URLs (for Docker-based conversion).
-PADDLE_MODEL_URL = (
-    "https://paddleocr.bj.bcebos.com/PP-DocLayout/PP-DocLayout_plus.tar"
-)
-
 # Expected input shape for the model (batch, channels, height, width).
 MODEL_INPUT_SHAPE = (1, 3, 800, 800)

-# Docker image name used for conversion.
-DOCKER_IMAGE_TAG = "breakpilot/paddle2onnx-converter:latest"
+# Import methods from sibling module
+from doclayout_export_methods import (
+    try_download,
+    try_docker,
+    write_metadata,
+    sha256_file,
+)
+

 # ---------------------------------------------------------------------------
-# Helpers
+# Verification
 # ---------------------------------------------------------------------------


-def sha256_file(path: Path) -> str:
-    """Compute SHA-256 hex digest for a file."""
-    h = hashlib.sha256()
-    with open(path, "rb") as f:
-        for chunk in iter(lambda: f.read(1 << 20), b""):
-            h.update(chunk)
-    return h.hexdigest()
-
-
-def download_file(url: str, dest: Path, desc: str = "") -> bool:
-    """Download a file with progress reporting. Returns True on success."""
-    label = desc or url.split("/")[-1]
-    log.info("Downloading %s ...", label)
-    log.info("  URL: %s", url)
-
-    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "breakpilot-export/1.0"})
-        with urllib.request.urlopen(req, timeout=120) as resp:
-            total = resp.headers.get("Content-Length")
-            total = int(total) if total else None
-            downloaded = 0
-
-            dest.parent.mkdir(parents=True, exist_ok=True)
-            with open(dest, "wb") as f:
-                while True:
-                    chunk = resp.read(1 << 18)  # 256 KB
-                    if not chunk:
-                        break
-                    f.write(chunk)
-                    downloaded += len(chunk)
-                    if total:
-                        pct = downloaded * 100 / total
-                        mb = downloaded / (1 << 20)
-                        total_mb = total / (1 << 20)
-                        print(
-                            f"\r  {mb:.1f}/{total_mb:.1f} MB ({pct:.0f}%)",
-                            end="",
-                            flush=True,
-                        )
-            if total:
-                print()  # newline after progress
-
-        size_mb = dest.stat().st_size / (1 << 20)
-        log.info("  Downloaded %.1f MB -> %s", size_mb, dest)
-        return True
-
-    except Exception as exc:
-        log.warning("  Download failed: %s", exc)
-        if dest.exists():
-            dest.unlink()
-        return False
-
-
 def verify_onnx(model_path: Path) -> bool:
    """Load the ONNX model with onnxruntime, run a dummy inference, check outputs."""
    log.info("Verifying ONNX model: %s", model_path)
@@ -169,24 +93,23 @@ def verify_onnx(model_path: Path) -> bool:
        for out in outputs:
            log.info("    %s: shape=%s dtype=%s", out.name, out.shape, out.type)

-        # Build dummy input — use the first input's name and expected shape.
+        # Build dummy input
        input_name = inputs[0].name
        input_shape = inputs[0].shape

-        # Replace dynamic dims (strings or None) with concrete sizes.
+        # Replace dynamic dims with concrete sizes.
        concrete_shape = []
        for i, dim in enumerate(input_shape):
            if isinstance(dim, (int,)) and dim > 0:
                concrete_shape.append(dim)
            elif i == 0:
-                concrete_shape.append(1)  # batch
+                concrete_shape.append(1)
            elif i == 1:
-                concrete_shape.append(3)  # channels
+                concrete_shape.append(3)
            else:
-                concrete_shape.append(800)  # spatial
+                concrete_shape.append(800)
        concrete_shape = tuple(concrete_shape)

-        # Fallback if shape looks wrong — use standard MODEL_INPUT_SHAPE.
        if len(concrete_shape) != 4:
            concrete_shape = MODEL_INPUT_SHAPE

@@ -199,20 +122,15 @@ def verify_onnx(model_path: Path) -> bool:
            arr = np.asarray(r)
            log.info("    output[%d]: shape=%s dtype=%s", i, arr.shape, arr.dtype)

-        # Basic sanity checks
        if len(result) == 0:
            log.error("  Model produced no outputs!")
            return False

-        # Check for at least one output with a bounding-box-like shape (N, 4) or
-        # a detection-like structure. Be lenient — different ONNX exports vary.
        has_plausible_output = False
        for r in result:
            arr = np.asarray(r)
-            # Common detection output shapes: (1, N, 6), (N, 4), (N, 6), (1, N, 5+C), etc.
            if arr.ndim >= 2 and any(d >= 4 for d in arr.shape):
                has_plausible_output = True
-            # Some models output (N,) labels or scores
            if arr.ndim >= 1 and arr.size > 0:
                has_plausible_output = True

@@ -229,238 +147,6 @@ def verify_onnx(model_path: Path) -> bool:
        return False


-# ---------------------------------------------------------------------------
-# Method: Download
-# ---------------------------------------------------------------------------
-
-
-def try_download(output_dir: Path) -> bool:
-    """Attempt to download a pre-exported ONNX model. Returns True on success."""
-    log.info("=== Method: DOWNLOAD ===")
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-    model_path = output_dir / "model.onnx"
-
-    for source in DOWNLOAD_SOURCES:
-        log.info("Trying source: %s", source["name"])
-        tmp_path = output_dir / f".{source['filename']}.tmp"
-
-        if not download_file(source["url"], tmp_path, desc=source["name"]):
-            continue
-
-        # Check SHA-256 if known.
-        if source["sha256"]:
-            actual_hash = sha256_file(tmp_path)
-            if actual_hash != source["sha256"]:
-                log.warning(
-                    "  SHA-256 mismatch: expected %s, got %s",
-                    source["sha256"],
-                    actual_hash,
-                )
-                tmp_path.unlink()
-                continue
-
-        # Basic sanity: file should be > 1 MB (a real ONNX model, not an error page).
-        size = tmp_path.stat().st_size
-        if size < 1 << 20:
-            log.warning("  File too small (%.1f KB) — probably not a valid model.", size / 1024)
-            tmp_path.unlink()
-            continue
-
-        # Move into place.
-        shutil.move(str(tmp_path), str(model_path))
-        log.info("Model saved to %s (%.1f MB)", model_path, model_path.stat().st_size / (1 << 20))
-        return True
-
-    log.warning("All download sources failed.")
-    return False
-
-
-# ---------------------------------------------------------------------------
-# Method: Docker
-# ---------------------------------------------------------------------------
-
-DOCKERFILE_CONTENT = r"""
-FROM --platform=linux/amd64 python:3.11-slim
-
-RUN pip install --no-cache-dir \
-    paddlepaddle==3.0.0 \
-    paddle2onnx==1.3.1 \
-    onnx==1.17.0 \
-    requests
-
-WORKDIR /work
-
-# Download + extract the PP-DocLayout Paddle inference model.
-RUN python3 -c "
-import urllib.request, tarfile, os
-url = 'PADDLE_MODEL_URL_PLACEHOLDER'
-print(f'Downloading {url} ...')
-dest = '/work/pp_doclayout.tar'
-urllib.request.urlretrieve(url, dest)
-print('Extracting ...')
-with tarfile.open(dest) as t:
-    t.extractall('/work/paddle_model')
-os.remove(dest)
-# List what we extracted
-for root, dirs, files in os.walk('/work/paddle_model'):
-    for f in files:
-        fp = os.path.join(root, f)
-        sz = os.path.getsize(fp)
-        print(f'  {fp} ({sz} bytes)')
-"
-
-# Convert Paddle model to ONNX.
-# paddle2onnx expects model_dir with model.pdmodel + model.pdiparams
-RUN python3 -c "
-import os, glob, subprocess
-
-# Find the inference model files
-model_dir = '/work/paddle_model'
-pdmodel_files = glob.glob(os.path.join(model_dir, '**', '*.pdmodel'), recursive=True)
-pdiparams_files = glob.glob(os.path.join(model_dir, '**', '*.pdiparams'), recursive=True)
-
-if not pdmodel_files:
-    raise FileNotFoundError('No .pdmodel file found in extracted archive')
-
-pdmodel = pdmodel_files[0]
-pdiparams = pdiparams_files[0] if pdiparams_files else None
-model_dir_actual = os.path.dirname(pdmodel)
-pdmodel_name = os.path.basename(pdmodel).replace('.pdmodel', '')
-
-print(f'Found model: {pdmodel}')
-print(f'Found params: {pdiparams}')
-print(f'Model dir: {model_dir_actual}')
-print(f'Model name prefix: {pdmodel_name}')
-
-cmd = [
-    'paddle2onnx',
-    '--model_dir', model_dir_actual,
-    '--model_filename', os.path.basename(pdmodel),
-]
-if pdiparams:
-    cmd += ['--params_filename', os.path.basename(pdiparams)]
-cmd += [
-    '--save_file', '/work/output/model.onnx',
-    '--opset_version', '14',
-    '--enable_onnx_checker', 'True',
-]
-
-os.makedirs('/work/output', exist_ok=True)
-print(f'Running: {\" \".join(cmd)}')
-subprocess.run(cmd, check=True)
-
-out_size = os.path.getsize('/work/output/model.onnx')
-print(f'Conversion done: /work/output/model.onnx ({out_size} bytes)')
-"
-
-CMD ["cp", "-v", "/work/output/model.onnx", "/output/model.onnx"]
-""".replace(
-    "PADDLE_MODEL_URL_PLACEHOLDER", PADDLE_MODEL_URL
-)
-
-
-def try_docker(output_dir: Path) -> bool:
-    """Build a Docker image to convert the Paddle model to ONNX. Returns True on success."""
-    log.info("=== Method: DOCKER (linux/amd64) ===")
-
-    # Check Docker is available.
-    docker_bin = shutil.which("docker") or "/usr/local/bin/docker"
-    try:
-        subprocess.run(
-            [docker_bin, "version"],
-            capture_output=True,
-            check=True,
-            timeout=15,
-        )
-    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as exc:
-        log.error("Docker is not available: %s", exc)
-        return False
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    with tempfile.TemporaryDirectory(prefix="doclayout-export-") as tmpdir:
-        tmpdir = Path(tmpdir)
-
-        # Write Dockerfile.
-        dockerfile_path = tmpdir / "Dockerfile"
-        dockerfile_path.write_text(DOCKERFILE_CONTENT)
-        log.info("Wrote Dockerfile to %s", dockerfile_path)
-
-        # Build image.
-        log.info("Building Docker image (this downloads ~2 GB, may take a while) ...")
-        build_cmd = [
-            docker_bin, "build",
-            "--platform", "linux/amd64",
-            "-t", DOCKER_IMAGE_TAG,
-            "-f", str(dockerfile_path),
-            str(tmpdir),
-        ]
-        log.info("  %s", " ".join(build_cmd))
-        build_result = subprocess.run(
-            build_cmd,
-            capture_output=False,  # stream output to terminal
-            timeout=1200,  # 20 min
-        )
-        if build_result.returncode != 0:
-            log.error("Docker build failed (exit code %d).", build_result.returncode)
-            return False
-
-        # Run container — mount output_dir as /output, the CMD copies model.onnx there.
-        log.info("Running conversion container ...")
-        run_cmd = [
-            docker_bin, "run",
-            "--rm",
-            "--platform", "linux/amd64",
-            "-v", f"{output_dir.resolve()}:/output",
-            DOCKER_IMAGE_TAG,
-        ]
-        log.info("  %s", " ".join(run_cmd))
-        run_result = subprocess.run(
-            run_cmd,
-            capture_output=False,
-            timeout=300,
-        )
-        if run_result.returncode != 0:
-            log.error("Docker run failed (exit code %d).", run_result.returncode)
-            return False
-
-    model_path = output_dir / "model.onnx"
-    if model_path.exists():
-        size_mb = model_path.stat().st_size / (1 << 20)
-        log.info("Model exported: %s (%.1f MB)", model_path, size_mb)
-        return True
-    else:
-        log.error("Expected output file not found: %s", model_path)
-        return False
-
-
-# ---------------------------------------------------------------------------
-# Write metadata
-# ---------------------------------------------------------------------------
-
-
-def write_metadata(output_dir: Path, method: str) -> None:
-    """Write a metadata JSON next to the model for provenance tracking."""
-    model_path = output_dir / "model.onnx"
-    if not model_path.exists():
-        return
-
-    meta = {
-        "model": "PP-DocLayout",
-        "format": "ONNX",
-        "export_method": method,
-        "class_labels": CLASS_LABELS,
-        "input_shape": list(MODEL_INPUT_SHAPE),
-        "file_size_bytes": model_path.stat().st_size,
-        "sha256": sha256_file(model_path),
-    }
-    meta_path = output_dir / "metadata.json"
-    with open(meta_path, "w") as f:
-        json.dump(meta, f, indent=2)
-    log.info("Metadata written to %s", meta_path)
-
-
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
@@ -527,7 +213,7 @@ def main() -> int:
        return 1

    # Write metadata.
-    write_metadata(output_dir, used_method)
+    write_metadata(output_dir, used_method, CLASS_LABELS, MODEL_INPUT_SHAPE)

    # Verify.
    if not args.skip_verify: