feat: Sprint 1 — IPA hardening, regression framework, ground-truth review

Track A (Backend): - Compound word IPA decomposition (schoolbag→school+bag) - Trailing garbled IPA fragment removal after brackets (R21 fix) - Regression runner with DB persistence, history endpoints - Page crop determinism verified with tests Track B (Frontend): - OCR Regression dashboard (/ai/ocr-regression) - Ground Truth Review workflow (/ai/ocr-ground-truth) with split-view, confidence highlighting, inline edit, batch mark, progress tracking Track C (Docs): - OCR-Pipeline.md v5.0 (Steps 5e-5h) - Regression testing guide - mkdocs.yml nav update Track D (Infra): - TrOCR baseline benchmark script - run-regression.sh shell script - Migration 008: regression_runs table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 09:21:27 +01:00
parent f5d5d6c59c
commit a1e079b911
13 changed files with 1796 additions and 15 deletions
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+TrOCR Baseline Benchmark — measures PyTorch TrOCR performance.
+
+Metrics:
+- RAM usage (RSS) before and after model load
+- Inference time per line (min, max, mean, p50, p95)
+- Model size on disk
+
+Output: JSON report to stdout (redirect to file for Sprint 2 comparison).
+
+Usage:
+    python scripts/benchmark-trocr.py [--model trocr-base-printed] [--runs 10]
+    python scripts/benchmark-trocr.py > benchmark-trocr-baseline.json
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime
+
+# Add backend to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'klausur-service', 'backend'))
+
+
+def get_rss_mb():
+    """Get current process RSS in MB."""
+    import resource
+    # resource.getrusage returns KB on Linux, bytes on macOS
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    rss = usage.ru_maxrss
+    if sys.platform == 'darwin':
+        return rss / (1024 * 1024)  # bytes to MB on macOS
+    return rss / 1024  # KB to MB on Linux
+
+
+def get_model_size_mb(model_name):
+    """Estimate model size from HuggingFace cache."""
+    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
+    total = 0
+    model_dir_pattern = model_name.replace('/', '--')
+    for root, dirs, files in os.walk(cache_dir):
+        if model_dir_pattern in root:
+            for f in files:
+                total += os.path.getsize(os.path.join(root, f))
+    return total / (1024 * 1024)  # bytes to MB
+
+
+def benchmark_trocr(model_name: str = "microsoft/trocr-base-printed", num_runs: int = 10):
+    """Run TrOCR benchmark and return results dict."""
+    import numpy as np
+    from PIL import Image
+
+    rss_before = get_rss_mb()
+
+    # Load model
+    print(f"Loading model: {model_name}", file=sys.stderr)
+    load_start = time.monotonic()
+
+    try:
+        from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+        processor = TrOCRProcessor.from_pretrained(model_name)
+        model = VisionEncoderDecoderModel.from_pretrained(model_name)
+        model.eval()
+    except Exception as e:
+        return {"error": f"Failed to load model: {e}"}
+
+    load_time = time.monotonic() - load_start
+    rss_after_load = get_rss_mb()
+    model_size = get_model_size_mb(model_name)
+
+    print(f"Model loaded in {load_time:.1f}s, RSS: {rss_after_load:.0f}MB", file=sys.stderr)
+
+    # Create synthetic test images (text line images)
+    test_images = []
+    for i in range(num_runs):
+        # Create a simple white image with black text-like content
+        # In production, these would be real cropped text lines
+        w, h = 384, 48  # typical TrOCR input size
+        img = Image.new('RGB', (w, h), 'white')
+        # Add some variation
+        pixels = img.load()
+        # Simple dark region to simulate text
+        for x in range(50 + i * 10, 200 + i * 5):
+            for y in range(10, 38):
+                pixels[x, y] = (30, 30, 30)
+        test_images.append(img)
+
+    # Warm-up run (not counted)
+    print("Warm-up...", file=sys.stderr)
+    import torch
+    with torch.no_grad():
+        pixel_values = processor(images=test_images[0], return_tensors="pt").pixel_values
+        _ = model.generate(pixel_values, max_new_tokens=50)
+
+    # Benchmark runs
+    print(f"Running {num_runs} inference passes...", file=sys.stderr)
+    times_ms = []
+    for i, img in enumerate(test_images):
+        start = time.monotonic()
+        with torch.no_grad():
+            pixel_values = processor(images=img, return_tensors="pt").pixel_values
+            generated_ids = model.generate(pixel_values, max_new_tokens=50)
+            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        elapsed_ms = (time.monotonic() - start) * 1000
+        times_ms.append(elapsed_ms)
+        print(f"  Run {i+1}/{num_runs}: {elapsed_ms:.0f}ms -> '{text[:30]}'", file=sys.stderr)
+
+    rss_after_inference = get_rss_mb()
+
+    # Compute stats
+    times_sorted = sorted(times_ms)
+    p50_idx = len(times_sorted) // 2
+    p95_idx = int(len(times_sorted) * 0.95)
+
+    report = {
+        "benchmark": "trocr-baseline",
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "model": model_name,
+        "backend": "pytorch",
+        "quantization": "float32",
+        "num_runs": num_runs,
+        "model_size_mb": round(model_size, 1),
+        "ram_mb": {
+            "before_load": round(rss_before, 1),
+            "after_load": round(rss_after_load, 1),
+            "after_inference": round(rss_after_inference, 1),
+            "model_delta": round(rss_after_load - rss_before, 1),
+        },
+        "load_time_seconds": round(load_time, 2),
+        "inference_ms": {
+            "min": round(min(times_ms), 1),
+            "max": round(max(times_ms), 1),
+            "mean": round(sum(times_ms) / len(times_ms), 1),
+            "p50": round(times_sorted[p50_idx], 1),
+            "p95": round(times_sorted[min(p95_idx, len(times_sorted) - 1)], 1),
+        },
+        "times_ms": [round(t, 1) for t in times_ms],
+        "platform": {
+            "python": sys.version.split()[0],
+            "os": sys.platform,
+        },
+    }
+
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="TrOCR Baseline Benchmark")
+    parser.add_argument("--model", default="microsoft/trocr-base-printed",
+                        help="HuggingFace model name")
+    parser.add_argument("--runs", type=int, default=10,
+                        help="Number of inference runs")
+    args = parser.parse_args()
+
+    report = benchmark_trocr(model_name=args.model, num_runs=args.runs)
+    print(json.dumps(report, indent=2))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Run OCR pipeline regression tests and exit non-zero on failure.
+#
+# Usage:
+#   ./scripts/run-regression.sh                    # default: macmini:8086
+#   ./scripts/run-regression.sh http://localhost:8086
+#
+# Exit codes:
+#   0 = all pass
+#   1 = failures or errors
+#   2 = connection error
+
+set -euo pipefail
+
+BASE_URL="${1:-http://macmini:8086}"
+ENDPOINT="${BASE_URL}/api/v1/ocr-pipeline/regression/run?triggered_by=script"
+
+echo "=== OCR Pipeline Regression Suite ==="
+echo "Endpoint: ${ENDPOINT}"
+echo ""
+
+RESPONSE=$(curl -sf -X POST "${ENDPOINT}" -H "Content-Type: application/json" 2>&1) || {
+    echo "ERROR: Could not reach ${ENDPOINT}"
+    exit 2
+}
+
+STATUS=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
+TOTAL=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['total'])")
+PASSED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['passed'])")
+FAILED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['failed'])")
+ERRORS=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['errors'])")
+DURATION=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('duration_ms', '?'))")
+
+echo "Status:   ${STATUS}"
+echo "Total:    ${TOTAL}"
+echo "Passed:   ${PASSED}"
+echo "Failed:   ${FAILED}"
+echo "Errors:   ${ERRORS}"
+echo "Duration: ${DURATION}ms"
+echo ""
+
+if [ "${STATUS}" = "pass" ]; then
+    echo "PASS — All regression tests passed."
+    exit 0
+else
+    echo "FAIL — Regression failures detected!"
+    # Print failure details
+    echo "${RESPONSE}" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for r in data.get('results', []):
+    if r['status'] != 'pass':
+        print(f\"  {r['status'].upper()}: {r.get('name', r['session_id'])}\")
+        if 'error' in r:
+            print(f\"    Error: {r['error']}\")
+        ds = r.get('diff_summary', {})
+        if ds:
+            print(f\"    Structural: {ds.get('structural_changes', 0)}, Text: {ds.get('text_changes', 0)}, Missing: {ds.get('cells_missing', 0)}, Added: {ds.get('cells_added', 0)}\")
+"
+    exit 1
+fi