feat: Sprint 1 — IPA hardening, regression framework, ground-truth review
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s

Track A (Backend):
- Compound word IPA decomposition (schoolbag→school+bag)
- Trailing garbled IPA fragment removal after brackets (R21 fix)
- Regression runner with DB persistence, history endpoints
- Page crop determinism verified with tests

Track B (Frontend):
- OCR Regression dashboard (/ai/ocr-regression)
- Ground Truth Review workflow (/ai/ocr-ground-truth)
  with split-view, confidence highlighting, inline edit,
  batch mark, progress tracking

Track C (Docs):
- OCR-Pipeline.md v5.0 (Steps 5e-5h)
- Regression testing guide
- mkdocs.yml nav update

Track D (Infra):
- TrOCR baseline benchmark script
- run-regression.sh shell script
- Migration 008: regression_runs table

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-23 09:21:27 +01:00
parent f5d5d6c59c
commit a1e079b911
13 changed files with 1796 additions and 15 deletions

163
scripts/benchmark-trocr.py Executable file
View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""
TrOCR Baseline Benchmark — measures PyTorch TrOCR performance.
Metrics:
- RAM usage (RSS) before and after model load
- Inference time per line (min, max, mean, p50, p95)
- Model size on disk
Output: JSON report to stdout (redirect to file for Sprint 2 comparison).
Usage:
python scripts/benchmark-trocr.py [--model trocr-base-printed] [--runs 10]
python scripts/benchmark-trocr.py > benchmark-trocr-baseline.json
"""
import argparse
import json
import os
import sys
import time
from datetime import datetime
# Add backend to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'klausur-service', 'backend'))
def get_rss_mb():
"""Get current process RSS in MB."""
import resource
# resource.getrusage returns KB on Linux, bytes on macOS
usage = resource.getrusage(resource.RUSAGE_SELF)
rss = usage.ru_maxrss
if sys.platform == 'darwin':
return rss / (1024 * 1024) # bytes to MB on macOS
return rss / 1024 # KB to MB on Linux
def get_model_size_mb(model_name):
"""Estimate model size from HuggingFace cache."""
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
total = 0
model_dir_pattern = model_name.replace('/', '--')
for root, dirs, files in os.walk(cache_dir):
if model_dir_pattern in root:
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / (1024 * 1024) # bytes to MB
def benchmark_trocr(model_name: str = "microsoft/trocr-base-printed", num_runs: int = 10):
"""Run TrOCR benchmark and return results dict."""
import numpy as np
from PIL import Image
rss_before = get_rss_mb()
# Load model
print(f"Loading model: {model_name}", file=sys.stderr)
load_start = time.monotonic()
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)
model.eval()
except Exception as e:
return {"error": f"Failed to load model: {e}"}
load_time = time.monotonic() - load_start
rss_after_load = get_rss_mb()
model_size = get_model_size_mb(model_name)
print(f"Model loaded in {load_time:.1f}s, RSS: {rss_after_load:.0f}MB", file=sys.stderr)
# Create synthetic test images (text line images)
test_images = []
for i in range(num_runs):
# Create a simple white image with black text-like content
# In production, these would be real cropped text lines
w, h = 384, 48 # typical TrOCR input size
img = Image.new('RGB', (w, h), 'white')
# Add some variation
pixels = img.load()
# Simple dark region to simulate text
for x in range(50 + i * 10, 200 + i * 5):
for y in range(10, 38):
pixels[x, y] = (30, 30, 30)
test_images.append(img)
# Warm-up run (not counted)
print("Warm-up...", file=sys.stderr)
import torch
with torch.no_grad():
pixel_values = processor(images=test_images[0], return_tensors="pt").pixel_values
_ = model.generate(pixel_values, max_new_tokens=50)
# Benchmark runs
print(f"Running {num_runs} inference passes...", file=sys.stderr)
times_ms = []
for i, img in enumerate(test_images):
start = time.monotonic()
with torch.no_grad():
pixel_values = processor(images=img, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values, max_new_tokens=50)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
elapsed_ms = (time.monotonic() - start) * 1000
times_ms.append(elapsed_ms)
print(f" Run {i+1}/{num_runs}: {elapsed_ms:.0f}ms -> '{text[:30]}'", file=sys.stderr)
rss_after_inference = get_rss_mb()
# Compute stats
times_sorted = sorted(times_ms)
p50_idx = len(times_sorted) // 2
p95_idx = int(len(times_sorted) * 0.95)
report = {
"benchmark": "trocr-baseline",
"timestamp": datetime.utcnow().isoformat() + "Z",
"model": model_name,
"backend": "pytorch",
"quantization": "float32",
"num_runs": num_runs,
"model_size_mb": round(model_size, 1),
"ram_mb": {
"before_load": round(rss_before, 1),
"after_load": round(rss_after_load, 1),
"after_inference": round(rss_after_inference, 1),
"model_delta": round(rss_after_load - rss_before, 1),
},
"load_time_seconds": round(load_time, 2),
"inference_ms": {
"min": round(min(times_ms), 1),
"max": round(max(times_ms), 1),
"mean": round(sum(times_ms) / len(times_ms), 1),
"p50": round(times_sorted[p50_idx], 1),
"p95": round(times_sorted[min(p95_idx, len(times_sorted) - 1)], 1),
},
"times_ms": [round(t, 1) for t in times_ms],
"platform": {
"python": sys.version.split()[0],
"os": sys.platform,
},
}
return report
def main():
parser = argparse.ArgumentParser(description="TrOCR Baseline Benchmark")
parser.add_argument("--model", default="microsoft/trocr-base-printed",
help="HuggingFace model name")
parser.add_argument("--runs", type=int, default=10,
help="Number of inference runs")
args = parser.parse_args()
report = benchmark_trocr(model_name=args.model, num_runs=args.runs)
print(json.dumps(report, indent=2))
if __name__ == "__main__":
main()

61
scripts/run-regression.sh Executable file
View File

@@ -0,0 +1,61 @@
#!/usr/bin/env bash
# Run OCR pipeline regression tests and exit non-zero on failure.
#
# Usage:
# ./scripts/run-regression.sh # default: macmini:8086
# ./scripts/run-regression.sh http://localhost:8086
#
# Exit codes:
# 0 = all pass
# 1 = failures or errors
# 2 = connection error
set -euo pipefail
BASE_URL="${1:-http://macmini:8086}"
ENDPOINT="${BASE_URL}/api/v1/ocr-pipeline/regression/run?triggered_by=script"
echo "=== OCR Pipeline Regression Suite ==="
echo "Endpoint: ${ENDPOINT}"
echo ""
RESPONSE=$(curl -sf -X POST "${ENDPOINT}" -H "Content-Type: application/json" 2>&1) || {
echo "ERROR: Could not reach ${ENDPOINT}"
exit 2
}
STATUS=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
TOTAL=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['total'])")
PASSED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['passed'])")
FAILED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['failed'])")
ERRORS=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['errors'])")
DURATION=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('duration_ms', '?'))")
echo "Status: ${STATUS}"
echo "Total: ${TOTAL}"
echo "Passed: ${PASSED}"
echo "Failed: ${FAILED}"
echo "Errors: ${ERRORS}"
echo "Duration: ${DURATION}ms"
echo ""
if [ "${STATUS}" = "pass" ]; then
echo "PASS — All regression tests passed."
exit 0
else
echo "FAIL — Regression failures detected!"
# Print failure details
echo "${RESPONSE}" | python3 -c "
import sys, json
data = json.load(sys.stdin)
for r in data.get('results', []):
if r['status'] != 'pass':
print(f\" {r['status'].upper()}: {r.get('name', r['session_id'])}\")
if 'error' in r:
print(f\" Error: {r['error']}\")
ds = r.get('diff_summary', {})
if ds:
print(f\" Structural: {ds.get('structural_changes', 0)}, Text: {ds.get('text_changes', 0)}, Missing: {ds.get('cells_missing', 0)}, Added: {ds.get('cells_added', 0)}\")
"
exit 1
fi