feat: Sprint 1 — IPA hardening, regression framework, ground-truth review
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Track A (Backend): - Compound word IPA decomposition (schoolbag→school+bag) - Trailing garbled IPA fragment removal after brackets (R21 fix) - Regression runner with DB persistence, history endpoints - Page crop determinism verified with tests Track B (Frontend): - OCR Regression dashboard (/ai/ocr-regression) - Ground Truth Review workflow (/ai/ocr-ground-truth) with split-view, confidence highlighting, inline edit, batch mark, progress tracking Track C (Docs): - OCR-Pipeline.md v5.0 (Steps 5e-5h) - Regression testing guide - mkdocs.yml nav update Track D (Infra): - TrOCR baseline benchmark script - run-regression.sh shell script - Migration 008: regression_runs table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
163
scripts/benchmark-trocr.py
Executable file
163
scripts/benchmark-trocr.py
Executable file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TrOCR Baseline Benchmark — measures PyTorch TrOCR performance.
|
||||
|
||||
Metrics:
|
||||
- RAM usage (RSS) before and after model load
|
||||
- Inference time per line (min, max, mean, p50, p95)
|
||||
- Model size on disk
|
||||
|
||||
Output: JSON report to stdout (redirect to file for Sprint 2 comparison).
|
||||
|
||||
Usage:
|
||||
python scripts/benchmark-trocr.py [--model trocr-base-printed] [--runs 10]
|
||||
python scripts/benchmark-trocr.py > benchmark-trocr-baseline.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
# Add backend to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'klausur-service', 'backend'))
|
||||
|
||||
|
||||
def get_rss_mb():
|
||||
"""Get current process RSS in MB."""
|
||||
import resource
|
||||
# resource.getrusage returns KB on Linux, bytes on macOS
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
rss = usage.ru_maxrss
|
||||
if sys.platform == 'darwin':
|
||||
return rss / (1024 * 1024) # bytes to MB on macOS
|
||||
return rss / 1024 # KB to MB on Linux
|
||||
|
||||
|
||||
def get_model_size_mb(model_name):
|
||||
"""Estimate model size from HuggingFace cache."""
|
||||
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
|
||||
total = 0
|
||||
model_dir_pattern = model_name.replace('/', '--')
|
||||
for root, dirs, files in os.walk(cache_dir):
|
||||
if model_dir_pattern in root:
|
||||
for f in files:
|
||||
total += os.path.getsize(os.path.join(root, f))
|
||||
return total / (1024 * 1024) # bytes to MB
|
||||
|
||||
|
||||
def benchmark_trocr(model_name: str = "microsoft/trocr-base-printed", num_runs: int = 10):
|
||||
"""Run TrOCR benchmark and return results dict."""
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
rss_before = get_rss_mb()
|
||||
|
||||
# Load model
|
||||
print(f"Loading model: {model_name}", file=sys.stderr)
|
||||
load_start = time.monotonic()
|
||||
|
||||
try:
|
||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||||
processor = TrOCRProcessor.from_pretrained(model_name)
|
||||
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
||||
model.eval()
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to load model: {e}"}
|
||||
|
||||
load_time = time.monotonic() - load_start
|
||||
rss_after_load = get_rss_mb()
|
||||
model_size = get_model_size_mb(model_name)
|
||||
|
||||
print(f"Model loaded in {load_time:.1f}s, RSS: {rss_after_load:.0f}MB", file=sys.stderr)
|
||||
|
||||
# Create synthetic test images (text line images)
|
||||
test_images = []
|
||||
for i in range(num_runs):
|
||||
# Create a simple white image with black text-like content
|
||||
# In production, these would be real cropped text lines
|
||||
w, h = 384, 48 # typical TrOCR input size
|
||||
img = Image.new('RGB', (w, h), 'white')
|
||||
# Add some variation
|
||||
pixels = img.load()
|
||||
# Simple dark region to simulate text
|
||||
for x in range(50 + i * 10, 200 + i * 5):
|
||||
for y in range(10, 38):
|
||||
pixels[x, y] = (30, 30, 30)
|
||||
test_images.append(img)
|
||||
|
||||
# Warm-up run (not counted)
|
||||
print("Warm-up...", file=sys.stderr)
|
||||
import torch
|
||||
with torch.no_grad():
|
||||
pixel_values = processor(images=test_images[0], return_tensors="pt").pixel_values
|
||||
_ = model.generate(pixel_values, max_new_tokens=50)
|
||||
|
||||
# Benchmark runs
|
||||
print(f"Running {num_runs} inference passes...", file=sys.stderr)
|
||||
times_ms = []
|
||||
for i, img in enumerate(test_images):
|
||||
start = time.monotonic()
|
||||
with torch.no_grad():
|
||||
pixel_values = processor(images=img, return_tensors="pt").pixel_values
|
||||
generated_ids = model.generate(pixel_values, max_new_tokens=50)
|
||||
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
times_ms.append(elapsed_ms)
|
||||
print(f" Run {i+1}/{num_runs}: {elapsed_ms:.0f}ms -> '{text[:30]}'", file=sys.stderr)
|
||||
|
||||
rss_after_inference = get_rss_mb()
|
||||
|
||||
# Compute stats
|
||||
times_sorted = sorted(times_ms)
|
||||
p50_idx = len(times_sorted) // 2
|
||||
p95_idx = int(len(times_sorted) * 0.95)
|
||||
|
||||
report = {
|
||||
"benchmark": "trocr-baseline",
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||
"model": model_name,
|
||||
"backend": "pytorch",
|
||||
"quantization": "float32",
|
||||
"num_runs": num_runs,
|
||||
"model_size_mb": round(model_size, 1),
|
||||
"ram_mb": {
|
||||
"before_load": round(rss_before, 1),
|
||||
"after_load": round(rss_after_load, 1),
|
||||
"after_inference": round(rss_after_inference, 1),
|
||||
"model_delta": round(rss_after_load - rss_before, 1),
|
||||
},
|
||||
"load_time_seconds": round(load_time, 2),
|
||||
"inference_ms": {
|
||||
"min": round(min(times_ms), 1),
|
||||
"max": round(max(times_ms), 1),
|
||||
"mean": round(sum(times_ms) / len(times_ms), 1),
|
||||
"p50": round(times_sorted[p50_idx], 1),
|
||||
"p95": round(times_sorted[min(p95_idx, len(times_sorted) - 1)], 1),
|
||||
},
|
||||
"times_ms": [round(t, 1) for t in times_ms],
|
||||
"platform": {
|
||||
"python": sys.version.split()[0],
|
||||
"os": sys.platform,
|
||||
},
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="TrOCR Baseline Benchmark")
|
||||
parser.add_argument("--model", default="microsoft/trocr-base-printed",
|
||||
help="HuggingFace model name")
|
||||
parser.add_argument("--runs", type=int, default=10,
|
||||
help="Number of inference runs")
|
||||
args = parser.parse_args()
|
||||
|
||||
report = benchmark_trocr(model_name=args.model, num_runs=args.runs)
|
||||
print(json.dumps(report, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
61
scripts/run-regression.sh
Executable file
61
scripts/run-regression.sh
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run OCR pipeline regression tests and exit non-zero on failure.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/run-regression.sh # default: macmini:8086
|
||||
# ./scripts/run-regression.sh http://localhost:8086
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 = all pass
|
||||
# 1 = failures or errors
|
||||
# 2 = connection error
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BASE_URL="${1:-http://macmini:8086}"
|
||||
ENDPOINT="${BASE_URL}/api/v1/ocr-pipeline/regression/run?triggered_by=script"
|
||||
|
||||
echo "=== OCR Pipeline Regression Suite ==="
|
||||
echo "Endpoint: ${ENDPOINT}"
|
||||
echo ""
|
||||
|
||||
RESPONSE=$(curl -sf -X POST "${ENDPOINT}" -H "Content-Type: application/json" 2>&1) || {
|
||||
echo "ERROR: Could not reach ${ENDPOINT}"
|
||||
exit 2
|
||||
}
|
||||
|
||||
STATUS=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
|
||||
TOTAL=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['total'])")
|
||||
PASSED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['passed'])")
|
||||
FAILED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['failed'])")
|
||||
ERRORS=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['errors'])")
|
||||
DURATION=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('duration_ms', '?'))")
|
||||
|
||||
echo "Status: ${STATUS}"
|
||||
echo "Total: ${TOTAL}"
|
||||
echo "Passed: ${PASSED}"
|
||||
echo "Failed: ${FAILED}"
|
||||
echo "Errors: ${ERRORS}"
|
||||
echo "Duration: ${DURATION}ms"
|
||||
echo ""
|
||||
|
||||
if [ "${STATUS}" = "pass" ]; then
|
||||
echo "PASS — All regression tests passed."
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL — Regression failures detected!"
|
||||
# Print failure details
|
||||
echo "${RESPONSE}" | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
for r in data.get('results', []):
|
||||
if r['status'] != 'pass':
|
||||
print(f\" {r['status'].upper()}: {r.get('name', r['session_id'])}\")
|
||||
if 'error' in r:
|
||||
print(f\" Error: {r['error']}\")
|
||||
ds = r.get('diff_summary', {})
|
||||
if ds:
|
||||
print(f\" Structural: {ds.get('structural_changes', 0)}, Text: {ds.get('text_changes', 0)}, Missing: {ds.get('cells_missing', 0)}, Added: {ds.get('cells_added', 0)}\")
|
||||
"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user