Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Track A (Backend): - Compound word IPA decomposition (schoolbag→school+bag) - Trailing garbled IPA fragment removal after brackets (R21 fix) - Regression runner with DB persistence, history endpoints - Page crop determinism verified with tests Track B (Frontend): - OCR Regression dashboard (/ai/ocr-regression) - Ground Truth Review workflow (/ai/ocr-ground-truth) with split-view, confidence highlighting, inline edit, batch mark, progress tracking Track C (Docs): - OCR-Pipeline.md v5.0 (Steps 5e-5h) - Regression testing guide - mkdocs.yml nav update Track D (Infra): - TrOCR baseline benchmark script - run-regression.sh shell script - Migration 008: regression_runs table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
164 lines
5.5 KiB
Python
Executable File
164 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
TrOCR Baseline Benchmark — measures PyTorch TrOCR performance.
|
|
|
|
Metrics:
|
|
- RAM usage (RSS) before and after model load
|
|
- Inference time per line (min, max, mean, p50, p95)
|
|
- Model size on disk
|
|
|
|
Output: JSON report to stdout (redirect to file for Sprint 2 comparison).
|
|
|
|
Usage:
|
|
python scripts/benchmark-trocr.py [--model trocr-base-printed] [--runs 10]
|
|
python scripts/benchmark-trocr.py > benchmark-trocr-baseline.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
|
|
# Add backend to path for imports
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'klausur-service', 'backend'))
|
|
|
|
|
|
def get_rss_mb():
|
|
"""Get current process RSS in MB."""
|
|
import resource
|
|
# resource.getrusage returns KB on Linux, bytes on macOS
|
|
usage = resource.getrusage(resource.RUSAGE_SELF)
|
|
rss = usage.ru_maxrss
|
|
if sys.platform == 'darwin':
|
|
return rss / (1024 * 1024) # bytes to MB on macOS
|
|
return rss / 1024 # KB to MB on Linux
|
|
|
|
|
|
def get_model_size_mb(model_name):
|
|
"""Estimate model size from HuggingFace cache."""
|
|
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
|
|
total = 0
|
|
model_dir_pattern = model_name.replace('/', '--')
|
|
for root, dirs, files in os.walk(cache_dir):
|
|
if model_dir_pattern in root:
|
|
for f in files:
|
|
total += os.path.getsize(os.path.join(root, f))
|
|
return total / (1024 * 1024) # bytes to MB
|
|
|
|
|
|
def benchmark_trocr(model_name: str = "microsoft/trocr-base-printed", num_runs: int = 10):
|
|
"""Run TrOCR benchmark and return results dict."""
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
rss_before = get_rss_mb()
|
|
|
|
# Load model
|
|
print(f"Loading model: {model_name}", file=sys.stderr)
|
|
load_start = time.monotonic()
|
|
|
|
try:
|
|
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
|
processor = TrOCRProcessor.from_pretrained(model_name)
|
|
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
|
model.eval()
|
|
except Exception as e:
|
|
return {"error": f"Failed to load model: {e}"}
|
|
|
|
load_time = time.monotonic() - load_start
|
|
rss_after_load = get_rss_mb()
|
|
model_size = get_model_size_mb(model_name)
|
|
|
|
print(f"Model loaded in {load_time:.1f}s, RSS: {rss_after_load:.0f}MB", file=sys.stderr)
|
|
|
|
# Create synthetic test images (text line images)
|
|
test_images = []
|
|
for i in range(num_runs):
|
|
# Create a simple white image with black text-like content
|
|
# In production, these would be real cropped text lines
|
|
w, h = 384, 48 # typical TrOCR input size
|
|
img = Image.new('RGB', (w, h), 'white')
|
|
# Add some variation
|
|
pixels = img.load()
|
|
# Simple dark region to simulate text
|
|
for x in range(50 + i * 10, 200 + i * 5):
|
|
for y in range(10, 38):
|
|
pixels[x, y] = (30, 30, 30)
|
|
test_images.append(img)
|
|
|
|
# Warm-up run (not counted)
|
|
print("Warm-up...", file=sys.stderr)
|
|
import torch
|
|
with torch.no_grad():
|
|
pixel_values = processor(images=test_images[0], return_tensors="pt").pixel_values
|
|
_ = model.generate(pixel_values, max_new_tokens=50)
|
|
|
|
# Benchmark runs
|
|
print(f"Running {num_runs} inference passes...", file=sys.stderr)
|
|
times_ms = []
|
|
for i, img in enumerate(test_images):
|
|
start = time.monotonic()
|
|
with torch.no_grad():
|
|
pixel_values = processor(images=img, return_tensors="pt").pixel_values
|
|
generated_ids = model.generate(pixel_values, max_new_tokens=50)
|
|
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
elapsed_ms = (time.monotonic() - start) * 1000
|
|
times_ms.append(elapsed_ms)
|
|
print(f" Run {i+1}/{num_runs}: {elapsed_ms:.0f}ms -> '{text[:30]}'", file=sys.stderr)
|
|
|
|
rss_after_inference = get_rss_mb()
|
|
|
|
# Compute stats
|
|
times_sorted = sorted(times_ms)
|
|
p50_idx = len(times_sorted) // 2
|
|
p95_idx = int(len(times_sorted) * 0.95)
|
|
|
|
report = {
|
|
"benchmark": "trocr-baseline",
|
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
|
"model": model_name,
|
|
"backend": "pytorch",
|
|
"quantization": "float32",
|
|
"num_runs": num_runs,
|
|
"model_size_mb": round(model_size, 1),
|
|
"ram_mb": {
|
|
"before_load": round(rss_before, 1),
|
|
"after_load": round(rss_after_load, 1),
|
|
"after_inference": round(rss_after_inference, 1),
|
|
"model_delta": round(rss_after_load - rss_before, 1),
|
|
},
|
|
"load_time_seconds": round(load_time, 2),
|
|
"inference_ms": {
|
|
"min": round(min(times_ms), 1),
|
|
"max": round(max(times_ms), 1),
|
|
"mean": round(sum(times_ms) / len(times_ms), 1),
|
|
"p50": round(times_sorted[p50_idx], 1),
|
|
"p95": round(times_sorted[min(p95_idx, len(times_sorted) - 1)], 1),
|
|
},
|
|
"times_ms": [round(t, 1) for t in times_ms],
|
|
"platform": {
|
|
"python": sys.version.split()[0],
|
|
"os": sys.platform,
|
|
},
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="TrOCR Baseline Benchmark")
|
|
parser.add_argument("--model", default="microsoft/trocr-base-printed",
|
|
help="HuggingFace model name")
|
|
parser.add_argument("--runs", type=int, default=10,
|
|
help="Number of inference runs")
|
|
args = parser.parse_args()
|
|
|
|
report = benchmark_trocr(model_name=args.model, num_runs=args.runs)
|
|
print(json.dumps(report, indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|