#!/usr/bin/env python3 """ TrOCR Baseline Benchmark — measures PyTorch TrOCR performance. Metrics: - RAM usage (RSS) before and after model load - Inference time per line (min, max, mean, p50, p95) - Model size on disk Output: JSON report to stdout (redirect to file for Sprint 2 comparison). Usage: python scripts/benchmark-trocr.py [--model trocr-base-printed] [--runs 10] python scripts/benchmark-trocr.py > benchmark-trocr-baseline.json """ import argparse import json import os import sys import time from datetime import datetime # Add backend to path for imports sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'klausur-service', 'backend')) def get_rss_mb(): """Get current process RSS in MB.""" import resource # resource.getrusage returns KB on Linux, bytes on macOS usage = resource.getrusage(resource.RUSAGE_SELF) rss = usage.ru_maxrss if sys.platform == 'darwin': return rss / (1024 * 1024) # bytes to MB on macOS return rss / 1024 # KB to MB on Linux def get_model_size_mb(model_name): """Estimate model size from HuggingFace cache.""" cache_dir = os.path.expanduser("~/.cache/huggingface/hub") total = 0 model_dir_pattern = model_name.replace('/', '--') for root, dirs, files in os.walk(cache_dir): if model_dir_pattern in root: for f in files: total += os.path.getsize(os.path.join(root, f)) return total / (1024 * 1024) # bytes to MB def benchmark_trocr(model_name: str = "microsoft/trocr-base-printed", num_runs: int = 10): """Run TrOCR benchmark and return results dict.""" import numpy as np from PIL import Image rss_before = get_rss_mb() # Load model print(f"Loading model: {model_name}", file=sys.stderr) load_start = time.monotonic() try: from transformers import TrOCRProcessor, VisionEncoderDecoderModel processor = TrOCRProcessor.from_pretrained(model_name) model = VisionEncoderDecoderModel.from_pretrained(model_name) model.eval() except Exception as e: return {"error": f"Failed to load model: {e}"} load_time = time.monotonic() - load_start rss_after_load = get_rss_mb() model_size = get_model_size_mb(model_name) print(f"Model loaded in {load_time:.1f}s, RSS: {rss_after_load:.0f}MB", file=sys.stderr) # Create synthetic test images (text line images) test_images = [] for i in range(num_runs): # Create a simple white image with black text-like content # In production, these would be real cropped text lines w, h = 384, 48 # typical TrOCR input size img = Image.new('RGB', (w, h), 'white') # Add some variation pixels = img.load() # Simple dark region to simulate text for x in range(50 + i * 10, 200 + i * 5): for y in range(10, 38): pixels[x, y] = (30, 30, 30) test_images.append(img) # Warm-up run (not counted) print("Warm-up...", file=sys.stderr) import torch with torch.no_grad(): pixel_values = processor(images=test_images[0], return_tensors="pt").pixel_values _ = model.generate(pixel_values, max_new_tokens=50) # Benchmark runs print(f"Running {num_runs} inference passes...", file=sys.stderr) times_ms = [] for i, img in enumerate(test_images): start = time.monotonic() with torch.no_grad(): pixel_values = processor(images=img, return_tensors="pt").pixel_values generated_ids = model.generate(pixel_values, max_new_tokens=50) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] elapsed_ms = (time.monotonic() - start) * 1000 times_ms.append(elapsed_ms) print(f" Run {i+1}/{num_runs}: {elapsed_ms:.0f}ms -> '{text[:30]}'", file=sys.stderr) rss_after_inference = get_rss_mb() # Compute stats times_sorted = sorted(times_ms) p50_idx = len(times_sorted) // 2 p95_idx = int(len(times_sorted) * 0.95) report = { "benchmark": "trocr-baseline", "timestamp": datetime.utcnow().isoformat() + "Z", "model": model_name, "backend": "pytorch", "quantization": "float32", "num_runs": num_runs, "model_size_mb": round(model_size, 1), "ram_mb": { "before_load": round(rss_before, 1), "after_load": round(rss_after_load, 1), "after_inference": round(rss_after_inference, 1), "model_delta": round(rss_after_load - rss_before, 1), }, "load_time_seconds": round(load_time, 2), "inference_ms": { "min": round(min(times_ms), 1), "max": round(max(times_ms), 1), "mean": round(sum(times_ms) / len(times_ms), 1), "p50": round(times_sorted[p50_idx], 1), "p95": round(times_sorted[min(p95_idx, len(times_sorted) - 1)], 1), }, "times_ms": [round(t, 1) for t in times_ms], "platform": { "python": sys.version.split()[0], "os": sys.platform, }, } return report def main(): parser = argparse.ArgumentParser(description="TrOCR Baseline Benchmark") parser.add_argument("--model", default="microsoft/trocr-base-printed", help="HuggingFace model name") parser.add_argument("--runs", type=int, default=10, help="Number of inference runs") args = parser.parse_args() report = benchmark_trocr(model_name=args.model, num_runs=args.runs) print(json.dumps(report, indent=2)) if __name__ == "__main__": main()