breakpilot-lehrer/klausur-service/backend/services/trocr_service.py

"""
TrOCR Service — Barrel Re-export

Microsoft's Transformer-based OCR for text recognition.
Split into submodules:
- trocr_models.py  — Dataclasses, cache, model loading, line splitting
- trocr_ocr.py     — Core OCR inference (PyTorch/ONNX routing, enhanced)
- trocr_batch.py   — Batch processing and SSE streaming

DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

# Models, cache, and model loading
from .trocr_models import (
    OCRResult,
    BatchOCRResult,
    _compute_image_hash,
    _cache_get,
    _cache_set,
    get_cache_stats,
    _check_trocr_available,
    get_trocr_model,
    preload_trocr_model,
    get_model_status,
    get_active_backend,
    _split_into_lines,
)

# Core OCR execution
from .trocr_ocr import (
    run_trocr_ocr,
    run_trocr_ocr_enhanced,
    _run_pytorch_ocr,
)

# Batch processing & streaming
from .trocr_batch import (
    run_trocr_batch,
    run_trocr_batch_stream,
    test_trocr_ocr,
)

__all__ = [
    # Dataclasses
    "OCRResult",
    "BatchOCRResult",
    # Cache
    "_compute_image_hash",
    "_cache_get",
    "_cache_set",
    "get_cache_stats",
    # Model loading
    "_check_trocr_available",
    "get_trocr_model",
    "preload_trocr_model",
    "get_model_status",
    "get_active_backend",
    "_split_into_lines",
    # OCR execution
    "run_trocr_ocr",
    "run_trocr_ocr_enhanced",
    "_run_pytorch_ocr",
    # Batch
    "run_trocr_batch",
    "run_trocr_batch_stream",
    "test_trocr_ocr",
]


if __name__ == "__main__":
    import asyncio
    import sys

    handwritten = "--handwritten" in sys.argv
    args = [a for a in sys.argv[1:] if not a.startswith("--")]

    if args:
        asyncio.run(test_trocr_ocr(args[0], handwritten=handwritten))
    else:
        print("Usage: python trocr_service.py <image_path> [--handwritten]")