breakpilot-lehrer/klausur-service/backend/services/trocr_ocr.py

"""
TrOCR OCR Execution

Core OCR inference routines (PyTorch, ONNX routing, enhanced mode).

DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import io
import logging
import time
from typing import Tuple, Optional, List, Dict, Any

from .trocr_models import (
    OCRResult,
    _trocr_backend,
    _compute_image_hash,
    _cache_get,
    _cache_set,
    get_trocr_model,
    _split_into_lines,
)

logger = logging.getLogger(__name__)


def _try_onnx_ocr(
    image_data: bytes,
    handwritten: bool = False,
    split_lines: bool = True,
) -> Optional[Tuple[Optional[str], float]]:
    """
    Attempt ONNX inference.  Returns the (text, confidence) tuple on
    success, or None if ONNX is not available / fails to load.
    """
    try:
        from .trocr_onnx_service import is_onnx_available, run_trocr_onnx

        if not is_onnx_available(handwritten=handwritten):
            return None
        # run_trocr_onnx is async -- return the coroutine's awaitable result
        # The caller (run_trocr_ocr) will await it.
        return run_trocr_onnx  # sentinel: caller checks callable
    except ImportError:
        return None


async def _run_pytorch_ocr(
    image_data: bytes,
    handwritten: bool = False,
    split_lines: bool = True,
    size: str = "base",
) -> Tuple[Optional[str], float]:
    """
    Original PyTorch inference path (extracted for routing).
    """
    processor, model = get_trocr_model(handwritten=handwritten, size=size)

    if processor is None or model is None:
        logger.error("TrOCR PyTorch model not available")
        return None, 0.0

    try:
        import torch
        from PIL import Image
        import numpy as np

        # Load image
        image = Image.open(io.BytesIO(image_data)).convert("RGB")

        if split_lines:
            lines = _split_into_lines(image)
            if not lines:
                lines = [image]
        else:
            lines = [image]

        all_text = []
        confidences = []

        for line_image in lines:
            pixel_values = processor(images=line_image, return_tensors="pt").pixel_values

            device = next(model.parameters()).device
            pixel_values = pixel_values.to(device)

            with torch.no_grad():
                generated_ids = model.generate(pixel_values, max_length=128)

            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

            if generated_text.strip():
                all_text.append(generated_text.strip())
                confidences.append(0.85 if len(generated_text) > 3 else 0.5)

        text = "\n".join(all_text)
        confidence = sum(confidences) / len(confidences) if confidences else 0.0

        logger.info(f"TrOCR (PyTorch) extracted {len(text)} characters from {len(lines)} lines")
        return text, confidence

    except Exception as e:
        logger.error(f"TrOCR PyTorch failed: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return None, 0.0


async def run_trocr_ocr(
    image_data: bytes,
    handwritten: bool = False,
    split_lines: bool = True,
    size: str = "base",
) -> Tuple[Optional[str], float]:
    """
    Run TrOCR on an image.

    Routes between ONNX and PyTorch backends based on the TROCR_BACKEND
    environment variable (default: "auto").

    - "onnx"    -- always use ONNX (raises RuntimeError if unavailable)
    - "pytorch" -- always use PyTorch (original behaviour)
    - "auto"    -- try ONNX first, fall back to PyTorch

    TrOCR is optimized for single-line text recognition, so for full-page
    images we need to either:
    1. Split into lines first (using line detection)
    2. Process the whole image and get partial results

    Args:
        image_data: Raw image bytes
        handwritten: Use handwritten model (slower but better for handwriting)
        split_lines: Whether to split image into lines first
        size: "base" or "large" (only for handwritten variant)

    Returns:
        Tuple of (extracted_text, confidence)
    """
    backend = _trocr_backend

    # --- ONNX-only mode ---
    if backend == "onnx":
        onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
        if onnx_fn is None or not callable(onnx_fn):
            raise RuntimeError(
                "ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. "
                "Ensure onnxruntime + optimum are installed and ONNX model files exist."
            )
        return await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines)

    # --- PyTorch-only mode ---
    if backend == "pytorch":
        return await _run_pytorch_ocr(
            image_data, handwritten=handwritten, split_lines=split_lines, size=size,
        )

    # --- Auto mode: try ONNX first, then PyTorch ---
    onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
    if onnx_fn is not None and callable(onnx_fn):
        try:
            result = await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines)
            if result[0] is not None:
                return result
            logger.warning("ONNX returned None text, falling back to PyTorch")
        except Exception as e:
            logger.warning(f"ONNX inference failed ({e}), falling back to PyTorch")

    return await _run_pytorch_ocr(
        image_data, handwritten=handwritten, split_lines=split_lines, size=size,
    )


def _try_onnx_enhanced(
    handwritten: bool = True,
):
    """
    Return the ONNX enhanced coroutine function, or None if unavailable.
    """
    try:
        from .trocr_onnx_service import is_onnx_available, run_trocr_onnx_enhanced

        if not is_onnx_available(handwritten=handwritten):
            return None
        return run_trocr_onnx_enhanced
    except ImportError:
        return None


async def run_trocr_ocr_enhanced(
    image_data: bytes,
    handwritten: bool = True,
    split_lines: bool = True,
    use_cache: bool = True
) -> OCRResult:
    """
    Enhanced TrOCR OCR with caching and detailed results.

    Routes between ONNX and PyTorch backends based on the TROCR_BACKEND
    environment variable (default: "auto").

    Args:
        image_data: Raw image bytes
        handwritten: Use handwritten model
        split_lines: Whether to split image into lines first
        use_cache: Whether to use caching

    Returns:
        OCRResult with detailed information
    """
    backend = _trocr_backend

    # --- ONNX-only mode ---
    if backend == "onnx":
        onnx_fn = _try_onnx_enhanced(handwritten=handwritten)
        if onnx_fn is None:
            raise RuntimeError(
                "ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. "
                "Ensure onnxruntime + optimum are installed and ONNX model files exist."
            )
        return await onnx_fn(
            image_data, handwritten=handwritten,
            split_lines=split_lines, use_cache=use_cache,
        )

    # --- Auto mode: try ONNX first ---
    if backend == "auto":
        onnx_fn = _try_onnx_enhanced(handwritten=handwritten)
        if onnx_fn is not None:
            try:
                result = await onnx_fn(
                    image_data, handwritten=handwritten,
                    split_lines=split_lines, use_cache=use_cache,
                )
                if result.text:
                    return result
                logger.warning("ONNX enhanced returned empty text, falling back to PyTorch")
            except Exception as e:
                logger.warning(f"ONNX enhanced failed ({e}), falling back to PyTorch")

    # --- PyTorch path (backend == "pytorch" or auto fallback) ---
    start_time = time.time()

    # Check cache first
    image_hash = _compute_image_hash(image_data)
    if use_cache:
        cached = _cache_get(image_hash)
        if cached:
            return OCRResult(
                text=cached["text"],
                confidence=cached["confidence"],
                processing_time_ms=0,
                model=cached["model"],
                has_lora_adapter=cached.get("has_lora_adapter", False),
                char_confidences=cached.get("char_confidences", []),
                word_boxes=cached.get("word_boxes", []),
                from_cache=True,
                image_hash=image_hash
            )

    # Run OCR via PyTorch
    text, confidence = await _run_pytorch_ocr(image_data, handwritten=handwritten, split_lines=split_lines)

    processing_time_ms = int((time.time() - start_time) * 1000)

    # Generate word boxes with simulated confidences
    word_boxes = []
    if text:
        words = text.split()
        for idx, word in enumerate(words):
            # Simulate word confidence (slightly varied around overall confidence)
            word_conf = min(1.0, max(0.0, confidence + (hash(word) % 20 - 10) / 100))
            word_boxes.append({
                "text": word,
                "confidence": word_conf,
                "bbox": [0, 0, 0, 0]  # Would need actual bounding box detection
            })

    # Generate character confidences
    char_confidences = []
    if text:
        for char in text:
            # Simulate per-character confidence
            char_conf = min(1.0, max(0.0, confidence + (hash(char) % 15 - 7) / 100))
            char_confidences.append(char_conf)

    result = OCRResult(
        text=text or "",
        confidence=confidence,
        processing_time_ms=processing_time_ms,
        model="trocr-base-handwritten" if handwritten else "trocr-base-printed",
        has_lora_adapter=False,  # Would check actual adapter status
        char_confidences=char_confidences,
        word_boxes=word_boxes,
        from_cache=False,
        image_hash=image_hash
    )

    # Cache result
    if use_cache and text:
        _cache_set(image_hash, {
            "text": result.text,
            "confidence": result.confidence,
            "model": result.model,
            "has_lora_adapter": result.has_lora_adapter,
            "char_confidences": result.char_confidences,
            "word_boxes": result.word_boxes
        })

    return result