""" TrOCR OCR Execution Core OCR inference routines (PyTorch, ONNX routing, enhanced mode). DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import io import logging import time from typing import Tuple, Optional, List, Dict, Any from .trocr_models import ( OCRResult, _trocr_backend, _compute_image_hash, _cache_get, _cache_set, get_trocr_model, _split_into_lines, ) logger = logging.getLogger(__name__) def _try_onnx_ocr( image_data: bytes, handwritten: bool = False, split_lines: bool = True, ) -> Optional[Tuple[Optional[str], float]]: """ Attempt ONNX inference. Returns the (text, confidence) tuple on success, or None if ONNX is not available / fails to load. """ try: from .trocr_onnx_service import is_onnx_available, run_trocr_onnx if not is_onnx_available(handwritten=handwritten): return None # run_trocr_onnx is async -- return the coroutine's awaitable result # The caller (run_trocr_ocr) will await it. return run_trocr_onnx # sentinel: caller checks callable except ImportError: return None async def _run_pytorch_ocr( image_data: bytes, handwritten: bool = False, split_lines: bool = True, size: str = "base", ) -> Tuple[Optional[str], float]: """ Original PyTorch inference path (extracted for routing). """ processor, model = get_trocr_model(handwritten=handwritten, size=size) if processor is None or model is None: logger.error("TrOCR PyTorch model not available") return None, 0.0 try: import torch from PIL import Image import numpy as np # Load image image = Image.open(io.BytesIO(image_data)).convert("RGB") if split_lines: lines = _split_into_lines(image) if not lines: lines = [image] else: lines = [image] all_text = [] confidences = [] for line_image in lines: pixel_values = processor(images=line_image, return_tensors="pt").pixel_values device = next(model.parameters()).device pixel_values = pixel_values.to(device) with torch.no_grad(): generated_ids = model.generate(pixel_values, max_length=128) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] if generated_text.strip(): all_text.append(generated_text.strip()) confidences.append(0.85 if len(generated_text) > 3 else 0.5) text = "\n".join(all_text) confidence = sum(confidences) / len(confidences) if confidences else 0.0 logger.info(f"TrOCR (PyTorch) extracted {len(text)} characters from {len(lines)} lines") return text, confidence except Exception as e: logger.error(f"TrOCR PyTorch failed: {e}") import traceback logger.error(traceback.format_exc()) return None, 0.0 async def run_trocr_ocr( image_data: bytes, handwritten: bool = False, split_lines: bool = True, size: str = "base", ) -> Tuple[Optional[str], float]: """ Run TrOCR on an image. Routes between ONNX and PyTorch backends based on the TROCR_BACKEND environment variable (default: "auto"). - "onnx" -- always use ONNX (raises RuntimeError if unavailable) - "pytorch" -- always use PyTorch (original behaviour) - "auto" -- try ONNX first, fall back to PyTorch TrOCR is optimized for single-line text recognition, so for full-page images we need to either: 1. Split into lines first (using line detection) 2. Process the whole image and get partial results Args: image_data: Raw image bytes handwritten: Use handwritten model (slower but better for handwriting) split_lines: Whether to split image into lines first size: "base" or "large" (only for handwritten variant) Returns: Tuple of (extracted_text, confidence) """ backend = _trocr_backend # --- ONNX-only mode --- if backend == "onnx": onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines) if onnx_fn is None or not callable(onnx_fn): raise RuntimeError( "ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. " "Ensure onnxruntime + optimum are installed and ONNX model files exist." ) return await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines) # --- PyTorch-only mode --- if backend == "pytorch": return await _run_pytorch_ocr( image_data, handwritten=handwritten, split_lines=split_lines, size=size, ) # --- Auto mode: try ONNX first, then PyTorch --- onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines) if onnx_fn is not None and callable(onnx_fn): try: result = await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines) if result[0] is not None: return result logger.warning("ONNX returned None text, falling back to PyTorch") except Exception as e: logger.warning(f"ONNX inference failed ({e}), falling back to PyTorch") return await _run_pytorch_ocr( image_data, handwritten=handwritten, split_lines=split_lines, size=size, ) def _try_onnx_enhanced( handwritten: bool = True, ): """ Return the ONNX enhanced coroutine function, or None if unavailable. """ try: from .trocr_onnx_service import is_onnx_available, run_trocr_onnx_enhanced if not is_onnx_available(handwritten=handwritten): return None return run_trocr_onnx_enhanced except ImportError: return None async def run_trocr_ocr_enhanced( image_data: bytes, handwritten: bool = True, split_lines: bool = True, use_cache: bool = True ) -> OCRResult: """ Enhanced TrOCR OCR with caching and detailed results. Routes between ONNX and PyTorch backends based on the TROCR_BACKEND environment variable (default: "auto"). Args: image_data: Raw image bytes handwritten: Use handwritten model split_lines: Whether to split image into lines first use_cache: Whether to use caching Returns: OCRResult with detailed information """ backend = _trocr_backend # --- ONNX-only mode --- if backend == "onnx": onnx_fn = _try_onnx_enhanced(handwritten=handwritten) if onnx_fn is None: raise RuntimeError( "ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. " "Ensure onnxruntime + optimum are installed and ONNX model files exist." ) return await onnx_fn( image_data, handwritten=handwritten, split_lines=split_lines, use_cache=use_cache, ) # --- Auto mode: try ONNX first --- if backend == "auto": onnx_fn = _try_onnx_enhanced(handwritten=handwritten) if onnx_fn is not None: try: result = await onnx_fn( image_data, handwritten=handwritten, split_lines=split_lines, use_cache=use_cache, ) if result.text: return result logger.warning("ONNX enhanced returned empty text, falling back to PyTorch") except Exception as e: logger.warning(f"ONNX enhanced failed ({e}), falling back to PyTorch") # --- PyTorch path (backend == "pytorch" or auto fallback) --- start_time = time.time() # Check cache first image_hash = _compute_image_hash(image_data) if use_cache: cached = _cache_get(image_hash) if cached: return OCRResult( text=cached["text"], confidence=cached["confidence"], processing_time_ms=0, model=cached["model"], has_lora_adapter=cached.get("has_lora_adapter", False), char_confidences=cached.get("char_confidences", []), word_boxes=cached.get("word_boxes", []), from_cache=True, image_hash=image_hash ) # Run OCR via PyTorch text, confidence = await _run_pytorch_ocr(image_data, handwritten=handwritten, split_lines=split_lines) processing_time_ms = int((time.time() - start_time) * 1000) # Generate word boxes with simulated confidences word_boxes = [] if text: words = text.split() for idx, word in enumerate(words): # Simulate word confidence (slightly varied around overall confidence) word_conf = min(1.0, max(0.0, confidence + (hash(word) % 20 - 10) / 100)) word_boxes.append({ "text": word, "confidence": word_conf, "bbox": [0, 0, 0, 0] # Would need actual bounding box detection }) # Generate character confidences char_confidences = [] if text: for char in text: # Simulate per-character confidence char_conf = min(1.0, max(0.0, confidence + (hash(char) % 15 - 7) / 100)) char_confidences.append(char_conf) result = OCRResult( text=text or "", confidence=confidence, processing_time_ms=processing_time_ms, model="trocr-base-handwritten" if handwritten else "trocr-base-printed", has_lora_adapter=False, # Would check actual adapter status char_confidences=char_confidences, word_boxes=word_boxes, from_cache=False, image_hash=image_hash ) # Cache result if use_cache and text: _cache_set(image_hash, { "text": result.text, "confidence": result.confidence, "model": result.model, "has_lora_adapter": result.has_lora_adapter, "char_confidences": result.char_confidences, "word_boxes": result.word_boxes }) return result