Files
breakpilot-lehrer/klausur-service/backend/services/trocr_ocr.py
Benjamin Admin 34da9f4cda [split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00

310 lines
10 KiB
Python

"""
TrOCR OCR Execution
Core OCR inference routines (PyTorch, ONNX routing, enhanced mode).
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import io
import logging
import time
from typing import Tuple, Optional, List, Dict, Any
from .trocr_models import (
OCRResult,
_trocr_backend,
_compute_image_hash,
_cache_get,
_cache_set,
get_trocr_model,
_split_into_lines,
)
logger = logging.getLogger(__name__)
def _try_onnx_ocr(
image_data: bytes,
handwritten: bool = False,
split_lines: bool = True,
) -> Optional[Tuple[Optional[str], float]]:
"""
Attempt ONNX inference. Returns the (text, confidence) tuple on
success, or None if ONNX is not available / fails to load.
"""
try:
from .trocr_onnx_service import is_onnx_available, run_trocr_onnx
if not is_onnx_available(handwritten=handwritten):
return None
# run_trocr_onnx is async -- return the coroutine's awaitable result
# The caller (run_trocr_ocr) will await it.
return run_trocr_onnx # sentinel: caller checks callable
except ImportError:
return None
async def _run_pytorch_ocr(
image_data: bytes,
handwritten: bool = False,
split_lines: bool = True,
size: str = "base",
) -> Tuple[Optional[str], float]:
"""
Original PyTorch inference path (extracted for routing).
"""
processor, model = get_trocr_model(handwritten=handwritten, size=size)
if processor is None or model is None:
logger.error("TrOCR PyTorch model not available")
return None, 0.0
try:
import torch
from PIL import Image
import numpy as np
# Load image
image = Image.open(io.BytesIO(image_data)).convert("RGB")
if split_lines:
lines = _split_into_lines(image)
if not lines:
lines = [image]
else:
lines = [image]
all_text = []
confidences = []
for line_image in lines:
pixel_values = processor(images=line_image, return_tensors="pt").pixel_values
device = next(model.parameters()).device
pixel_values = pixel_values.to(device)
with torch.no_grad():
generated_ids = model.generate(pixel_values, max_length=128)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
if generated_text.strip():
all_text.append(generated_text.strip())
confidences.append(0.85 if len(generated_text) > 3 else 0.5)
text = "\n".join(all_text)
confidence = sum(confidences) / len(confidences) if confidences else 0.0
logger.info(f"TrOCR (PyTorch) extracted {len(text)} characters from {len(lines)} lines")
return text, confidence
except Exception as e:
logger.error(f"TrOCR PyTorch failed: {e}")
import traceback
logger.error(traceback.format_exc())
return None, 0.0
async def run_trocr_ocr(
image_data: bytes,
handwritten: bool = False,
split_lines: bool = True,
size: str = "base",
) -> Tuple[Optional[str], float]:
"""
Run TrOCR on an image.
Routes between ONNX and PyTorch backends based on the TROCR_BACKEND
environment variable (default: "auto").
- "onnx" -- always use ONNX (raises RuntimeError if unavailable)
- "pytorch" -- always use PyTorch (original behaviour)
- "auto" -- try ONNX first, fall back to PyTorch
TrOCR is optimized for single-line text recognition, so for full-page
images we need to either:
1. Split into lines first (using line detection)
2. Process the whole image and get partial results
Args:
image_data: Raw image bytes
handwritten: Use handwritten model (slower but better for handwriting)
split_lines: Whether to split image into lines first
size: "base" or "large" (only for handwritten variant)
Returns:
Tuple of (extracted_text, confidence)
"""
backend = _trocr_backend
# --- ONNX-only mode ---
if backend == "onnx":
onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
if onnx_fn is None or not callable(onnx_fn):
raise RuntimeError(
"ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. "
"Ensure onnxruntime + optimum are installed and ONNX model files exist."
)
return await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines)
# --- PyTorch-only mode ---
if backend == "pytorch":
return await _run_pytorch_ocr(
image_data, handwritten=handwritten, split_lines=split_lines, size=size,
)
# --- Auto mode: try ONNX first, then PyTorch ---
onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
if onnx_fn is not None and callable(onnx_fn):
try:
result = await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines)
if result[0] is not None:
return result
logger.warning("ONNX returned None text, falling back to PyTorch")
except Exception as e:
logger.warning(f"ONNX inference failed ({e}), falling back to PyTorch")
return await _run_pytorch_ocr(
image_data, handwritten=handwritten, split_lines=split_lines, size=size,
)
def _try_onnx_enhanced(
handwritten: bool = True,
):
"""
Return the ONNX enhanced coroutine function, or None if unavailable.
"""
try:
from .trocr_onnx_service import is_onnx_available, run_trocr_onnx_enhanced
if not is_onnx_available(handwritten=handwritten):
return None
return run_trocr_onnx_enhanced
except ImportError:
return None
async def run_trocr_ocr_enhanced(
image_data: bytes,
handwritten: bool = True,
split_lines: bool = True,
use_cache: bool = True
) -> OCRResult:
"""
Enhanced TrOCR OCR with caching and detailed results.
Routes between ONNX and PyTorch backends based on the TROCR_BACKEND
environment variable (default: "auto").
Args:
image_data: Raw image bytes
handwritten: Use handwritten model
split_lines: Whether to split image into lines first
use_cache: Whether to use caching
Returns:
OCRResult with detailed information
"""
backend = _trocr_backend
# --- ONNX-only mode ---
if backend == "onnx":
onnx_fn = _try_onnx_enhanced(handwritten=handwritten)
if onnx_fn is None:
raise RuntimeError(
"ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. "
"Ensure onnxruntime + optimum are installed and ONNX model files exist."
)
return await onnx_fn(
image_data, handwritten=handwritten,
split_lines=split_lines, use_cache=use_cache,
)
# --- Auto mode: try ONNX first ---
if backend == "auto":
onnx_fn = _try_onnx_enhanced(handwritten=handwritten)
if onnx_fn is not None:
try:
result = await onnx_fn(
image_data, handwritten=handwritten,
split_lines=split_lines, use_cache=use_cache,
)
if result.text:
return result
logger.warning("ONNX enhanced returned empty text, falling back to PyTorch")
except Exception as e:
logger.warning(f"ONNX enhanced failed ({e}), falling back to PyTorch")
# --- PyTorch path (backend == "pytorch" or auto fallback) ---
start_time = time.time()
# Check cache first
image_hash = _compute_image_hash(image_data)
if use_cache:
cached = _cache_get(image_hash)
if cached:
return OCRResult(
text=cached["text"],
confidence=cached["confidence"],
processing_time_ms=0,
model=cached["model"],
has_lora_adapter=cached.get("has_lora_adapter", False),
char_confidences=cached.get("char_confidences", []),
word_boxes=cached.get("word_boxes", []),
from_cache=True,
image_hash=image_hash
)
# Run OCR via PyTorch
text, confidence = await _run_pytorch_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
processing_time_ms = int((time.time() - start_time) * 1000)
# Generate word boxes with simulated confidences
word_boxes = []
if text:
words = text.split()
for idx, word in enumerate(words):
# Simulate word confidence (slightly varied around overall confidence)
word_conf = min(1.0, max(0.0, confidence + (hash(word) % 20 - 10) / 100))
word_boxes.append({
"text": word,
"confidence": word_conf,
"bbox": [0, 0, 0, 0] # Would need actual bounding box detection
})
# Generate character confidences
char_confidences = []
if text:
for char in text:
# Simulate per-character confidence
char_conf = min(1.0, max(0.0, confidence + (hash(char) % 15 - 7) / 100))
char_confidences.append(char_conf)
result = OCRResult(
text=text or "",
confidence=confidence,
processing_time_ms=processing_time_ms,
model="trocr-base-handwritten" if handwritten else "trocr-base-printed",
has_lora_adapter=False, # Would check actual adapter status
char_confidences=char_confidences,
word_boxes=word_boxes,
from_cache=False,
image_hash=image_hash
)
# Cache result
if use_cache and text:
_cache_set(image_hash, {
"text": result.text,
"confidence": result.confidence,
"model": result.model,
"has_lora_adapter": result.has_lora_adapter,
"char_confidences": result.char_confidences,
"word_boxes": result.word_boxes
})
return result