backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
310 lines
10 KiB
Python
310 lines
10 KiB
Python
"""
|
|
TrOCR OCR Execution
|
|
|
|
Core OCR inference routines (PyTorch, ONNX routing, enhanced mode).
|
|
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import io
|
|
import logging
|
|
import time
|
|
from typing import Tuple, Optional, List, Dict, Any
|
|
|
|
from .trocr_models import (
|
|
OCRResult,
|
|
_trocr_backend,
|
|
_compute_image_hash,
|
|
_cache_get,
|
|
_cache_set,
|
|
get_trocr_model,
|
|
_split_into_lines,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _try_onnx_ocr(
|
|
image_data: bytes,
|
|
handwritten: bool = False,
|
|
split_lines: bool = True,
|
|
) -> Optional[Tuple[Optional[str], float]]:
|
|
"""
|
|
Attempt ONNX inference. Returns the (text, confidence) tuple on
|
|
success, or None if ONNX is not available / fails to load.
|
|
"""
|
|
try:
|
|
from .trocr_onnx_service import is_onnx_available, run_trocr_onnx
|
|
|
|
if not is_onnx_available(handwritten=handwritten):
|
|
return None
|
|
# run_trocr_onnx is async -- return the coroutine's awaitable result
|
|
# The caller (run_trocr_ocr) will await it.
|
|
return run_trocr_onnx # sentinel: caller checks callable
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
async def _run_pytorch_ocr(
|
|
image_data: bytes,
|
|
handwritten: bool = False,
|
|
split_lines: bool = True,
|
|
size: str = "base",
|
|
) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Original PyTorch inference path (extracted for routing).
|
|
"""
|
|
processor, model = get_trocr_model(handwritten=handwritten, size=size)
|
|
|
|
if processor is None or model is None:
|
|
logger.error("TrOCR PyTorch model not available")
|
|
return None, 0.0
|
|
|
|
try:
|
|
import torch
|
|
from PIL import Image
|
|
import numpy as np
|
|
|
|
# Load image
|
|
image = Image.open(io.BytesIO(image_data)).convert("RGB")
|
|
|
|
if split_lines:
|
|
lines = _split_into_lines(image)
|
|
if not lines:
|
|
lines = [image]
|
|
else:
|
|
lines = [image]
|
|
|
|
all_text = []
|
|
confidences = []
|
|
|
|
for line_image in lines:
|
|
pixel_values = processor(images=line_image, return_tensors="pt").pixel_values
|
|
|
|
device = next(model.parameters()).device
|
|
pixel_values = pixel_values.to(device)
|
|
|
|
with torch.no_grad():
|
|
generated_ids = model.generate(pixel_values, max_length=128)
|
|
|
|
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
|
|
if generated_text.strip():
|
|
all_text.append(generated_text.strip())
|
|
confidences.append(0.85 if len(generated_text) > 3 else 0.5)
|
|
|
|
text = "\n".join(all_text)
|
|
confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
|
|
|
logger.info(f"TrOCR (PyTorch) extracted {len(text)} characters from {len(lines)} lines")
|
|
return text, confidence
|
|
|
|
except Exception as e:
|
|
logger.error(f"TrOCR PyTorch failed: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return None, 0.0
|
|
|
|
|
|
async def run_trocr_ocr(
|
|
image_data: bytes,
|
|
handwritten: bool = False,
|
|
split_lines: bool = True,
|
|
size: str = "base",
|
|
) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Run TrOCR on an image.
|
|
|
|
Routes between ONNX and PyTorch backends based on the TROCR_BACKEND
|
|
environment variable (default: "auto").
|
|
|
|
- "onnx" -- always use ONNX (raises RuntimeError if unavailable)
|
|
- "pytorch" -- always use PyTorch (original behaviour)
|
|
- "auto" -- try ONNX first, fall back to PyTorch
|
|
|
|
TrOCR is optimized for single-line text recognition, so for full-page
|
|
images we need to either:
|
|
1. Split into lines first (using line detection)
|
|
2. Process the whole image and get partial results
|
|
|
|
Args:
|
|
image_data: Raw image bytes
|
|
handwritten: Use handwritten model (slower but better for handwriting)
|
|
split_lines: Whether to split image into lines first
|
|
size: "base" or "large" (only for handwritten variant)
|
|
|
|
Returns:
|
|
Tuple of (extracted_text, confidence)
|
|
"""
|
|
backend = _trocr_backend
|
|
|
|
# --- ONNX-only mode ---
|
|
if backend == "onnx":
|
|
onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
|
|
if onnx_fn is None or not callable(onnx_fn):
|
|
raise RuntimeError(
|
|
"ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. "
|
|
"Ensure onnxruntime + optimum are installed and ONNX model files exist."
|
|
)
|
|
return await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines)
|
|
|
|
# --- PyTorch-only mode ---
|
|
if backend == "pytorch":
|
|
return await _run_pytorch_ocr(
|
|
image_data, handwritten=handwritten, split_lines=split_lines, size=size,
|
|
)
|
|
|
|
# --- Auto mode: try ONNX first, then PyTorch ---
|
|
onnx_fn = _try_onnx_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
|
|
if onnx_fn is not None and callable(onnx_fn):
|
|
try:
|
|
result = await onnx_fn(image_data, handwritten=handwritten, split_lines=split_lines)
|
|
if result[0] is not None:
|
|
return result
|
|
logger.warning("ONNX returned None text, falling back to PyTorch")
|
|
except Exception as e:
|
|
logger.warning(f"ONNX inference failed ({e}), falling back to PyTorch")
|
|
|
|
return await _run_pytorch_ocr(
|
|
image_data, handwritten=handwritten, split_lines=split_lines, size=size,
|
|
)
|
|
|
|
|
|
def _try_onnx_enhanced(
|
|
handwritten: bool = True,
|
|
):
|
|
"""
|
|
Return the ONNX enhanced coroutine function, or None if unavailable.
|
|
"""
|
|
try:
|
|
from .trocr_onnx_service import is_onnx_available, run_trocr_onnx_enhanced
|
|
|
|
if not is_onnx_available(handwritten=handwritten):
|
|
return None
|
|
return run_trocr_onnx_enhanced
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
async def run_trocr_ocr_enhanced(
|
|
image_data: bytes,
|
|
handwritten: bool = True,
|
|
split_lines: bool = True,
|
|
use_cache: bool = True
|
|
) -> OCRResult:
|
|
"""
|
|
Enhanced TrOCR OCR with caching and detailed results.
|
|
|
|
Routes between ONNX and PyTorch backends based on the TROCR_BACKEND
|
|
environment variable (default: "auto").
|
|
|
|
Args:
|
|
image_data: Raw image bytes
|
|
handwritten: Use handwritten model
|
|
split_lines: Whether to split image into lines first
|
|
use_cache: Whether to use caching
|
|
|
|
Returns:
|
|
OCRResult with detailed information
|
|
"""
|
|
backend = _trocr_backend
|
|
|
|
# --- ONNX-only mode ---
|
|
if backend == "onnx":
|
|
onnx_fn = _try_onnx_enhanced(handwritten=handwritten)
|
|
if onnx_fn is None:
|
|
raise RuntimeError(
|
|
"ONNX backend requested (TROCR_BACKEND=onnx) but unavailable. "
|
|
"Ensure onnxruntime + optimum are installed and ONNX model files exist."
|
|
)
|
|
return await onnx_fn(
|
|
image_data, handwritten=handwritten,
|
|
split_lines=split_lines, use_cache=use_cache,
|
|
)
|
|
|
|
# --- Auto mode: try ONNX first ---
|
|
if backend == "auto":
|
|
onnx_fn = _try_onnx_enhanced(handwritten=handwritten)
|
|
if onnx_fn is not None:
|
|
try:
|
|
result = await onnx_fn(
|
|
image_data, handwritten=handwritten,
|
|
split_lines=split_lines, use_cache=use_cache,
|
|
)
|
|
if result.text:
|
|
return result
|
|
logger.warning("ONNX enhanced returned empty text, falling back to PyTorch")
|
|
except Exception as e:
|
|
logger.warning(f"ONNX enhanced failed ({e}), falling back to PyTorch")
|
|
|
|
# --- PyTorch path (backend == "pytorch" or auto fallback) ---
|
|
start_time = time.time()
|
|
|
|
# Check cache first
|
|
image_hash = _compute_image_hash(image_data)
|
|
if use_cache:
|
|
cached = _cache_get(image_hash)
|
|
if cached:
|
|
return OCRResult(
|
|
text=cached["text"],
|
|
confidence=cached["confidence"],
|
|
processing_time_ms=0,
|
|
model=cached["model"],
|
|
has_lora_adapter=cached.get("has_lora_adapter", False),
|
|
char_confidences=cached.get("char_confidences", []),
|
|
word_boxes=cached.get("word_boxes", []),
|
|
from_cache=True,
|
|
image_hash=image_hash
|
|
)
|
|
|
|
# Run OCR via PyTorch
|
|
text, confidence = await _run_pytorch_ocr(image_data, handwritten=handwritten, split_lines=split_lines)
|
|
|
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
# Generate word boxes with simulated confidences
|
|
word_boxes = []
|
|
if text:
|
|
words = text.split()
|
|
for idx, word in enumerate(words):
|
|
# Simulate word confidence (slightly varied around overall confidence)
|
|
word_conf = min(1.0, max(0.0, confidence + (hash(word) % 20 - 10) / 100))
|
|
word_boxes.append({
|
|
"text": word,
|
|
"confidence": word_conf,
|
|
"bbox": [0, 0, 0, 0] # Would need actual bounding box detection
|
|
})
|
|
|
|
# Generate character confidences
|
|
char_confidences = []
|
|
if text:
|
|
for char in text:
|
|
# Simulate per-character confidence
|
|
char_conf = min(1.0, max(0.0, confidence + (hash(char) % 15 - 7) / 100))
|
|
char_confidences.append(char_conf)
|
|
|
|
result = OCRResult(
|
|
text=text or "",
|
|
confidence=confidence,
|
|
processing_time_ms=processing_time_ms,
|
|
model="trocr-base-handwritten" if handwritten else "trocr-base-printed",
|
|
has_lora_adapter=False, # Would check actual adapter status
|
|
char_confidences=char_confidences,
|
|
word_boxes=word_boxes,
|
|
from_cache=False,
|
|
image_hash=image_hash
|
|
)
|
|
|
|
# Cache result
|
|
if use_cache and text:
|
|
_cache_set(image_hash, {
|
|
"text": result.text,
|
|
"confidence": result.confidence,
|
|
"model": result.model,
|
|
"has_lora_adapter": result.has_lora_adapter,
|
|
"char_confidences": result.char_confidences,
|
|
"word_boxes": result.word_boxes
|
|
})
|
|
|
|
return result
|