[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
160
klausur-service/backend/services/trocr_batch.py
Normal file
160
klausur-service/backend/services/trocr_batch.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
TrOCR Batch Processing & Streaming
|
||||
|
||||
Batch OCR and SSE streaming for multiple images.
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
from .trocr_models import OCRResult, BatchOCRResult
|
||||
from .trocr_ocr import run_trocr_ocr_enhanced
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def run_trocr_batch(
|
||||
images: List[bytes],
|
||||
handwritten: bool = True,
|
||||
split_lines: bool = True,
|
||||
use_cache: bool = True,
|
||||
progress_callback: Optional[callable] = None
|
||||
) -> BatchOCRResult:
|
||||
"""
|
||||
Process multiple images in batch.
|
||||
|
||||
Args:
|
||||
images: List of image data bytes
|
||||
handwritten: Use handwritten model
|
||||
split_lines: Whether to split images into lines
|
||||
use_cache: Whether to use caching
|
||||
progress_callback: Optional callback(current, total) for progress updates
|
||||
|
||||
Returns:
|
||||
BatchOCRResult with all results
|
||||
"""
|
||||
start_time = time.time()
|
||||
results = []
|
||||
cached_count = 0
|
||||
error_count = 0
|
||||
|
||||
for idx, image_data in enumerate(images):
|
||||
try:
|
||||
result = await run_trocr_ocr_enhanced(
|
||||
image_data,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
if result.from_cache:
|
||||
cached_count += 1
|
||||
|
||||
# Report progress
|
||||
if progress_callback:
|
||||
progress_callback(idx + 1, len(images))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Batch OCR error for image {idx}: {e}")
|
||||
error_count += 1
|
||||
results.append(OCRResult(
|
||||
text=f"Error: {str(e)}",
|
||||
confidence=0.0,
|
||||
processing_time_ms=0,
|
||||
model="error",
|
||||
has_lora_adapter=False
|
||||
))
|
||||
|
||||
total_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
return BatchOCRResult(
|
||||
results=results,
|
||||
total_time_ms=total_time_ms,
|
||||
processed_count=len(images),
|
||||
cached_count=cached_count,
|
||||
error_count=error_count
|
||||
)
|
||||
|
||||
|
||||
# Generator for SSE streaming during batch processing
|
||||
async def run_trocr_batch_stream(
|
||||
images: List[bytes],
|
||||
handwritten: bool = True,
|
||||
split_lines: bool = True,
|
||||
use_cache: bool = True
|
||||
):
|
||||
"""
|
||||
Process images and yield progress updates for SSE streaming.
|
||||
|
||||
Yields:
|
||||
dict with current progress and result
|
||||
"""
|
||||
start_time = time.time()
|
||||
total = len(images)
|
||||
|
||||
for idx, image_data in enumerate(images):
|
||||
try:
|
||||
result = await run_trocr_ocr_enhanced(
|
||||
image_data,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
)
|
||||
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
avg_time_per_image = elapsed_ms / (idx + 1)
|
||||
estimated_remaining = int(avg_time_per_image * (total - idx - 1))
|
||||
|
||||
yield {
|
||||
"type": "progress",
|
||||
"current": idx + 1,
|
||||
"total": total,
|
||||
"progress_percent": ((idx + 1) / total) * 100,
|
||||
"elapsed_ms": elapsed_ms,
|
||||
"estimated_remaining_ms": estimated_remaining,
|
||||
"result": {
|
||||
"text": result.text,
|
||||
"confidence": result.confidence,
|
||||
"processing_time_ms": result.processing_time_ms,
|
||||
"from_cache": result.from_cache
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Stream OCR error for image {idx}: {e}")
|
||||
yield {
|
||||
"type": "error",
|
||||
"current": idx + 1,
|
||||
"total": total,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
total_time_ms = int((time.time() - start_time) * 1000)
|
||||
yield {
|
||||
"type": "complete",
|
||||
"total_time_ms": total_time_ms,
|
||||
"processed_count": total
|
||||
}
|
||||
|
||||
|
||||
# Test function
|
||||
async def test_trocr_ocr(image_path: str, handwritten: bool = False):
|
||||
"""Test TrOCR on a local image file."""
|
||||
from .trocr_ocr import run_trocr_ocr
|
||||
|
||||
with open(image_path, "rb") as f:
|
||||
image_data = f.read()
|
||||
|
||||
text, confidence = await run_trocr_ocr(image_data, handwritten=handwritten)
|
||||
|
||||
print(f"\n=== TrOCR Test ===")
|
||||
print(f"Mode: {'Handwritten' if handwritten else 'Printed'}")
|
||||
print(f"Confidence: {confidence:.2f}")
|
||||
print(f"Text:\n{text}")
|
||||
|
||||
return text, confidence
|
||||
Reference in New Issue
Block a user