[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/services/trocr_batch.py
+++ b/klausur-service/backend/services/trocr_batch.py
@@ -0,0 +1,160 @@
+"""
+TrOCR Batch Processing & Streaming
+
+Batch OCR and SSE streaming for multiple images.
+
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import asyncio
+import logging
+import time
+from typing import Optional, List, Dict, Any
+
+from .trocr_models import OCRResult, BatchOCRResult
+from .trocr_ocr import run_trocr_ocr_enhanced
+
+logger = logging.getLogger(__name__)
+
+
+async def run_trocr_batch(
+    images: List[bytes],
+    handwritten: bool = True,
+    split_lines: bool = True,
+    use_cache: bool = True,
+    progress_callback: Optional[callable] = None
+) -> BatchOCRResult:
+    """
+    Process multiple images in batch.
+
+    Args:
+        images: List of image data bytes
+        handwritten: Use handwritten model
+        split_lines: Whether to split images into lines
+        use_cache: Whether to use caching
+        progress_callback: Optional callback(current, total) for progress updates
+
+    Returns:
+        BatchOCRResult with all results
+    """
+    start_time = time.time()
+    results = []
+    cached_count = 0
+    error_count = 0
+
+    for idx, image_data in enumerate(images):
+        try:
+            result = await run_trocr_ocr_enhanced(
+                image_data,
+                handwritten=handwritten,
+                split_lines=split_lines,
+                use_cache=use_cache
+            )
+            results.append(result)
+
+            if result.from_cache:
+                cached_count += 1
+
+            # Report progress
+            if progress_callback:
+                progress_callback(idx + 1, len(images))
+
+        except Exception as e:
+            logger.error(f"Batch OCR error for image {idx}: {e}")
+            error_count += 1
+            results.append(OCRResult(
+                text=f"Error: {str(e)}",
+                confidence=0.0,
+                processing_time_ms=0,
+                model="error",
+                has_lora_adapter=False
+            ))
+
+    total_time_ms = int((time.time() - start_time) * 1000)
+
+    return BatchOCRResult(
+        results=results,
+        total_time_ms=total_time_ms,
+        processed_count=len(images),
+        cached_count=cached_count,
+        error_count=error_count
+    )
+
+
+# Generator for SSE streaming during batch processing
+async def run_trocr_batch_stream(
+    images: List[bytes],
+    handwritten: bool = True,
+    split_lines: bool = True,
+    use_cache: bool = True
+):
+    """
+    Process images and yield progress updates for SSE streaming.
+
+    Yields:
+        dict with current progress and result
+    """
+    start_time = time.time()
+    total = len(images)
+
+    for idx, image_data in enumerate(images):
+        try:
+            result = await run_trocr_ocr_enhanced(
+                image_data,
+                handwritten=handwritten,
+                split_lines=split_lines,
+                use_cache=use_cache
+            )
+
+            elapsed_ms = int((time.time() - start_time) * 1000)
+            avg_time_per_image = elapsed_ms / (idx + 1)
+            estimated_remaining = int(avg_time_per_image * (total - idx - 1))
+
+            yield {
+                "type": "progress",
+                "current": idx + 1,
+                "total": total,
+                "progress_percent": ((idx + 1) / total) * 100,
+                "elapsed_ms": elapsed_ms,
+                "estimated_remaining_ms": estimated_remaining,
+                "result": {
+                    "text": result.text,
+                    "confidence": result.confidence,
+                    "processing_time_ms": result.processing_time_ms,
+                    "from_cache": result.from_cache
+                }
+            }
+
+        except Exception as e:
+            logger.error(f"Stream OCR error for image {idx}: {e}")
+            yield {
+                "type": "error",
+                "current": idx + 1,
+                "total": total,
+                "error": str(e)
+            }
+
+    total_time_ms = int((time.time() - start_time) * 1000)
+    yield {
+        "type": "complete",
+        "total_time_ms": total_time_ms,
+        "processed_count": total
+    }
+
+
+# Test function
+async def test_trocr_ocr(image_path: str, handwritten: bool = False):
+    """Test TrOCR on a local image file."""
+    from .trocr_ocr import run_trocr_ocr
+
+    with open(image_path, "rb") as f:
+        image_data = f.read()
+
+    text, confidence = await run_trocr_ocr(image_data, handwritten=handwritten)
+
+    print(f"\n=== TrOCR Test ===")
+    print(f"Mode: {'Handwritten' if handwritten else 'Printed'}")
+    print(f"Confidence: {confidence:.2f}")
+    print(f"Text:\n{text}")
+
+    return text, confidence