[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/ocr_labeling_helpers.py
+++ b/klausur-service/backend/ocr_labeling_helpers.py
@@ -0,0 +1,205 @@
+"""
+OCR Labeling - Helper Functions and OCR Wrappers
+
+Extracted from ocr_labeling_api.py to keep files under 500 LOC.
+
+DATENSCHUTZ/PRIVACY:
+- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
+- Keine Daten werden an externe Server gesendet
+"""
+
+import os
+import hashlib
+
+from ocr_labeling_models import LOCAL_STORAGE_PATH
+
+# Try to import Vision OCR service
+try:
+    import sys
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'backend', 'klausur', 'services'))
+    from vision_ocr_service import get_vision_ocr_service, VisionOCRService
+    VISION_OCR_AVAILABLE = True
+except ImportError:
+    VISION_OCR_AVAILABLE = False
+    print("Warning: Vision OCR service not available")
+
+# Try to import PaddleOCR from hybrid_vocab_extractor
+try:
+    from hybrid_vocab_extractor import run_paddle_ocr
+    PADDLEOCR_AVAILABLE = True
+except ImportError:
+    PADDLEOCR_AVAILABLE = False
+    print("Warning: PaddleOCR not available")
+
+# Try to import TrOCR service
+try:
+    from services.trocr_service import run_trocr_ocr
+    TROCR_AVAILABLE = True
+except ImportError:
+    TROCR_AVAILABLE = False
+    print("Warning: TrOCR service not available")
+
+# Try to import Donut service
+try:
+    from services.donut_ocr_service import run_donut_ocr
+    DONUT_AVAILABLE = True
+except ImportError:
+    DONUT_AVAILABLE = False
+    print("Warning: Donut OCR service not available")
+
+# Try to import MinIO storage
+try:
+    from minio_storage import upload_ocr_image, get_ocr_image, MINIO_BUCKET
+    MINIO_AVAILABLE = True
+except ImportError:
+    MINIO_AVAILABLE = False
+    print("Warning: MinIO storage not available, using local storage")
+
+# Try to import Training Export Service
+try:
+    from training_export_service import (
+        TrainingExportService,
+        TrainingSample,
+        get_training_export_service,
+    )
+    TRAINING_EXPORT_AVAILABLE = True
+except ImportError:
+    TRAINING_EXPORT_AVAILABLE = False
+    print("Warning: Training export service not available")
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+def compute_image_hash(image_data: bytes) -> str:
+    """Compute SHA256 hash of image data."""
+    return hashlib.sha256(image_data).hexdigest()
+
+
+async def run_ocr_on_image(image_data: bytes, filename: str, model: str = "llama3.2-vision:11b") -> tuple:
+    """
+    Run OCR on an image using the specified model.
+
+    Models:
+        - llama3.2-vision:11b: Vision LLM (default, best for handwriting)
+        - trocr: Microsoft TrOCR (fast for printed text)
+        - paddleocr: PaddleOCR + LLM hybrid (4x faster)
+        - donut: Document Understanding Transformer (structured documents)
+
+    Returns:
+        Tuple of (ocr_text, confidence)
+    """
+    print(f"Running OCR with model: {model}")
+
+    # Route to appropriate OCR service based on model
+    if model == "paddleocr":
+        return await run_paddleocr_wrapper(image_data, filename)
+    elif model == "donut":
+        return await run_donut_wrapper(image_data, filename)
+    elif model == "trocr":
+        return await run_trocr_wrapper(image_data, filename)
+    else:
+        # Default: Vision LLM (llama3.2-vision or similar)
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+
+async def run_vision_ocr_wrapper(image_data: bytes, filename: str) -> tuple:
+    """Vision LLM OCR wrapper."""
+    if not VISION_OCR_AVAILABLE:
+        print("Vision OCR service not available")
+        return None, 0.0
+
+    try:
+        service = get_vision_ocr_service()
+        if not await service.is_available():
+            print("Vision OCR service not available (is_available check failed)")
+            return None, 0.0
+
+        result = await service.extract_text(
+            image_data,
+            filename=filename,
+            is_handwriting=True
+        )
+        return result.text, result.confidence
+    except Exception as e:
+        print(f"Vision OCR failed: {e}")
+        return None, 0.0
+
+
+async def run_paddleocr_wrapper(image_data: bytes, filename: str) -> tuple:
+    """PaddleOCR wrapper - uses hybrid_vocab_extractor."""
+    if not PADDLEOCR_AVAILABLE:
+        print("PaddleOCR not available, falling back to Vision OCR")
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+    try:
+        # run_paddle_ocr returns (regions, raw_text)
+        regions, raw_text = run_paddle_ocr(image_data)
+
+        if not raw_text:
+            print("PaddleOCR returned empty text")
+            return None, 0.0
+
+        # Calculate average confidence from regions
+        if regions:
+            avg_confidence = sum(r.confidence for r in regions) / len(regions)
+        else:
+            avg_confidence = 0.5
+
+        return raw_text, avg_confidence
+    except Exception as e:
+        print(f"PaddleOCR failed: {e}, falling back to Vision OCR")
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+
+async def run_trocr_wrapper(image_data: bytes, filename: str) -> tuple:
+    """TrOCR wrapper."""
+    if not TROCR_AVAILABLE:
+        print("TrOCR not available, falling back to Vision OCR")
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+    try:
+        text, confidence = await run_trocr_ocr(image_data)
+        return text, confidence
+    except Exception as e:
+        print(f"TrOCR failed: {e}, falling back to Vision OCR")
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+
+async def run_donut_wrapper(image_data: bytes, filename: str) -> tuple:
+    """Donut OCR wrapper."""
+    if not DONUT_AVAILABLE:
+        print("Donut not available, falling back to Vision OCR")
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+    try:
+        text, confidence = await run_donut_ocr(image_data)
+        return text, confidence
+    except Exception as e:
+        print(f"Donut OCR failed: {e}, falling back to Vision OCR")
+        return await run_vision_ocr_wrapper(image_data, filename)
+
+
+def save_image_locally(session_id: str, item_id: str, image_data: bytes, extension: str = "png") -> str:
+    """Save image to local storage."""
+    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+    os.makedirs(session_dir, exist_ok=True)
+
+    filename = f"{item_id}.{extension}"
+    filepath = os.path.join(session_dir, filename)
+
+    with open(filepath, 'wb') as f:
+        f.write(image_data)
+
+    return filepath
+
+
+def get_image_url(image_path: str) -> str:
+    """Get URL for an image."""
+    # For local images, return a relative path that the frontend can use
+    if image_path.startswith(LOCAL_STORAGE_PATH):
+        relative_path = image_path[len(LOCAL_STORAGE_PATH):].lstrip('/')
+        return f"/api/v1/ocr-label/images/{relative_path}"
+    # For MinIO images, the path is already a URL or key
+    return image_path