[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions

View File

@@ -0,0 +1,205 @@
"""
OCR Labeling - Helper Functions and OCR Wrappers
Extracted from ocr_labeling_api.py to keep files under 500 LOC.
DATENSCHUTZ/PRIVACY:
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
- Keine Daten werden an externe Server gesendet
"""
import os
import hashlib
from ocr_labeling_models import LOCAL_STORAGE_PATH
# Try to import Vision OCR service
try:
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'backend', 'klausur', 'services'))
from vision_ocr_service import get_vision_ocr_service, VisionOCRService
VISION_OCR_AVAILABLE = True
except ImportError:
VISION_OCR_AVAILABLE = False
print("Warning: Vision OCR service not available")
# Try to import PaddleOCR from hybrid_vocab_extractor
try:
from hybrid_vocab_extractor import run_paddle_ocr
PADDLEOCR_AVAILABLE = True
except ImportError:
PADDLEOCR_AVAILABLE = False
print("Warning: PaddleOCR not available")
# Try to import TrOCR service
try:
from services.trocr_service import run_trocr_ocr
TROCR_AVAILABLE = True
except ImportError:
TROCR_AVAILABLE = False
print("Warning: TrOCR service not available")
# Try to import Donut service
try:
from services.donut_ocr_service import run_donut_ocr
DONUT_AVAILABLE = True
except ImportError:
DONUT_AVAILABLE = False
print("Warning: Donut OCR service not available")
# Try to import MinIO storage
try:
from minio_storage import upload_ocr_image, get_ocr_image, MINIO_BUCKET
MINIO_AVAILABLE = True
except ImportError:
MINIO_AVAILABLE = False
print("Warning: MinIO storage not available, using local storage")
# Try to import Training Export Service
try:
from training_export_service import (
TrainingExportService,
TrainingSample,
get_training_export_service,
)
TRAINING_EXPORT_AVAILABLE = True
except ImportError:
TRAINING_EXPORT_AVAILABLE = False
print("Warning: Training export service not available")
# =============================================================================
# Helper Functions
# =============================================================================
def compute_image_hash(image_data: bytes) -> str:
"""Compute SHA256 hash of image data."""
return hashlib.sha256(image_data).hexdigest()
async def run_ocr_on_image(image_data: bytes, filename: str, model: str = "llama3.2-vision:11b") -> tuple:
"""
Run OCR on an image using the specified model.
Models:
- llama3.2-vision:11b: Vision LLM (default, best for handwriting)
- trocr: Microsoft TrOCR (fast for printed text)
- paddleocr: PaddleOCR + LLM hybrid (4x faster)
- donut: Document Understanding Transformer (structured documents)
Returns:
Tuple of (ocr_text, confidence)
"""
print(f"Running OCR with model: {model}")
# Route to appropriate OCR service based on model
if model == "paddleocr":
return await run_paddleocr_wrapper(image_data, filename)
elif model == "donut":
return await run_donut_wrapper(image_data, filename)
elif model == "trocr":
return await run_trocr_wrapper(image_data, filename)
else:
# Default: Vision LLM (llama3.2-vision or similar)
return await run_vision_ocr_wrapper(image_data, filename)
async def run_vision_ocr_wrapper(image_data: bytes, filename: str) -> tuple:
"""Vision LLM OCR wrapper."""
if not VISION_OCR_AVAILABLE:
print("Vision OCR service not available")
return None, 0.0
try:
service = get_vision_ocr_service()
if not await service.is_available():
print("Vision OCR service not available (is_available check failed)")
return None, 0.0
result = await service.extract_text(
image_data,
filename=filename,
is_handwriting=True
)
return result.text, result.confidence
except Exception as e:
print(f"Vision OCR failed: {e}")
return None, 0.0
async def run_paddleocr_wrapper(image_data: bytes, filename: str) -> tuple:
"""PaddleOCR wrapper - uses hybrid_vocab_extractor."""
if not PADDLEOCR_AVAILABLE:
print("PaddleOCR not available, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
try:
# run_paddle_ocr returns (regions, raw_text)
regions, raw_text = run_paddle_ocr(image_data)
if not raw_text:
print("PaddleOCR returned empty text")
return None, 0.0
# Calculate average confidence from regions
if regions:
avg_confidence = sum(r.confidence for r in regions) / len(regions)
else:
avg_confidence = 0.5
return raw_text, avg_confidence
except Exception as e:
print(f"PaddleOCR failed: {e}, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
async def run_trocr_wrapper(image_data: bytes, filename: str) -> tuple:
"""TrOCR wrapper."""
if not TROCR_AVAILABLE:
print("TrOCR not available, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
try:
text, confidence = await run_trocr_ocr(image_data)
return text, confidence
except Exception as e:
print(f"TrOCR failed: {e}, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
async def run_donut_wrapper(image_data: bytes, filename: str) -> tuple:
"""Donut OCR wrapper."""
if not DONUT_AVAILABLE:
print("Donut not available, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
try:
text, confidence = await run_donut_ocr(image_data)
return text, confidence
except Exception as e:
print(f"Donut OCR failed: {e}, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
def save_image_locally(session_id: str, item_id: str, image_data: bytes, extension: str = "png") -> str:
"""Save image to local storage."""
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
filename = f"{item_id}.{extension}"
filepath = os.path.join(session_dir, filename)
with open(filepath, 'wb') as f:
f.write(image_data)
return filepath
def get_image_url(image_path: str) -> str:
"""Get URL for an image."""
# For local images, return a relative path that the frontend can use
if image_path.startswith(LOCAL_STORAGE_PATH):
relative_path = image_path[len(LOCAL_STORAGE_PATH):].lstrip('/')
return f"/api/v1/ocr-label/images/{relative_path}"
# For MinIO images, the path is already a URL or key
return image_path