[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
205
klausur-service/backend/ocr_labeling_helpers.py
Normal file
205
klausur-service/backend/ocr_labeling_helpers.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
OCR Labeling - Helper Functions and OCR Wrappers
|
||||
|
||||
Extracted from ocr_labeling_api.py to keep files under 500 LOC.
|
||||
|
||||
DATENSCHUTZ/PRIVACY:
|
||||
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
|
||||
- Keine Daten werden an externe Server gesendet
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
from ocr_labeling_models import LOCAL_STORAGE_PATH
|
||||
|
||||
# Try to import Vision OCR service
|
||||
try:
|
||||
import sys
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'backend', 'klausur', 'services'))
|
||||
from vision_ocr_service import get_vision_ocr_service, VisionOCRService
|
||||
VISION_OCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
VISION_OCR_AVAILABLE = False
|
||||
print("Warning: Vision OCR service not available")
|
||||
|
||||
# Try to import PaddleOCR from hybrid_vocab_extractor
|
||||
try:
|
||||
from hybrid_vocab_extractor import run_paddle_ocr
|
||||
PADDLEOCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
PADDLEOCR_AVAILABLE = False
|
||||
print("Warning: PaddleOCR not available")
|
||||
|
||||
# Try to import TrOCR service
|
||||
try:
|
||||
from services.trocr_service import run_trocr_ocr
|
||||
TROCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
TROCR_AVAILABLE = False
|
||||
print("Warning: TrOCR service not available")
|
||||
|
||||
# Try to import Donut service
|
||||
try:
|
||||
from services.donut_ocr_service import run_donut_ocr
|
||||
DONUT_AVAILABLE = True
|
||||
except ImportError:
|
||||
DONUT_AVAILABLE = False
|
||||
print("Warning: Donut OCR service not available")
|
||||
|
||||
# Try to import MinIO storage
|
||||
try:
|
||||
from minio_storage import upload_ocr_image, get_ocr_image, MINIO_BUCKET
|
||||
MINIO_AVAILABLE = True
|
||||
except ImportError:
|
||||
MINIO_AVAILABLE = False
|
||||
print("Warning: MinIO storage not available, using local storage")
|
||||
|
||||
# Try to import Training Export Service
|
||||
try:
|
||||
from training_export_service import (
|
||||
TrainingExportService,
|
||||
TrainingSample,
|
||||
get_training_export_service,
|
||||
)
|
||||
TRAINING_EXPORT_AVAILABLE = True
|
||||
except ImportError:
|
||||
TRAINING_EXPORT_AVAILABLE = False
|
||||
print("Warning: Training export service not available")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def compute_image_hash(image_data: bytes) -> str:
|
||||
"""Compute SHA256 hash of image data."""
|
||||
return hashlib.sha256(image_data).hexdigest()
|
||||
|
||||
|
||||
async def run_ocr_on_image(image_data: bytes, filename: str, model: str = "llama3.2-vision:11b") -> tuple:
|
||||
"""
|
||||
Run OCR on an image using the specified model.
|
||||
|
||||
Models:
|
||||
- llama3.2-vision:11b: Vision LLM (default, best for handwriting)
|
||||
- trocr: Microsoft TrOCR (fast for printed text)
|
||||
- paddleocr: PaddleOCR + LLM hybrid (4x faster)
|
||||
- donut: Document Understanding Transformer (structured documents)
|
||||
|
||||
Returns:
|
||||
Tuple of (ocr_text, confidence)
|
||||
"""
|
||||
print(f"Running OCR with model: {model}")
|
||||
|
||||
# Route to appropriate OCR service based on model
|
||||
if model == "paddleocr":
|
||||
return await run_paddleocr_wrapper(image_data, filename)
|
||||
elif model == "donut":
|
||||
return await run_donut_wrapper(image_data, filename)
|
||||
elif model == "trocr":
|
||||
return await run_trocr_wrapper(image_data, filename)
|
||||
else:
|
||||
# Default: Vision LLM (llama3.2-vision or similar)
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
|
||||
async def run_vision_ocr_wrapper(image_data: bytes, filename: str) -> tuple:
|
||||
"""Vision LLM OCR wrapper."""
|
||||
if not VISION_OCR_AVAILABLE:
|
||||
print("Vision OCR service not available")
|
||||
return None, 0.0
|
||||
|
||||
try:
|
||||
service = get_vision_ocr_service()
|
||||
if not await service.is_available():
|
||||
print("Vision OCR service not available (is_available check failed)")
|
||||
return None, 0.0
|
||||
|
||||
result = await service.extract_text(
|
||||
image_data,
|
||||
filename=filename,
|
||||
is_handwriting=True
|
||||
)
|
||||
return result.text, result.confidence
|
||||
except Exception as e:
|
||||
print(f"Vision OCR failed: {e}")
|
||||
return None, 0.0
|
||||
|
||||
|
||||
async def run_paddleocr_wrapper(image_data: bytes, filename: str) -> tuple:
|
||||
"""PaddleOCR wrapper - uses hybrid_vocab_extractor."""
|
||||
if not PADDLEOCR_AVAILABLE:
|
||||
print("PaddleOCR not available, falling back to Vision OCR")
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
try:
|
||||
# run_paddle_ocr returns (regions, raw_text)
|
||||
regions, raw_text = run_paddle_ocr(image_data)
|
||||
|
||||
if not raw_text:
|
||||
print("PaddleOCR returned empty text")
|
||||
return None, 0.0
|
||||
|
||||
# Calculate average confidence from regions
|
||||
if regions:
|
||||
avg_confidence = sum(r.confidence for r in regions) / len(regions)
|
||||
else:
|
||||
avg_confidence = 0.5
|
||||
|
||||
return raw_text, avg_confidence
|
||||
except Exception as e:
|
||||
print(f"PaddleOCR failed: {e}, falling back to Vision OCR")
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
|
||||
async def run_trocr_wrapper(image_data: bytes, filename: str) -> tuple:
|
||||
"""TrOCR wrapper."""
|
||||
if not TROCR_AVAILABLE:
|
||||
print("TrOCR not available, falling back to Vision OCR")
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
try:
|
||||
text, confidence = await run_trocr_ocr(image_data)
|
||||
return text, confidence
|
||||
except Exception as e:
|
||||
print(f"TrOCR failed: {e}, falling back to Vision OCR")
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
|
||||
async def run_donut_wrapper(image_data: bytes, filename: str) -> tuple:
|
||||
"""Donut OCR wrapper."""
|
||||
if not DONUT_AVAILABLE:
|
||||
print("Donut not available, falling back to Vision OCR")
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
try:
|
||||
text, confidence = await run_donut_ocr(image_data)
|
||||
return text, confidence
|
||||
except Exception as e:
|
||||
print(f"Donut OCR failed: {e}, falling back to Vision OCR")
|
||||
return await run_vision_ocr_wrapper(image_data, filename)
|
||||
|
||||
|
||||
def save_image_locally(session_id: str, item_id: str, image_data: bytes, extension: str = "png") -> str:
|
||||
"""Save image to local storage."""
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
os.makedirs(session_dir, exist_ok=True)
|
||||
|
||||
filename = f"{item_id}.{extension}"
|
||||
filepath = os.path.join(session_dir, filename)
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(image_data)
|
||||
|
||||
return filepath
|
||||
|
||||
|
||||
def get_image_url(image_path: str) -> str:
|
||||
"""Get URL for an image."""
|
||||
# For local images, return a relative path that the frontend can use
|
||||
if image_path.startswith(LOCAL_STORAGE_PATH):
|
||||
relative_path = image_path[len(LOCAL_STORAGE_PATH):].lstrip('/')
|
||||
return f"/api/v1/ocr-label/images/{relative_path}"
|
||||
# For MinIO images, the path is already a URL or key
|
||||
return image_path
|
||||
Reference in New Issue
Block a user