""" Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision. DATENSCHUTZ/PRIVACY BY DESIGN: - Alle Verarbeitung erfolgt lokal auf dem Mac Mini - Keine Daten verlassen das lokale Netzwerk - Keine Cloud-APIs beteiligt - Perfekt für DSGVO-konforme Schulumgebungen Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung. Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung. """ import os import base64 import httpx import logging import time from typing import Optional from dataclasses import dataclass from llm_gateway.config import get_config logger = logging.getLogger(__name__) @dataclass class VisionOCRResult: """Result from Vision-LLM OCR extraction.""" text: str confidence: float processing_time_ms: int model: str = "llama3.2-vision:11b" device: str = "local-ollama" # OCR System Prompt für optimale Handschrifterkennung HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR). AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich. WICHTIGE REGELN: 1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu 2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile) 3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden 4. Ignoriere Linien, Kästchen und andere Formatierungen 5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht 6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.) AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare.""" # Alternative Prompt für gedruckten Text PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild. Behalte die Struktur bei (Absätze, Listen, etc.). Gib nur den extrahierten Text zurück, ohne Kommentare.""" class VisionOCRService: """ OCR Service mit Llama 3.2 Vision über Ollama. Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig. Ideal für datenschutzkonforme Klausurkorrektur in Schulen. Usage: service = VisionOCRService() if await service.is_available(): result = await service.extract_text(image_bytes) print(result.text) """ def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None): """ Initialize Vision OCR Service. Args: ollama_url: Ollama API URL (default: from config) model: Vision model to use (default: llama3.2-vision:11b) """ config = get_config() self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434") self.model = model or config.vision_model self._client: Optional[httpx.AsyncClient] = None async def _get_client(self) -> httpx.AsyncClient: """Get or create HTTP client.""" if self._client is None or self._client.is_closed: self._client = httpx.AsyncClient( timeout=300.0 # 5 min timeout für große Bilder ) return self._client async def close(self): """Close the HTTP client.""" if self._client and not self._client.is_closed: await self._client.aclose() async def is_available(self) -> bool: """Check if Ollama with vision model is available.""" try: client = await self._get_client() # Check Ollama health response = await client.get( f"{self.ollama_url}/api/tags", timeout=5.0 ) if response.status_code != 200: return False # Check if vision model is installed data = response.json() models = [m.get("name", "") for m in data.get("models", [])] # Check for any vision model has_vision = any( "vision" in m.lower() or "llava" in m.lower() for m in models ) if not has_vision: logger.warning(f"No vision model found. Available: {models}") return False return True except Exception as e: logger.warning(f"Vision OCR service not available: {e}") return False async def get_status(self) -> dict: """Get service status.""" try: client = await self._get_client() response = await client.get(f"{self.ollama_url}/api/tags") if response.status_code == 200: data = response.json() models = data.get("models", []) vision_models = [ m for m in models if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower() ] return { "status": "available", "ollama_url": self.ollama_url, "configured_model": self.model, "vision_models": [m.get("name") for m in vision_models], "total_models": len(models) } else: return { "status": "unavailable", "error": f"HTTP {response.status_code}" } except Exception as e: return { "status": "unavailable", "error": str(e) } async def extract_text( self, image_data: bytes, filename: str = "image.png", is_handwriting: bool = True ) -> VisionOCRResult: """ Extract text from an image using Vision LLM. Args: image_data: Raw image bytes (PNG, JPG, etc.) filename: Original filename (for logging) is_handwriting: True for handwriting, False for printed text Returns: VisionOCRResult with extracted text """ start_time = time.time() try: client = await self._get_client() # Encode image as base64 image_base64 = base64.b64encode(image_data).decode("utf-8") # Select appropriate prompt prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT # Ollama Vision API request payload = { "model": self.model, "messages": [ { "role": "user", "content": prompt, "images": [image_base64] } ], "stream": False, "options": { "temperature": 0.1, # Low temperature for consistent OCR "num_predict": 2048, # Max tokens for extracted text } } logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)") response = await client.post( f"{self.ollama_url}/api/chat", json=payload, timeout=180.0 # 3 min timeout ) response.raise_for_status() data = response.json() extracted_text = data.get("message", {}).get("content", "") processing_time_ms = int((time.time() - start_time) * 1000) # Estimate confidence based on response quality confidence = self._estimate_confidence(extracted_text) logger.info( f"Vision OCR completed for {filename}: " f"{len(extracted_text)} chars in {processing_time_ms}ms" ) return VisionOCRResult( text=extracted_text.strip(), confidence=confidence, processing_time_ms=processing_time_ms, model=self.model, device="local-ollama" ) except httpx.TimeoutException: logger.error(f"Vision OCR timed out for {filename}") raise except Exception as e: logger.error(f"Vision OCR failed for {filename}: {e}") raise def _estimate_confidence(self, text: str) -> float: """ Estimate OCR confidence based on text quality. This is a heuristic - real confidence would need model output. """ if not text: return 0.0 # Count uncertain markers uncertain_markers = text.count("[unleserlich]") + text.count("[?]") # Count reasonable text vs markers text_length = len(text.replace("[unleserlich]", "").replace("[?]", "")) if text_length == 0: return 0.1 # Base confidence confidence = 0.85 # Reduce for uncertain markers confidence -= min(uncertain_markers * 0.05, 0.3) # Very short text might be incomplete if text_length < 20: confidence -= 0.1 return max(confidence, 0.1) # Singleton instance _vision_ocr_service: Optional[VisionOCRService] = None def get_vision_ocr_service() -> VisionOCRService: """Get the Vision OCR service singleton.""" global _vision_ocr_service if _vision_ocr_service is None: _vision_ocr_service = VisionOCRService() return _vision_ocr_service async def extract_handwriting( image_data: bytes, filename: str = "image.png" ) -> VisionOCRResult: """ Convenience function to extract handwriting from an image. Uses Llama 3.2 Vision locally via Ollama. All processing happens on the local Mac Mini - DSGVO-konform. Args: image_data: Raw image bytes filename: Original filename Returns: VisionOCRResult with extracted text """ service = get_vision_ocr_service() return await service.extract_text(image_data, filename, is_handwriting=True)