A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
310 lines
9.6 KiB
Python
310 lines
9.6 KiB
Python
"""
|
|
Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision.
|
|
|
|
DATENSCHUTZ/PRIVACY BY DESIGN:
|
|
- Alle Verarbeitung erfolgt lokal auf dem Mac Mini
|
|
- Keine Daten verlassen das lokale Netzwerk
|
|
- Keine Cloud-APIs beteiligt
|
|
- Perfekt für DSGVO-konforme Schulumgebungen
|
|
|
|
Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung.
|
|
Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung.
|
|
"""
|
|
import os
|
|
import base64
|
|
import httpx
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
from dataclasses import dataclass
|
|
|
|
from llm_gateway.config import get_config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class VisionOCRResult:
|
|
"""Result from Vision-LLM OCR extraction."""
|
|
text: str
|
|
confidence: float
|
|
processing_time_ms: int
|
|
model: str = "llama3.2-vision:11b"
|
|
device: str = "local-ollama"
|
|
|
|
|
|
# OCR System Prompt für optimale Handschrifterkennung
|
|
HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR).
|
|
|
|
AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich.
|
|
|
|
WICHTIGE REGELN:
|
|
1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu
|
|
2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile)
|
|
3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden
|
|
4. Ignoriere Linien, Kästchen und andere Formatierungen
|
|
5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht
|
|
6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.)
|
|
|
|
AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare."""
|
|
|
|
# Alternative Prompt für gedruckten Text
|
|
PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild.
|
|
Behalte die Struktur bei (Absätze, Listen, etc.).
|
|
Gib nur den extrahierten Text zurück, ohne Kommentare."""
|
|
|
|
|
|
class VisionOCRService:
|
|
"""
|
|
OCR Service mit Llama 3.2 Vision über Ollama.
|
|
|
|
Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig.
|
|
Ideal für datenschutzkonforme Klausurkorrektur in Schulen.
|
|
|
|
Usage:
|
|
service = VisionOCRService()
|
|
|
|
if await service.is_available():
|
|
result = await service.extract_text(image_bytes)
|
|
print(result.text)
|
|
"""
|
|
|
|
def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None):
|
|
"""
|
|
Initialize Vision OCR Service.
|
|
|
|
Args:
|
|
ollama_url: Ollama API URL (default: from config)
|
|
model: Vision model to use (default: llama3.2-vision:11b)
|
|
"""
|
|
config = get_config()
|
|
self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434")
|
|
self.model = model or config.vision_model
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""Get or create HTTP client."""
|
|
if self._client is None or self._client.is_closed:
|
|
self._client = httpx.AsyncClient(
|
|
timeout=300.0 # 5 min timeout für große Bilder
|
|
)
|
|
return self._client
|
|
|
|
async def close(self):
|
|
"""Close the HTTP client."""
|
|
if self._client and not self._client.is_closed:
|
|
await self._client.aclose()
|
|
|
|
async def is_available(self) -> bool:
|
|
"""Check if Ollama with vision model is available."""
|
|
try:
|
|
client = await self._get_client()
|
|
|
|
# Check Ollama health
|
|
response = await client.get(
|
|
f"{self.ollama_url}/api/tags",
|
|
timeout=5.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
return False
|
|
|
|
# Check if vision model is installed
|
|
data = response.json()
|
|
models = [m.get("name", "") for m in data.get("models", [])]
|
|
|
|
# Check for any vision model
|
|
has_vision = any(
|
|
"vision" in m.lower() or "llava" in m.lower()
|
|
for m in models
|
|
)
|
|
|
|
if not has_vision:
|
|
logger.warning(f"No vision model found. Available: {models}")
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Vision OCR service not available: {e}")
|
|
return False
|
|
|
|
async def get_status(self) -> dict:
|
|
"""Get service status."""
|
|
try:
|
|
client = await self._get_client()
|
|
response = await client.get(f"{self.ollama_url}/api/tags")
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
models = data.get("models", [])
|
|
vision_models = [
|
|
m for m in models
|
|
if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower()
|
|
]
|
|
|
|
return {
|
|
"status": "available",
|
|
"ollama_url": self.ollama_url,
|
|
"configured_model": self.model,
|
|
"vision_models": [m.get("name") for m in vision_models],
|
|
"total_models": len(models)
|
|
}
|
|
else:
|
|
return {
|
|
"status": "unavailable",
|
|
"error": f"HTTP {response.status_code}"
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"status": "unavailable",
|
|
"error": str(e)
|
|
}
|
|
|
|
async def extract_text(
|
|
self,
|
|
image_data: bytes,
|
|
filename: str = "image.png",
|
|
is_handwriting: bool = True
|
|
) -> VisionOCRResult:
|
|
"""
|
|
Extract text from an image using Vision LLM.
|
|
|
|
Args:
|
|
image_data: Raw image bytes (PNG, JPG, etc.)
|
|
filename: Original filename (for logging)
|
|
is_handwriting: True for handwriting, False for printed text
|
|
|
|
Returns:
|
|
VisionOCRResult with extracted text
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
client = await self._get_client()
|
|
|
|
# Encode image as base64
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
|
|
# Select appropriate prompt
|
|
prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT
|
|
|
|
# Ollama Vision API request
|
|
payload = {
|
|
"model": self.model,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": prompt,
|
|
"images": [image_base64]
|
|
}
|
|
],
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1, # Low temperature for consistent OCR
|
|
"num_predict": 2048, # Max tokens for extracted text
|
|
}
|
|
}
|
|
|
|
logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)")
|
|
|
|
response = await client.post(
|
|
f"{self.ollama_url}/api/chat",
|
|
json=payload,
|
|
timeout=180.0 # 3 min timeout
|
|
)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
extracted_text = data.get("message", {}).get("content", "")
|
|
|
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
# Estimate confidence based on response quality
|
|
confidence = self._estimate_confidence(extracted_text)
|
|
|
|
logger.info(
|
|
f"Vision OCR completed for {filename}: "
|
|
f"{len(extracted_text)} chars in {processing_time_ms}ms"
|
|
)
|
|
|
|
return VisionOCRResult(
|
|
text=extracted_text.strip(),
|
|
confidence=confidence,
|
|
processing_time_ms=processing_time_ms,
|
|
model=self.model,
|
|
device="local-ollama"
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
logger.error(f"Vision OCR timed out for {filename}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Vision OCR failed for {filename}: {e}")
|
|
raise
|
|
|
|
def _estimate_confidence(self, text: str) -> float:
|
|
"""
|
|
Estimate OCR confidence based on text quality.
|
|
|
|
This is a heuristic - real confidence would need model output.
|
|
"""
|
|
if not text:
|
|
return 0.0
|
|
|
|
# Count uncertain markers
|
|
uncertain_markers = text.count("[unleserlich]") + text.count("[?]")
|
|
|
|
# Count reasonable text vs markers
|
|
text_length = len(text.replace("[unleserlich]", "").replace("[?]", ""))
|
|
|
|
if text_length == 0:
|
|
return 0.1
|
|
|
|
# Base confidence
|
|
confidence = 0.85
|
|
|
|
# Reduce for uncertain markers
|
|
confidence -= min(uncertain_markers * 0.05, 0.3)
|
|
|
|
# Very short text might be incomplete
|
|
if text_length < 20:
|
|
confidence -= 0.1
|
|
|
|
return max(confidence, 0.1)
|
|
|
|
|
|
# Singleton instance
|
|
_vision_ocr_service: Optional[VisionOCRService] = None
|
|
|
|
|
|
def get_vision_ocr_service() -> VisionOCRService:
|
|
"""Get the Vision OCR service singleton."""
|
|
global _vision_ocr_service
|
|
if _vision_ocr_service is None:
|
|
_vision_ocr_service = VisionOCRService()
|
|
return _vision_ocr_service
|
|
|
|
|
|
async def extract_handwriting(
|
|
image_data: bytes,
|
|
filename: str = "image.png"
|
|
) -> VisionOCRResult:
|
|
"""
|
|
Convenience function to extract handwriting from an image.
|
|
|
|
Uses Llama 3.2 Vision locally via Ollama.
|
|
All processing happens on the local Mac Mini - DSGVO-konform.
|
|
|
|
Args:
|
|
image_data: Raw image bytes
|
|
filename: Original filename
|
|
|
|
Returns:
|
|
VisionOCRResult with extracted text
|
|
"""
|
|
service = get_vision_ocr_service()
|
|
return await service.extract_text(image_data, filename, is_handwriting=True)
|