fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
309
backend/klausur/services/vision_ocr_service.py
Normal file
309
backend/klausur/services/vision_ocr_service.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision.
|
||||
|
||||
DATENSCHUTZ/PRIVACY BY DESIGN:
|
||||
- Alle Verarbeitung erfolgt lokal auf dem Mac Mini
|
||||
- Keine Daten verlassen das lokale Netzwerk
|
||||
- Keine Cloud-APIs beteiligt
|
||||
- Perfekt für DSGVO-konforme Schulumgebungen
|
||||
|
||||
Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung.
|
||||
Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung.
|
||||
"""
|
||||
import os
|
||||
import base64
|
||||
import httpx
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from llm_gateway.config import get_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionOCRResult:
|
||||
"""Result from Vision-LLM OCR extraction."""
|
||||
text: str
|
||||
confidence: float
|
||||
processing_time_ms: int
|
||||
model: str = "llama3.2-vision:11b"
|
||||
device: str = "local-ollama"
|
||||
|
||||
|
||||
# OCR System Prompt für optimale Handschrifterkennung
|
||||
HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR).
|
||||
|
||||
AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich.
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu
|
||||
2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile)
|
||||
3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden
|
||||
4. Ignoriere Linien, Kästchen und andere Formatierungen
|
||||
5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht
|
||||
6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.)
|
||||
|
||||
AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare."""
|
||||
|
||||
# Alternative Prompt für gedruckten Text
|
||||
PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild.
|
||||
Behalte die Struktur bei (Absätze, Listen, etc.).
|
||||
Gib nur den extrahierten Text zurück, ohne Kommentare."""
|
||||
|
||||
|
||||
class VisionOCRService:
|
||||
"""
|
||||
OCR Service mit Llama 3.2 Vision über Ollama.
|
||||
|
||||
Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig.
|
||||
Ideal für datenschutzkonforme Klausurkorrektur in Schulen.
|
||||
|
||||
Usage:
|
||||
service = VisionOCRService()
|
||||
|
||||
if await service.is_available():
|
||||
result = await service.extract_text(image_bytes)
|
||||
print(result.text)
|
||||
"""
|
||||
|
||||
def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None):
|
||||
"""
|
||||
Initialize Vision OCR Service.
|
||||
|
||||
Args:
|
||||
ollama_url: Ollama API URL (default: from config)
|
||||
model: Vision model to use (default: llama3.2-vision:11b)
|
||||
"""
|
||||
config = get_config()
|
||||
self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434")
|
||||
self.model = model or config.vision_model
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Get or create HTTP client."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=300.0 # 5 min timeout für große Bilder
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
|
||||
async def is_available(self) -> bool:
|
||||
"""Check if Ollama with vision model is available."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
# Check Ollama health
|
||||
response = await client.get(
|
||||
f"{self.ollama_url}/api/tags",
|
||||
timeout=5.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return False
|
||||
|
||||
# Check if vision model is installed
|
||||
data = response.json()
|
||||
models = [m.get("name", "") for m in data.get("models", [])]
|
||||
|
||||
# Check for any vision model
|
||||
has_vision = any(
|
||||
"vision" in m.lower() or "llava" in m.lower()
|
||||
for m in models
|
||||
)
|
||||
|
||||
if not has_vision:
|
||||
logger.warning(f"No vision model found. Available: {models}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Vision OCR service not available: {e}")
|
||||
return False
|
||||
|
||||
async def get_status(self) -> dict:
|
||||
"""Get service status."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
response = await client.get(f"{self.ollama_url}/api/tags")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
models = data.get("models", [])
|
||||
vision_models = [
|
||||
m for m in models
|
||||
if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower()
|
||||
]
|
||||
|
||||
return {
|
||||
"status": "available",
|
||||
"ollama_url": self.ollama_url,
|
||||
"configured_model": self.model,
|
||||
"vision_models": [m.get("name") for m in vision_models],
|
||||
"total_models": len(models)
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"status": "unavailable",
|
||||
"error": f"HTTP {response.status_code}"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "unavailable",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def extract_text(
|
||||
self,
|
||||
image_data: bytes,
|
||||
filename: str = "image.png",
|
||||
is_handwriting: bool = True
|
||||
) -> VisionOCRResult:
|
||||
"""
|
||||
Extract text from an image using Vision LLM.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes (PNG, JPG, etc.)
|
||||
filename: Original filename (for logging)
|
||||
is_handwriting: True for handwriting, False for printed text
|
||||
|
||||
Returns:
|
||||
VisionOCRResult with extracted text
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
# Encode image as base64
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
# Select appropriate prompt
|
||||
prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT
|
||||
|
||||
# Ollama Vision API request
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
"images": [image_base64]
|
||||
}
|
||||
],
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1, # Low temperature for consistent OCR
|
||||
"num_predict": 2048, # Max tokens for extracted text
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)")
|
||||
|
||||
response = await client.post(
|
||||
f"{self.ollama_url}/api/chat",
|
||||
json=payload,
|
||||
timeout=180.0 # 3 min timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
extracted_text = data.get("message", {}).get("content", "")
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Estimate confidence based on response quality
|
||||
confidence = self._estimate_confidence(extracted_text)
|
||||
|
||||
logger.info(
|
||||
f"Vision OCR completed for {filename}: "
|
||||
f"{len(extracted_text)} chars in {processing_time_ms}ms"
|
||||
)
|
||||
|
||||
return VisionOCRResult(
|
||||
text=extracted_text.strip(),
|
||||
confidence=confidence,
|
||||
processing_time_ms=processing_time_ms,
|
||||
model=self.model,
|
||||
device="local-ollama"
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.error(f"Vision OCR timed out for {filename}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Vision OCR failed for {filename}: {e}")
|
||||
raise
|
||||
|
||||
def _estimate_confidence(self, text: str) -> float:
|
||||
"""
|
||||
Estimate OCR confidence based on text quality.
|
||||
|
||||
This is a heuristic - real confidence would need model output.
|
||||
"""
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
# Count uncertain markers
|
||||
uncertain_markers = text.count("[unleserlich]") + text.count("[?]")
|
||||
|
||||
# Count reasonable text vs markers
|
||||
text_length = len(text.replace("[unleserlich]", "").replace("[?]", ""))
|
||||
|
||||
if text_length == 0:
|
||||
return 0.1
|
||||
|
||||
# Base confidence
|
||||
confidence = 0.85
|
||||
|
||||
# Reduce for uncertain markers
|
||||
confidence -= min(uncertain_markers * 0.05, 0.3)
|
||||
|
||||
# Very short text might be incomplete
|
||||
if text_length < 20:
|
||||
confidence -= 0.1
|
||||
|
||||
return max(confidence, 0.1)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_vision_ocr_service: Optional[VisionOCRService] = None
|
||||
|
||||
|
||||
def get_vision_ocr_service() -> VisionOCRService:
|
||||
"""Get the Vision OCR service singleton."""
|
||||
global _vision_ocr_service
|
||||
if _vision_ocr_service is None:
|
||||
_vision_ocr_service = VisionOCRService()
|
||||
return _vision_ocr_service
|
||||
|
||||
|
||||
async def extract_handwriting(
|
||||
image_data: bytes,
|
||||
filename: str = "image.png"
|
||||
) -> VisionOCRResult:
|
||||
"""
|
||||
Convenience function to extract handwriting from an image.
|
||||
|
||||
Uses Llama 3.2 Vision locally via Ollama.
|
||||
All processing happens on the local Mac Mini - DSGVO-konform.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes
|
||||
filename: Original filename
|
||||
|
||||
Returns:
|
||||
VisionOCRResult with extracted text
|
||||
"""
|
||||
service = get_vision_ocr_service()
|
||||
return await service.extract_text(image_data, filename, is_handwriting=True)
|
||||
Reference in New Issue
Block a user