This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/klausur/services/vision_ocr_service.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

310 lines
9.6 KiB
Python

"""
Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision.
DATENSCHUTZ/PRIVACY BY DESIGN:
- Alle Verarbeitung erfolgt lokal auf dem Mac Mini
- Keine Daten verlassen das lokale Netzwerk
- Keine Cloud-APIs beteiligt
- Perfekt für DSGVO-konforme Schulumgebungen
Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung.
Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung.
"""
import os
import base64
import httpx
import logging
import time
from typing import Optional
from dataclasses import dataclass
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class VisionOCRResult:
"""Result from Vision-LLM OCR extraction."""
text: str
confidence: float
processing_time_ms: int
model: str = "llama3.2-vision:11b"
device: str = "local-ollama"
# OCR System Prompt für optimale Handschrifterkennung
HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR).
AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich.
WICHTIGE REGELN:
1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu
2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile)
3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden
4. Ignoriere Linien, Kästchen und andere Formatierungen
5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht
6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.)
AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare."""
# Alternative Prompt für gedruckten Text
PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild.
Behalte die Struktur bei (Absätze, Listen, etc.).
Gib nur den extrahierten Text zurück, ohne Kommentare."""
class VisionOCRService:
"""
OCR Service mit Llama 3.2 Vision über Ollama.
Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig.
Ideal für datenschutzkonforme Klausurkorrektur in Schulen.
Usage:
service = VisionOCRService()
if await service.is_available():
result = await service.extract_text(image_bytes)
print(result.text)
"""
def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None):
"""
Initialize Vision OCR Service.
Args:
ollama_url: Ollama API URL (default: from config)
model: Vision model to use (default: llama3.2-vision:11b)
"""
config = get_config()
self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434")
self.model = model or config.vision_model
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
timeout=300.0 # 5 min timeout für große Bilder
)
return self._client
async def close(self):
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
async def is_available(self) -> bool:
"""Check if Ollama with vision model is available."""
try:
client = await self._get_client()
# Check Ollama health
response = await client.get(
f"{self.ollama_url}/api/tags",
timeout=5.0
)
if response.status_code != 200:
return False
# Check if vision model is installed
data = response.json()
models = [m.get("name", "") for m in data.get("models", [])]
# Check for any vision model
has_vision = any(
"vision" in m.lower() or "llava" in m.lower()
for m in models
)
if not has_vision:
logger.warning(f"No vision model found. Available: {models}")
return False
return True
except Exception as e:
logger.warning(f"Vision OCR service not available: {e}")
return False
async def get_status(self) -> dict:
"""Get service status."""
try:
client = await self._get_client()
response = await client.get(f"{self.ollama_url}/api/tags")
if response.status_code == 200:
data = response.json()
models = data.get("models", [])
vision_models = [
m for m in models
if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower()
]
return {
"status": "available",
"ollama_url": self.ollama_url,
"configured_model": self.model,
"vision_models": [m.get("name") for m in vision_models],
"total_models": len(models)
}
else:
return {
"status": "unavailable",
"error": f"HTTP {response.status_code}"
}
except Exception as e:
return {
"status": "unavailable",
"error": str(e)
}
async def extract_text(
self,
image_data: bytes,
filename: str = "image.png",
is_handwriting: bool = True
) -> VisionOCRResult:
"""
Extract text from an image using Vision LLM.
Args:
image_data: Raw image bytes (PNG, JPG, etc.)
filename: Original filename (for logging)
is_handwriting: True for handwriting, False for printed text
Returns:
VisionOCRResult with extracted text
"""
start_time = time.time()
try:
client = await self._get_client()
# Encode image as base64
image_base64 = base64.b64encode(image_data).decode("utf-8")
# Select appropriate prompt
prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT
# Ollama Vision API request
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": prompt,
"images": [image_base64]
}
],
"stream": False,
"options": {
"temperature": 0.1, # Low temperature for consistent OCR
"num_predict": 2048, # Max tokens for extracted text
}
}
logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)")
response = await client.post(
f"{self.ollama_url}/api/chat",
json=payload,
timeout=180.0 # 3 min timeout
)
response.raise_for_status()
data = response.json()
extracted_text = data.get("message", {}).get("content", "")
processing_time_ms = int((time.time() - start_time) * 1000)
# Estimate confidence based on response quality
confidence = self._estimate_confidence(extracted_text)
logger.info(
f"Vision OCR completed for {filename}: "
f"{len(extracted_text)} chars in {processing_time_ms}ms"
)
return VisionOCRResult(
text=extracted_text.strip(),
confidence=confidence,
processing_time_ms=processing_time_ms,
model=self.model,
device="local-ollama"
)
except httpx.TimeoutException:
logger.error(f"Vision OCR timed out for {filename}")
raise
except Exception as e:
logger.error(f"Vision OCR failed for {filename}: {e}")
raise
def _estimate_confidence(self, text: str) -> float:
"""
Estimate OCR confidence based on text quality.
This is a heuristic - real confidence would need model output.
"""
if not text:
return 0.0
# Count uncertain markers
uncertain_markers = text.count("[unleserlich]") + text.count("[?]")
# Count reasonable text vs markers
text_length = len(text.replace("[unleserlich]", "").replace("[?]", ""))
if text_length == 0:
return 0.1
# Base confidence
confidence = 0.85
# Reduce for uncertain markers
confidence -= min(uncertain_markers * 0.05, 0.3)
# Very short text might be incomplete
if text_length < 20:
confidence -= 0.1
return max(confidence, 0.1)
# Singleton instance
_vision_ocr_service: Optional[VisionOCRService] = None
def get_vision_ocr_service() -> VisionOCRService:
"""Get the Vision OCR service singleton."""
global _vision_ocr_service
if _vision_ocr_service is None:
_vision_ocr_service = VisionOCRService()
return _vision_ocr_service
async def extract_handwriting(
image_data: bytes,
filename: str = "image.png"
) -> VisionOCRResult:
"""
Convenience function to extract handwriting from an image.
Uses Llama 3.2 Vision locally via Ollama.
All processing happens on the local Mac Mini - DSGVO-konform.
Args:
image_data: Raw image bytes
filename: Original filename
Returns:
VisionOCRResult with extracted text
"""
service = get_vision_ocr_service()
return await service.extract_text(image_data, filename, is_handwriting=True)