""" OCR Pipeline Auto-Mode Helpers. VLM shear detection, SSE event formatting, and request models. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import os import re from typing import Any, Dict from pydantic import BaseModel logger = logging.getLogger(__name__) class RunAutoRequest(BaseModel): from_step: int = 1 # 1=deskew, 2=dewarp, 3=columns, 4=rows, 5=words, 6=llm-review ocr_engine: str = "auto" # "auto" | "rapid" | "tesseract" pronunciation: str = "british" skip_llm_review: bool = False dewarp_method: str = "ensemble" # "ensemble" | "vlm" | "cv" async def auto_sse_event(step: str, status: str, data: Dict[str, Any]) -> str: """Format a single SSE event line.""" payload = {"step": step, "status": status, **data} return f"data: {json.dumps(payload)}\n\n" async def detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]: """Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page. The VLM is shown the image and asked: are the column/table borders tilted? If yes, by how many degrees? Returns a dict with shear_degrees and confidence. Confidence is 0.0 if Ollama is unavailable or parsing fails. """ import httpx import base64 ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b") prompt = ( "This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. " "Are they perfectly vertical, or do they tilt slightly? " "If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). " "Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} " "Use confidence 0.0-1.0 based on how clearly you can see the tilt. " "If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}" ) img_b64 = base64.b64encode(image_bytes).decode("utf-8") payload = { "model": model, "prompt": prompt, "images": [img_b64], "stream": False, } try: async with httpx.AsyncClient(timeout=60.0) as client: resp = await client.post(f"{ollama_base}/api/generate", json=payload) resp.raise_for_status() text = resp.json().get("response", "") # Parse JSON from response (may have surrounding text) match = re.search(r'\{[^}]+\}', text) if match: data = json.loads(match.group(0)) shear = float(data.get("shear_degrees", 0.0)) conf = float(data.get("confidence", 0.0)) # Clamp to reasonable range shear = max(-3.0, min(3.0, shear)) conf = max(0.0, min(1.0, conf)) return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)} except Exception as e: logger.warning(f"VLM dewarp failed: {e}") return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}