[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/ocr_pipeline_auto_helpers.py
+++ b/klausur-service/backend/ocr_pipeline_auto_helpers.py
@@ -0,0 +1,84 @@
+"""
+OCR Pipeline Auto-Mode Helpers.
+
+VLM shear detection, SSE event formatting, and request models.
+
+Lizenz: Apache 2.0
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re
+from typing import Any, Dict
+
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+
+class RunAutoRequest(BaseModel):
+    from_step: int = 1          # 1=deskew, 2=dewarp, 3=columns, 4=rows, 5=words, 6=llm-review
+    ocr_engine: str = "auto"    # "auto" | "rapid" | "tesseract"
+    pronunciation: str = "british"
+    skip_llm_review: bool = False
+    dewarp_method: str = "ensemble"  # "ensemble" | "vlm" | "cv"
+
+
+async def auto_sse_event(step: str, status: str, data: Dict[str, Any]) -> str:
+    """Format a single SSE event line."""
+    payload = {"step": step, "status": status, **data}
+    return f"data: {json.dumps(payload)}\n\n"
+
+
+async def detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]:
+    """Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page.
+
+    The VLM is shown the image and asked: are the column/table borders tilted?
+    If yes, by how many degrees? Returns a dict with shear_degrees and confidence.
+    Confidence is 0.0 if Ollama is unavailable or parsing fails.
+    """
+    import httpx
+    import base64
+
+    ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+    model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
+
+    prompt = (
+        "This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. "
+        "Are they perfectly vertical, or do they tilt slightly? "
+        "If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). "
+        "Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} "
+        "Use confidence 0.0-1.0 based on how clearly you can see the tilt. "
+        "If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}"
+    )
+
+    img_b64 = base64.b64encode(image_bytes).decode("utf-8")
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "images": [img_b64],
+        "stream": False,
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(f"{ollama_base}/api/generate", json=payload)
+            resp.raise_for_status()
+            text = resp.json().get("response", "")
+
+        # Parse JSON from response (may have surrounding text)
+        match = re.search(r'\{[^}]+\}', text)
+        if match:
+            data = json.loads(match.group(0))
+            shear = float(data.get("shear_degrees", 0.0))
+            conf = float(data.get("confidence", 0.0))
+            # Clamp to reasonable range
+            shear = max(-3.0, min(3.0, shear))
+            conf = max(0.0, min(1.0, conf))
+            return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)}
+    except Exception as e:
+        logger.warning(f"VLM dewarp failed: {e}")
+
+    return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}