Add Vision-LLM OCR Fusion (Step 4) for degraded scans

New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 00:24:22 +02:00
parent 00eb9f26f6
commit 2f8270f77b
4 changed files with 320 additions and 5 deletions
--- a/klausur-service/backend/vision_ocr_fusion.py
+++ b/klausur-service/backend/vision_ocr_fusion.py
@@ -0,0 +1,261 @@
+"""
+Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.
+
+Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
+The LLM can read degraded text using context understanding and visual inspection,
+while OCR coordinates provide structural hints (where text is, column positions).
+
+Uses Ollama API (same pattern as handwriting_htr_api.py).
+"""
+
+import base64
+import json
+import logging
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import cv2
+import httpx
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
+
+# Document category → prompt context
+CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
+    "vokabelseite": {
+        "label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
+        "columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
+    },
+    "woerterbuch": {
+        "label": "Woerterbuchseite",
+        "columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
+    },
+    "arbeitsblatt": {
+        "label": "Arbeitsblatt",
+        "columns": "Erkenne die Spaltenstruktur aus dem Layout.",
+    },
+    "buchseite": {
+        "label": "Schulbuchseite",
+        "columns": "Erkenne die Spaltenstruktur aus dem Layout.",
+    },
+}
+
+
+def _group_words_into_lines(
+    words: List[Dict], y_tolerance: float = 15.0,
+) -> List[List[Dict]]:
+    """Group OCR words into lines by Y-proximity."""
+    if not words:
+        return []
+    sorted_w = sorted(words, key=lambda w: w.get("top", 0))
+    lines: List[List[Dict]] = [[sorted_w[0]]]
+    for w in sorted_w[1:]:
+        last_line = lines[-1]
+        avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
+        if abs(w["top"] - avg_y) <= y_tolerance:
+            last_line.append(w)
+        else:
+            lines.append([w])
+    # Sort words within each line by X
+    for line in lines:
+        line.sort(key=lambda w: w.get("left", 0))
+    return lines
+
+
+def _build_ocr_context(words: List[Dict], img_h: int) -> str:
+    """Build a text description of OCR words with positions for the prompt."""
+    lines = _group_words_into_lines(words)
+    context_parts = []
+    for i, line in enumerate(lines):
+        word_descs = []
+        for w in line:
+            text = w.get("text", "").strip()
+            x = w.get("left", 0)
+            conf = w.get("conf", 0)
+            marker = " (?)" if conf < 50 else ""
+            word_descs.append(f'x={x} "{text}"{marker}')
+        avg_y = int(sum(w["top"] for w in line) / len(line))
+        context_parts.append(f"Zeile {i+1} (y~{avg_y}):  {',  '.join(word_descs)}")
+    return "\n".join(context_parts)
+
+
+def _build_prompt(
+    ocr_context: str, category: str, img_w: int, img_h: int,
+) -> str:
+    """Build the Vision-LLM prompt with OCR context and document type."""
+    cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])
+
+    return f"""Du siehst eine eingescannte {cat_info['label']}.
+{cat_info['columns']}
+
+Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
+Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:
+
+{ocr_context}
+
+Bildgroesse: {img_w} x {img_h} Pixel.
+
+AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
+- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
+- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
+  gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
+- Behalte die Reihenfolge bei
+
+Antworte NUR mit einem JSON-Array, keine Erklaerungen:
+[
+  {{"row": 1, "english": "...", "german": "...", "example": "..."}},
+  {{"row": 2, "english": "...", "german": "...", "example": "..."}}
+]"""
+
+
+def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
+    """Parse the LLM JSON response, handling markdown code blocks."""
+    text = response_text.strip()
+
+    # Strip markdown code block if present
+    if text.startswith("```"):
+        text = re.sub(r"^```(?:json)?\s*", "", text)
+        text = re.sub(r"\s*```\s*$", "", text)
+
+    # Try to find JSON array
+    match = re.search(r"\[[\s\S]*\]", text)
+    if not match:
+        logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
+        return None
+
+    try:
+        data = json.loads(match.group())
+        if not isinstance(data, list):
+            return None
+        return data
+    except json.JSONDecodeError as e:
+        logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
+        return None
+
+
+def _vocab_rows_to_words(
+    rows: List[Dict], img_w: int, img_h: int,
+) -> List[Dict]:
+    """Convert LLM vocab rows back to word dicts for grid building.
+
+    Distributes words across estimated column positions so the
+    existing grid builder can process them normally.
+    """
+    words = []
+    # Estimate column positions (3-column vocab layout)
+    col_positions = [
+        (0.02, 0.28),      # EN: 2%-28% of width
+        (0.30, 0.55),      # DE: 30%-55%
+        (0.57, 0.98),      # Example: 57%-98%
+    ]
+
+    median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
+    y_step = max(median_h + 5, img_h // max(len(rows), 1))
+
+    for i, row in enumerate(rows):
+        y = int(i * y_step + 20)
+        row_num = row.get("row", i + 1)
+
+        for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
+            ("english", col_positions[0]),
+            ("german", col_positions[1]),
+            ("example", col_positions[2]),
+        ]):
+            text = (row.get(field) or "").strip()
+            if not text:
+                continue
+            x = int(x_start_pct * img_w)
+            w = int((x_end_pct - x_start_pct) * img_w)
+            words.append({
+                "text": text,
+                "left": x,
+                "top": y,
+                "width": w,
+                "height": median_h,
+                "conf": 95,  # LLM-corrected → high confidence
+                "_source": "vision_llm",
+                "_row": row_num,
+                "_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
+            })
+
+    logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
+    return words
+
+
+async def vision_fuse_ocr(
+    img_bgr: np.ndarray,
+    ocr_words: List[Dict],
+    document_category: str = "vokabelseite",
+) -> List[Dict]:
+    """Fuse traditional OCR results with Vision-LLM reading.
+
+    Sends the image + OCR word positions to Qwen2.5-VL which can:
+    - Read degraded text that traditional OCR cannot
+    - Use document context (knows what a vocab table looks like)
+    - Merge continuation rows (understands table structure)
+
+    Args:
+        img_bgr: The cropped/dewarped scan image (BGR)
+        ocr_words: Traditional OCR word list with positions
+        document_category: Type of document being scanned
+
+    Returns:
+        Corrected word list in same format as input, ready for grid building.
+        Falls back to original ocr_words on error.
+    """
+    img_h, img_w = img_bgr.shape[:2]
+
+    # Build OCR context string
+    ocr_context = _build_ocr_context(ocr_words, img_h)
+
+    # Build prompt
+    prompt = _build_prompt(ocr_context, document_category, img_w, img_h)
+
+    # Encode image as base64
+    _, img_encoded = cv2.imencode(".png", img_bgr)
+    img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
+
+    # Call Qwen2.5-VL via Ollama
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(
+                f"{OLLAMA_BASE_URL}/api/generate",
+                json={
+                    "model": OLLAMA_HTR_MODEL,
+                    "prompt": prompt,
+                    "images": [img_b64],
+                    "stream": False,
+                    "options": {"temperature": 0.1, "num_predict": 4096},
+                },
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            response_text = data.get("response", "").strip()
+    except Exception as e:
+        logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
+        return ocr_words  # Fallback to original
+
+    if not response_text:
+        logger.warning("vision_fuse_ocr: empty LLM response")
+        return ocr_words
+
+    # Parse JSON response
+    rows = _parse_llm_response(response_text)
+    if not rows:
+        logger.warning(
+            "vision_fuse_ocr: could not parse LLM response, "
+            "first 200 chars: %s", response_text[:200],
+        )
+        return ocr_words
+
+    logger.info(
+        f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
+        f"(from {len(ocr_words)} OCR words)"
+    )
+
+    # Convert back to word format for grid building
+    return _vocab_rows_to_words(rows, img_w, img_h)