breakpilot-lehrer/klausur-service/backend/vision_ocr_fusion.py

"""
Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.

Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
The LLM can read degraded text using context understanding and visual inspection,
while OCR coordinates provide structural hints (where text is, column positions).

Uses Ollama API (same pattern as handwriting_htr_api.py).
"""

import base64
import json
import logging
import os
import re
from typing import Any, Dict, List, Optional

import cv2
import httpx
import numpy as np

logger = logging.getLogger(__name__)

OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")

# Document category → prompt context
CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
    "vokabelseite": {
        "label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
        "columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
    },
    "woerterbuch": {
        "label": "Woerterbuchseite",
        "columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
    },
    "arbeitsblatt": {
        "label": "Arbeitsblatt",
        "columns": "Erkenne die Spaltenstruktur aus dem Layout.",
    },
    "buchseite": {
        "label": "Schulbuchseite",
        "columns": "Erkenne die Spaltenstruktur aus dem Layout.",
    },
}


def _group_words_into_lines(
    words: List[Dict], y_tolerance: float = 15.0,
) -> List[List[Dict]]:
    """Group OCR words into lines by Y-proximity."""
    if not words:
        return []
    sorted_w = sorted(words, key=lambda w: w.get("top", 0))
    lines: List[List[Dict]] = [[sorted_w[0]]]
    for w in sorted_w[1:]:
        last_line = lines[-1]
        avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
        if abs(w["top"] - avg_y) <= y_tolerance:
            last_line.append(w)
        else:
            lines.append([w])
    # Sort words within each line by X
    for line in lines:
        line.sort(key=lambda w: w.get("left", 0))
    return lines


def _build_ocr_context(words: List[Dict], img_h: int) -> str:
    """Build a text description of OCR words with positions for the prompt."""
    lines = _group_words_into_lines(words)
    context_parts = []
    for i, line in enumerate(lines):
        word_descs = []
        for w in line:
            text = w.get("text", "").strip()
            x = w.get("left", 0)
            conf = w.get("conf", 0)
            marker = " (?)" if conf < 50 else ""
            word_descs.append(f'x={x} "{text}"{marker}')
        avg_y = int(sum(w["top"] for w in line) / len(line))
        context_parts.append(f"Zeile {i+1} (y~{avg_y}):  {',  '.join(word_descs)}")
    return "\n".join(context_parts)


def _build_prompt(
    ocr_context: str, category: str, img_w: int, img_h: int,
) -> str:
    """Build the Vision-LLM prompt with OCR context and document type."""
    cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])

    return f"""Du siehst eine eingescannte {cat_info['label']}.
{cat_info['columns']}

Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:

{ocr_context}

Bildgroesse: {img_w} x {img_h} Pixel.

AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
  gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
- Behalte die Reihenfolge bei

Antworte NUR mit einem JSON-Array, keine Erklaerungen:
[
  {{"row": 1, "english": "...", "german": "...", "example": "..."}},
  {{"row": 2, "english": "...", "german": "...", "example": "..."}}
]"""


def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
    """Parse the LLM JSON response, handling markdown code blocks."""
    text = response_text.strip()

    # Strip markdown code block if present
    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```\s*$", "", text)

    # Try to find JSON array
    match = re.search(r"\[[\s\S]*\]", text)
    if not match:
        logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
        return None

    try:
        data = json.loads(match.group())
        if not isinstance(data, list):
            return None
        return data
    except json.JSONDecodeError as e:
        logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
        return None


def _vocab_rows_to_words(
    rows: List[Dict], img_w: int, img_h: int,
) -> List[Dict]:
    """Convert LLM vocab rows back to word dicts for grid building.

    Distributes words across estimated column positions so the
    existing grid builder can process them normally.
    """
    words = []
    # Estimate column positions (3-column vocab layout)
    col_positions = [
        (0.02, 0.28),      # EN: 2%-28% of width
        (0.30, 0.55),      # DE: 30%-55%
        (0.57, 0.98),      # Example: 57%-98%
    ]

    median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
    y_step = max(median_h + 5, img_h // max(len(rows), 1))

    for i, row in enumerate(rows):
        y = int(i * y_step + 20)
        row_num = row.get("row", i + 1)

        for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
            ("english", col_positions[0]),
            ("german", col_positions[1]),
            ("example", col_positions[2]),
        ]):
            text = (row.get(field) or "").strip()
            if not text:
                continue
            x = int(x_start_pct * img_w)
            w = int((x_end_pct - x_start_pct) * img_w)
            words.append({
                "text": text,
                "left": x,
                "top": y,
                "width": w,
                "height": median_h,
                "conf": 95,  # LLM-corrected → high confidence
                "_source": "vision_llm",
                "_row": row_num,
                "_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
            })

    logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
    return words


async def vision_fuse_ocr(
    img_bgr: np.ndarray,
    ocr_words: List[Dict],
    document_category: str = "vokabelseite",
) -> List[Dict]:
    """Fuse traditional OCR results with Vision-LLM reading.

    Sends the image + OCR word positions to Qwen2.5-VL which can:
    - Read degraded text that traditional OCR cannot
    - Use document context (knows what a vocab table looks like)
    - Merge continuation rows (understands table structure)

    Args:
        img_bgr: The cropped/dewarped scan image (BGR)
        ocr_words: Traditional OCR word list with positions
        document_category: Type of document being scanned

    Returns:
        Corrected word list in same format as input, ready for grid building.
        Falls back to original ocr_words on error.
    """
    img_h, img_w = img_bgr.shape[:2]

    # Build OCR context string
    ocr_context = _build_ocr_context(ocr_words, img_h)

    # Build prompt
    prompt = _build_prompt(ocr_context, document_category, img_w, img_h)

    # Encode image as base64
    _, img_encoded = cv2.imencode(".png", img_bgr)
    img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")

    # Call Qwen2.5-VL via Ollama
    try:
        async with httpx.AsyncClient(timeout=120.0) as client:
            resp = await client.post(
                f"{OLLAMA_BASE_URL}/api/generate",
                json={
                    "model": OLLAMA_HTR_MODEL,
                    "prompt": prompt,
                    "images": [img_b64],
                    "stream": False,
                    "options": {"temperature": 0.1, "num_predict": 4096},
                },
            )
            resp.raise_for_status()
            data = resp.json()
            response_text = data.get("response", "").strip()
    except Exception as e:
        logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
        return ocr_words  # Fallback to original

    if not response_text:
        logger.warning("vision_fuse_ocr: empty LLM response")
        return ocr_words

    # Parse JSON response
    rows = _parse_llm_response(response_text)
    if not rows:
        logger.warning(
            "vision_fuse_ocr: could not parse LLM response, "
            "first 200 chars: %s", response_text[:200],
        )
        return ocr_words

    logger.info(
        f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
        f"(from {len(ocr_words)} OCR words)"
    )

    # Convert back to word format for grid building
    return _vocab_rows_to_words(rows, img_w, img_h)