Add Vision-LLM OCR Fusion (Step 4) for degraded scans

New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 00:24:22 +02:00
parent 00eb9f26f6
commit 2f8270f77b
4 changed files with 320 additions and 5 deletions
@@ -28,10 +28,14 @@ export function useGridEditor(sessionId: string | null) {
  const [ipaMode, setIpaMode] = useState<IpaMode>('auto')
  const [syllableMode, setSyllableMode] = useState<SyllableMode>('auto')

-  // OCR Quality Steps (A/B testing toggles)
-  const [ocrEnhance, setOcrEnhance] = useState(true)
-  const [ocrMaxCols, setOcrMaxCols] = useState(0) // 0 = unlimited (admin pipeline default)
-  const [ocrMinConf, setOcrMinConf] = useState(0) // 0 = auto from quality score
+  // OCR Quality Steps (A/B testing toggles — defaults off for now)
+  const [ocrEnhance, setOcrEnhance] = useState(false)
+  const [ocrMaxCols, setOcrMaxCols] = useState(0)
+  const [ocrMinConf, setOcrMinConf] = useState(0)
+
+  // Vision-LLM Fusion (Step 4)
+  const [visionFusion, setVisionFusion] = useState(false)
+  const [documentCategory, setDocumentCategory] = useState('vokabelseite')

  // Undo/redo stacks store serialized zone arrays
  const undoStack = useRef<string[]>([])
@@ -92,6 +96,8 @@ export function useGridEditor(sessionId: string | null) {
      params.set('enhance', String(ocrEnhance))
      if (ocrMaxCols > 0) params.set('max_cols', String(ocrMaxCols))
      if (ocrMinConf > 0) params.set('min_conf', String(ocrMinConf))
+      params.set('vision_fusion', String(visionFusion))
+      if (documentCategory) params.set('doc_category', documentCategory)
      const res = await fetch(
        `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/rerun-ocr-and-build-grid?${params}`,
        { method: 'POST' },
@@ -110,7 +116,7 @@ export function useGridEditor(sessionId: string | null) {
    } finally {
      setLoading(false)
    }
-  }, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf])
+  }, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf, visionFusion, documentCategory])

  const loadGrid = useCallback(async () => {
    if (!sessionId) return
@@ -1030,6 +1036,10 @@ export function useGridEditor(sessionId: string | null) {
    setOcrMaxCols,
    ocrMinConf,
    setOcrMinConf,
+    visionFusion,
+    setVisionFusion,
+    documentCategory,
+    setDocumentCategory,
    rerunOcr,
  }
 }
@@ -67,6 +67,10 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
    setOcrMaxCols,
    ocrMinConf,
    setOcrMinConf,
+    visionFusion,
+    setVisionFusion,
+    documentCategory,
+    setDocumentCategory,
    rerunOcr,
  } = useGridEditor(sessionId)

@@ -291,6 +295,22 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
          </select>
        </label>

+        <span className="text-gray-400 dark:text-gray-500">|</span>
+        <label className="flex items-center gap-1 cursor-pointer" title="Step 4: Vision-LLM Fusion — Qwen2.5-VL korrigiert OCR anhand des Bildes">
+          <input type="checkbox" checked={visionFusion} onChange={(e) => setVisionFusion(e.target.checked)} className="rounded w-3 h-3 accent-orange-500" />
+          <span className={`${visionFusion ? 'text-orange-500 dark:text-orange-400 font-medium' : 'text-gray-500 dark:text-gray-400'}`}>Vision-LLM</span>
+        </label>
+        <label className="flex items-center gap-1" title="Dokumenttyp fuer Vision-LLM Prompt">
+          <span className="text-gray-500 dark:text-gray-400">Typ:</span>
+          <select value={documentCategory} onChange={(e) => setDocumentCategory(e.target.value)} className="px-1 py-0.5 text-xs rounded border border-gray-200 dark:border-gray-600 bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-300">
+            <option value="vokabelseite">Vokabelseite</option>
+            <option value="woerterbuch">Woerterbuch</option>
+            <option value="arbeitsblatt">Arbeitsblatt</option>
+            <option value="buchseite">Buchseite</option>
+            <option value="sonstiges">Sonstiges</option>
+          </select>
+        </label>
+
        <div className="ml-auto flex items-center gap-2">
          <button
            onClick={() => {
@@ -111,6 +111,8 @@ async def rerun_ocr_and_build_grid(
    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
+    vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
+    doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
 ):
    """Re-run OCR with quality settings, then rebuild the grid.

@@ -212,6 +214,26 @@ async def rerun_ocr_and_build_grid(
        "word_count": len(merged_words),
        "raw_paddle_words": rapid_words,
    }
+    # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
+    vision_applied = False
+    if vision_fusion:
+        try:
+            from vision_ocr_fusion import vision_fuse_ocr
+            category = doc_category or session.get("document_category") or "vokabelseite"
+            logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
+            merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
+            vision_applied = True
+            # Rebuild storage from fused words
+            cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
+                                  "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
+                                 for w in merged_words]
+            word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
+                                     "word_boxes": cells_for_storage}]
+            word_result["word_count"] = len(merged_words)
+            word_result["ocr_engine"] = "vision_fusion"
+        except Exception as e:
+            logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
+
    await update_session_db(session_id, word_result=word_result)

    # Reload session with updated word_result
@@ -249,6 +271,8 @@ async def rerun_ocr_and_build_grid(
        "merged_words": len(merged_words),
        "min_conf_used": actual_min_conf,
        "enhance_applied": enhance and is_degraded,
+        "vision_fusion_applied": vision_applied,
+        "document_category": doc_category or session.get("document_category", ""),
        "ocr_duration_seconds": round(ocr_duration, 1),
    }

@@ -0,0 +1,261 @@
+"""
+Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.
+
+Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
+The LLM can read degraded text using context understanding and visual inspection,
+while OCR coordinates provide structural hints (where text is, column positions).
+
+Uses Ollama API (same pattern as handwriting_htr_api.py).
+"""
+
+import base64
+import json
+import logging
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import cv2
+import httpx
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
+
+# Document category → prompt context
+CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
+    "vokabelseite": {
+        "label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
+        "columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
+    },
+    "woerterbuch": {
+        "label": "Woerterbuchseite",
+        "columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
+    },
+    "arbeitsblatt": {
+        "label": "Arbeitsblatt",
+        "columns": "Erkenne die Spaltenstruktur aus dem Layout.",
+    },
+    "buchseite": {
+        "label": "Schulbuchseite",
+        "columns": "Erkenne die Spaltenstruktur aus dem Layout.",
+    },
+}
+
+
+def _group_words_into_lines(
+    words: List[Dict], y_tolerance: float = 15.0,
+) -> List[List[Dict]]:
+    """Group OCR words into lines by Y-proximity."""
+    if not words:
+        return []
+    sorted_w = sorted(words, key=lambda w: w.get("top", 0))
+    lines: List[List[Dict]] = [[sorted_w[0]]]
+    for w in sorted_w[1:]:
+        last_line = lines[-1]
+        avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
+        if abs(w["top"] - avg_y) <= y_tolerance:
+            last_line.append(w)
+        else:
+            lines.append([w])
+    # Sort words within each line by X
+    for line in lines:
+        line.sort(key=lambda w: w.get("left", 0))
+    return lines
+
+
+def _build_ocr_context(words: List[Dict], img_h: int) -> str:
+    """Build a text description of OCR words with positions for the prompt."""
+    lines = _group_words_into_lines(words)
+    context_parts = []
+    for i, line in enumerate(lines):
+        word_descs = []
+        for w in line:
+            text = w.get("text", "").strip()
+            x = w.get("left", 0)
+            conf = w.get("conf", 0)
+            marker = " (?)" if conf < 50 else ""
+            word_descs.append(f'x={x} "{text}"{marker}')
+        avg_y = int(sum(w["top"] for w in line) / len(line))
+        context_parts.append(f"Zeile {i+1} (y~{avg_y}):  {',  '.join(word_descs)}")
+    return "\n".join(context_parts)
+
+
+def _build_prompt(
+    ocr_context: str, category: str, img_w: int, img_h: int,
+) -> str:
+    """Build the Vision-LLM prompt with OCR context and document type."""
+    cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])
+
+    return f"""Du siehst eine eingescannte {cat_info['label']}.
+{cat_info['columns']}
+
+Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
+Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:
+
+{ocr_context}
+
+Bildgroesse: {img_w} x {img_h} Pixel.
+
+AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
+- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
+- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
+  gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
+- Behalte die Reihenfolge bei
+
+Antworte NUR mit einem JSON-Array, keine Erklaerungen:
+[
+  {{"row": 1, "english": "...", "german": "...", "example": "..."}},
+  {{"row": 2, "english": "...", "german": "...", "example": "..."}}
+]"""
+
+
+def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
+    """Parse the LLM JSON response, handling markdown code blocks."""
+    text = response_text.strip()
+
+    # Strip markdown code block if present
+    if text.startswith("```"):
+        text = re.sub(r"^```(?:json)?\s*", "", text)
+        text = re.sub(r"\s*```\s*$", "", text)
+
+    # Try to find JSON array
+    match = re.search(r"\[[\s\S]*\]", text)
+    if not match:
+        logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
+        return None
+
+    try:
+        data = json.loads(match.group())
+        if not isinstance(data, list):
+            return None
+        return data
+    except json.JSONDecodeError as e:
+        logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
+        return None
+
+
+def _vocab_rows_to_words(
+    rows: List[Dict], img_w: int, img_h: int,
+) -> List[Dict]:
+    """Convert LLM vocab rows back to word dicts for grid building.
+
+    Distributes words across estimated column positions so the
+    existing grid builder can process them normally.
+    """
+    words = []
+    # Estimate column positions (3-column vocab layout)
+    col_positions = [
+        (0.02, 0.28),      # EN: 2%-28% of width
+        (0.30, 0.55),      # DE: 30%-55%
+        (0.57, 0.98),      # Example: 57%-98%
+    ]
+
+    median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
+    y_step = max(median_h + 5, img_h // max(len(rows), 1))
+
+    for i, row in enumerate(rows):
+        y = int(i * y_step + 20)
+        row_num = row.get("row", i + 1)
+
+        for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
+            ("english", col_positions[0]),
+            ("german", col_positions[1]),
+            ("example", col_positions[2]),
+        ]):
+            text = (row.get(field) or "").strip()
+            if not text:
+                continue
+            x = int(x_start_pct * img_w)
+            w = int((x_end_pct - x_start_pct) * img_w)
+            words.append({
+                "text": text,
+                "left": x,
+                "top": y,
+                "width": w,
+                "height": median_h,
+                "conf": 95,  # LLM-corrected → high confidence
+                "_source": "vision_llm",
+                "_row": row_num,
+                "_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
+            })
+
+    logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
+    return words
+
+
+async def vision_fuse_ocr(
+    img_bgr: np.ndarray,
+    ocr_words: List[Dict],
+    document_category: str = "vokabelseite",
+) -> List[Dict]:
+    """Fuse traditional OCR results with Vision-LLM reading.
+
+    Sends the image + OCR word positions to Qwen2.5-VL which can:
+    - Read degraded text that traditional OCR cannot
+    - Use document context (knows what a vocab table looks like)
+    - Merge continuation rows (understands table structure)
+
+    Args:
+        img_bgr: The cropped/dewarped scan image (BGR)
+        ocr_words: Traditional OCR word list with positions
+        document_category: Type of document being scanned
+
+    Returns:
+        Corrected word list in same format as input, ready for grid building.
+        Falls back to original ocr_words on error.
+    """
+    img_h, img_w = img_bgr.shape[:2]
+
+    # Build OCR context string
+    ocr_context = _build_ocr_context(ocr_words, img_h)
+
+    # Build prompt
+    prompt = _build_prompt(ocr_context, document_category, img_w, img_h)
+
+    # Encode image as base64
+    _, img_encoded = cv2.imencode(".png", img_bgr)
+    img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
+
+    # Call Qwen2.5-VL via Ollama
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(
+                f"{OLLAMA_BASE_URL}/api/generate",
+                json={
+                    "model": OLLAMA_HTR_MODEL,
+                    "prompt": prompt,
+                    "images": [img_b64],
+                    "stream": False,
+                    "options": {"temperature": 0.1, "num_predict": 4096},
+                },
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            response_text = data.get("response", "").strip()
+    except Exception as e:
+        logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
+        return ocr_words  # Fallback to original
+
+    if not response_text:
+        logger.warning("vision_fuse_ocr: empty LLM response")
+        return ocr_words
+
+    # Parse JSON response
+    rows = _parse_llm_response(response_text)
+    if not rows:
+        logger.warning(
+            "vision_fuse_ocr: could not parse LLM response, "
+            "first 200 chars: %s", response_text[:200],
+        )
+        return ocr_words
+
+    logger.info(
+        f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
+        f"(from {len(ocr_words)} OCR words)"
+    )
+
+    # Convert back to word format for grid building
+    return _vocab_rows_to_words(rows, img_w, img_h)