Add Vision-LLM OCR Fusion (Step 4) for degraded scans

New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 00:24:22 +02:00
parent 00eb9f26f6
commit 2f8270f77b
4 changed files with 320 additions and 5 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -111,6 +111,8 @@ async def rerun_ocr_and_build_grid(
    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
+    vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
+    doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
 ):
    """Re-run OCR with quality settings, then rebuild the grid.

@@ -212,6 +214,26 @@ async def rerun_ocr_and_build_grid(
        "word_count": len(merged_words),
        "raw_paddle_words": rapid_words,
    }
+    # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
+    vision_applied = False
+    if vision_fusion:
+        try:
+            from vision_ocr_fusion import vision_fuse_ocr
+            category = doc_category or session.get("document_category") or "vokabelseite"
+            logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
+            merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
+            vision_applied = True
+            # Rebuild storage from fused words
+            cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
+                                  "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
+                                 for w in merged_words]
+            word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
+                                     "word_boxes": cells_for_storage}]
+            word_result["word_count"] = len(merged_words)
+            word_result["ocr_engine"] = "vision_fusion"
+        except Exception as e:
+            logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
+
    await update_session_db(session_id, word_result=word_result)

    # Reload session with updated word_result
@@ -249,6 +271,8 @@ async def rerun_ocr_and_build_grid(
        "merged_words": len(merged_words),
        "min_conf_used": actual_min_conf,
        "enhance_applied": enhance and is_degraded,
+        "vision_fusion_applied": vision_applied,
+        "document_category": doc_category or session.get("document_category", ""),
        "ocr_duration_seconds": round(ocr_duration, 1),
    }