Add Vision-LLM OCR Fusion (Step 4) for degraded scans
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s

New module vision_ocr_fusion.py: Sends scan image + OCR word
coordinates + document type to Qwen2.5-VL 32B. The LLM reads
the image visually while using OCR positions as structural hints.

Key features:
- Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.)
- OCR words grouped into lines with x/y coordinates in prompt
- Low-confidence words marked with (?) for LLM attention
- Continuation row merging instructions in prompt
- JSON response parsing with markdown code block handling
- Fallback to original OCR on any error

Frontend (admin-lehrer Grid Review):
- "Vision-LLM" checkbox toggle
- "Typ" dropdown (Vokabelseite, Woerterbuch, etc.)
- Steps 1-3 defaults set to inactive

Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 00:24:22 +02:00
parent 00eb9f26f6
commit 2f8270f77b
4 changed files with 320 additions and 5 deletions

View File

@@ -111,6 +111,8 @@ async def rerun_ocr_and_build_grid(
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
):
"""Re-run OCR with quality settings, then rebuild the grid.
@@ -212,6 +214,26 @@ async def rerun_ocr_and_build_grid(
"word_count": len(merged_words),
"raw_paddle_words": rapid_words,
}
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
vision_applied = False
if vision_fusion:
try:
from vision_ocr_fusion import vision_fuse_ocr
category = doc_category or session.get("document_category") or "vokabelseite"
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
vision_applied = True
# Rebuild storage from fused words
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
for w in merged_words]
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
"word_boxes": cells_for_storage}]
word_result["word_count"] = len(merged_words)
word_result["ocr_engine"] = "vision_fusion"
except Exception as e:
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
await update_session_db(session_id, word_result=word_result)
# Reload session with updated word_result
@@ -249,6 +271,8 @@ async def rerun_ocr_and_build_grid(
"merged_words": len(merged_words),
"min_conf_used": actual_min_conf,
"enhance_applied": enhance and is_degraded,
"vision_fusion_applied": vision_applied,
"document_category": doc_category or session.get("document_category", ""),
"ocr_duration_seconds": round(ocr_duration, 1),
}