Add Vision-LLM OCR Fusion (Step 4) for degraded scans
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s
New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -111,6 +111,8 @@ async def rerun_ocr_and_build_grid(
|
||||
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
|
||||
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
|
||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
|
||||
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
|
||||
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
|
||||
):
|
||||
"""Re-run OCR with quality settings, then rebuild the grid.
|
||||
|
||||
@@ -212,6 +214,26 @@ async def rerun_ocr_and_build_grid(
|
||||
"word_count": len(merged_words),
|
||||
"raw_paddle_words": rapid_words,
|
||||
}
|
||||
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
|
||||
vision_applied = False
|
||||
if vision_fusion:
|
||||
try:
|
||||
from vision_ocr_fusion import vision_fuse_ocr
|
||||
category = doc_category or session.get("document_category") or "vokabelseite"
|
||||
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
|
||||
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
|
||||
vision_applied = True
|
||||
# Rebuild storage from fused words
|
||||
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
|
||||
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
|
||||
for w in merged_words]
|
||||
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
|
||||
"word_boxes": cells_for_storage}]
|
||||
word_result["word_count"] = len(merged_words)
|
||||
word_result["ocr_engine"] = "vision_fusion"
|
||||
except Exception as e:
|
||||
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
|
||||
|
||||
await update_session_db(session_id, word_result=word_result)
|
||||
|
||||
# Reload session with updated word_result
|
||||
@@ -249,6 +271,8 @@ async def rerun_ocr_and_build_grid(
|
||||
"merged_words": len(merged_words),
|
||||
"min_conf_used": actual_min_conf,
|
||||
"enhance_applied": enhance and is_degraded,
|
||||
"vision_fusion_applied": vision_applied,
|
||||
"document_category": doc_category or session.get("document_category", ""),
|
||||
"ocr_duration_seconds": round(ocr_duration, 1),
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user