Add Vision-LLM OCR Fusion (Step 4) for degraded scans
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s

New module vision_ocr_fusion.py: Sends scan image + OCR word
coordinates + document type to Qwen2.5-VL 32B. The LLM reads
the image visually while using OCR positions as structural hints.

Key features:
- Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.)
- OCR words grouped into lines with x/y coordinates in prompt
- Low-confidence words marked with (?) for LLM attention
- Continuation row merging instructions in prompt
- JSON response parsing with markdown code block handling
- Fallback to original OCR on any error

Frontend (admin-lehrer Grid Review):
- "Vision-LLM" checkbox toggle
- "Typ" dropdown (Vokabelseite, Woerterbuch, etc.)
- Steps 1-3 defaults set to inactive

Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 00:24:22 +02:00
parent 00eb9f26f6
commit 2f8270f77b
4 changed files with 320 additions and 5 deletions

View File

@@ -28,10 +28,14 @@ export function useGridEditor(sessionId: string | null) {
const [ipaMode, setIpaMode] = useState<IpaMode>('auto')
const [syllableMode, setSyllableMode] = useState<SyllableMode>('auto')
// OCR Quality Steps (A/B testing toggles)
const [ocrEnhance, setOcrEnhance] = useState(true)
const [ocrMaxCols, setOcrMaxCols] = useState(0) // 0 = unlimited (admin pipeline default)
const [ocrMinConf, setOcrMinConf] = useState(0) // 0 = auto from quality score
// OCR Quality Steps (A/B testing toggles — defaults off for now)
const [ocrEnhance, setOcrEnhance] = useState(false)
const [ocrMaxCols, setOcrMaxCols] = useState(0)
const [ocrMinConf, setOcrMinConf] = useState(0)
// Vision-LLM Fusion (Step 4)
const [visionFusion, setVisionFusion] = useState(false)
const [documentCategory, setDocumentCategory] = useState('vokabelseite')
// Undo/redo stacks store serialized zone arrays
const undoStack = useRef<string[]>([])
@@ -92,6 +96,8 @@ export function useGridEditor(sessionId: string | null) {
params.set('enhance', String(ocrEnhance))
if (ocrMaxCols > 0) params.set('max_cols', String(ocrMaxCols))
if (ocrMinConf > 0) params.set('min_conf', String(ocrMinConf))
params.set('vision_fusion', String(visionFusion))
if (documentCategory) params.set('doc_category', documentCategory)
const res = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/rerun-ocr-and-build-grid?${params}`,
{ method: 'POST' },
@@ -110,7 +116,7 @@ export function useGridEditor(sessionId: string | null) {
} finally {
setLoading(false)
}
}, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf])
}, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf, visionFusion, documentCategory])
const loadGrid = useCallback(async () => {
if (!sessionId) return
@@ -1030,6 +1036,10 @@ export function useGridEditor(sessionId: string | null) {
setOcrMaxCols,
ocrMinConf,
setOcrMinConf,
visionFusion,
setVisionFusion,
documentCategory,
setDocumentCategory,
rerunOcr,
}
}

View File

@@ -67,6 +67,10 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
setOcrMaxCols,
ocrMinConf,
setOcrMinConf,
visionFusion,
setVisionFusion,
documentCategory,
setDocumentCategory,
rerunOcr,
} = useGridEditor(sessionId)
@@ -291,6 +295,22 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
</select>
</label>
<span className="text-gray-400 dark:text-gray-500">|</span>
<label className="flex items-center gap-1 cursor-pointer" title="Step 4: Vision-LLM Fusion — Qwen2.5-VL korrigiert OCR anhand des Bildes">
<input type="checkbox" checked={visionFusion} onChange={(e) => setVisionFusion(e.target.checked)} className="rounded w-3 h-3 accent-orange-500" />
<span className={`${visionFusion ? 'text-orange-500 dark:text-orange-400 font-medium' : 'text-gray-500 dark:text-gray-400'}`}>Vision-LLM</span>
</label>
<label className="flex items-center gap-1" title="Dokumenttyp fuer Vision-LLM Prompt">
<span className="text-gray-500 dark:text-gray-400">Typ:</span>
<select value={documentCategory} onChange={(e) => setDocumentCategory(e.target.value)} className="px-1 py-0.5 text-xs rounded border border-gray-200 dark:border-gray-600 bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-300">
<option value="vokabelseite">Vokabelseite</option>
<option value="woerterbuch">Woerterbuch</option>
<option value="arbeitsblatt">Arbeitsblatt</option>
<option value="buchseite">Buchseite</option>
<option value="sonstiges">Sonstiges</option>
</select>
</label>
<div className="ml-auto flex items-center gap-2">
<button
onClick={() => {

View File

@@ -111,6 +111,8 @@ async def rerun_ocr_and_build_grid(
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
):
"""Re-run OCR with quality settings, then rebuild the grid.
@@ -212,6 +214,26 @@ async def rerun_ocr_and_build_grid(
"word_count": len(merged_words),
"raw_paddle_words": rapid_words,
}
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
vision_applied = False
if vision_fusion:
try:
from vision_ocr_fusion import vision_fuse_ocr
category = doc_category or session.get("document_category") or "vokabelseite"
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
vision_applied = True
# Rebuild storage from fused words
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
for w in merged_words]
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
"word_boxes": cells_for_storage}]
word_result["word_count"] = len(merged_words)
word_result["ocr_engine"] = "vision_fusion"
except Exception as e:
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
await update_session_db(session_id, word_result=word_result)
# Reload session with updated word_result
@@ -249,6 +271,8 @@ async def rerun_ocr_and_build_grid(
"merged_words": len(merged_words),
"min_conf_used": actual_min_conf,
"enhance_applied": enhance and is_degraded,
"vision_fusion_applied": vision_applied,
"document_category": doc_category or session.get("document_category", ""),
"ocr_duration_seconds": round(ocr_duration, 1),
}

View File

@@ -0,0 +1,261 @@
"""
Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.
Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
The LLM can read degraded text using context understanding and visual inspection,
while OCR coordinates provide structural hints (where text is, column positions).
Uses Ollama API (same pattern as handwriting_htr_api.py).
"""
import base64
import json
import logging
import os
import re
from typing import Any, Dict, List, Optional
import cv2
import httpx
import numpy as np
logger = logging.getLogger(__name__)
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
# Document category → prompt context
CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
"vokabelseite": {
"label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
"columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
},
"woerterbuch": {
"label": "Woerterbuchseite",
"columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
},
"arbeitsblatt": {
"label": "Arbeitsblatt",
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
},
"buchseite": {
"label": "Schulbuchseite",
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
},
}
def _group_words_into_lines(
words: List[Dict], y_tolerance: float = 15.0,
) -> List[List[Dict]]:
"""Group OCR words into lines by Y-proximity."""
if not words:
return []
sorted_w = sorted(words, key=lambda w: w.get("top", 0))
lines: List[List[Dict]] = [[sorted_w[0]]]
for w in sorted_w[1:]:
last_line = lines[-1]
avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
if abs(w["top"] - avg_y) <= y_tolerance:
last_line.append(w)
else:
lines.append([w])
# Sort words within each line by X
for line in lines:
line.sort(key=lambda w: w.get("left", 0))
return lines
def _build_ocr_context(words: List[Dict], img_h: int) -> str:
"""Build a text description of OCR words with positions for the prompt."""
lines = _group_words_into_lines(words)
context_parts = []
for i, line in enumerate(lines):
word_descs = []
for w in line:
text = w.get("text", "").strip()
x = w.get("left", 0)
conf = w.get("conf", 0)
marker = " (?)" if conf < 50 else ""
word_descs.append(f'x={x} "{text}"{marker}')
avg_y = int(sum(w["top"] for w in line) / len(line))
context_parts.append(f"Zeile {i+1} (y~{avg_y}): {', '.join(word_descs)}")
return "\n".join(context_parts)
def _build_prompt(
ocr_context: str, category: str, img_w: int, img_h: int,
) -> str:
"""Build the Vision-LLM prompt with OCR context and document type."""
cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])
return f"""Du siehst eine eingescannte {cat_info['label']}.
{cat_info['columns']}
Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:
{ocr_context}
Bildgroesse: {img_w} x {img_h} Pixel.
AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
- Behalte die Reihenfolge bei
Antworte NUR mit einem JSON-Array, keine Erklaerungen:
[
{{"row": 1, "english": "...", "german": "...", "example": "..."}},
{{"row": 2, "english": "...", "german": "...", "example": "..."}}
]"""
def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
"""Parse the LLM JSON response, handling markdown code blocks."""
text = response_text.strip()
# Strip markdown code block if present
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```\s*$", "", text)
# Try to find JSON array
match = re.search(r"\[[\s\S]*\]", text)
if not match:
logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
return None
try:
data = json.loads(match.group())
if not isinstance(data, list):
return None
return data
except json.JSONDecodeError as e:
logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
return None
def _vocab_rows_to_words(
rows: List[Dict], img_w: int, img_h: int,
) -> List[Dict]:
"""Convert LLM vocab rows back to word dicts for grid building.
Distributes words across estimated column positions so the
existing grid builder can process them normally.
"""
words = []
# Estimate column positions (3-column vocab layout)
col_positions = [
(0.02, 0.28), # EN: 2%-28% of width
(0.30, 0.55), # DE: 30%-55%
(0.57, 0.98), # Example: 57%-98%
]
median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
y_step = max(median_h + 5, img_h // max(len(rows), 1))
for i, row in enumerate(rows):
y = int(i * y_step + 20)
row_num = row.get("row", i + 1)
for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
("english", col_positions[0]),
("german", col_positions[1]),
("example", col_positions[2]),
]):
text = (row.get(field) or "").strip()
if not text:
continue
x = int(x_start_pct * img_w)
w = int((x_end_pct - x_start_pct) * img_w)
words.append({
"text": text,
"left": x,
"top": y,
"width": w,
"height": median_h,
"conf": 95, # LLM-corrected → high confidence
"_source": "vision_llm",
"_row": row_num,
"_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
})
logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
return words
async def vision_fuse_ocr(
img_bgr: np.ndarray,
ocr_words: List[Dict],
document_category: str = "vokabelseite",
) -> List[Dict]:
"""Fuse traditional OCR results with Vision-LLM reading.
Sends the image + OCR word positions to Qwen2.5-VL which can:
- Read degraded text that traditional OCR cannot
- Use document context (knows what a vocab table looks like)
- Merge continuation rows (understands table structure)
Args:
img_bgr: The cropped/dewarped scan image (BGR)
ocr_words: Traditional OCR word list with positions
document_category: Type of document being scanned
Returns:
Corrected word list in same format as input, ready for grid building.
Falls back to original ocr_words on error.
"""
img_h, img_w = img_bgr.shape[:2]
# Build OCR context string
ocr_context = _build_ocr_context(ocr_words, img_h)
# Build prompt
prompt = _build_prompt(ocr_context, document_category, img_w, img_h)
# Encode image as base64
_, img_encoded = cv2.imencode(".png", img_bgr)
img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
# Call Qwen2.5-VL via Ollama
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
f"{OLLAMA_BASE_URL}/api/generate",
json={
"model": OLLAMA_HTR_MODEL,
"prompt": prompt,
"images": [img_b64],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 4096},
},
)
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "").strip()
except Exception as e:
logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
return ocr_words # Fallback to original
if not response_text:
logger.warning("vision_fuse_ocr: empty LLM response")
return ocr_words
# Parse JSON response
rows = _parse_llm_response(response_text)
if not rows:
logger.warning(
"vision_fuse_ocr: could not parse LLM response, "
"first 200 chars: %s", response_text[:200],
)
return ocr_words
logger.info(
f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
f"(from {len(ocr_words)} OCR words)"
)
# Convert back to word format for grid building
return _vocab_rows_to_words(rows, img_w, img_h)