Add Vision-LLM OCR Fusion (Step 4) for degraded scans
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s
New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -28,10 +28,14 @@ export function useGridEditor(sessionId: string | null) {
|
||||
const [ipaMode, setIpaMode] = useState<IpaMode>('auto')
|
||||
const [syllableMode, setSyllableMode] = useState<SyllableMode>('auto')
|
||||
|
||||
// OCR Quality Steps (A/B testing toggles)
|
||||
const [ocrEnhance, setOcrEnhance] = useState(true)
|
||||
const [ocrMaxCols, setOcrMaxCols] = useState(0) // 0 = unlimited (admin pipeline default)
|
||||
const [ocrMinConf, setOcrMinConf] = useState(0) // 0 = auto from quality score
|
||||
// OCR Quality Steps (A/B testing toggles — defaults off for now)
|
||||
const [ocrEnhance, setOcrEnhance] = useState(false)
|
||||
const [ocrMaxCols, setOcrMaxCols] = useState(0)
|
||||
const [ocrMinConf, setOcrMinConf] = useState(0)
|
||||
|
||||
// Vision-LLM Fusion (Step 4)
|
||||
const [visionFusion, setVisionFusion] = useState(false)
|
||||
const [documentCategory, setDocumentCategory] = useState('vokabelseite')
|
||||
|
||||
// Undo/redo stacks store serialized zone arrays
|
||||
const undoStack = useRef<string[]>([])
|
||||
@@ -92,6 +96,8 @@ export function useGridEditor(sessionId: string | null) {
|
||||
params.set('enhance', String(ocrEnhance))
|
||||
if (ocrMaxCols > 0) params.set('max_cols', String(ocrMaxCols))
|
||||
if (ocrMinConf > 0) params.set('min_conf', String(ocrMinConf))
|
||||
params.set('vision_fusion', String(visionFusion))
|
||||
if (documentCategory) params.set('doc_category', documentCategory)
|
||||
const res = await fetch(
|
||||
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/rerun-ocr-and-build-grid?${params}`,
|
||||
{ method: 'POST' },
|
||||
@@ -110,7 +116,7 @@ export function useGridEditor(sessionId: string | null) {
|
||||
} finally {
|
||||
setLoading(false)
|
||||
}
|
||||
}, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf])
|
||||
}, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf, visionFusion, documentCategory])
|
||||
|
||||
const loadGrid = useCallback(async () => {
|
||||
if (!sessionId) return
|
||||
@@ -1030,6 +1036,10 @@ export function useGridEditor(sessionId: string | null) {
|
||||
setOcrMaxCols,
|
||||
ocrMinConf,
|
||||
setOcrMinConf,
|
||||
visionFusion,
|
||||
setVisionFusion,
|
||||
documentCategory,
|
||||
setDocumentCategory,
|
||||
rerunOcr,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,6 +67,10 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
|
||||
setOcrMaxCols,
|
||||
ocrMinConf,
|
||||
setOcrMinConf,
|
||||
visionFusion,
|
||||
setVisionFusion,
|
||||
documentCategory,
|
||||
setDocumentCategory,
|
||||
rerunOcr,
|
||||
} = useGridEditor(sessionId)
|
||||
|
||||
@@ -291,6 +295,22 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<span className="text-gray-400 dark:text-gray-500">|</span>
|
||||
<label className="flex items-center gap-1 cursor-pointer" title="Step 4: Vision-LLM Fusion — Qwen2.5-VL korrigiert OCR anhand des Bildes">
|
||||
<input type="checkbox" checked={visionFusion} onChange={(e) => setVisionFusion(e.target.checked)} className="rounded w-3 h-3 accent-orange-500" />
|
||||
<span className={`${visionFusion ? 'text-orange-500 dark:text-orange-400 font-medium' : 'text-gray-500 dark:text-gray-400'}`}>Vision-LLM</span>
|
||||
</label>
|
||||
<label className="flex items-center gap-1" title="Dokumenttyp fuer Vision-LLM Prompt">
|
||||
<span className="text-gray-500 dark:text-gray-400">Typ:</span>
|
||||
<select value={documentCategory} onChange={(e) => setDocumentCategory(e.target.value)} className="px-1 py-0.5 text-xs rounded border border-gray-200 dark:border-gray-600 bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-300">
|
||||
<option value="vokabelseite">Vokabelseite</option>
|
||||
<option value="woerterbuch">Woerterbuch</option>
|
||||
<option value="arbeitsblatt">Arbeitsblatt</option>
|
||||
<option value="buchseite">Buchseite</option>
|
||||
<option value="sonstiges">Sonstiges</option>
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<div className="ml-auto flex items-center gap-2">
|
||||
<button
|
||||
onClick={() => {
|
||||
|
||||
@@ -111,6 +111,8 @@ async def rerun_ocr_and_build_grid(
|
||||
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
|
||||
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
|
||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
|
||||
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
|
||||
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
|
||||
):
|
||||
"""Re-run OCR with quality settings, then rebuild the grid.
|
||||
|
||||
@@ -212,6 +214,26 @@ async def rerun_ocr_and_build_grid(
|
||||
"word_count": len(merged_words),
|
||||
"raw_paddle_words": rapid_words,
|
||||
}
|
||||
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
|
||||
vision_applied = False
|
||||
if vision_fusion:
|
||||
try:
|
||||
from vision_ocr_fusion import vision_fuse_ocr
|
||||
category = doc_category or session.get("document_category") or "vokabelseite"
|
||||
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
|
||||
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
|
||||
vision_applied = True
|
||||
# Rebuild storage from fused words
|
||||
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
|
||||
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
|
||||
for w in merged_words]
|
||||
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
|
||||
"word_boxes": cells_for_storage}]
|
||||
word_result["word_count"] = len(merged_words)
|
||||
word_result["ocr_engine"] = "vision_fusion"
|
||||
except Exception as e:
|
||||
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
|
||||
|
||||
await update_session_db(session_id, word_result=word_result)
|
||||
|
||||
# Reload session with updated word_result
|
||||
@@ -249,6 +271,8 @@ async def rerun_ocr_and_build_grid(
|
||||
"merged_words": len(merged_words),
|
||||
"min_conf_used": actual_min_conf,
|
||||
"enhance_applied": enhance and is_degraded,
|
||||
"vision_fusion_applied": vision_applied,
|
||||
"document_category": doc_category or session.get("document_category", ""),
|
||||
"ocr_duration_seconds": round(ocr_duration, 1),
|
||||
}
|
||||
|
||||
|
||||
261
klausur-service/backend/vision_ocr_fusion.py
Normal file
261
klausur-service/backend/vision_ocr_fusion.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.
|
||||
|
||||
Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
|
||||
The LLM can read degraded text using context understanding and visual inspection,
|
||||
while OCR coordinates provide structural hints (where text is, column positions).
|
||||
|
||||
Uses Ollama API (same pattern as handwriting_htr_api.py).
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import cv2
|
||||
import httpx
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
||||
|
||||
# Document category → prompt context
|
||||
CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
|
||||
"vokabelseite": {
|
||||
"label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
|
||||
"columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
|
||||
},
|
||||
"woerterbuch": {
|
||||
"label": "Woerterbuchseite",
|
||||
"columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
|
||||
},
|
||||
"arbeitsblatt": {
|
||||
"label": "Arbeitsblatt",
|
||||
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
|
||||
},
|
||||
"buchseite": {
|
||||
"label": "Schulbuchseite",
|
||||
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _group_words_into_lines(
|
||||
words: List[Dict], y_tolerance: float = 15.0,
|
||||
) -> List[List[Dict]]:
|
||||
"""Group OCR words into lines by Y-proximity."""
|
||||
if not words:
|
||||
return []
|
||||
sorted_w = sorted(words, key=lambda w: w.get("top", 0))
|
||||
lines: List[List[Dict]] = [[sorted_w[0]]]
|
||||
for w in sorted_w[1:]:
|
||||
last_line = lines[-1]
|
||||
avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
|
||||
if abs(w["top"] - avg_y) <= y_tolerance:
|
||||
last_line.append(w)
|
||||
else:
|
||||
lines.append([w])
|
||||
# Sort words within each line by X
|
||||
for line in lines:
|
||||
line.sort(key=lambda w: w.get("left", 0))
|
||||
return lines
|
||||
|
||||
|
||||
def _build_ocr_context(words: List[Dict], img_h: int) -> str:
|
||||
"""Build a text description of OCR words with positions for the prompt."""
|
||||
lines = _group_words_into_lines(words)
|
||||
context_parts = []
|
||||
for i, line in enumerate(lines):
|
||||
word_descs = []
|
||||
for w in line:
|
||||
text = w.get("text", "").strip()
|
||||
x = w.get("left", 0)
|
||||
conf = w.get("conf", 0)
|
||||
marker = " (?)" if conf < 50 else ""
|
||||
word_descs.append(f'x={x} "{text}"{marker}')
|
||||
avg_y = int(sum(w["top"] for w in line) / len(line))
|
||||
context_parts.append(f"Zeile {i+1} (y~{avg_y}): {', '.join(word_descs)}")
|
||||
return "\n".join(context_parts)
|
||||
|
||||
|
||||
def _build_prompt(
|
||||
ocr_context: str, category: str, img_w: int, img_h: int,
|
||||
) -> str:
|
||||
"""Build the Vision-LLM prompt with OCR context and document type."""
|
||||
cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])
|
||||
|
||||
return f"""Du siehst eine eingescannte {cat_info['label']}.
|
||||
{cat_info['columns']}
|
||||
|
||||
Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
|
||||
Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:
|
||||
|
||||
{ocr_context}
|
||||
|
||||
Bildgroesse: {img_w} x {img_h} Pixel.
|
||||
|
||||
AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
|
||||
- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
|
||||
- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
|
||||
gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
|
||||
- Behalte die Reihenfolge bei
|
||||
|
||||
Antworte NUR mit einem JSON-Array, keine Erklaerungen:
|
||||
[
|
||||
{{"row": 1, "english": "...", "german": "...", "example": "..."}},
|
||||
{{"row": 2, "english": "...", "german": "...", "example": "..."}}
|
||||
]"""
|
||||
|
||||
|
||||
def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
|
||||
"""Parse the LLM JSON response, handling markdown code blocks."""
|
||||
text = response_text.strip()
|
||||
|
||||
# Strip markdown code block if present
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"\s*```\s*$", "", text)
|
||||
|
||||
# Try to find JSON array
|
||||
match = re.search(r"\[[\s\S]*\]", text)
|
||||
if not match:
|
||||
logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
|
||||
return None
|
||||
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
if not isinstance(data, list):
|
||||
return None
|
||||
return data
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _vocab_rows_to_words(
|
||||
rows: List[Dict], img_w: int, img_h: int,
|
||||
) -> List[Dict]:
|
||||
"""Convert LLM vocab rows back to word dicts for grid building.
|
||||
|
||||
Distributes words across estimated column positions so the
|
||||
existing grid builder can process them normally.
|
||||
"""
|
||||
words = []
|
||||
# Estimate column positions (3-column vocab layout)
|
||||
col_positions = [
|
||||
(0.02, 0.28), # EN: 2%-28% of width
|
||||
(0.30, 0.55), # DE: 30%-55%
|
||||
(0.57, 0.98), # Example: 57%-98%
|
||||
]
|
||||
|
||||
median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
|
||||
y_step = max(median_h + 5, img_h // max(len(rows), 1))
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
y = int(i * y_step + 20)
|
||||
row_num = row.get("row", i + 1)
|
||||
|
||||
for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
|
||||
("english", col_positions[0]),
|
||||
("german", col_positions[1]),
|
||||
("example", col_positions[2]),
|
||||
]):
|
||||
text = (row.get(field) or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
x = int(x_start_pct * img_w)
|
||||
w = int((x_end_pct - x_start_pct) * img_w)
|
||||
words.append({
|
||||
"text": text,
|
||||
"left": x,
|
||||
"top": y,
|
||||
"width": w,
|
||||
"height": median_h,
|
||||
"conf": 95, # LLM-corrected → high confidence
|
||||
"_source": "vision_llm",
|
||||
"_row": row_num,
|
||||
"_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
|
||||
})
|
||||
|
||||
logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
|
||||
return words
|
||||
|
||||
|
||||
async def vision_fuse_ocr(
|
||||
img_bgr: np.ndarray,
|
||||
ocr_words: List[Dict],
|
||||
document_category: str = "vokabelseite",
|
||||
) -> List[Dict]:
|
||||
"""Fuse traditional OCR results with Vision-LLM reading.
|
||||
|
||||
Sends the image + OCR word positions to Qwen2.5-VL which can:
|
||||
- Read degraded text that traditional OCR cannot
|
||||
- Use document context (knows what a vocab table looks like)
|
||||
- Merge continuation rows (understands table structure)
|
||||
|
||||
Args:
|
||||
img_bgr: The cropped/dewarped scan image (BGR)
|
||||
ocr_words: Traditional OCR word list with positions
|
||||
document_category: Type of document being scanned
|
||||
|
||||
Returns:
|
||||
Corrected word list in same format as input, ready for grid building.
|
||||
Falls back to original ocr_words on error.
|
||||
"""
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
|
||||
# Build OCR context string
|
||||
ocr_context = _build_ocr_context(ocr_words, img_h)
|
||||
|
||||
# Build prompt
|
||||
prompt = _build_prompt(ocr_context, document_category, img_w, img_h)
|
||||
|
||||
# Encode image as base64
|
||||
_, img_encoded = cv2.imencode(".png", img_bgr)
|
||||
img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
|
||||
|
||||
# Call Qwen2.5-VL via Ollama
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(
|
||||
f"{OLLAMA_BASE_URL}/api/generate",
|
||||
json={
|
||||
"model": OLLAMA_HTR_MODEL,
|
||||
"prompt": prompt,
|
||||
"images": [img_b64],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 4096},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
response_text = data.get("response", "").strip()
|
||||
except Exception as e:
|
||||
logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
|
||||
return ocr_words # Fallback to original
|
||||
|
||||
if not response_text:
|
||||
logger.warning("vision_fuse_ocr: empty LLM response")
|
||||
return ocr_words
|
||||
|
||||
# Parse JSON response
|
||||
rows = _parse_llm_response(response_text)
|
||||
if not rows:
|
||||
logger.warning(
|
||||
"vision_fuse_ocr: could not parse LLM response, "
|
||||
"first 200 chars: %s", response_text[:200],
|
||||
)
|
||||
return ocr_words
|
||||
|
||||
logger.info(
|
||||
f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
|
||||
f"(from {len(ocr_words)} OCR words)"
|
||||
)
|
||||
|
||||
# Convert back to word format for grid building
|
||||
return _vocab_rows_to_words(rows, img_w, img_h)
|
||||
Reference in New Issue
Block a user