Add Vision-LLM OCR Fusion (Step 4) for degraded scans
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s

New module vision_ocr_fusion.py: Sends scan image + OCR word
coordinates + document type to Qwen2.5-VL 32B. The LLM reads
the image visually while using OCR positions as structural hints.

Key features:
- Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.)
- OCR words grouped into lines with x/y coordinates in prompt
- Low-confidence words marked with (?) for LLM attention
- Continuation row merging instructions in prompt
- JSON response parsing with markdown code block handling
- Fallback to original OCR on any error

Frontend (admin-lehrer Grid Review):
- "Vision-LLM" checkbox toggle
- "Typ" dropdown (Vokabelseite, Woerterbuch, etc.)
- Steps 1-3 defaults set to inactive

Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 00:24:22 +02:00
parent 00eb9f26f6
commit 2f8270f77b
4 changed files with 320 additions and 5 deletions

View File

@@ -0,0 +1,261 @@
"""
Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.
Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
The LLM can read degraded text using context understanding and visual inspection,
while OCR coordinates provide structural hints (where text is, column positions).
Uses Ollama API (same pattern as handwriting_htr_api.py).
"""
import base64
import json
import logging
import os
import re
from typing import Any, Dict, List, Optional
import cv2
import httpx
import numpy as np
logger = logging.getLogger(__name__)
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
# Document category → prompt context
CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
"vokabelseite": {
"label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
"columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
},
"woerterbuch": {
"label": "Woerterbuchseite",
"columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
},
"arbeitsblatt": {
"label": "Arbeitsblatt",
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
},
"buchseite": {
"label": "Schulbuchseite",
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
},
}
def _group_words_into_lines(
words: List[Dict], y_tolerance: float = 15.0,
) -> List[List[Dict]]:
"""Group OCR words into lines by Y-proximity."""
if not words:
return []
sorted_w = sorted(words, key=lambda w: w.get("top", 0))
lines: List[List[Dict]] = [[sorted_w[0]]]
for w in sorted_w[1:]:
last_line = lines[-1]
avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
if abs(w["top"] - avg_y) <= y_tolerance:
last_line.append(w)
else:
lines.append([w])
# Sort words within each line by X
for line in lines:
line.sort(key=lambda w: w.get("left", 0))
return lines
def _build_ocr_context(words: List[Dict], img_h: int) -> str:
"""Build a text description of OCR words with positions for the prompt."""
lines = _group_words_into_lines(words)
context_parts = []
for i, line in enumerate(lines):
word_descs = []
for w in line:
text = w.get("text", "").strip()
x = w.get("left", 0)
conf = w.get("conf", 0)
marker = " (?)" if conf < 50 else ""
word_descs.append(f'x={x} "{text}"{marker}')
avg_y = int(sum(w["top"] for w in line) / len(line))
context_parts.append(f"Zeile {i+1} (y~{avg_y}): {', '.join(word_descs)}")
return "\n".join(context_parts)
def _build_prompt(
ocr_context: str, category: str, img_w: int, img_h: int,
) -> str:
"""Build the Vision-LLM prompt with OCR context and document type."""
cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])
return f"""Du siehst eine eingescannte {cat_info['label']}.
{cat_info['columns']}
Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:
{ocr_context}
Bildgroesse: {img_w} x {img_h} Pixel.
AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
- Behalte die Reihenfolge bei
Antworte NUR mit einem JSON-Array, keine Erklaerungen:
[
{{"row": 1, "english": "...", "german": "...", "example": "..."}},
{{"row": 2, "english": "...", "german": "...", "example": "..."}}
]"""
def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
"""Parse the LLM JSON response, handling markdown code blocks."""
text = response_text.strip()
# Strip markdown code block if present
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```\s*$", "", text)
# Try to find JSON array
match = re.search(r"\[[\s\S]*\]", text)
if not match:
logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
return None
try:
data = json.loads(match.group())
if not isinstance(data, list):
return None
return data
except json.JSONDecodeError as e:
logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
return None
def _vocab_rows_to_words(
rows: List[Dict], img_w: int, img_h: int,
) -> List[Dict]:
"""Convert LLM vocab rows back to word dicts for grid building.
Distributes words across estimated column positions so the
existing grid builder can process them normally.
"""
words = []
# Estimate column positions (3-column vocab layout)
col_positions = [
(0.02, 0.28), # EN: 2%-28% of width
(0.30, 0.55), # DE: 30%-55%
(0.57, 0.98), # Example: 57%-98%
]
median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
y_step = max(median_h + 5, img_h // max(len(rows), 1))
for i, row in enumerate(rows):
y = int(i * y_step + 20)
row_num = row.get("row", i + 1)
for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
("english", col_positions[0]),
("german", col_positions[1]),
("example", col_positions[2]),
]):
text = (row.get(field) or "").strip()
if not text:
continue
x = int(x_start_pct * img_w)
w = int((x_end_pct - x_start_pct) * img_w)
words.append({
"text": text,
"left": x,
"top": y,
"width": w,
"height": median_h,
"conf": 95, # LLM-corrected → high confidence
"_source": "vision_llm",
"_row": row_num,
"_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
})
logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
return words
async def vision_fuse_ocr(
img_bgr: np.ndarray,
ocr_words: List[Dict],
document_category: str = "vokabelseite",
) -> List[Dict]:
"""Fuse traditional OCR results with Vision-LLM reading.
Sends the image + OCR word positions to Qwen2.5-VL which can:
- Read degraded text that traditional OCR cannot
- Use document context (knows what a vocab table looks like)
- Merge continuation rows (understands table structure)
Args:
img_bgr: The cropped/dewarped scan image (BGR)
ocr_words: Traditional OCR word list with positions
document_category: Type of document being scanned
Returns:
Corrected word list in same format as input, ready for grid building.
Falls back to original ocr_words on error.
"""
img_h, img_w = img_bgr.shape[:2]
# Build OCR context string
ocr_context = _build_ocr_context(ocr_words, img_h)
# Build prompt
prompt = _build_prompt(ocr_context, document_category, img_w, img_h)
# Encode image as base64
_, img_encoded = cv2.imencode(".png", img_bgr)
img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
# Call Qwen2.5-VL via Ollama
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
f"{OLLAMA_BASE_URL}/api/generate",
json={
"model": OLLAMA_HTR_MODEL,
"prompt": prompt,
"images": [img_b64],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 4096},
},
)
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "").strip()
except Exception as e:
logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
return ocr_words # Fallback to original
if not response_text:
logger.warning("vision_fuse_ocr: empty LLM response")
return ocr_words
# Parse JSON response
rows = _parse_llm_response(response_text)
if not rows:
logger.warning(
"vision_fuse_ocr: could not parse LLM response, "
"first 200 chars: %s", response_text[:200],
)
return ocr_words
logger.info(
f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
f"(from {len(ocr_words)} OCR words)"
)
# Convert back to word format for grid building
return _vocab_rows_to_words(rows, img_w, img_h)