Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m43s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 27s
New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
262 lines
8.7 KiB
Python
262 lines
8.7 KiB
Python
"""
|
|
Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading.
|
|
|
|
Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL.
|
|
The LLM can read degraded text using context understanding and visual inspection,
|
|
while OCR coordinates provide structural hints (where text is, column positions).
|
|
|
|
Uses Ollama API (same pattern as handwriting_htr_api.py).
|
|
"""
|
|
|
|
import base64
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import cv2
|
|
import httpx
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
|
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
|
|
|
# Document category → prompt context
|
|
CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = {
|
|
"vokabelseite": {
|
|
"label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)",
|
|
"columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.",
|
|
},
|
|
"woerterbuch": {
|
|
"label": "Woerterbuchseite",
|
|
"columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.",
|
|
},
|
|
"arbeitsblatt": {
|
|
"label": "Arbeitsblatt",
|
|
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
|
|
},
|
|
"buchseite": {
|
|
"label": "Schulbuchseite",
|
|
"columns": "Erkenne die Spaltenstruktur aus dem Layout.",
|
|
},
|
|
}
|
|
|
|
|
|
def _group_words_into_lines(
|
|
words: List[Dict], y_tolerance: float = 15.0,
|
|
) -> List[List[Dict]]:
|
|
"""Group OCR words into lines by Y-proximity."""
|
|
if not words:
|
|
return []
|
|
sorted_w = sorted(words, key=lambda w: w.get("top", 0))
|
|
lines: List[List[Dict]] = [[sorted_w[0]]]
|
|
for w in sorted_w[1:]:
|
|
last_line = lines[-1]
|
|
avg_y = sum(ww["top"] for ww in last_line) / len(last_line)
|
|
if abs(w["top"] - avg_y) <= y_tolerance:
|
|
last_line.append(w)
|
|
else:
|
|
lines.append([w])
|
|
# Sort words within each line by X
|
|
for line in lines:
|
|
line.sort(key=lambda w: w.get("left", 0))
|
|
return lines
|
|
|
|
|
|
def _build_ocr_context(words: List[Dict], img_h: int) -> str:
|
|
"""Build a text description of OCR words with positions for the prompt."""
|
|
lines = _group_words_into_lines(words)
|
|
context_parts = []
|
|
for i, line in enumerate(lines):
|
|
word_descs = []
|
|
for w in line:
|
|
text = w.get("text", "").strip()
|
|
x = w.get("left", 0)
|
|
conf = w.get("conf", 0)
|
|
marker = " (?)" if conf < 50 else ""
|
|
word_descs.append(f'x={x} "{text}"{marker}')
|
|
avg_y = int(sum(w["top"] for w in line) / len(line))
|
|
context_parts.append(f"Zeile {i+1} (y~{avg_y}): {', '.join(word_descs)}")
|
|
return "\n".join(context_parts)
|
|
|
|
|
|
def _build_prompt(
|
|
ocr_context: str, category: str, img_w: int, img_h: int,
|
|
) -> str:
|
|
"""Build the Vision-LLM prompt with OCR context and document type."""
|
|
cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"])
|
|
|
|
return f"""Du siehst eine eingescannte {cat_info['label']}.
|
|
{cat_info['columns']}
|
|
|
|
Die OCR-Software hat folgende Woerter an diesen Positionen erkannt.
|
|
Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch:
|
|
|
|
{ocr_context}
|
|
|
|
Bildgroesse: {img_w} x {img_h} Pixel.
|
|
|
|
AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle.
|
|
- Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst
|
|
- Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist,
|
|
gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht)
|
|
- Behalte die Reihenfolge bei
|
|
|
|
Antworte NUR mit einem JSON-Array, keine Erklaerungen:
|
|
[
|
|
{{"row": 1, "english": "...", "german": "...", "example": "..."}},
|
|
{{"row": 2, "english": "...", "german": "...", "example": "..."}}
|
|
]"""
|
|
|
|
|
|
def _parse_llm_response(response_text: str) -> Optional[List[Dict]]:
|
|
"""Parse the LLM JSON response, handling markdown code blocks."""
|
|
text = response_text.strip()
|
|
|
|
# Strip markdown code block if present
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```\s*$", "", text)
|
|
|
|
# Try to find JSON array
|
|
match = re.search(r"\[[\s\S]*\]", text)
|
|
if not match:
|
|
logger.warning("vision_fuse_ocr: no JSON array found in LLM response")
|
|
return None
|
|
|
|
try:
|
|
data = json.loads(match.group())
|
|
if not isinstance(data, list):
|
|
return None
|
|
return data
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"vision_fuse_ocr: JSON parse error: {e}")
|
|
return None
|
|
|
|
|
|
def _vocab_rows_to_words(
|
|
rows: List[Dict], img_w: int, img_h: int,
|
|
) -> List[Dict]:
|
|
"""Convert LLM vocab rows back to word dicts for grid building.
|
|
|
|
Distributes words across estimated column positions so the
|
|
existing grid builder can process them normally.
|
|
"""
|
|
words = []
|
|
# Estimate column positions (3-column vocab layout)
|
|
col_positions = [
|
|
(0.02, 0.28), # EN: 2%-28% of width
|
|
(0.30, 0.55), # DE: 30%-55%
|
|
(0.57, 0.98), # Example: 57%-98%
|
|
]
|
|
|
|
median_h = max(15, img_h // (len(rows) * 3)) if rows else 20
|
|
y_step = max(median_h + 5, img_h // max(len(rows), 1))
|
|
|
|
for i, row in enumerate(rows):
|
|
y = int(i * y_step + 20)
|
|
row_num = row.get("row", i + 1)
|
|
|
|
for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([
|
|
("english", col_positions[0]),
|
|
("german", col_positions[1]),
|
|
("example", col_positions[2]),
|
|
]):
|
|
text = (row.get(field) or "").strip()
|
|
if not text:
|
|
continue
|
|
x = int(x_start_pct * img_w)
|
|
w = int((x_end_pct - x_start_pct) * img_w)
|
|
words.append({
|
|
"text": text,
|
|
"left": x,
|
|
"top": y,
|
|
"width": w,
|
|
"height": median_h,
|
|
"conf": 95, # LLM-corrected → high confidence
|
|
"_source": "vision_llm",
|
|
"_row": row_num,
|
|
"_col_type": f"column_{['en', 'de', 'example'][col_idx]}",
|
|
})
|
|
|
|
logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words")
|
|
return words
|
|
|
|
|
|
async def vision_fuse_ocr(
|
|
img_bgr: np.ndarray,
|
|
ocr_words: List[Dict],
|
|
document_category: str = "vokabelseite",
|
|
) -> List[Dict]:
|
|
"""Fuse traditional OCR results with Vision-LLM reading.
|
|
|
|
Sends the image + OCR word positions to Qwen2.5-VL which can:
|
|
- Read degraded text that traditional OCR cannot
|
|
- Use document context (knows what a vocab table looks like)
|
|
- Merge continuation rows (understands table structure)
|
|
|
|
Args:
|
|
img_bgr: The cropped/dewarped scan image (BGR)
|
|
ocr_words: Traditional OCR word list with positions
|
|
document_category: Type of document being scanned
|
|
|
|
Returns:
|
|
Corrected word list in same format as input, ready for grid building.
|
|
Falls back to original ocr_words on error.
|
|
"""
|
|
img_h, img_w = img_bgr.shape[:2]
|
|
|
|
# Build OCR context string
|
|
ocr_context = _build_ocr_context(ocr_words, img_h)
|
|
|
|
# Build prompt
|
|
prompt = _build_prompt(ocr_context, document_category, img_w, img_h)
|
|
|
|
# Encode image as base64
|
|
_, img_encoded = cv2.imencode(".png", img_bgr)
|
|
img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
|
|
|
|
# Call Qwen2.5-VL via Ollama
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
resp = await client.post(
|
|
f"{OLLAMA_BASE_URL}/api/generate",
|
|
json={
|
|
"model": OLLAMA_HTR_MODEL,
|
|
"prompt": prompt,
|
|
"images": [img_b64],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 4096},
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
response_text = data.get("response", "").strip()
|
|
except Exception as e:
|
|
logger.error(f"vision_fuse_ocr: Ollama call failed: {e}")
|
|
return ocr_words # Fallback to original
|
|
|
|
if not response_text:
|
|
logger.warning("vision_fuse_ocr: empty LLM response")
|
|
return ocr_words
|
|
|
|
# Parse JSON response
|
|
rows = _parse_llm_response(response_text)
|
|
if not rows:
|
|
logger.warning(
|
|
"vision_fuse_ocr: could not parse LLM response, "
|
|
"first 200 chars: %s", response_text[:200],
|
|
)
|
|
return ocr_words
|
|
|
|
logger.info(
|
|
f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows "
|
|
f"(from {len(ocr_words)} OCR words)"
|
|
)
|
|
|
|
# Convert back to word format for grid building
|
|
return _vocab_rows_to_words(rows, img_w, img_h)
|