""" Vision-LLM OCR Fusion — Combines traditional OCR positions with Vision-LLM reading. Sends the scan image + OCR word coordinates + document type to Qwen2.5-VL. The LLM can read degraded text using context understanding and visual inspection, while OCR coordinates provide structural hints (where text is, column positions). Uses Ollama API (same pattern as handwriting_htr_api.py). """ import base64 import json import logging import os import re from typing import Any, Dict, List, Optional import cv2 import httpx import numpy as np logger = logging.getLogger(__name__) OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") VISION_FUSION_MODEL = os.getenv("VISION_FUSION_MODEL", "llama3.2-vision:11b") # Document category → prompt context CATEGORY_PROMPTS: Dict[str, Dict[str, str]] = { "vokabelseite": { "label": "Vokabelseite eines Schulbuchs (Englisch-Deutsch)", "columns": "Die Tabelle hat typischerweise 3 Spalten: Englisch, Deutsch, Beispielsatz.", }, "woerterbuch": { "label": "Woerterbuchseite", "columns": "Die Eintraege haben: Stichwort, Lautschrift, Uebersetzung(en), Beispielsaetze.", }, "arbeitsblatt": { "label": "Arbeitsblatt", "columns": "Erkenne die Spaltenstruktur aus dem Layout.", }, "buchseite": { "label": "Schulbuchseite", "columns": "Erkenne die Spaltenstruktur aus dem Layout.", }, } def _group_words_into_lines( words: List[Dict], y_tolerance: float = 15.0, ) -> List[List[Dict]]: """Group OCR words into lines by Y-proximity.""" if not words: return [] sorted_w = sorted(words, key=lambda w: w.get("top", 0)) lines: List[List[Dict]] = [[sorted_w[0]]] for w in sorted_w[1:]: last_line = lines[-1] avg_y = sum(ww["top"] for ww in last_line) / len(last_line) if abs(w["top"] - avg_y) <= y_tolerance: last_line.append(w) else: lines.append([w]) # Sort words within each line by X for line in lines: line.sort(key=lambda w: w.get("left", 0)) return lines def _build_ocr_context(words: List[Dict], img_h: int) -> str: """Build a text description of OCR words with positions for the prompt.""" lines = _group_words_into_lines(words) context_parts = [] for i, line in enumerate(lines): word_descs = [] for w in line: text = w.get("text", "").strip() x = w.get("left", 0) conf = w.get("conf", 0) marker = " (?)" if conf < 50 else "" word_descs.append(f'x={x} "{text}"{marker}') avg_y = int(sum(w["top"] for w in line) / len(line)) context_parts.append(f"Zeile {i+1} (y~{avg_y}): {', '.join(word_descs)}") return "\n".join(context_parts) def _build_prompt( ocr_context: str, category: str, img_w: int, img_h: int, ) -> str: """Build the Vision-LLM prompt with OCR context and document type.""" cat_info = CATEGORY_PROMPTS.get(category, CATEGORY_PROMPTS["buchseite"]) return f"""Du siehst eine eingescannte {cat_info['label']}. {cat_info['columns']} Die OCR-Software hat folgende Woerter an diesen Positionen erkannt. Woerter mit (?) haben niedrige Erkennungssicherheit und sind wahrscheinlich falsch: {ocr_context} Bildgroesse: {img_w} x {img_h} Pixel. AUFGABE: Schau dir das Bild genau an und erstelle die korrekte Tabelle. - Korrigiere falsch erkannte Woerter anhand dessen was du im Bild siehst - Fasse Fortsetzungszeilen zusammen (wenn eine Spalte in der naechsten Zeile leer ist, gehoert der Text zur Zeile darueber — der Autor hat nur einen Zeilenumbruch innerhalb der Zelle gemacht) - Behalte die Reihenfolge bei Antworte NUR mit einem JSON-Array, keine Erklaerungen: [ {{"row": 1, "english": "...", "german": "...", "example": "..."}}, {{"row": 2, "english": "...", "german": "...", "example": "..."}} ]""" def _parse_llm_response(response_text: str) -> Optional[List[Dict]]: """Parse the LLM JSON response, handling markdown code blocks.""" text = response_text.strip() # Strip markdown code block if present if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```\s*$", "", text) # Try to find JSON array match = re.search(r"\[[\s\S]*\]", text) if not match: logger.warning("vision_fuse_ocr: no JSON array found in LLM response") return None try: data = json.loads(match.group()) if not isinstance(data, list): return None return data except json.JSONDecodeError as e: logger.warning(f"vision_fuse_ocr: JSON parse error: {e}") return None def _vocab_rows_to_words( rows: List[Dict], img_w: int, img_h: int, ) -> List[Dict]: """Convert LLM vocab rows back to word dicts for grid building. Distributes words across estimated column positions so the existing grid builder can process them normally. """ words = [] # Estimate column positions (3-column vocab layout) col_positions = [ (0.02, 0.28), # EN: 2%-28% of width (0.30, 0.55), # DE: 30%-55% (0.57, 0.98), # Example: 57%-98% ] median_h = max(15, img_h // (len(rows) * 3)) if rows else 20 y_step = max(median_h + 5, img_h // max(len(rows), 1)) for i, row in enumerate(rows): y = int(i * y_step + 20) row_num = row.get("row", i + 1) for col_idx, (field, (x_start_pct, x_end_pct)) in enumerate([ ("english", col_positions[0]), ("german", col_positions[1]), ("example", col_positions[2]), ]): text = (row.get(field) or "").strip() if not text: continue x = int(x_start_pct * img_w) w = int((x_end_pct - x_start_pct) * img_w) words.append({ "text": text, "left": x, "top": y, "width": w, "height": median_h, "conf": 95, # LLM-corrected → high confidence "_source": "vision_llm", "_row": row_num, "_col_type": f"column_{['en', 'de', 'example'][col_idx]}", }) logger.info(f"vision_fuse_ocr: converted {len(rows)} LLM rows → {len(words)} words") return words async def vision_fuse_ocr( img_bgr: np.ndarray, ocr_words: List[Dict], document_category: str = "vokabelseite", ) -> List[Dict]: """Fuse traditional OCR results with Vision-LLM reading. Sends the image + OCR word positions to Qwen2.5-VL which can: - Read degraded text that traditional OCR cannot - Use document context (knows what a vocab table looks like) - Merge continuation rows (understands table structure) Args: img_bgr: The cropped/dewarped scan image (BGR) ocr_words: Traditional OCR word list with positions document_category: Type of document being scanned Returns: Corrected word list in same format as input, ready for grid building. Falls back to original ocr_words on error. """ img_h, img_w = img_bgr.shape[:2] # Build OCR context string ocr_context = _build_ocr_context(ocr_words, img_h) # Build prompt prompt = _build_prompt(ocr_context, document_category, img_w, img_h) # Encode image as base64 _, img_encoded = cv2.imencode(".png", img_bgr) img_b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8") # Call Qwen2.5-VL via Ollama try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post( f"{OLLAMA_BASE_URL}/api/generate", json={ "model": VISION_FUSION_MODEL, "prompt": prompt, "images": [img_b64], "stream": False, "options": {"temperature": 0.1, "num_predict": 4096}, }, ) resp.raise_for_status() data = resp.json() response_text = data.get("response", "").strip() except Exception as e: logger.error(f"vision_fuse_ocr: Ollama call failed: {e}") return ocr_words # Fallback to original if not response_text: logger.warning("vision_fuse_ocr: empty LLM response") return ocr_words # Parse JSON response rows = _parse_llm_response(response_text) if not rows: logger.warning( "vision_fuse_ocr: could not parse LLM response, " "first 200 chars: %s", response_text[:200], ) return ocr_words logger.info( f"vision_fuse_ocr: LLM returned {len(rows)} vocab rows " f"(from {len(ocr_words)} OCR words)" ) # Convert back to word format for grid building return _vocab_rows_to_words(rows, img_w, img_h)