"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM. Contains: - VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction - extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM) - _get_demo_vocabulary(): Demo data for testing - parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback """ import base64 import json import logging import os import re import uuid from typing import List import httpx from vocab_worksheet_models import VocabularyEntry logger = logging.getLogger(__name__) # Ollama Configuration OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b") # ============================================================================= # Vision LLM Vocabulary Extraction # ============================================================================= VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch. AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format: { "vocabulary": [ { "english": "to improve", "german": "verbessern", "example": "I want to improve my English." } ] } REGELN: 1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz 2. Behalte die exakte Schreibweise bei 3. Bei fehlenden Beispielsaetzen: "example": null 4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern 5. Gib NUR valides JSON zurueck, keine Erklaerungen 6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type" Beispiel-Output: { "vocabulary": [ {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"}, {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"} ] }""" async def extract_vocabulary_from_image( image_data: bytes, filename: str, page_number: int = 0, use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini ) -> tuple[List[VocabularyEntry], float, str]: """ Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default). Args: image_data: Image bytes filename: Original filename for logging page_number: 0-indexed page number for error messages use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text) If False, use Vision LLM (slower, better for complex layouts) Returns: Tuple of (vocabulary_entries, confidence, error_message) error_message is empty string on success """ # ========================================================================== # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway # ========================================================================== if use_hybrid: try: from hybrid_vocab_extractor import extract_vocabulary_hybrid logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)") vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number) if error: logger.warning(f"Hybrid extraction had issues: {error}") # Fall through to Vision LLM fallback elif vocab_dicts: # Convert dicts to VocabularyEntry objects vocabulary = [ VocabularyEntry( id=str(uuid.uuid4()), english=v.get("english", ""), german=v.get("german", ""), example_sentence=v.get("example"), source_page=page_number + 1 ) for v in vocab_dicts if v.get("english") and v.get("german") ] logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}") return vocabulary, confidence, "" except ImportError as e: logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.") except Exception as e: logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.") import traceback logger.debug(traceback.format_exc()) # ========================================================================== # FALLBACK: Vision LLM (Ollama llama3.2-vision) # ========================================================================== logger.info(f"Using VISION LLM extraction for {filename}") try: # First check if Ollama is available async with httpx.AsyncClient(timeout=10.0) as check_client: try: health_response = await check_client.get(f"{OLLAMA_URL}/api/tags") if health_response.status_code != 200: logger.error(f"Ollama not available at {OLLAMA_URL}") return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar" except Exception as e: logger.error(f"Ollama health check failed: {e}") return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen" image_base64 = base64.b64encode(image_data).decode("utf-8") payload = { "model": VISION_MODEL, "messages": [ { "role": "user", "content": VOCAB_EXTRACTION_PROMPT, "images": [image_base64] } ], "stream": False, "options": { "temperature": 0.1, "num_predict": 4096, } } logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}") # Increased timeout for Vision models (they can be slow) async with httpx.AsyncClient(timeout=600.0) as client: response = await client.post( f"{OLLAMA_URL}/api/chat", json=payload, timeout=300.0 # 5 minutes per page ) response.raise_for_status() data = response.json() extracted_text = data.get("message", {}).get("content", "") logger.info(f"Ollama response received: {len(extracted_text)} chars") # Parse JSON from response vocabulary = parse_vocabulary_json(extracted_text) # Set source_page for each entry for v in vocabulary: v.source_page = page_number + 1 # Estimate confidence confidence = 0.85 if len(vocabulary) > 0 else 0.1 logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}") return vocabulary, confidence, "" except httpx.TimeoutException: logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})") return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange" except Exception as e: logger.error(f"Vocabulary extraction failed for {filename}: {e}") import traceback logger.error(traceback.format_exc()) return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}" def _get_demo_vocabulary() -> List[VocabularyEntry]: """Return demo vocabulary for testing when Vision LLM is not available.""" demo_entries = [ {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."}, {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."}, {"english": "improve", "german": "verbessern", "example": "I want to improve my English."}, {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."}, {"english": "success", "german": "Erfolg", "example": "The project was a success."}, {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."}, {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."}, {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."}, ] return [ VocabularyEntry( id=str(uuid.uuid4()), english=e["english"], german=e["german"], example_sentence=e.get("example"), ) for e in demo_entries ] def parse_vocabulary_json(text: str) -> List[VocabularyEntry]: """Parse vocabulary JSON from LLM response with robust error handling.""" def clean_json_string(s: str) -> str: """Clean a JSON string by removing control characters and fixing common issues.""" # Remove control characters except newlines and tabs s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s) # Replace unescaped newlines within strings with space # This is a simplistic approach - replace actual newlines with escaped ones s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') return s def try_parse_json(json_str: str) -> dict: """Try multiple strategies to parse JSON.""" # Strategy 1: Direct parse try: return json.loads(json_str) except json.JSONDecodeError: pass # Strategy 2: Clean and parse try: cleaned = clean_json_string(json_str) return json.loads(cleaned) except json.JSONDecodeError: pass # Strategy 3: Try to fix common issues try: # Remove trailing commas before } or ] fixed = re.sub(r',(\s*[}\]])', r'\1', json_str) # Fix unquoted keys fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed) return json.loads(fixed) except json.JSONDecodeError: pass return None try: # Find JSON in response (may have extra text) start = text.find('{') end = text.rfind('}') + 1 if start == -1 or end == 0: logger.warning("No JSON found in response") return [] json_str = text[start:end] data = try_parse_json(json_str) if data is None: # Strategy 4: Extract vocabulary entries using regex as fallback logger.warning("JSON parsing failed, trying regex extraction") vocabulary = [] # Match patterns like {"english": "...", "german": "...", ...} pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?' matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL) for match in matches: english = match[0].strip() if match[0] else "" german = match[1].strip() if match[1] else "" example = match[2].strip() if len(match) > 2 and match[2] else None if english and german: vocab_entry = VocabularyEntry( id=str(uuid.uuid4()), english=english, german=german, example_sentence=example, ) vocabulary.append(vocab_entry) if vocabulary: logger.info(f"Regex extraction found {len(vocabulary)} entries") return vocabulary # Normal JSON parsing succeeded vocabulary = [] for i, entry in enumerate(data.get("vocabulary", [])): english = entry.get("english", "").strip() german = entry.get("german", "").strip() # Skip entries that look like hallucinations (very long or containing unusual patterns) if len(english) > 100 or len(german) > 200: logger.warning(f"Skipping suspicious entry: {english[:50]}...") continue if not english or not german: continue vocab_entry = VocabularyEntry( id=str(uuid.uuid4()), english=english, german=german, example_sentence=entry.get("example"), word_type=entry.get("word_type"), ) vocabulary.append(vocab_entry) return vocabulary except Exception as e: logger.error(f"Failed to parse vocabulary JSON: {e}") import traceback logger.error(traceback.format_exc()) return []