""" Hybrid OCR + LLM Vocabulary Extractor Split into: - hybrid_vocab_ocr.py: PaddleOCR integration, parsing, row/column detection - hybrid_vocab_extractor.py (this file): LLM structuring, public API, barrel re-exports All symbols re-exported for backward compatibility. """ import os import json import logging import re from typing import List, Dict, Any, Tuple import httpx # Re-export everything from ocr module for backward compatibility from hybrid_vocab_ocr import ( OCRRegion, get_paddle_ocr, preprocess_image, run_paddle_ocr, group_regions_by_rows, detect_columns, format_ocr_for_llm, ) logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") LLM_MODEL = os.getenv("LLM_MODEL", "qwen2.5:14b") # ============================================================================= # LLM Strukturierung # ============================================================================= STRUCTURE_PROMPT = """Du erhaeltst OCR-Output einer Vokabelliste aus einem englischen Schulbuch. Die Zeilen sind Tab-separiert und enthalten typischerweise: - 2 Spalten: Englisch | Deutsch - 3 Spalten: Englisch | Deutsch | Beispielsatz OCR-Text: {ocr_text} AUFGABE: Strukturiere die Vokabeln als JSON-Array. AUSGABE-FORMAT (nur JSON, keine Erklaerungen): {{ "vocabulary": [ {{"english": "to improve", "german": "verbessern", "example": "I want to improve my English."}}, {{"english": "achievement", "german": "Leistung", "example": null}} ] }} REGELN: 1. Erkenne das Spalten-Layout aus den Tab-Trennungen 2. Korrigiere offensichtliche OCR-Fehler kontextuell (z.B. "vereessern" -> "verbessern", "0" -> "o") 3. Bei fehlenden Beispielsaetzen: "example": null 4. Ueberspringe Ueberschriften, Seitenzahlen, Kapitelnummern 5. Behalte Wortarten bei wenn vorhanden (n, v, adj am Ende des englischen Worts) 6. Gib NUR valides JSON zurueck""" async def structure_vocabulary_with_llm(ocr_text: str) -> List[Dict[str, Any]]: """Verwendet Ollama LLM um OCR-Text zu strukturieren.""" prompt = STRUCTURE_PROMPT.format(ocr_text=ocr_text) try: async with httpx.AsyncClient(timeout=120.0) as client: response = await client.post( f"{OLLAMA_URL}/api/chat", json={ "model": LLM_MODEL, "messages": [{"role": "user", "content": prompt}], "stream": False, "options": {"temperature": 0.1, "num_predict": 4096} } ) response.raise_for_status() data = response.json() content = data.get("message", {}).get("content", "") logger.info(f"Ollama LLM response received: {len(content)} chars") return parse_llm_vocabulary_json(content) except httpx.TimeoutException: logger.error("Ollama LLM request timed out") return [] except httpx.HTTPStatusError as e: logger.error(f"Ollama LLM HTTP error: {e}") return [] except Exception as e: logger.error(f"LLM structuring failed: {e}") return [] def parse_llm_vocabulary_json(text: str) -> List[Dict[str, Any]]: """Robustes JSON-Parsing des LLM-Outputs.""" try: start = text.find('{') end = text.rfind('}') + 1 if start == -1 or end == 0: logger.warning("No JSON found in LLM response") return [] json_str = text[start:end] data = json.loads(json_str) vocabulary = data.get("vocabulary", []) valid_entries = [] for entry in vocabulary: english = entry.get("english", "").strip() german = entry.get("german", "").strip() if english and german: valid_entries.append({ "english": english, "german": german, "example": entry.get("example") }) return valid_entries except json.JSONDecodeError as e: logger.error(f"JSON parse error: {e}") return extract_vocabulary_regex(text) except Exception as e: logger.error(f"Vocabulary parsing failed: {e}") return [] def extract_vocabulary_regex(text: str) -> List[Dict[str, Any]]: """Fallback: Vokabeln via Regex extrahieren.""" pattern = r'"english"\s*:\s*"([^"]+)"\s*,\s*"german"\s*:\s*"([^"]+)"' matches = re.findall(pattern, text) vocabulary = [] for english, german in matches: vocabulary.append({ "english": english.strip(), "german": german.strip(), "example": None }) logger.info(f"Regex fallback extracted {len(vocabulary)} entries") return vocabulary # ============================================================================= # Public API # ============================================================================= async def extract_vocabulary_hybrid( image_bytes: bytes, page_number: int = 0 ) -> Tuple[List[Dict[str, Any]], float, str]: """Hybrid-Extraktion: PaddleOCR + LLM Strukturierung.""" try: logger.info(f"Starting hybrid extraction for page {page_number + 1}") regions, raw_text = run_paddle_ocr(image_bytes) if not regions: return [], 0.0, f"Seite {page_number + 1}: Kein Text erkannt (OCR)" formatted_text = format_ocr_for_llm(regions) logger.info(f"Formatted OCR text: {len(formatted_text)} chars") vocabulary = await structure_vocabulary_with_llm(formatted_text) if not vocabulary: vocabulary = extract_from_rows_directly(regions) if not vocabulary: return [], 0.0, f"Seite {page_number + 1}: Keine Vokabeln erkannt" avg_confidence = sum(r.confidence for r in regions) / len(regions) if regions else 0.0 logger.info(f"Hybrid extraction completed: {len(vocabulary)} entries, {avg_confidence:.2f} confidence") return vocabulary, avg_confidence, "" except Exception as e: logger.error(f"Hybrid extraction failed: {e}") import traceback logger.error(traceback.format_exc()) return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}" def extract_from_rows_directly(regions: List[OCRRegion]) -> List[Dict[str, Any]]: """Direkter Fallback: Extrahiere Vokabeln ohne LLM.""" rows = group_regions_by_rows(regions) vocabulary = [] for row in rows: if len(row) >= 2: english = row[0].text.strip() german = row[1].text.strip() example = row[2].text.strip() if len(row) >= 3 else None if english and german and len(english) > 1 and len(german) > 1: vocabulary.append({ "english": english, "german": german, "example": example }) logger.info(f"Direct row extraction: {len(vocabulary)} entries") return vocabulary # ============================================================================= # Test/Debug # ============================================================================= async def test_hybrid_extraction(image_path: str): """Test-Funktion fuer Entwicklung.""" with open(image_path, "rb") as f: image_bytes = f.read() vocab, confidence, error = await extract_vocabulary_hybrid(image_bytes) print(f"\n=== Hybrid OCR Test ===") print(f"Confidence: {confidence:.2f}") print(f"Error: {error or 'None'}") print(f"Vocabulary ({len(vocab)} entries):") for v in vocab[:10]: print(f" - {v['english']} = {v['german']}") return vocab if __name__ == "__main__": import asyncio import sys if len(sys.argv) > 1: asyncio.run(test_hybrid_extraction(sys.argv[1])) else: print("Usage: python hybrid_vocab_extractor.py ")