""" CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming. Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like _entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import os import re import time from typing import Dict, List, Tuple import httpx logger = logging.getLogger(__name__) _OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b") _REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20")) logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE) REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm" # Regex: entry contains IPA phonetic brackets like "dance [da:ns]" _HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]') # Regex: digit adjacent to a letter -- OCR digit<->letter confusion _OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])') def _entry_needs_review(entry: Dict) -> bool: """Check if an entry should be sent for review. Sends all non-empty entries that don't have IPA phonetic transcriptions. """ en = entry.get("english", "") or "" de = entry.get("german", "") or "" if not en.strip() and not de.strip(): return False if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de): return False return True def _build_llm_prompt(table_lines: List[Dict]) -> str: """Build the LLM correction prompt for a batch of entries.""" return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch). DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden. NUR diese Korrekturen sind erlaubt: - Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball" - Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old" - Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin" - Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See" - Ziffer 6 statt G oder g: "6eld" -> "Geld" - Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help" ABSOLUT VERBOTEN -- aendere NIEMALS: - Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst - Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN - Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst - Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest - Eigennamen: Ben, London, China, Africa, Shakespeare usw. - Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw. - Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren - Beispielsaetze in der ex-Spalte -- NIEMALS aendern Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false. Antworte NUR mit dem JSON-Array. Kein Text davor oder danach. Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge). /no_think Eingabe: {json.dumps(table_lines, ensure_ascii=False, indent=2)}""" def _is_spurious_change(old_val: str, new_val: str) -> bool: """Detect LLM changes that are likely wrong and should be discarded. Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are legitimate OCR corrections. Everything else is rejected. """ if not old_val or not new_val: return False if old_val.lower() == new_val.lower(): return True old_words = old_val.split() new_words = new_val.split() if abs(len(old_words) - len(new_words)) > 1: return True _OCR_CHAR_MAP = { '0': set('oOgG'), '1': set('lLiI'), '5': set('sS'), '6': set('gG'), '8': set('bB'), '|': set('lLiI1'), 'l': set('iI|1'), } has_valid_fix = False if len(old_val) == len(new_val): for oc, nc in zip(old_val, new_val): if oc != nc: if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]: has_valid_fix = True elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]: has_valid_fix = True else: _OCR_SUSPICIOUS_RE = re.compile(r'[|01568]') if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val): has_valid_fix = True if not has_valid_fix: return True return False def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]: """Compare original entries with LLM-corrected ones, return (changes, corrected_entries).""" changes = [] entries_out = [] for i, orig in enumerate(originals): if i < len(corrected): c = corrected[i] entry = dict(orig) for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]: new_val = c.get(key, "").strip() old_val = (orig.get(field_name, "") or "").strip() if new_val and new_val != old_val: if _is_spurious_change(old_val, new_val): continue changes.append({ "row_index": orig.get("row_index", i), "field": field_name, "old": old_val, "new": new_val, }) entry[field_name] = new_val entry["llm_corrected"] = True entries_out.append(entry) else: entries_out.append(dict(orig)) return changes, entries_out def _sanitize_for_json(text: str) -> str: """Remove or escape control characters that break JSON parsing.""" return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text) def _parse_llm_json_array(text: str) -> List[Dict]: """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags).""" text = re.sub(r'.*?', '', text, flags=re.DOTALL) text = re.sub(r'```json\s*', '', text) text = re.sub(r'```\s*', '', text) text = _sanitize_for_json(text) match = re.search(r'\[.*\]', text, re.DOTALL) if match: try: return json.loads(match.group()) except (ValueError, json.JSONDecodeError) as e: logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200]) else: logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200]) return [] async def llm_review_entries( entries: List[Dict], model: str = None, ) -> Dict: """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm).""" from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: return spell_review_entries_sync(entries) if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") model = model or OLLAMA_REVIEW_MODEL reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)] if not reviewable: return { "entries_original": entries, "entries_corrected": [dict(e) for e in entries], "changes": [], "skipped_count": len(entries), "model_used": model, "duration_ms": 0, } review_entries = [e for _, e in reviewable] table_lines = [ {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} for e in review_entries ] logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)", len(review_entries), len(entries), model, len(entries) - len(reviewable)) prompt = _build_llm_prompt(table_lines) t0 = time.time() async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post( f"{_OLLAMA_URL}/api/chat", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False, "think": False, "options": {"temperature": 0.1, "num_predict": 8192}, }, ) resp.raise_for_status() content = resp.json().get("message", {}).get("content", "") duration_ms = int((time.time() - t0) * 1000) logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content)) corrected = _parse_llm_json_array(content) changes, corrected_entries = _diff_batch(review_entries, corrected) all_corrected = [dict(e) for e in entries] for batch_idx, (orig_idx, _) in enumerate(reviewable): if batch_idx < len(corrected_entries): all_corrected[orig_idx] = corrected_entries[batch_idx] return { "entries_original": entries, "entries_corrected": all_corrected, "changes": changes, "skipped_count": len(entries) - len(reviewable), "model_used": model, "duration_ms": duration_ms, } async def llm_review_entries_streaming( entries: List[Dict], model: str = None, batch_size: int = _REVIEW_BATCH_SIZE, ): """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE. Phase 0 (always): Run _fix_character_confusion and emit any changes. """ from cv_ocr_engines import _fix_character_confusion from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE _CONF_FIELDS = ('english', 'german', 'example') originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries] _fix_character_confusion(entries) char_changes = [ {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')} for i in range(len(entries)) for f in _CONF_FIELDS if originals[i][f] != entries[i].get(f, '') ] if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: _meta_sent = False async for event in spell_review_entries_streaming(entries, batch_size): yield event if not _meta_sent and event.get('type') == 'meta' and char_changes: _meta_sent = True yield { 'type': 'batch', 'changes': char_changes, 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), 'progress': {'current': 0, 'total': len(entries)}, } return if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") # LLM path if char_changes: yield { 'type': 'batch', 'changes': char_changes, 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), 'progress': {'current': 0, 'total': len(entries)}, } model = model or OLLAMA_REVIEW_MODEL reviewable = [] skipped_indices = [] for i, e in enumerate(entries): if _entry_needs_review(e): reviewable.append((i, e)) else: skipped_indices.append(i) total_to_review = len(reviewable) yield { "type": "meta", "total_entries": len(entries), "to_review": total_to_review, "skipped": len(skipped_indices), "model": model, "batch_size": batch_size, } all_changes = [] all_corrected = [dict(e) for e in entries] total_duration_ms = 0 reviewed_count = 0 for batch_start in range(0, total_to_review, batch_size): batch_items = reviewable[batch_start:batch_start + batch_size] batch_entries = [e for _, e in batch_items] table_lines = [ {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} for e in batch_entries ] prompt = _build_llm_prompt(table_lines) logger.info("LLM review streaming: batch %d -- sending %d entries to %s", batch_start // batch_size, len(batch_entries), model) t0 = time.time() async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post( f"{_OLLAMA_URL}/api/chat", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False, "think": False, "options": {"temperature": 0.1, "num_predict": 8192}, }, ) resp.raise_for_status() content = resp.json().get("message", {}).get("content", "") batch_ms = int((time.time() - t0) * 1000) total_duration_ms += batch_ms corrected = _parse_llm_json_array(content) batch_changes, batch_corrected = _diff_batch(batch_entries, corrected) for batch_idx, (orig_idx, _) in enumerate(batch_items): if batch_idx < len(batch_corrected): all_corrected[orig_idx] = batch_corrected[batch_idx] all_changes.extend(batch_changes) reviewed_count += len(batch_items) yield { "type": "batch", "batch_index": batch_start // batch_size, "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items], "changes": batch_changes, "duration_ms": batch_ms, "progress": {"current": reviewed_count, "total": total_to_review}, } yield { "type": "complete", "changes": all_changes, "model_used": model, "duration_ms": total_duration_ms, "total_entries": len(entries), "reviewed": total_to_review, "skipped": len(skipped_indices), "corrections_found": len(all_changes), "entries_corrected": all_corrected, }