""" Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import os import re import time from typing import Any, Dict, List, Optional, Tuple import numpy as np from cv_vocab_types import ( CV_PIPELINE_AVAILABLE, PageRegion, PipelineResult, VocabRow, ) from cv_preprocessing import ( deskew_image, dewarp_image, render_image_high_res, render_pdf_high_res, ) from cv_layout import ( analyze_layout, create_layout_image, create_ocr_image, ) from cv_ocr_engines import ( _fix_character_confusion, _group_words_into_lines, ) logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] try: import pytesseract from PIL import Image except ImportError: pytesseract = None # type: ignore[assignment] Image = None # type: ignore[assignment,misc] # ============================================================================= # Stage 6: Multi-Pass OCR # ============================================================================= def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str, psm: int, fallback_psm: Optional[int] = None, min_confidence: float = 40.0) -> List[Dict[str, Any]]: """Run Tesseract OCR on a specific region with given PSM. Args: ocr_img: Binarized full-page image. region: Region to crop and OCR. lang: Tesseract language string. psm: Page Segmentation Mode. fallback_psm: If confidence too low, retry with this PSM per line. min_confidence: Minimum average confidence before fallback. Returns: List of word dicts with text, position, confidence. """ # Crop region crop = ocr_img[region.y:region.y + region.height, region.x:region.x + region.width] if crop.size == 0: return [] # Convert to PIL for pytesseract pil_img = Image.fromarray(crop) # Run Tesseract with specified PSM config = f'--psm {psm} --oem 3' try: data = pytesseract.image_to_data(pil_img, lang=lang, config=config, output_type=pytesseract.Output.DICT) except Exception as e: logger.warning(f"Tesseract failed for region {region.type}: {e}") return [] words = [] for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if not text or conf < 10: continue words.append({ 'text': text, 'left': data['left'][i] + region.x, # Absolute coords 'top': data['top'][i] + region.y, 'width': data['width'][i], 'height': data['height'][i], 'conf': conf, 'region_type': region.type, }) # Check average confidence if words and fallback_psm is not None: avg_conf = sum(w['conf'] for w in words) / len(words) if avg_conf < min_confidence: logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, " f"trying fallback PSM {fallback_psm}") words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm) return words def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion, lang: str, psm: int) -> List[Dict[str, Any]]: """OCR a region line by line (fallback for low-confidence regions). Splits the region into horizontal strips based on text density, then OCRs each strip individually with the given PSM. """ crop = ocr_img[region.y:region.y + region.height, region.x:region.x + region.width] if crop.size == 0: return [] # Find text lines via horizontal projection inv = cv2.bitwise_not(crop) h_proj = np.sum(inv, axis=1) threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0 # Find line boundaries lines = [] in_text = False line_start = 0 for y in range(len(h_proj)): if h_proj[y] > threshold and not in_text: line_start = y in_text = True elif h_proj[y] <= threshold and in_text: if y - line_start > 5: # Minimum line height lines.append((line_start, y)) in_text = False if in_text and len(h_proj) - line_start > 5: lines.append((line_start, len(h_proj))) all_words = [] config = f'--psm {psm} --oem 3' for line_y_start, line_y_end in lines: # Add small padding pad = 3 y1 = max(0, line_y_start - pad) y2 = min(crop.shape[0], line_y_end + pad) line_crop = crop[y1:y2, :] if line_crop.size == 0: continue pil_img = Image.fromarray(line_crop) try: data = pytesseract.image_to_data(pil_img, lang=lang, config=config, output_type=pytesseract.Output.DICT) except Exception: continue for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if not text or conf < 10: continue all_words.append({ 'text': text, 'left': data['left'][i] + region.x, 'top': data['top'][i] + region.y + y1, 'width': data['width'][i], 'height': data['height'][i], 'conf': conf, 'region_type': region.type, }) return all_words def run_multi_pass_ocr(ocr_img: np.ndarray, regions: List[PageRegion], lang: str = "eng+deu") -> Dict[str, List[Dict]]: """Run OCR on each detected region with optimized settings. Args: ocr_img: Binarized full-page image. regions: Detected page regions. lang: Default language. Returns: Dict mapping region type to list of word dicts. """ results: Dict[str, List[Dict]] = {} _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} for region in regions: if region.type in _ocr_skip: continue # Skip non-content regions if region.type == 'column_en': words = ocr_region(ocr_img, region, lang='eng', psm=4) elif region.type == 'column_de': words = ocr_region(ocr_img, region, lang='deu', psm=4) elif region.type == 'column_example': words = ocr_region(ocr_img, region, lang=lang, psm=6, fallback_psm=7, min_confidence=40.0) else: words = ocr_region(ocr_img, region, lang=lang, psm=6) results[region.type] = words logger.info(f"OCR {region.type}: {len(words)} words") return results # ============================================================================= # Stage 7: Line Alignment → Vocabulary Entries # ============================================================================= def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]], regions: List[PageRegion], y_tolerance_px: int = 25) -> List[VocabRow]: """Align OCR results from different columns into vocabulary rows. Uses Y-coordinate matching to pair English words, German translations, and example sentences that appear on the same line. Args: ocr_results: Dict mapping region type to word lists. regions: Detected regions (for reference). y_tolerance_px: Max Y-distance to consider words on the same row. Returns: List of VocabRow objects. """ # If no vocabulary columns detected (e.g. plain text page), return empty if 'column_en' not in ocr_results and 'column_de' not in ocr_results: logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty") return [] # Group words into lines per column en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px) de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px) ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px) def line_y_center(line: List[Dict]) -> float: return sum(w['top'] + w['height'] / 2 for w in line) / len(line) def line_text(line: List[Dict]) -> str: return ' '.join(w['text'] for w in line) def line_confidence(line: List[Dict]) -> float: return sum(w['conf'] for w in line) / len(line) if line else 0 # Build EN entries as the primary reference vocab_rows: List[VocabRow] = [] for en_line in en_lines: en_y = line_y_center(en_line) en_text = line_text(en_line) en_conf = line_confidence(en_line) # Skip very short or likely header content if len(en_text.strip()) < 2: continue # Find matching DE line de_text = "" de_conf = 0.0 best_de_dist = float('inf') best_de_idx = -1 for idx, de_line in enumerate(de_lines): dist = abs(line_y_center(de_line) - en_y) if dist < y_tolerance_px and dist < best_de_dist: best_de_dist = dist best_de_idx = idx if best_de_idx >= 0: de_text = line_text(de_lines[best_de_idx]) de_conf = line_confidence(de_lines[best_de_idx]) # Find matching example line ex_text = "" ex_conf = 0.0 best_ex_dist = float('inf') best_ex_idx = -1 for idx, ex_line in enumerate(ex_lines): dist = abs(line_y_center(ex_line) - en_y) if dist < y_tolerance_px and dist < best_ex_dist: best_ex_dist = dist best_ex_idx = idx if best_ex_idx >= 0: ex_text = line_text(ex_lines[best_ex_idx]) ex_conf = line_confidence(ex_lines[best_ex_idx]) avg_conf = en_conf conf_count = 1 if de_conf > 0: avg_conf += de_conf conf_count += 1 if ex_conf > 0: avg_conf += ex_conf conf_count += 1 vocab_rows.append(VocabRow( english=en_text.strip(), german=de_text.strip(), example=ex_text.strip(), confidence=avg_conf / conf_count, y_position=int(en_y), )) # Handle multi-line wrapping in example column: # If an example line has no matching EN/DE, append to previous entry matched_ex_ys = set() for row in vocab_rows: if row.example: matched_ex_ys.add(row.y_position) for ex_line in ex_lines: ex_y = line_y_center(ex_line) # Check if already matched already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys) if already_matched: continue # Find nearest previous vocab row best_row = None best_dist = float('inf') for row in vocab_rows: dist = ex_y - row.y_position if 0 < dist < y_tolerance_px * 3 and dist < best_dist: best_dist = dist best_row = row if best_row: continuation = line_text(ex_line).strip() if continuation: best_row.example = (best_row.example + " " + continuation).strip() # Sort by Y position vocab_rows.sort(key=lambda r: r.y_position) return vocab_rows # ============================================================================= # Stage 8: Optional LLM Post-Correction # ============================================================================= async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow], confidence_threshold: float = 50.0, enabled: bool = False) -> List[VocabRow]: """Optionally send low-confidence regions to Qwen-VL for correction. Default: disabled. Enable per parameter. Args: img: Original BGR image. vocab_rows: Current vocabulary rows. confidence_threshold: Rows below this get LLM correction. enabled: Whether to actually run LLM correction. Returns: Corrected vocabulary rows. """ if not enabled: return vocab_rows # TODO: Implement Qwen-VL correction for low-confidence entries # For each row with confidence < threshold: # 1. Crop the relevant region from img # 2. Send crop + OCR text to Qwen-VL # 3. Replace text if LLM provides a confident correction logger.info(f"LLM post-correction skipped (not yet implemented)") return vocab_rows # ============================================================================= # Orchestrator # ============================================================================= async def run_cv_pipeline( pdf_data: Optional[bytes] = None, image_data: Optional[bytes] = None, page_number: int = 0, zoom: float = 3.0, enable_dewarp: bool = True, enable_llm_correction: bool = False, lang: str = "eng+deu", ) -> PipelineResult: """Run the complete CV document reconstruction pipeline. Args: pdf_data: Raw PDF bytes (mutually exclusive with image_data). image_data: Raw image bytes (mutually exclusive with pdf_data). page_number: 0-indexed page number (for PDF). zoom: PDF rendering zoom factor. enable_dewarp: Whether to run dewarp stage. enable_llm_correction: Whether to run LLM post-correction. lang: Tesseract language string. Returns: PipelineResult with vocabulary and timing info. """ if not CV_PIPELINE_AVAILABLE: return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)") result = PipelineResult() total_start = time.time() try: # Stage 1: Render t = time.time() if pdf_data: img = render_pdf_high_res(pdf_data, page_number, zoom) elif image_data: img = render_image_high_res(image_data) else: return PipelineResult(error="No input data (pdf_data or image_data required)") result.stages['render'] = round(time.time() - t, 2) result.image_width = img.shape[1] result.image_height = img.shape[0] logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s") # Stage 2: Deskew t = time.time() img, angle = deskew_image(img) result.stages['deskew'] = round(time.time() - t, 2) logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s") # Stage 3: Dewarp if enable_dewarp: t = time.time() img, _dewarp_info = dewarp_image(img) result.stages['dewarp'] = round(time.time() - t, 2) # Stage 4: Dual image preparation t = time.time() ocr_img = create_ocr_image(img) layout_img = create_layout_image(img) result.stages['image_prep'] = round(time.time() - t, 2) # Stage 5: Layout analysis t = time.time() regions = analyze_layout(layout_img, ocr_img) result.stages['layout'] = round(time.time() - t, 2) result.columns_detected = len([r for r in regions if r.type.startswith('column')]) logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s") # Stage 6: Multi-pass OCR t = time.time() ocr_results = run_multi_pass_ocr(ocr_img, regions, lang) result.stages['ocr'] = round(time.time() - t, 2) total_words = sum(len(w) for w in ocr_results.values()) result.word_count = total_words logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s") # Stage 7: Line alignment t = time.time() vocab_rows = match_lines_to_vocab(ocr_results, regions) result.stages['alignment'] = round(time.time() - t, 2) # Stage 8: Optional LLM correction if enable_llm_correction: t = time.time() vocab_rows = await llm_post_correct(img, vocab_rows) result.stages['llm_correction'] = round(time.time() - t, 2) # Convert to output format result.vocabulary = [ { "english": row.english, "german": row.german, "example": row.example, "confidence": round(row.confidence, 1), } for row in vocab_rows if row.english or row.german # Skip empty rows ] result.duration_seconds = round(time.time() - total_start, 2) logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s") except Exception as e: logger.error(f"CV Pipeline error: {e}") import traceback logger.debug(traceback.format_exc()) result.error = str(e) result.duration_seconds = round(time.time() - total_start, 2) return result # --------------------------------------------------------------------------- # LLM-based OCR Correction (Step 6) # --------------------------------------------------------------------------- import httpx import os import json as _json import re as _re _OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b") _REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20")) logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE) # Regex: entry contains IPA phonetic brackets like "dance [dɑːns]" _HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]') # Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion. # Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B) # when they appear inside or next to a word character. _OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])') def _entry_needs_review(entry: Dict) -> bool: """Check if an entry should be sent to the LLM for review. Sends all non-empty entries that don't have IPA phonetic transcriptions. The LLM prompt and _is_spurious_change() guard against unwanted changes. """ en = entry.get("english", "") or "" de = entry.get("german", "") or "" # Skip completely empty entries if not en.strip() and not de.strip(): return False # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de): return False return True def _build_llm_prompt(table_lines: List[Dict]) -> str: """Build the LLM correction prompt for a batch of entries.""" return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch). DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden. NUR diese Korrekturen sind erlaubt: - Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball" - Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old" - Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin" - Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See" - Ziffer 6 statt G oder g: "6eld" → "Geld" - Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help" ABSOLUT VERBOTEN — aendere NIEMALS: - Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst - Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN - Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst - Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest - Eigennamen: Ben, London, China, Africa, Shakespeare usw. - Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw. - Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren - Beispielsaetze in der ex-Spalte — NIEMALS aendern Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false. Antworte NUR mit dem JSON-Array. Kein Text davor oder danach. Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge). /no_think Eingabe: {_json.dumps(table_lines, ensure_ascii=False, indent=2)}""" def _is_spurious_change(old_val: str, new_val: str) -> bool: """Detect LLM changes that are likely wrong and should be discarded. Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are legitimate OCR corrections. Everything else is rejected. Filters out: - Case-only changes - Changes that don't contain any digit→letter fix - Completely different words (LLM translating or hallucinating) - Additions or removals of whole words (count changed) """ if not old_val or not new_val: return False # Case-only change — never a real OCR error if old_val.lower() == new_val.lower(): return True # If the word count changed significantly, the LLM rewrote rather than fixed old_words = old_val.split() new_words = new_val.split() if abs(len(old_words) - len(new_words)) > 1: return True # Core rule: a legitimate correction replaces a digit with the corresponding # letter. If the change doesn't include such a substitution, reject it. # Build a set of (old_char, new_char) pairs that differ between old and new. # Use character-level diff heuristic: if lengths are close, zip and compare. # Map of characters that OCR commonly misreads → set of correct replacements _OCR_CHAR_MAP = { # Digits mistaken for letters '0': set('oOgG'), '1': set('lLiI'), '5': set('sS'), '6': set('gG'), '8': set('bB'), # Non-letter symbols mistaken for letters '|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1 'l': set('iI|1'), # lowercase l → capital I (and reverse) } has_valid_fix = False if len(old_val) == len(new_val): for oc, nc in zip(old_val, new_val): if oc != nc: if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]: has_valid_fix = True elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]: # Reverse check (e.g. l→I where new is the "correct" char) has_valid_fix = True else: # Length changed by 1: accept if old had a suspicious char sequence _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]') if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val): has_valid_fix = True if not has_valid_fix: return True # Reject — looks like translation or hallucination return False def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]: """Compare original entries with LLM-corrected ones, return (changes, corrected_entries).""" changes = [] entries_out = [] for i, orig in enumerate(originals): if i < len(corrected): c = corrected[i] entry = dict(orig) for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]: new_val = c.get(key, "").strip() old_val = (orig.get(field_name, "") or "").strip() if new_val and new_val != old_val: # Filter spurious LLM changes if _is_spurious_change(old_val, new_val): continue changes.append({ "row_index": orig.get("row_index", i), "field": field_name, "old": old_val, "new": new_val, }) entry[field_name] = new_val entry["llm_corrected"] = True entries_out.append(entry) else: entries_out.append(dict(orig)) return changes, entries_out # ─── Spell-Checker OCR Review (Rule-Based, no LLM) ──────────────────────────── REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm" try: from spellchecker import SpellChecker as _SpellChecker _en_spell = _SpellChecker(language='en', distance=1) _de_spell = _SpellChecker(language='de', distance=1) _SPELL_AVAILABLE = True logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE) except ImportError: _SPELL_AVAILABLE = False logger.warning("pyspellchecker not installed — falling back to LLM review") # ─── Page-Ref Normalization ─────────────────────────────────────────────────── # Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60" _PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE) def _normalize_page_ref(text: str) -> str: """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'.""" if not text: return text return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text) # Suspicious OCR chars → ordered list of most-likely correct replacements _SPELL_SUBS: Dict[str, List[str]] = { '0': ['O', 'o'], '1': ['l', 'I'], '5': ['S', 's'], '6': ['G', 'g'], '8': ['B', 'b'], '|': ['I', 'l', '1'], } _SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys()) # Tokenizer: word tokens (letters + pipe) alternating with separators _SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)') def _spell_dict_knows(word: str) -> bool: """True if word is known in EN or DE dictionary.""" if not _SPELL_AVAILABLE: return False w = word.lower() return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) def _spell_fix_token(token: str, field: str = "") -> Optional[str]: """Return corrected form of token, or None if no fix needed/possible. *field* is 'english' or 'german' — used to pick the right dictionary for general spell correction (step 3 below). """ has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token) # 1. Already known word → no fix needed if _spell_dict_knows(token): return None # 2. Digit/pipe substitution (existing logic) if has_suspicious: # Standalone pipe → capital I if token == '|': return 'I' # Dictionary-backed single-char substitution for i, ch in enumerate(token): if ch not in _SPELL_SUBS: continue for replacement in _SPELL_SUBS[ch]: candidate = token[:i] + replacement + token[i + 1:] if _spell_dict_knows(candidate): return candidate # Structural rule: suspicious char at position 0 + rest is all lowercase letters first = token[0] if first in _SPELL_SUBS and len(token) >= 2: rest = token[1:] if rest.isalpha() and rest.islower(): candidate = _SPELL_SUBS[first][0] + rest if not candidate[0].isdigit(): return candidate # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u) # Try single-char umlaut substitutions and check against dictionary. if len(token) >= 3 and token.isalpha() and field == "german": _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü', 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'} for i, ch in enumerate(token): if ch in _UMLAUT_SUBS: candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:] if _spell_dict_knows(candidate): return candidate # 4. General spell correction for unknown words (no digits/pipes) # e.g. "beautful" → "beautiful" if not has_suspicious and len(token) >= 3 and token.isalpha(): spell = _en_spell if field == "english" else _de_spell if field == "german" else None if spell is not None: correction = spell.correction(token.lower()) if correction and correction != token.lower(): # Preserve original capitalisation pattern if token[0].isupper(): correction = correction[0].upper() + correction[1:] if _spell_dict_knows(correction): return correction return None def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]: """Apply OCR corrections to a text field. Returns (fixed_text, was_changed). *field* is 'english' or 'german' — forwarded to _spell_fix_token for dictionary selection. """ if not text: return text, False has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS) # If no suspicious chars AND no alpha chars that could be misspelled, skip if not has_suspicious and not any(c.isalpha() for c in text): return text, False # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ") fixed = _re.sub(r'(? Dict: """Rule-based OCR correction: spell-checker + structural heuristics. Deterministic — never translates, never touches IPA, never hallucinates. """ t0 = time.time() changes: List[Dict] = [] all_corrected: List[Dict] = [] for i, entry in enumerate(entries): e = dict(entry) # Page-ref normalization (always, regardless of review status) old_ref = (e.get("source_page") or "").strip() if old_ref: new_ref = _normalize_page_ref(old_ref) if new_ref != old_ref: changes.append({ "row_index": e.get("row_index", i), "field": "source_page", "old": old_ref, "new": new_ref, }) e["source_page"] = new_ref e["llm_corrected"] = True if not _entry_needs_review(e): all_corrected.append(e) continue for field_name in ("english", "german", "example"): old_val = (e.get(field_name) or "").strip() if not old_val: continue # example field is mixed-language — try German first (for umlauts) lang = "german" if field_name in ("german", "example") else "english" new_val, was_changed = _spell_fix_field(old_val, field=lang) if was_changed and new_val != old_val: changes.append({ "row_index": e.get("row_index", i), "field": field_name, "old": old_val, "new": new_val, }) e[field_name] = new_val e["llm_corrected"] = True all_corrected.append(e) duration_ms = int((time.time() - t0) * 1000) return { "entries_original": entries, "entries_corrected": all_corrected, "changes": changes, "skipped_count": 0, "model_used": "spell-checker", "duration_ms": duration_ms, } async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50): """Async generator yielding SSE-compatible events for spell-checker review.""" total = len(entries) yield { "type": "meta", "total_entries": total, "to_review": total, "skipped": 0, "model": "spell-checker", "batch_size": batch_size, } result = spell_review_entries_sync(entries) changes = result["changes"] yield { "type": "batch", "batch_index": 0, "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)], "changes": changes, "duration_ms": result["duration_ms"], "progress": {"current": total, "total": total}, } yield { "type": "complete", "changes": changes, "model_used": "spell-checker", "duration_ms": result["duration_ms"], "total_entries": total, "reviewed": total, "skipped": 0, "corrections_found": len(changes), "entries_corrected": result["entries_corrected"], } # ─── End Spell-Checker ──────────────────────────────────────────────────────── async def llm_review_entries( entries: List[Dict], model: str = None, ) -> Dict: """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm).""" if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: return spell_review_entries_sync(entries) if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") model = model or OLLAMA_REVIEW_MODEL # Filter: only entries that need review reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)] if not reviewable: return { "entries_original": entries, "entries_corrected": [dict(e) for e in entries], "changes": [], "skipped_count": len(entries), "model_used": model, "duration_ms": 0, } review_entries = [e for _, e in reviewable] table_lines = [ {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} for e in review_entries ] logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)", len(review_entries), len(entries), model, len(entries) - len(reviewable)) logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False)) prompt = _build_llm_prompt(table_lines) t0 = time.time() async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post( f"{_OLLAMA_URL}/api/chat", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False, "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6) "options": {"temperature": 0.1, "num_predict": 8192}, }, ) resp.raise_for_status() content = resp.json().get("message", {}).get("content", "") duration_ms = int((time.time() - t0) * 1000) logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content)) logger.debug("LLM review raw response (first 500): %.500s", content) corrected = _parse_llm_json_array(content) logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected)) changes, corrected_entries = _diff_batch(review_entries, corrected) # Merge corrected entries back into the full list all_corrected = [dict(e) for e in entries] for batch_idx, (orig_idx, _) in enumerate(reviewable): if batch_idx < len(corrected_entries): all_corrected[orig_idx] = corrected_entries[batch_idx] return { "entries_original": entries, "entries_corrected": all_corrected, "changes": changes, "skipped_count": len(entries) - len(reviewable), "model_used": model, "duration_ms": duration_ms, } async def llm_review_entries_streaming( entries: List[Dict], model: str = None, batch_size: int = _REVIEW_BATCH_SIZE, ): """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE. Phase 0 (always): Run _fix_character_confusion and emit any changes so they are visible in the UI — this is the only place the fix now runs (removed from Step 1 of build_vocab_pipeline_streaming). """ # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) --- _CONF_FIELDS = ('english', 'german', 'example') originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries] _fix_character_confusion(entries) # modifies in-place, returns same list char_changes = [ {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')} for i in range(len(entries)) for f in _CONF_FIELDS if originals[i][f] != entries[i].get(f, '') ] if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: # Inject char_changes as a batch right after the meta event from the spell checker _meta_sent = False async for event in spell_review_entries_streaming(entries, batch_size): yield event if not _meta_sent and event.get('type') == 'meta' and char_changes: _meta_sent = True yield { 'type': 'batch', 'changes': char_changes, 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), 'progress': {'current': 0, 'total': len(entries)}, } return if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") # LLM path: emit char_changes first (before meta) so they appear in the UI if char_changes: yield { 'type': 'batch', 'changes': char_changes, 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), 'progress': {'current': 0, 'total': len(entries)}, } model = model or OLLAMA_REVIEW_MODEL # Separate reviewable from skipped entries reviewable = [] skipped_indices = [] for i, e in enumerate(entries): if _entry_needs_review(e): reviewable.append((i, e)) else: skipped_indices.append(i) total_to_review = len(reviewable) # meta event yield { "type": "meta", "total_entries": len(entries), "to_review": total_to_review, "skipped": len(skipped_indices), "model": model, "batch_size": batch_size, } all_changes = [] all_corrected = [dict(e) for e in entries] total_duration_ms = 0 reviewed_count = 0 # Process in batches for batch_start in range(0, total_to_review, batch_size): batch_items = reviewable[batch_start:batch_start + batch_size] batch_entries = [e for _, e in batch_items] table_lines = [ {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} for e in batch_entries ] prompt = _build_llm_prompt(table_lines) logger.info("LLM review streaming: batch %d — sending %d entries to %s", batch_start // batch_size, len(batch_entries), model) t0 = time.time() async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post( f"{_OLLAMA_URL}/api/chat", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False, "think": False, # qwen3: disable chain-of-thought "options": {"temperature": 0.1, "num_predict": 8192}, }, ) resp.raise_for_status() content = resp.json().get("message", {}).get("content", "") batch_ms = int((time.time() - t0) * 1000) total_duration_ms += batch_ms logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content)) logger.debug("LLM review streaming raw (first 500): %.500s", content) corrected = _parse_llm_json_array(content) logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected)) batch_changes, batch_corrected = _diff_batch(batch_entries, corrected) # Merge back for batch_idx, (orig_idx, _) in enumerate(batch_items): if batch_idx < len(batch_corrected): all_corrected[orig_idx] = batch_corrected[batch_idx] all_changes.extend(batch_changes) reviewed_count += len(batch_items) # Yield batch result yield { "type": "batch", "batch_index": batch_start // batch_size, "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items], "changes": batch_changes, "duration_ms": batch_ms, "progress": {"current": reviewed_count, "total": total_to_review}, } # Complete event yield { "type": "complete", "changes": all_changes, "model_used": model, "duration_ms": total_duration_ms, "total_entries": len(entries), "reviewed": total_to_review, "skipped": len(skipped_indices), "corrections_found": len(all_changes), "entries_corrected": all_corrected, } def _sanitize_for_json(text: str) -> str: """Remove or escape control characters that break JSON parsing. Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid JSON whitespace. Removes all other ASCII control characters (0x00-0x1f) that are only valid inside JSON strings when properly escaped. """ # Replace literal control chars (except \\t \\n \\r) with a space return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text) def _parse_llm_json_array(text: str) -> List[Dict]: """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags).""" # Strip qwen3 ... blocks (present even with think=False on some builds) text = _re.sub(r'.*?', '', text, flags=_re.DOTALL) # Strip markdown code fences text = _re.sub(r'```json\s*', '', text) text = _re.sub(r'```\s*', '', text) # Sanitize control characters before JSON parsing text = _sanitize_for_json(text) # Find first [ ... last ] match = _re.search(r'\[.*\]', text, _re.DOTALL) if match: try: return _json.loads(match.group()) except (ValueError, _json.JSONDecodeError) as e: logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200]) else: logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200]) return []