breakpilot-lehrer/klausur-service/backend/cv_review.py

"""
Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import json
import logging
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple

import numpy as np

from cv_vocab_types import (
    CV_PIPELINE_AVAILABLE,
    PageRegion,
    PipelineResult,
    VocabRow,
)
from cv_preprocessing import (
    deskew_image,
    dewarp_image,
    render_image_high_res,
    render_pdf_high_res,
)
from cv_layout import (
    analyze_layout,
    create_layout_image,
    create_ocr_image,
)
from cv_ocr_engines import (
    _fix_character_confusion,
    _group_words_into_lines,
)

logger = logging.getLogger(__name__)

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]

try:
    import pytesseract
    from PIL import Image
except ImportError:
    pytesseract = None  # type: ignore[assignment]
    Image = None  # type: ignore[assignment,misc]


# =============================================================================
# Stage 6: Multi-Pass OCR
# =============================================================================

def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
               psm: int, fallback_psm: Optional[int] = None,
               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
    """Run Tesseract OCR on a specific region with given PSM.

    Args:
        ocr_img: Binarized full-page image.
        region: Region to crop and OCR.
        lang: Tesseract language string.
        psm: Page Segmentation Mode.
        fallback_psm: If confidence too low, retry with this PSM per line.
        min_confidence: Minimum average confidence before fallback.

    Returns:
        List of word dicts with text, position, confidence.
    """
    # Crop region
    crop = ocr_img[region.y:region.y + region.height,
                   region.x:region.x + region.width]

    if crop.size == 0:
        return []

    # Convert to PIL for pytesseract
    pil_img = Image.fromarray(crop)

    # Run Tesseract with specified PSM
    config = f'--psm {psm} --oem 3'
    try:
        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
                                         output_type=pytesseract.Output.DICT)
    except Exception as e:
        logger.warning(f"Tesseract failed for region {region.type}: {e}")
        return []

    words = []
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        conf = int(data['conf'][i])
        if not text or conf < 10:
            continue
        words.append({
            'text': text,
            'left': data['left'][i] + region.x,  # Absolute coords
            'top': data['top'][i] + region.y,
            'width': data['width'][i],
            'height': data['height'][i],
            'conf': conf,
            'region_type': region.type,
        })

    # Check average confidence
    if words and fallback_psm is not None:
        avg_conf = sum(w['conf'] for w in words) / len(words)
        if avg_conf < min_confidence:
            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
                        f"trying fallback PSM {fallback_psm}")
            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)

    return words


def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
                              lang: str, psm: int) -> List[Dict[str, Any]]:
    """OCR a region line by line (fallback for low-confidence regions).

    Splits the region into horizontal strips based on text density,
    then OCRs each strip individually with the given PSM.
    """
    crop = ocr_img[region.y:region.y + region.height,
                   region.x:region.x + region.width]

    if crop.size == 0:
        return []

    # Find text lines via horizontal projection
    inv = cv2.bitwise_not(crop)
    h_proj = np.sum(inv, axis=1)
    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0

    # Find line boundaries
    lines = []
    in_text = False
    line_start = 0
    for y in range(len(h_proj)):
        if h_proj[y] > threshold and not in_text:
            line_start = y
            in_text = True
        elif h_proj[y] <= threshold and in_text:
            if y - line_start > 5:  # Minimum line height
                lines.append((line_start, y))
            in_text = False
    if in_text and len(h_proj) - line_start > 5:
        lines.append((line_start, len(h_proj)))

    all_words = []
    config = f'--psm {psm} --oem 3'

    for line_y_start, line_y_end in lines:
        # Add small padding
        pad = 3
        y1 = max(0, line_y_start - pad)
        y2 = min(crop.shape[0], line_y_end + pad)
        line_crop = crop[y1:y2, :]

        if line_crop.size == 0:
            continue

        pil_img = Image.fromarray(line_crop)
        try:
            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
                                             output_type=pytesseract.Output.DICT)
        except Exception:
            continue

        for i in range(len(data['text'])):
            text = data['text'][i].strip()
            conf = int(data['conf'][i])
            if not text or conf < 10:
                continue
            all_words.append({
                'text': text,
                'left': data['left'][i] + region.x,
                'top': data['top'][i] + region.y + y1,
                'width': data['width'][i],
                'height': data['height'][i],
                'conf': conf,
                'region_type': region.type,
            })

    return all_words


def run_multi_pass_ocr(ocr_img: np.ndarray,
                       regions: List[PageRegion],
                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
    """Run OCR on each detected region with optimized settings.

    Args:
        ocr_img: Binarized full-page image.
        regions: Detected page regions.
        lang: Default language.

    Returns:
        Dict mapping region type to list of word dicts.
    """
    results: Dict[str, List[Dict]] = {}

    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    for region in regions:
        if region.type in _ocr_skip:
            continue  # Skip non-content regions

        if region.type == 'column_en':
            words = ocr_region(ocr_img, region, lang='eng', psm=4)
        elif region.type == 'column_de':
            words = ocr_region(ocr_img, region, lang='deu', psm=4)
        elif region.type == 'column_example':
            words = ocr_region(ocr_img, region, lang=lang, psm=6,
                              fallback_psm=7, min_confidence=40.0)
        else:
            words = ocr_region(ocr_img, region, lang=lang, psm=6)

        results[region.type] = words
        logger.info(f"OCR {region.type}: {len(words)} words")

    return results


# =============================================================================
# Stage 7: Line Alignment → Vocabulary Entries
# =============================================================================

def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
                          regions: List[PageRegion],
                          y_tolerance_px: int = 25) -> List[VocabRow]:
    """Align OCR results from different columns into vocabulary rows.

    Uses Y-coordinate matching to pair English words, German translations,
    and example sentences that appear on the same line.

    Args:
        ocr_results: Dict mapping region type to word lists.
        regions: Detected regions (for reference).
        y_tolerance_px: Max Y-distance to consider words on the same row.

    Returns:
        List of VocabRow objects.
    """
    # If no vocabulary columns detected (e.g. plain text page), return empty
    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
        return []

    # Group words into lines per column
    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)

    def line_y_center(line: List[Dict]) -> float:
        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)

    def line_text(line: List[Dict]) -> str:
        return ' '.join(w['text'] for w in line)

    def line_confidence(line: List[Dict]) -> float:
        return sum(w['conf'] for w in line) / len(line) if line else 0

    # Build EN entries as the primary reference
    vocab_rows: List[VocabRow] = []

    for en_line in en_lines:
        en_y = line_y_center(en_line)
        en_text = line_text(en_line)
        en_conf = line_confidence(en_line)

        # Skip very short or likely header content
        if len(en_text.strip()) < 2:
            continue

        # Find matching DE line
        de_text = ""
        de_conf = 0.0
        best_de_dist = float('inf')
        best_de_idx = -1
        for idx, de_line in enumerate(de_lines):
            dist = abs(line_y_center(de_line) - en_y)
            if dist < y_tolerance_px and dist < best_de_dist:
                best_de_dist = dist
                best_de_idx = idx

        if best_de_idx >= 0:
            de_text = line_text(de_lines[best_de_idx])
            de_conf = line_confidence(de_lines[best_de_idx])

        # Find matching example line
        ex_text = ""
        ex_conf = 0.0
        best_ex_dist = float('inf')
        best_ex_idx = -1
        for idx, ex_line in enumerate(ex_lines):
            dist = abs(line_y_center(ex_line) - en_y)
            if dist < y_tolerance_px and dist < best_ex_dist:
                best_ex_dist = dist
                best_ex_idx = idx

        if best_ex_idx >= 0:
            ex_text = line_text(ex_lines[best_ex_idx])
            ex_conf = line_confidence(ex_lines[best_ex_idx])

        avg_conf = en_conf
        conf_count = 1
        if de_conf > 0:
            avg_conf += de_conf
            conf_count += 1
        if ex_conf > 0:
            avg_conf += ex_conf
            conf_count += 1

        vocab_rows.append(VocabRow(
            english=en_text.strip(),
            german=de_text.strip(),
            example=ex_text.strip(),
            confidence=avg_conf / conf_count,
            y_position=int(en_y),
        ))

    # Handle multi-line wrapping in example column:
    # If an example line has no matching EN/DE, append to previous entry
    matched_ex_ys = set()
    for row in vocab_rows:
        if row.example:
            matched_ex_ys.add(row.y_position)

    for ex_line in ex_lines:
        ex_y = line_y_center(ex_line)
        # Check if already matched
        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
        if already_matched:
            continue

        # Find nearest previous vocab row
        best_row = None
        best_dist = float('inf')
        for row in vocab_rows:
            dist = ex_y - row.y_position
            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
                best_dist = dist
                best_row = row

        if best_row:
            continuation = line_text(ex_line).strip()
            if continuation:
                best_row.example = (best_row.example + " " + continuation).strip()

    # Sort by Y position
    vocab_rows.sort(key=lambda r: r.y_position)

    return vocab_rows


# =============================================================================
# Stage 8: Optional LLM Post-Correction
# =============================================================================

async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
                           confidence_threshold: float = 50.0,
                           enabled: bool = False) -> List[VocabRow]:
    """Optionally send low-confidence regions to Qwen-VL for correction.

    Default: disabled. Enable per parameter.

    Args:
        img: Original BGR image.
        vocab_rows: Current vocabulary rows.
        confidence_threshold: Rows below this get LLM correction.
        enabled: Whether to actually run LLM correction.

    Returns:
        Corrected vocabulary rows.
    """
    if not enabled:
        return vocab_rows

    # TODO: Implement Qwen-VL correction for low-confidence entries
    # For each row with confidence < threshold:
    #   1. Crop the relevant region from img
    #   2. Send crop + OCR text to Qwen-VL
    #   3. Replace text if LLM provides a confident correction
    logger.info(f"LLM post-correction skipped (not yet implemented)")
    return vocab_rows


# =============================================================================
# Orchestrator
# =============================================================================

async def run_cv_pipeline(
    pdf_data: Optional[bytes] = None,
    image_data: Optional[bytes] = None,
    page_number: int = 0,
    zoom: float = 3.0,
    enable_dewarp: bool = True,
    enable_llm_correction: bool = False,
    lang: str = "eng+deu",
) -> PipelineResult:
    """Run the complete CV document reconstruction pipeline.

    Args:
        pdf_data: Raw PDF bytes (mutually exclusive with image_data).
        image_data: Raw image bytes (mutually exclusive with pdf_data).
        page_number: 0-indexed page number (for PDF).
        zoom: PDF rendering zoom factor.
        enable_dewarp: Whether to run dewarp stage.
        enable_llm_correction: Whether to run LLM post-correction.
        lang: Tesseract language string.

    Returns:
        PipelineResult with vocabulary and timing info.
    """
    if not CV_PIPELINE_AVAILABLE:
        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")

    result = PipelineResult()
    total_start = time.time()

    try:
        # Stage 1: Render
        t = time.time()
        if pdf_data:
            img = render_pdf_high_res(pdf_data, page_number, zoom)
        elif image_data:
            img = render_image_high_res(image_data)
        else:
            return PipelineResult(error="No input data (pdf_data or image_data required)")
        result.stages['render'] = round(time.time() - t, 2)
        result.image_width = img.shape[1]
        result.image_height = img.shape[0]
        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")

        # Stage 2: Deskew
        t = time.time()
        img, angle = deskew_image(img)
        result.stages['deskew'] = round(time.time() - t, 2)
        logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")

        # Stage 3: Dewarp
        if enable_dewarp:
            t = time.time()
            img, _dewarp_info = dewarp_image(img)
            result.stages['dewarp'] = round(time.time() - t, 2)

        # Stage 4: Dual image preparation
        t = time.time()
        ocr_img = create_ocr_image(img)
        layout_img = create_layout_image(img)
        result.stages['image_prep'] = round(time.time() - t, 2)

        # Stage 5: Layout analysis
        t = time.time()
        regions = analyze_layout(layout_img, ocr_img)
        result.stages['layout'] = round(time.time() - t, 2)
        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")

        # Stage 6: Multi-pass OCR
        t = time.time()
        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
        result.stages['ocr'] = round(time.time() - t, 2)
        total_words = sum(len(w) for w in ocr_results.values())
        result.word_count = total_words
        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")

        # Stage 7: Line alignment
        t = time.time()
        vocab_rows = match_lines_to_vocab(ocr_results, regions)
        result.stages['alignment'] = round(time.time() - t, 2)

        # Stage 8: Optional LLM correction
        if enable_llm_correction:
            t = time.time()
            vocab_rows = await llm_post_correct(img, vocab_rows)
            result.stages['llm_correction'] = round(time.time() - t, 2)

        # Convert to output format
        result.vocabulary = [
            {
                "english": row.english,
                "german": row.german,
                "example": row.example,
                "confidence": round(row.confidence, 1),
            }
            for row in vocab_rows
            if row.english or row.german  # Skip empty rows
        ]

        result.duration_seconds = round(time.time() - total_start, 2)
        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")

    except Exception as e:
        logger.error(f"CV Pipeline error: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        result.error = str(e)
        result.duration_seconds = round(time.time() - total_start, 2)

    return result


# ---------------------------------------------------------------------------
# LLM-based OCR Correction (Step 6)
# ---------------------------------------------------------------------------

import httpx
import os
import json as _json
import re as _re

_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)

# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')

# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
# when they appear inside or next to a word character.
_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')


def _entry_needs_review(entry: Dict) -> bool:
    """Check if an entry should be sent to the LLM for review.

    Sends all non-empty entries that don't have IPA phonetic transcriptions.
    The LLM prompt and _is_spurious_change() guard against unwanted changes.
    """
    en = entry.get("english", "") or ""
    de = entry.get("german", "") or ""

    # Skip completely empty entries
    if not en.strip() and not de.strip():
        return False
    # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
        return False
    return True


def _build_llm_prompt(table_lines: List[Dict]) -> str:
    """Build the LLM correction prompt for a batch of entries."""
    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).

DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.

NUR diese Korrekturen sind erlaubt:
- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
- Ziffer 6 statt G oder g: "6eld" → "Geld"
- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"

ABSOLUT VERBOTEN — aendere NIEMALS:
- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
- Beispielsaetze in der ex-Spalte — NIEMALS aendern

Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.

Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).

/no_think

Eingabe:
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""


def _is_spurious_change(old_val: str, new_val: str) -> bool:
    """Detect LLM changes that are likely wrong and should be discarded.

    Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
    legitimate OCR corrections. Everything else is rejected.

    Filters out:
    - Case-only changes
    - Changes that don't contain any digit→letter fix
    - Completely different words (LLM translating or hallucinating)
    - Additions or removals of whole words (count changed)
    """
    if not old_val or not new_val:
        return False

    # Case-only change — never a real OCR error
    if old_val.lower() == new_val.lower():
        return True

    # If the word count changed significantly, the LLM rewrote rather than fixed
    old_words = old_val.split()
    new_words = new_val.split()
    if abs(len(old_words) - len(new_words)) > 1:
        return True

    # Core rule: a legitimate correction replaces a digit with the corresponding
    # letter. If the change doesn't include such a substitution, reject it.
    # Build a set of (old_char, new_char) pairs that differ between old and new.
    # Use character-level diff heuristic: if lengths are close, zip and compare.
    # Map of characters that OCR commonly misreads → set of correct replacements
    _OCR_CHAR_MAP = {
        # Digits mistaken for letters
        '0': set('oOgG'),
        '1': set('lLiI'),
        '5': set('sS'),
        '6': set('gG'),
        '8': set('bB'),
        # Non-letter symbols mistaken for letters
        '|': set('lLiI1'),  # pipe → lowercase l, capital I, or digit 1
        'l': set('iI|1'),   # lowercase l → capital I (and reverse)
    }
    has_valid_fix = False
    if len(old_val) == len(new_val):
        for oc, nc in zip(old_val, new_val):
            if oc != nc:
                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
                    has_valid_fix = True
                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
                    # Reverse check (e.g. l→I where new is the "correct" char)
                    has_valid_fix = True
    else:
        # Length changed by 1: accept if old had a suspicious char sequence
        _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
            has_valid_fix = True

    if not has_valid_fix:
        return True  # Reject — looks like translation or hallucination

    return False


def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
    changes = []
    entries_out = []
    for i, orig in enumerate(originals):
        if i < len(corrected):
            c = corrected[i]
            entry = dict(orig)
            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
                new_val = c.get(key, "").strip()
                old_val = (orig.get(field_name, "") or "").strip()
                if new_val and new_val != old_val:
                    # Filter spurious LLM changes
                    if _is_spurious_change(old_val, new_val):
                        continue
                    changes.append({
                        "row_index": orig.get("row_index", i),
                        "field": field_name,
                        "old": old_val,
                        "new": new_val,
                    })
                    entry[field_name] = new_val
                    entry["llm_corrected"] = True
            entries_out.append(entry)
        else:
            entries_out.append(dict(orig))
    return changes, entries_out


# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────

REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"

try:
    from spellchecker import SpellChecker as _SpellChecker
    _en_spell = _SpellChecker(language='en', distance=1)
    _de_spell = _SpellChecker(language='de', distance=1)
    _SPELL_AVAILABLE = True
    logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
except ImportError:
    _SPELL_AVAILABLE = False
    logger.warning("pyspellchecker not installed — falling back to LLM review")

# ─── Page-Ref Normalization ───────────────────────────────────────────────────
# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)


def _normalize_page_ref(text: str) -> str:
    """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
    if not text:
        return text
    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)


# Suspicious OCR chars → ordered list of most-likely correct replacements
_SPELL_SUBS: Dict[str, List[str]] = {
    '0': ['O', 'o'],
    '1': ['l', 'I'],
    '5': ['S', 's'],
    '6': ['G', 'g'],
    '8': ['B', 'b'],
    '|': ['I', 'l', '1'],
}
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())

# Tokenizer: word tokens (letters + pipe) alternating with separators
_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')


def _spell_dict_knows(word: str) -> bool:
    """True if word is known in EN or DE dictionary."""
    if not _SPELL_AVAILABLE:
        return False
    w = word.lower()
    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))


def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
    """Return corrected form of token, or None if no fix needed/possible.

    *field* is 'english' or 'german' — used to pick the right dictionary
    for general spell correction (step 3 below).
    """
    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)

    # 1. Already known word → no fix needed
    if _spell_dict_knows(token):
        return None

    # 2. Digit/pipe substitution (existing logic)
    if has_suspicious:
        # Standalone pipe → capital I
        if token == '|':
            return 'I'
        # Dictionary-backed single-char substitution
        for i, ch in enumerate(token):
            if ch not in _SPELL_SUBS:
                continue
            for replacement in _SPELL_SUBS[ch]:
                candidate = token[:i] + replacement + token[i + 1:]
                if _spell_dict_knows(candidate):
                    return candidate
        # Structural rule: suspicious char at position 0 + rest is all lowercase letters
        first = token[0]
        if first in _SPELL_SUBS and len(token) >= 2:
            rest = token[1:]
            if rest.isalpha() and rest.islower():
                candidate = _SPELL_SUBS[first][0] + rest
                if not candidate[0].isdigit():
                    return candidate

    # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
    #    Try single-char umlaut substitutions and check against dictionary.
    if len(token) >= 3 and token.isalpha() and field == "german":
        _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
                         'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
        for i, ch in enumerate(token):
            if ch in _UMLAUT_SUBS:
                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
                if _spell_dict_knows(candidate):
                    return candidate

    # 4. General spell correction for unknown words (no digits/pipes)
    #    e.g. "beautful" → "beautiful"
    if not has_suspicious and len(token) >= 3 and token.isalpha():
        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
        if spell is not None:
            correction = spell.correction(token.lower())
            if correction and correction != token.lower():
                # Preserve original capitalisation pattern
                if token[0].isupper():
                    correction = correction[0].upper() + correction[1:]
                if _spell_dict_knows(correction):
                    return correction
    return None


def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).

    *field* is 'english' or 'german' — forwarded to _spell_fix_token for
    dictionary selection.
    """
    if not text:
        return text, False
    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
    # If no suspicious chars AND no alpha chars that could be misspelled, skip
    if not has_suspicious and not any(c.isalpha() for c in text):
        return text, False
    # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
    changed = fixed != text
    # Tokenize and fix word by word
    parts: List[str] = []
    pos = 0
    for m in _SPELL_TOKEN_RE.finditer(fixed):
        token, sep = m.group(1), m.group(2)
        correction = _spell_fix_token(token, field=field)
        if correction:
            parts.append(correction)
            changed = True
        else:
            parts.append(token)
        parts.append(sep)
        pos = m.end()
    if pos < len(fixed):
        parts.append(fixed[pos:])
    return ''.join(parts), changed


def spell_review_entries_sync(entries: List[Dict]) -> Dict:
    """Rule-based OCR correction: spell-checker + structural heuristics.

    Deterministic — never translates, never touches IPA, never hallucinates.
    """
    t0 = time.time()
    changes: List[Dict] = []
    all_corrected: List[Dict] = []
    for i, entry in enumerate(entries):
        e = dict(entry)
        # Page-ref normalization (always, regardless of review status)
        old_ref = (e.get("source_page") or "").strip()
        if old_ref:
            new_ref = _normalize_page_ref(old_ref)
            if new_ref != old_ref:
                changes.append({
                    "row_index": e.get("row_index", i),
                    "field": "source_page",
                    "old": old_ref,
                    "new": new_ref,
                })
                e["source_page"] = new_ref
                e["llm_corrected"] = True
        if not _entry_needs_review(e):
            all_corrected.append(e)
            continue
        for field_name in ("english", "german", "example"):
            old_val = (e.get(field_name) or "").strip()
            if not old_val:
                continue
            # example field is mixed-language — try German first (for umlauts)
            lang = "german" if field_name in ("german", "example") else "english"
            new_val, was_changed = _spell_fix_field(old_val, field=lang)
            if was_changed and new_val != old_val:
                changes.append({
                    "row_index": e.get("row_index", i),
                    "field": field_name,
                    "old": old_val,
                    "new": new_val,
                })
                e[field_name] = new_val
                e["llm_corrected"] = True
        all_corrected.append(e)
    duration_ms = int((time.time() - t0) * 1000)
    return {
        "entries_original": entries,
        "entries_corrected": all_corrected,
        "changes": changes,
        "skipped_count": 0,
        "model_used": "spell-checker",
        "duration_ms": duration_ms,
    }


async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
    """Async generator yielding SSE-compatible events for spell-checker review."""
    total = len(entries)
    yield {
        "type": "meta",
        "total_entries": total,
        "to_review": total,
        "skipped": 0,
        "model": "spell-checker",
        "batch_size": batch_size,
    }
    result = spell_review_entries_sync(entries)
    changes = result["changes"]
    yield {
        "type": "batch",
        "batch_index": 0,
        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
        "changes": changes,
        "duration_ms": result["duration_ms"],
        "progress": {"current": total, "total": total},
    }
    yield {
        "type": "complete",
        "changes": changes,
        "model_used": "spell-checker",
        "duration_ms": result["duration_ms"],
        "total_entries": total,
        "reviewed": total,
        "skipped": 0,
        "corrections_found": len(changes),
        "entries_corrected": result["entries_corrected"],
    }

# ─── End Spell-Checker ────────────────────────────────────────────────────────


async def llm_review_entries(
    entries: List[Dict],
    model: str = None,
) -> Dict:
    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
        return spell_review_entries_sync(entries)
    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")

    model = model or OLLAMA_REVIEW_MODEL

    # Filter: only entries that need review
    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]

    if not reviewable:
        return {
            "entries_original": entries,
            "entries_corrected": [dict(e) for e in entries],
            "changes": [],
            "skipped_count": len(entries),
            "model_used": model,
            "duration_ms": 0,
        }

    review_entries = [e for _, e in reviewable]
    table_lines = [
        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
        for e in review_entries
    ]

    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
                len(review_entries), len(entries), model, len(entries) - len(reviewable))
    logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))

    prompt = _build_llm_prompt(table_lines)

    t0 = time.time()
    async with httpx.AsyncClient(timeout=300.0) as client:
        resp = await client.post(
            f"{_OLLAMA_URL}/api/chat",
            json={
                "model": model,
                "messages": [{"role": "user", "content": prompt}],
                "stream": False,
                "think": False,   # qwen3: disable chain-of-thought (Ollama >=0.6)
                "options": {"temperature": 0.1, "num_predict": 8192},
            },
        )
        resp.raise_for_status()
        content = resp.json().get("message", {}).get("content", "")
    duration_ms = int((time.time() - t0) * 1000)

    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
    logger.debug("LLM review raw response (first 500): %.500s", content)

    corrected = _parse_llm_json_array(content)
    logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
    changes, corrected_entries = _diff_batch(review_entries, corrected)

    # Merge corrected entries back into the full list
    all_corrected = [dict(e) for e in entries]
    for batch_idx, (orig_idx, _) in enumerate(reviewable):
        if batch_idx < len(corrected_entries):
            all_corrected[orig_idx] = corrected_entries[batch_idx]

    return {
        "entries_original": entries,
        "entries_corrected": all_corrected,
        "changes": changes,
        "skipped_count": len(entries) - len(reviewable),
        "model_used": model,
        "duration_ms": duration_ms,
    }


async def llm_review_entries_streaming(
    entries: List[Dict],
    model: str = None,
    batch_size: int = _REVIEW_BATCH_SIZE,
):
    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.

    Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
    visible in the UI — this is the only place the fix now runs (removed from Step 1
    of build_vocab_pipeline_streaming).
    """
    # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
    _CONF_FIELDS = ('english', 'german', 'example')
    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
    _fix_character_confusion(entries)  # modifies in-place, returns same list
    char_changes = [
        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
        for i in range(len(entries))
        for f in _CONF_FIELDS
        if originals[i][f] != entries[i].get(f, '')
    ]

    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
        # Inject char_changes as a batch right after the meta event from the spell checker
        _meta_sent = False
        async for event in spell_review_entries_streaming(entries, batch_size):
            yield event
            if not _meta_sent and event.get('type') == 'meta' and char_changes:
                _meta_sent = True
                yield {
                    'type': 'batch',
                    'changes': char_changes,
                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
                    'progress': {'current': 0, 'total': len(entries)},
                }
        return

    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")

    # LLM path: emit char_changes first (before meta) so they appear in the UI
    if char_changes:
        yield {
            'type': 'batch',
            'changes': char_changes,
            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
            'progress': {'current': 0, 'total': len(entries)},
        }

    model = model or OLLAMA_REVIEW_MODEL

    # Separate reviewable from skipped entries
    reviewable = []
    skipped_indices = []
    for i, e in enumerate(entries):
        if _entry_needs_review(e):
            reviewable.append((i, e))
        else:
            skipped_indices.append(i)

    total_to_review = len(reviewable)

    # meta event
    yield {
        "type": "meta",
        "total_entries": len(entries),
        "to_review": total_to_review,
        "skipped": len(skipped_indices),
        "model": model,
        "batch_size": batch_size,
    }

    all_changes = []
    all_corrected = [dict(e) for e in entries]
    total_duration_ms = 0
    reviewed_count = 0

    # Process in batches
    for batch_start in range(0, total_to_review, batch_size):
        batch_items = reviewable[batch_start:batch_start + batch_size]
        batch_entries = [e for _, e in batch_items]

        table_lines = [
            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
            for e in batch_entries
        ]

        prompt = _build_llm_prompt(table_lines)

        logger.info("LLM review streaming: batch %d — sending %d entries to %s",
                    batch_start // batch_size, len(batch_entries), model)

        t0 = time.time()
        async with httpx.AsyncClient(timeout=300.0) as client:
            resp = await client.post(
                f"{_OLLAMA_URL}/api/chat",
                json={
                    "model": model,
                    "messages": [{"role": "user", "content": prompt}],
                    "stream": False,
                    "think": False,   # qwen3: disable chain-of-thought
                    "options": {"temperature": 0.1, "num_predict": 8192},
                },
            )
            resp.raise_for_status()
            content = resp.json().get("message", {}).get("content", "")
        batch_ms = int((time.time() - t0) * 1000)
        total_duration_ms += batch_ms

        logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
        logger.debug("LLM review streaming raw (first 500): %.500s", content)

        corrected = _parse_llm_json_array(content)
        logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)

        # Merge back
        for batch_idx, (orig_idx, _) in enumerate(batch_items):
            if batch_idx < len(batch_corrected):
                all_corrected[orig_idx] = batch_corrected[batch_idx]

        all_changes.extend(batch_changes)
        reviewed_count += len(batch_items)

        # Yield batch result
        yield {
            "type": "batch",
            "batch_index": batch_start // batch_size,
            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
            "changes": batch_changes,
            "duration_ms": batch_ms,
            "progress": {"current": reviewed_count, "total": total_to_review},
        }

    # Complete event
    yield {
        "type": "complete",
        "changes": all_changes,
        "model_used": model,
        "duration_ms": total_duration_ms,
        "total_entries": len(entries),
        "reviewed": total_to_review,
        "skipped": len(skipped_indices),
        "corrections_found": len(all_changes),
        "entries_corrected": all_corrected,
    }


def _sanitize_for_json(text: str) -> str:
    """Remove or escape control characters that break JSON parsing.

    Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
    JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
    that are only valid inside JSON strings when properly escaped.
    """
    # Replace literal control chars (except \\t \\n \\r) with a space
    return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)


def _parse_llm_json_array(text: str) -> List[Dict]:
    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
    # Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
    text = _re.sub(r'<think>.*?</think>', '', text, flags=_re.DOTALL)
    # Strip markdown code fences
    text = _re.sub(r'```json\s*', '', text)
    text = _re.sub(r'```\s*', '', text)
    # Sanitize control characters before JSON parsing
    text = _sanitize_for_json(text)
    # Find first [ ... last ]
    match = _re.search(r'\[.*\]', text, _re.DOTALL)
    if match:
        try:
            return _json.loads(match.group())
        except (ValueError, _json.JSONDecodeError) as e:
            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
    else:
        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
    return []