Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
--- a/klausur-service/backend/ocr/review/llm.py
+++ b/klausur-service/backend/ocr/review/llm.py
@@ -0,0 +1,388 @@
+"""
+CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.
+
+Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
+_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re
+import time
+from typing import Dict, List, Tuple
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
+_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
+logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
+
+REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
+
+# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
+_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
+
+# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
+_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
+
+
+def _entry_needs_review(entry: Dict) -> bool:
+    """Check if an entry should be sent for review.
+
+    Sends all non-empty entries that don't have IPA phonetic transcriptions.
+    """
+    en = entry.get("english", "") or ""
+    de = entry.get("german", "") or ""
+
+    if not en.strip() and not de.strip():
+        return False
+    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
+        return False
+    return True
+
+
+def _build_llm_prompt(table_lines: List[Dict]) -> str:
+    """Build the LLM correction prompt for a batch of entries."""
+    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
+
+DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
+
+NUR diese Korrekturen sind erlaubt:
+- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
+- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
+- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
+- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
+- Ziffer 6 statt G oder g: "6eld" -> "Geld"
+- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
+
+ABSOLUT VERBOTEN -- aendere NIEMALS:
+- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
+- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
+- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
+- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
+- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
+- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
+- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
+- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
+
+Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
+
+Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
+
+/no_think
+
+Eingabe:
+{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
+
+
+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+    """Detect LLM changes that are likely wrong and should be discarded.
+
+    Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
+    legitimate OCR corrections. Everything else is rejected.
+    """
+    if not old_val or not new_val:
+        return False
+
+    if old_val.lower() == new_val.lower():
+        return True
+
+    old_words = old_val.split()
+    new_words = new_val.split()
+    if abs(len(old_words) - len(new_words)) > 1:
+        return True
+
+    _OCR_CHAR_MAP = {
+        '0': set('oOgG'),
+        '1': set('lLiI'),
+        '5': set('sS'),
+        '6': set('gG'),
+        '8': set('bB'),
+        '|': set('lLiI1'),
+        'l': set('iI|1'),
+    }
+    has_valid_fix = False
+    if len(old_val) == len(new_val):
+        for oc, nc in zip(old_val, new_val):
+            if oc != nc:
+                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
+                    has_valid_fix = True
+                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
+                    has_valid_fix = True
+    else:
+        _OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
+        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
+            has_valid_fix = True
+
+    if not has_valid_fix:
+        return True
+
+    return False
+
+
+def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
+    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
+    changes = []
+    entries_out = []
+    for i, orig in enumerate(originals):
+        if i < len(corrected):
+            c = corrected[i]
+            entry = dict(orig)
+            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
+                new_val = c.get(key, "").strip()
+                old_val = (orig.get(field_name, "") or "").strip()
+                if new_val and new_val != old_val:
+                    if _is_spurious_change(old_val, new_val):
+                        continue
+                    changes.append({
+                        "row_index": orig.get("row_index", i),
+                        "field": field_name,
+                        "old": old_val,
+                        "new": new_val,
+                    })
+                    entry[field_name] = new_val
+                    entry["llm_corrected"] = True
+            entries_out.append(entry)
+        else:
+            entries_out.append(dict(orig))
+    return changes, entries_out
+
+
+def _sanitize_for_json(text: str) -> str:
+    """Remove or escape control characters that break JSON parsing."""
+    return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
+
+
+def _parse_llm_json_array(text: str) -> List[Dict]:
+    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = re.sub(r'```json\s*', '', text)
+    text = re.sub(r'```\s*', '', text)
+    text = _sanitize_for_json(text)
+    match = re.search(r'\[.*\]', text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group())
+        except (ValueError, json.JSONDecodeError) as e:
+            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
+    else:
+        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
+    return []
+
+
+async def llm_review_entries(
+    entries: List[Dict],
+    model: str = None,
+) -> Dict:
+    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
+    from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        return spell_review_entries_sync(entries)
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
+
+    if not reviewable:
+        return {
+            "entries_original": entries,
+            "entries_corrected": [dict(e) for e in entries],
+            "changes": [],
+            "skipped_count": len(entries),
+            "model_used": model,
+            "duration_ms": 0,
+        }
+
+    review_entries = [e for _, e in reviewable]
+    table_lines = [
+        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+        for e in review_entries
+    ]
+
+    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
+                len(review_entries), len(entries), model, len(entries) - len(reviewable))
+
+    prompt = _build_llm_prompt(table_lines)
+
+    t0 = time.time()
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.post(
+            f"{_OLLAMA_URL}/api/chat",
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "stream": False,
+                "think": False,
+                "options": {"temperature": 0.1, "num_predict": 8192},
+            },
+        )
+        resp.raise_for_status()
+        content = resp.json().get("message", {}).get("content", "")
+    duration_ms = int((time.time() - t0) * 1000)
+
+    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
+
+    corrected = _parse_llm_json_array(content)
+    changes, corrected_entries = _diff_batch(review_entries, corrected)
+
+    all_corrected = [dict(e) for e in entries]
+    for batch_idx, (orig_idx, _) in enumerate(reviewable):
+        if batch_idx < len(corrected_entries):
+            all_corrected[orig_idx] = corrected_entries[batch_idx]
+
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": len(entries) - len(reviewable),
+        "model_used": model,
+        "duration_ms": duration_ms,
+    }
+
+
+async def llm_review_entries_streaming(
+    entries: List[Dict],
+    model: str = None,
+    batch_size: int = _REVIEW_BATCH_SIZE,
+):
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
+
+    Phase 0 (always): Run _fix_character_confusion and emit any changes.
+    """
+    from cv_ocr_engines import _fix_character_confusion
+    from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
+
+    _CONF_FIELDS = ('english', 'german', 'example')
+    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
+    _fix_character_confusion(entries)
+    char_changes = [
+        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
+        for i in range(len(entries))
+        for f in _CONF_FIELDS
+        if originals[i][f] != entries[i].get(f, '')
+    ]
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        _meta_sent = False
+        async for event in spell_review_entries_streaming(entries, batch_size):
+            yield event
+            if not _meta_sent and event.get('type') == 'meta' and char_changes:
+                _meta_sent = True
+                yield {
+                    'type': 'batch',
+                    'changes': char_changes,
+                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+                    'progress': {'current': 0, 'total': len(entries)},
+                }
+        return
+
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    # LLM path
+    if char_changes:
+        yield {
+            'type': 'batch',
+            'changes': char_changes,
+            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+            'progress': {'current': 0, 'total': len(entries)},
+        }
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    reviewable = []
+    skipped_indices = []
+    for i, e in enumerate(entries):
+        if _entry_needs_review(e):
+            reviewable.append((i, e))
+        else:
+            skipped_indices.append(i)
+
+    total_to_review = len(reviewable)
+
+    yield {
+        "type": "meta",
+        "total_entries": len(entries),
+        "to_review": total_to_review,
+        "skipped": len(skipped_indices),
+        "model": model,
+        "batch_size": batch_size,
+    }
+
+    all_changes = []
+    all_corrected = [dict(e) for e in entries]
+    total_duration_ms = 0
+    reviewed_count = 0
+
+    for batch_start in range(0, total_to_review, batch_size):
+        batch_items = reviewable[batch_start:batch_start + batch_size]
+        batch_entries = [e for _, e in batch_items]
+
+        table_lines = [
+            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+            for e in batch_entries
+        ]
+
+        prompt = _build_llm_prompt(table_lines)
+
+        logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
+                    batch_start // batch_size, len(batch_entries), model)
+
+        t0 = time.time()
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(
+                f"{_OLLAMA_URL}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False,
+                    "think": False,
+                    "options": {"temperature": 0.1, "num_predict": 8192},
+                },
+            )
+            resp.raise_for_status()
+            content = resp.json().get("message", {}).get("content", "")
+        batch_ms = int((time.time() - t0) * 1000)
+        total_duration_ms += batch_ms
+
+        corrected = _parse_llm_json_array(content)
+        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
+
+        for batch_idx, (orig_idx, _) in enumerate(batch_items):
+            if batch_idx < len(batch_corrected):
+                all_corrected[orig_idx] = batch_corrected[batch_idx]
+
+        all_changes.extend(batch_changes)
+        reviewed_count += len(batch_items)
+
+        yield {
+            "type": "batch",
+            "batch_index": batch_start // batch_size,
+            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
+            "changes": batch_changes,
+            "duration_ms": batch_ms,
+            "progress": {"current": reviewed_count, "total": total_to_review},
+        }
+
+    yield {
+        "type": "complete",
+        "changes": all_changes,
+        "model_used": model,
+        "duration_ms": total_duration_ms,
+        "total_entries": len(entries),
+        "reviewed": total_to_review,
+        "skipped": len(skipped_indices),
+        "corrections_found": len(all_changes),
+        "entries_corrected": all_corrected,
+    }
--- a/klausur-service/backend/ocr/review/pipeline.py
+++ b/klausur-service/backend/ocr/review/pipeline.py
@@ -0,0 +1,430 @@
+"""
+CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration.
+
+Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import time
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV_PIPELINE_AVAILABLE,
+    PageRegion,
+    PipelineResult,
+    VocabRow,
+)
+from cv_preprocessing import (
+    deskew_image,
+    dewarp_image,
+    render_image_high_res,
+    render_pdf_high_res,
+)
+from cv_layout import (
+    analyze_layout,
+    create_layout_image,
+    create_ocr_image,
+)
+from cv_ocr_engines import (
+    _group_words_into_lines,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Stage 6: Multi-Pass OCR
+# =============================================================================
+
+def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
+               psm: int, fallback_psm: Optional[int] = None,
+               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
+    """Run Tesseract OCR on a specific region with given PSM.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        region: Region to crop and OCR.
+        lang: Tesseract language string.
+        psm: Page Segmentation Mode.
+        fallback_psm: If confidence too low, retry with this PSM per line.
+        min_confidence: Minimum average confidence before fallback.
+
+    Returns:
+        List of word dicts with text, position, confidence.
+    """
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    pil_img = Image.fromarray(crop)
+
+    config = f'--psm {psm} --oem 3'
+    try:
+        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                         output_type=pytesseract.Output.DICT)
+    except Exception as e:
+        logger.warning(f"Tesseract failed for region {region.type}: {e}")
+        return []
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 10:
+            continue
+        words.append({
+            'text': text,
+            'left': data['left'][i] + region.x,
+            'top': data['top'][i] + region.y,
+            'width': data['width'][i],
+            'height': data['height'][i],
+            'conf': conf,
+            'region_type': region.type,
+        })
+
+    if words and fallback_psm is not None:
+        avg_conf = sum(w['conf'] for w in words) / len(words)
+        if avg_conf < min_confidence:
+            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
+                        f"trying fallback PSM {fallback_psm}")
+            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
+
+    return words
+
+
+def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
+                              lang: str, psm: int) -> List[Dict[str, Any]]:
+    """OCR a region line by line (fallback for low-confidence regions)."""
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    inv = cv2.bitwise_not(crop)
+    h_proj = np.sum(inv, axis=1)
+    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
+
+    lines = []
+    in_text = False
+    line_start = 0
+    for y in range(len(h_proj)):
+        if h_proj[y] > threshold and not in_text:
+            line_start = y
+            in_text = True
+        elif h_proj[y] <= threshold and in_text:
+            if y - line_start > 5:
+                lines.append((line_start, y))
+            in_text = False
+    if in_text and len(h_proj) - line_start > 5:
+        lines.append((line_start, len(h_proj)))
+
+    all_words = []
+    config = f'--psm {psm} --oem 3'
+
+    for line_y_start, line_y_end in lines:
+        pad = 3
+        y1 = max(0, line_y_start - pad)
+        y2 = min(crop.shape[0], line_y_end + pad)
+        line_crop = crop[y1:y2, :]
+
+        if line_crop.size == 0:
+            continue
+
+        pil_img = Image.fromarray(line_crop)
+        try:
+            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                             output_type=pytesseract.Output.DICT)
+        except Exception:
+            continue
+
+        for i in range(len(data['text'])):
+            text = data['text'][i].strip()
+            conf = int(data['conf'][i])
+            if not text or conf < 10:
+                continue
+            all_words.append({
+                'text': text,
+                'left': data['left'][i] + region.x,
+                'top': data['top'][i] + region.y + y1,
+                'width': data['width'][i],
+                'height': data['height'][i],
+                'conf': conf,
+                'region_type': region.type,
+            })
+
+    return all_words
+
+
+def run_multi_pass_ocr(ocr_img: np.ndarray,
+                       regions: List[PageRegion],
+                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
+    """Run OCR on each detected region with optimized settings."""
+    results: Dict[str, List[Dict]] = {}
+
+    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    for region in regions:
+        if region.type in _ocr_skip:
+            continue
+
+        if region.type == 'column_en':
+            words = ocr_region(ocr_img, region, lang='eng', psm=4)
+        elif region.type == 'column_de':
+            words = ocr_region(ocr_img, region, lang='deu', psm=4)
+        elif region.type == 'column_example':
+            words = ocr_region(ocr_img, region, lang=lang, psm=6,
+                              fallback_psm=7, min_confidence=40.0)
+        else:
+            words = ocr_region(ocr_img, region, lang=lang, psm=6)
+
+        results[region.type] = words
+        logger.info(f"OCR {region.type}: {len(words)} words")
+
+    return results
+
+
+# =============================================================================
+# Stage 7: Line Alignment -> Vocabulary Entries
+# =============================================================================
+
+def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
+                          regions: List[PageRegion],
+                          y_tolerance_px: int = 25) -> List[VocabRow]:
+    """Align OCR results from different columns into vocabulary rows."""
+    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
+        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
+        return []
+
+    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
+    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
+    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
+
+    def line_y_center(line: List[Dict]) -> float:
+        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
+
+    def line_text(line: List[Dict]) -> str:
+        return ' '.join(w['text'] for w in line)
+
+    def line_confidence(line: List[Dict]) -> float:
+        return sum(w['conf'] for w in line) / len(line) if line else 0
+
+    vocab_rows: List[VocabRow] = []
+
+    for en_line in en_lines:
+        en_y = line_y_center(en_line)
+        en_text = line_text(en_line)
+        en_conf = line_confidence(en_line)
+
+        if len(en_text.strip()) < 2:
+            continue
+
+        de_text = ""
+        de_conf = 0.0
+        best_de_dist = float('inf')
+        best_de_idx = -1
+        for idx, de_line in enumerate(de_lines):
+            dist = abs(line_y_center(de_line) - en_y)
+            if dist < y_tolerance_px and dist < best_de_dist:
+                best_de_dist = dist
+                best_de_idx = idx
+
+        if best_de_idx >= 0:
+            de_text = line_text(de_lines[best_de_idx])
+            de_conf = line_confidence(de_lines[best_de_idx])
+
+        ex_text = ""
+        ex_conf = 0.0
+        best_ex_dist = float('inf')
+        best_ex_idx = -1
+        for idx, ex_line in enumerate(ex_lines):
+            dist = abs(line_y_center(ex_line) - en_y)
+            if dist < y_tolerance_px and dist < best_ex_dist:
+                best_ex_dist = dist
+                best_ex_idx = idx
+
+        if best_ex_idx >= 0:
+            ex_text = line_text(ex_lines[best_ex_idx])
+            ex_conf = line_confidence(ex_lines[best_ex_idx])
+
+        avg_conf = en_conf
+        conf_count = 1
+        if de_conf > 0:
+            avg_conf += de_conf
+            conf_count += 1
+        if ex_conf > 0:
+            avg_conf += ex_conf
+            conf_count += 1
+
+        vocab_rows.append(VocabRow(
+            english=en_text.strip(),
+            german=de_text.strip(),
+            example=ex_text.strip(),
+            confidence=avg_conf / conf_count,
+            y_position=int(en_y),
+        ))
+
+    # Handle multi-line wrapping in example column
+    matched_ex_ys = set()
+    for row in vocab_rows:
+        if row.example:
+            matched_ex_ys.add(row.y_position)
+
+    for ex_line in ex_lines:
+        ex_y = line_y_center(ex_line)
+        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
+        if already_matched:
+            continue
+
+        best_row = None
+        best_dist = float('inf')
+        for row in vocab_rows:
+            dist = ex_y - row.y_position
+            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
+                best_dist = dist
+                best_row = row
+
+        if best_row:
+            continuation = line_text(ex_line).strip()
+            if continuation:
+                best_row.example = (best_row.example + " " + continuation).strip()
+
+    vocab_rows.sort(key=lambda r: r.y_position)
+
+    return vocab_rows
+
+
+# =============================================================================
+# Stage 8: Optional LLM Post-Correction
+# =============================================================================
+
+async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
+                           confidence_threshold: float = 50.0,
+                           enabled: bool = False) -> List[VocabRow]:
+    """Optionally send low-confidence regions to Qwen-VL for correction."""
+    if not enabled:
+        return vocab_rows
+
+    logger.info(f"LLM post-correction skipped (not yet implemented)")
+    return vocab_rows
+
+
+# =============================================================================
+# Orchestrator
+# =============================================================================
+
+async def run_cv_pipeline(
+    pdf_data: Optional[bytes] = None,
+    image_data: Optional[bytes] = None,
+    page_number: int = 0,
+    zoom: float = 3.0,
+    enable_dewarp: bool = True,
+    enable_llm_correction: bool = False,
+    lang: str = "eng+deu",
+) -> PipelineResult:
+    """Run the complete CV document reconstruction pipeline."""
+    if not CV_PIPELINE_AVAILABLE:
+        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
+
+    result = PipelineResult()
+    total_start = time.time()
+
+    try:
+        # Stage 1: Render
+        t = time.time()
+        if pdf_data:
+            img = render_pdf_high_res(pdf_data, page_number, zoom)
+        elif image_data:
+            img = render_image_high_res(image_data)
+        else:
+            return PipelineResult(error="No input data (pdf_data or image_data required)")
+        result.stages['render'] = round(time.time() - t, 2)
+        result.image_width = img.shape[1]
+        result.image_height = img.shape[0]
+        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
+
+        # Stage 2: Deskew
+        t = time.time()
+        img, angle = deskew_image(img)
+        result.stages['deskew'] = round(time.time() - t, 2)
+        logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")
+
+        # Stage 3: Dewarp
+        if enable_dewarp:
+            t = time.time()
+            img, _dewarp_info = dewarp_image(img)
+            result.stages['dewarp'] = round(time.time() - t, 2)
+
+        # Stage 4: Dual image preparation
+        t = time.time()
+        ocr_img = create_ocr_image(img)
+        layout_img = create_layout_image(img)
+        result.stages['image_prep'] = round(time.time() - t, 2)
+
+        # Stage 5: Layout analysis
+        t = time.time()
+        regions = analyze_layout(layout_img, ocr_img)
+        result.stages['layout'] = round(time.time() - t, 2)
+        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
+        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
+
+        # Stage 6: Multi-pass OCR
+        t = time.time()
+        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
+        result.stages['ocr'] = round(time.time() - t, 2)
+        total_words = sum(len(w) for w in ocr_results.values())
+        result.word_count = total_words
+        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
+
+        # Stage 7: Line alignment
+        t = time.time()
+        vocab_rows = match_lines_to_vocab(ocr_results, regions)
+        result.stages['alignment'] = round(time.time() - t, 2)
+
+        # Stage 8: Optional LLM correction
+        if enable_llm_correction:
+            t = time.time()
+            vocab_rows = await llm_post_correct(img, vocab_rows)
+            result.stages['llm_correction'] = round(time.time() - t, 2)
+
+        # Convert to output format
+        result.vocabulary = [
+            {
+                "english": row.english,
+                "german": row.german,
+                "example": row.example,
+                "confidence": round(row.confidence, 1),
+            }
+            for row in vocab_rows
+            if row.english or row.german
+        ]
+
+        result.duration_seconds = round(time.time() - total_start, 2)
+        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
+
+    except Exception as e:
+        logger.error(f"CV Pipeline error: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        result.error = str(e)
+        result.duration_seconds = round(time.time() - total_start, 2)
+
+    return result
--- a/klausur-service/backend/ocr/review/review.py
+++ b/klausur-service/backend/ocr/review/review.py
@@ -0,0 +1,46 @@
+"""
+Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
+
+Re-export facade -- all logic lives in the sub-modules:
+
+  cv_review_pipeline   Stages 6-8: OCR, line alignment, orchestrator
+  cv_review_spell      Rule-based spell-checker OCR correction
+  cv_review_llm        LLM-based OCR correction, prompt building, streaming
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+# Re-export everything for backward compatibility
+from cv_review_pipeline import (  # noqa: F401
+    ocr_region,
+    run_multi_pass_ocr,
+    match_lines_to_vocab,
+    llm_post_correct,
+    run_cv_pipeline,
+)
+
+from cv_review_spell import (  # noqa: F401
+    _SPELL_AVAILABLE,
+    _spell_dict_knows,
+    _spell_fix_field,
+    _spell_fix_token,
+    _try_split_merged_word,
+    _normalize_page_ref,
+    spell_review_entries_sync,
+    spell_review_entries_streaming,
+)
+
+from cv_review_llm import (  # noqa: F401
+    OLLAMA_REVIEW_MODEL,
+    REVIEW_ENGINE,
+    _REVIEW_BATCH_SIZE,
+    _build_llm_prompt,
+    _diff_batch,
+    _entry_needs_review,
+    _is_spurious_change,
+    _parse_llm_json_array,
+    _sanitize_for_json,
+    llm_review_entries,
+    llm_review_entries_streaming,
+)
--- a/klausur-service/backend/ocr/review/spell.py
+++ b/klausur-service/backend/ocr/review/spell.py
@@ -0,0 +1,315 @@
+"""
+CV Review Spell — Rule-based OCR spell correction (no LLM).
+
+Provides dictionary-backed digit-to-letter substitution, umlaut correction,
+general spell correction, merged-word splitting, and page-ref normalization.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+import time
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _SPELL_AVAILABLE = True
+    logger.info("pyspellchecker loaded (EN+DE)")
+except ImportError:
+    _SPELL_AVAILABLE = False
+    _en_spell = None  # type: ignore[assignment]
+    _de_spell = None  # type: ignore[assignment]
+    logger.warning("pyspellchecker not installed")
+
+
+# ---- Page-Ref Normalization ----
+# Normalizes OCR variants like "p-60", "p 61", "p60" -> "p.60"
+_PAGE_REF_RE = re.compile(r'\bp[\s\-]?(\d+)', re.IGNORECASE)
+
+
+def _normalize_page_ref(text: str) -> str:
+    """Normalize page references: 'p-60' / 'p 61' / 'p60' -> 'p.60'."""
+    if not text:
+        return text
+    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
+
+
+# Suspicious OCR chars -> ordered list of most-likely correct replacements
+_SPELL_SUBS: Dict[str, List[str]] = {
+    '0': ['O', 'o'],
+    '1': ['l', 'I'],
+    '5': ['S', 's'],
+    '6': ['G', 'g'],
+    '8': ['B', 'b'],
+    '|': ['I', 'l', '1'],
+}
+_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
+
+# Tokenizer: word tokens (letters + pipe) alternating with separators
+_SPELL_TOKEN_RE = re.compile(r'([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]*)')
+
+
+def _spell_dict_knows(word: str) -> bool:
+    """True if word is known in EN or DE dictionary."""
+    if not _SPELL_AVAILABLE:
+        return False
+    w = word.lower()
+    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
+
+
+def _try_split_merged_word(token: str) -> Optional[str]:
+    """Try to split a merged word like 'atmyschool' into 'at my school'.
+
+    Uses dynamic programming to find the shortest sequence of dictionary
+    words that covers the entire token. Only returns a result when the
+    split produces at least 2 words and ALL parts are known dictionary words.
+
+    Preserves original capitalisation by mapping back to the input string.
+    """
+    if not _SPELL_AVAILABLE or len(token) < 4:
+        return None
+
+    lower = token.lower()
+    n = len(lower)
+
+    # dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
+    dp: list = [None] * (n + 1)
+    dp[0] = ([], 0)
+
+    for i in range(1, n + 1):
+        for j in range(max(0, i - 20), i):
+            if dp[j] is None:
+                continue
+            candidate = lower[j:i]
+            word_len = i - j
+            if word_len == 1 and candidate not in ('a', 'i'):
+                continue
+            if _spell_dict_knows(candidate):
+                prev_words, prev_sq = dp[j]
+                new_words = prev_words + [word_len]
+                new_sq = prev_sq + word_len * word_len
+                new_key = (-len(new_words), new_sq)
+                if dp[i] is None:
+                    dp[i] = (new_words, new_sq)
+                else:
+                    old_key = (-len(dp[i][0]), dp[i][1])
+                    if new_key >= old_key:
+                        dp[i] = (new_words, new_sq)
+
+    if dp[n] is None or len(dp[n][0]) < 2:
+        return None
+
+    result = []
+    pos = 0
+    for wlen in dp[n][0]:
+        result.append(token[pos:pos + wlen])
+        pos += wlen
+
+    logger.debug("Split merged word: %r -> %r", token, " ".join(result))
+    return " ".join(result)
+
+
+def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
+    """Return corrected form of token, or None if no fix needed/possible.
+
+    *field* is 'english' or 'german' -- used to pick the right dictionary.
+    """
+    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
+
+    # 1. Already known word -> no fix needed
+    if _spell_dict_knows(token):
+        return None
+
+    # 2. Digit/pipe substitution
+    if has_suspicious:
+        if token == '|':
+            return 'I'
+        for i, ch in enumerate(token):
+            if ch not in _SPELL_SUBS:
+                continue
+            for replacement in _SPELL_SUBS[ch]:
+                candidate = token[:i] + replacement + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+        first = token[0]
+        if first in _SPELL_SUBS and len(token) >= 2:
+            rest = token[1:]
+            if rest.isalpha() and rest.islower():
+                candidate = _SPELL_SUBS[first][0] + rest
+                if not candidate[0].isdigit():
+                    return candidate
+
+    # 3. OCR umlaut confusion
+    if len(token) >= 3 and token.isalpha() and field == "german":
+        _UMLAUT_SUBS = {'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
+                         'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc'}
+        for i, ch in enumerate(token):
+            if ch in _UMLAUT_SUBS:
+                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+
+    # 4. General spell correction for unknown words (no digits/pipes)
+    if not has_suspicious and len(token) >= 3 and token.isalpha():
+        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
+        if spell is not None:
+            correction = spell.correction(token.lower())
+            if correction and correction != token.lower():
+                if token[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if _spell_dict_knows(correction):
+                    return correction
+
+    # 5. Merged-word split
+    if len(token) >= 4 and token.isalpha():
+        split = _try_split_merged_word(token)
+        if split:
+            return split
+
+    return None
+
+
+def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
+    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
+    if not text:
+        return text, False
+    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
+    if not has_suspicious and not any(c.isalpha() for c in text):
+        return text, False
+    # Pattern: | immediately before . or , -> numbered list prefix
+    fixed = re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
+    changed = fixed != text
+    # Tokenize and fix word by word
+    parts: List[str] = []
+    pos = 0
+    for m in _SPELL_TOKEN_RE.finditer(fixed):
+        token, sep = m.group(1), m.group(2)
+        correction = _spell_fix_token(token, field=field)
+        if correction:
+            parts.append(correction)
+            changed = True
+        else:
+            parts.append(token)
+        parts.append(sep)
+        pos = m.end()
+    if pos < len(fixed):
+        parts.append(fixed[pos:])
+    return ''.join(parts), changed
+
+
+def spell_review_entries_sync(entries: List[Dict]) -> Dict:
+    """Rule-based OCR correction: spell-checker + structural heuristics.
+
+    Deterministic -- never translates, never touches IPA, never hallucinates.
+    Uses SmartSpellChecker for language-aware corrections with context-based
+    disambiguation (a/I), multi-digit substitution, and cross-language guard.
+    """
+    from cv_review_llm import _entry_needs_review
+
+    t0 = time.time()
+    changes: List[Dict] = []
+    all_corrected: List[Dict] = []
+
+    # Use SmartSpellChecker if available
+    _smart = None
+    try:
+        from smart_spell import SmartSpellChecker
+        _smart = SmartSpellChecker()
+        logger.debug("spell_review: using SmartSpellChecker")
+    except Exception:
+        logger.debug("spell_review: SmartSpellChecker not available, using legacy")
+
+    _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
+
+    for i, entry in enumerate(entries):
+        e = dict(entry)
+        # Page-ref normalization
+        old_ref = (e.get("source_page") or "").strip()
+        if old_ref:
+            new_ref = _normalize_page_ref(old_ref)
+            if new_ref != old_ref:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": "source_page",
+                    "old": old_ref,
+                    "new": new_ref,
+                })
+                e["source_page"] = new_ref
+                e["llm_corrected"] = True
+        if not _entry_needs_review(e):
+            all_corrected.append(e)
+            continue
+        for field_name in ("english", "german", "example"):
+            old_val = (e.get(field_name) or "").strip()
+            if not old_val:
+                continue
+
+            if _smart:
+                lang_code = _LANG_MAP.get(field_name, "en")
+                result = _smart.correct_text(old_val, lang=lang_code)
+                new_val = result.corrected
+                was_changed = result.changed
+            else:
+                lang = "german" if field_name in ("german", "example") else "english"
+                new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
+            if was_changed and new_val != old_val:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": field_name,
+                    "old": old_val,
+                    "new": new_val,
+                })
+                e[field_name] = new_val
+                e["llm_corrected"] = True
+        all_corrected.append(e)
+    duration_ms = int((time.time() - t0) * 1000)
+    model_name = "smart-spell-checker" if _smart else "spell-checker"
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": 0,
+        "model_used": model_name,
+        "duration_ms": duration_ms,
+    }
+
+
+async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
+    """Async generator yielding SSE-compatible events for spell-checker review."""
+    total = len(entries)
+    yield {
+        "type": "meta",
+        "total_entries": total,
+        "to_review": total,
+        "skipped": 0,
+        "model": "spell-checker",
+        "batch_size": batch_size,
+    }
+    result = spell_review_entries_sync(entries)
+    changes = result["changes"]
+    yield {
+        "type": "batch",
+        "batch_index": 0,
+        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
+        "changes": changes,
+        "duration_ms": result["duration_ms"],
+        "progress": {"current": total, "total": total},
+    }
+    yield {
+        "type": "complete",
+        "changes": changes,
+        "model_used": "spell-checker",
+        "duration_ms": result["duration_ms"],
+        "total_entries": total,
+        "reviewed": total,
+        "skipped": 0,
+        "corrections_found": len(changes),
+        "entries_corrected": result["entries_corrected"],
+    }