From 9a5a35bff199cf45a6cbf229bdb7ea352cdc088a Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Sun, 8 Mar 2026 23:46:47 +0100
Subject: [PATCH] =?UTF-8?q?refactor:=20cv=5Fvocab=5Fpipeline.py=20in=206?=
 =?UTF-8?q?=20Module=20aufteilen=20(8163=20=E2=86=92=206=20+=20Fassade)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module:
- cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags
- cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp
- cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation
- cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning
- cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung
- cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung

cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) —
alle bestehenden Imports bleiben unveraendert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_cell_grid.py      | 1510 ++++
 klausur-service/backend/cv_layout.py         | 3036 +++++++
 klausur-service/backend/cv_ocr_engines.py    | 1282 +++
 klausur-service/backend/cv_preprocessing.py  | 1166 +++
 klausur-service/backend/cv_review.py         | 1184 +++
 klausur-service/backend/cv_vocab_pipeline.py | 8178 +-----------------
 klausur-service/backend/cv_vocab_types.py    |  156 +
 7 files changed, 8359 insertions(+), 8153 deletions(-)
 create mode 100644 klausur-service/backend/cv_cell_grid.py
 create mode 100644 klausur-service/backend/cv_layout.py
 create mode 100644 klausur-service/backend/cv_ocr_engines.py
 create mode 100644 klausur-service/backend/cv_preprocessing.py
 create mode 100644 klausur-service/backend/cv_review.py
 create mode 100644 klausur-service/backend/cv_vocab_types.py

diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py
new file mode 100644
index 0000000..6e55509
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid.py
@@ -0,0 +1,1510 @@
+"""
+Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    _assign_row_words_to_columns,
+    _attach_example_sentences,
+    _clean_cell_text,
+    _clean_cell_text_lite,
+    _fix_phonetic_brackets,
+    _split_comma_entries,
+    _words_to_reading_order_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None  # type: ignore[assignment,misc]
+
+
+# ---------------------------------------------------------------------------
+
+def _ocr_cell_crop(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+    """OCR a single cell by cropping the exact column×row intersection.
+
+    No padding beyond cell boundaries → no neighbour bleeding.
+    """
+    # Display bbox: exact column × row intersection
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # Crop boundaries: add small internal padding (3px each side) to avoid
+    # clipping characters near column/row edges (e.g. parentheses, descenders).
+    # Stays within image bounds but may extend slightly beyond strict cell.
+    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
+    _PAD = 3
+    cx = max(0, disp_x - _PAD)
+    cy = max(0, disp_y - _PAD)
+    cx2 = min(img_w, disp_x + disp_w + _PAD)
+    cy2 = min(img_h, disp_y + disp_h + _PAD)
+    cw = cx2 - cx
+    ch = cy2 - cy
+
+    empty_cell = {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': '',
+        'confidence': 0.0,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
+            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
+            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
+            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
+        },
+        'ocr_engine': 'cell_crop_v2',
+        'is_bold': False,
+    }
+
+    if cw <= 0 or ch <= 0:
+        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+        return empty_cell
+
+    # --- Pixel-density check: skip truly empty cells ---
+    if ocr_img is not None:
+        crop = ocr_img[cy:cy + ch, cx:cx + cw]
+        if crop.size > 0:
+            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+            if dark_ratio < 0.005:
+                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                            row_idx, col_idx, dark_ratio, cw, ch)
+                return empty_cell
+
+    # --- Prepare crop for OCR ---
+    cell_lang = lang_map.get(col.type, lang)
+    psm = _select_psm_for_column(col.type, col.width, row.height)
+    text = ''
+    avg_conf = 0.0
+    used_engine = 'cell_crop_v2'
+
+    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_trocr(img_bgr, cell_region,
+                                 handwritten=(engine_name == "trocr-handwritten"))
+    elif engine_name == "lighton" and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_lighton(img_bgr, cell_region)
+    elif engine_name == "rapid" and img_bgr is not None:
+        # Upscale small BGR crops for RapidOCR.
+        # Cell crops typically have height 35-55px but width >300px.
+        # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
+        # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
+        # For very short heights (< 80px), force 3× upscale for better OCR
+        # of small characters like periods, ellipsis, and phonetic symbols.
+        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+        if bgr_crop.size == 0:
+            words = []
+        else:
+            crop_h, crop_w = bgr_crop.shape[:2]
+            if crop_h < 80:
+                # Force 3× upscale for short rows — small chars need more pixels
+                scale = 3.0
+                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                    interpolation=cv2.INTER_CUBIC)
+            else:
+                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+            up_h, up_w = bgr_up.shape[:2]
+            scale_x = up_w / max(crop_w, 1)
+            scale_y = up_h / max(crop_h, 1)
+            was_scaled = (up_w != crop_w or up_h != crop_h)
+            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region_rapid(bgr_up, tmp_region)
+            # Remap positions back to original image coords
+            if words and was_scaled:
+                for w in words:
+                    w['left'] = int(w['left'] / scale_x) + cx
+                    w['top'] = int(w['top'] / scale_y) + cy
+                    w['width'] = int(w['width'] / scale_x)
+                    w['height'] = int(w['height'] / scale_y)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+    else:
+        # Tesseract: upscale tiny crops for better recognition
+        if ocr_img is not None:
+            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+            upscaled = _ensure_minimum_crop_size(crop_slice)
+            up_h, up_w = upscaled.shape[:2]
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
+            # Remap word positions back to original image coordinates
+            if words and (up_w != cw or up_h != ch):
+                sx = cw / max(up_w, 1)
+                sy = ch / max(up_h, 1)
+                for w in words:
+                    w['left'] = int(w['left'] * sx) + cx
+                    w['top'] = int(w['top'] * sy) + cy
+                    w['width'] = int(w['width'] * sx)
+                    w['height'] = int(w['height'] * sy)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+        else:
+            words = []
+
+    # Filter low-confidence words
+    _MIN_WORD_CONF = 30
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, ch)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+    else:
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+                    row_idx, col_idx, cw, ch, psm, engine_name)
+
+    # --- PSM 7 fallback for still-empty Tesseract cells ---
+    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
+        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+        upscaled = _ensure_minimum_crop_size(crop_slice)
+        up_h, up_w = upscaled.shape[:2]
+        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
+        if psm7_words:
+            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if psm7_words:
+            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+            if p7_text.strip():
+                text = p7_text
+                avg_conf = round(
+                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                )
+                used_engine = 'cell_crop_v2_psm7'
+
+    # --- Noise filter ---
+    if text.strip():
+        pre_filter = text
+        text = _clean_cell_text_lite(text)
+        if not text:
+            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+                        row_idx, col_idx, pre_filter)
+            avg_conf = 0.0
+
+    result = dict(empty_cell)
+    result['text'] = text
+    result['confidence'] = avg_conf
+    result['ocr_engine'] = used_engine
+    return result
+
+
+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.
+#
+# Broad columns (>= threshold): Full-page Tesseract word assignment.
+#   Better for multi-word content (sentences, IPA brackets, punctuation).
+#   Examples: EN vocabulary, DE translation, example sentences.
+#
+# Narrow columns (< threshold): Isolated cell-crop OCR.
+#   Prevents neighbour bleeding from adjacent broad columns.
+#   Examples: page_ref, marker, numbering columns.
+#
+# 15% was empirically validated across vocab table scans with 3-5 columns.
+# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
+# The 15% boundary cleanly separates the two groups.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
+def build_cell_grid_v2(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
+
+    Drop-in replacement for build_cell_grid() — same signature & return type.
+
+    Strategy:
+    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+      words (from row.words). Handles IPA brackets, punctuation, sentence
+      continuity correctly.
+    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+      neighbour bleeding from adjacent broad columns.
+    """
+    engine_name = "tesseract"
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+        engine_name = "rapid"
+
+    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
+
+    # Filter to content rows only
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows found")
+        return [], []
+
+    # Filter phantom rows (word_count=0) and artifact rows
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows with words found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
+        return [], []
+
+    # Filter columns
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid_v2: no usable columns found")
+        return [], []
+
+    # Heal row gaps — use header/footer boundaries
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    # --- Classify columns as broad vs narrow ---
+    narrow_col_indices = set()
+    for ci, col in enumerate(relevant_cols):
+        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+        if col_pct < _NARROW_COL_THRESHOLD_PCT:
+            narrow_col_indices.add(ci)
+
+    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+    # --- Phase 1: Broad columns via full-page word assignment ---
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        # Assign full-page words to columns for this row
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx not in narrow_col_indices:
+                # BROAD column: use pre-assigned full-page words
+                words = col_words.get(col_idx, [])
+                # Filter low-confidence words
+                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+                if words:
+                    y_tol = max(15, row.height)
+                    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+                else:
+                    text = ''
+                    avg_conf = 0.0
+
+                # Apply noise filter
+                text = _clean_cell_text(text)
+
+                cell = {
+                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
+                    'row_index': row_idx,
+                    'col_index': col_idx,
+                    'col_type': col.type,
+                    'text': text,
+                    'confidence': avg_conf,
+                    'bbox_px': {
+                        'x': col.x, 'y': row.y,
+                        'w': col.width, 'h': row.height,
+                    },
+                    'bbox_pct': {
+                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
+                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
+                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
+                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
+                    },
+                    'ocr_engine': 'word_lookup',
+                    'is_bold': False,
+                }
+                cells.append(cell)
+
+    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+    narrow_tasks = []
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx in narrow_col_indices:
+                narrow_tasks.append((row_idx, col_idx, row, col))
+
+    if narrow_tasks:
+        max_workers = 4 if engine_name == "tesseract" else 2
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            futures = {
+                pool.submit(
+                    _ocr_cell_crop,
+                    ri, ci, row, col,
+                    ocr_img, img_bgr, img_w, img_h,
+                    engine_name, lang, lang_map,
+                ): (ri, ci)
+                for ri, ci, row, col in narrow_tasks
+            }
+            for future in as_completed(futures):
+                try:
+                    cell = future.result()
+                    cells.append(cell)
+                except Exception as e:
+                    ri, ci = futures[future]
+                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+    # Sort cells by (row_index, col_index)
+    cells.sort(key=lambda c: (c['row_index'], c['col_index']))
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
+
+    # Bold detection disabled: cell-level stroke-width analysis cannot
+    # distinguish bold from non-bold when cells contain mixed formatting
+    # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
+    # TODO: word-level bold detection would require per-word bounding boxes.
+
+    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
+                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
+                f"engine={engine_name} (hybrid)")
+
+    return cells, columns_meta
+
+
+def build_cell_grid_v2_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells)
+    """
+    # Resolve engine — default to Tesseract for cell-first OCR.
+    # Tesseract excels at isolated text crops (binarized, upscaled).
+    # RapidOCR is optimized for full-page scene-text and produces artifacts
+    # on small cell crops (extra chars, missing punctuation, garbled IPA).
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        engine_name = "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    if not content_rows:
+        return
+
+    # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_cell_crop(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                engine_name, lang, lang_map,
+            )
+            yield cell, columns_meta, total_cells
+
+
+# ---------------------------------------------------------------------------
+# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
+# ---------------------------------------------------------------------------
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+    """Adaptive padding for OCR crops based on column width.
+
+    Narrow columns (page_ref, marker) need more surrounding context so
+    Tesseract can segment characters correctly.  Wide columns keep the
+    minimal 4 px padding to avoid pulling in neighbours.
+    """
+    col_pct = col_width / img_w * 100 if img_w > 0 else 100
+    if col_pct < 5:
+        return max(20, col_width // 2)
+    if col_pct < 10:
+        return max(12, col_width // 4)
+    if col_pct < 15:
+        return 8
+    return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+                               max_scale: int = 3) -> np.ndarray:
+    """Upscale tiny crops so Tesseract gets enough pixel data.
+
+    If either dimension is below *min_dim*, the crop is bicubic-upscaled
+    so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
+    """
+    h, w = crop.shape[:2]
+    if h >= min_dim and w >= min_dim:
+        return crop
+    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+    if scale <= 1.0:
+        return crop
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+                            row_height: int) -> int:
+    """Choose the best Tesseract PSM for a given column geometry.
+
+    - page_ref columns are almost always single short tokens → PSM 8
+    - Very narrow or short cells → PSM 7 (single text line)
+    - Everything else → PSM 6 (uniform block)
+    """
+    if col_type in ('page_ref', 'marker'):
+        return 8  # single word
+    if col_width < 100 or row_height < 30:
+        return 7  # single line
+    return 6  # uniform block
+
+
+def _ocr_single_cell(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    use_rapid: bool,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+    preassigned_words: Optional[List[Dict]] = None,
+) -> Dict[str, Any]:
+    """Populate a single cell (column x row intersection) via word lookup."""
+    # Display bbox: exact column × row intersection (no padding)
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # OCR crop: adaptive padding — narrow columns get more context
+    pad = _compute_cell_padding(col.width, img_w)
+    cell_x = max(0, col.x - pad)
+    cell_y = max(0, row.y - pad)
+    cell_w = min(col.width + 2 * pad, img_w - cell_x)
+    cell_h = min(row.height + 2 * pad, img_h - cell_y)
+    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
+
+    if disp_w <= 0 or disp_h <= 0:
+        return {
+            'cell_id': f"R{row_idx:02d}_C{col_idx}",
+            'row_index': row_idx,
+            'col_index': col_idx,
+            'col_type': col.type,
+            'text': '',
+            'confidence': 0.0,
+            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+            'bbox_pct': {
+                'x': round(col.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(col.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'ocr_engine': 'word_lookup',
+        }
+
+    # --- PRIMARY: Word-lookup from full-page Tesseract ---
+    words = preassigned_words if preassigned_words is not None else []
+    used_engine = 'word_lookup'
+
+    # Filter low-confidence words (OCR noise from images/artifacts).
+    # Tesseract gives low confidence to misread image edges, borders,
+    # and other non-text elements.
+    _MIN_WORD_CONF = 30
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        # Use row height as Y-tolerance so all words within a single row
+        # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
+        # across two lines due to slight vertical offset).
+        y_tol = max(15, row.height)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+    else:
+        text = ''
+        avg_conf = 0.0
+
+    # --- FALLBACK: Cell-OCR for empty cells ---
+    # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
+    # Re-run OCR on the cell crop to catch what word-lookup missed.
+    # To avoid wasting time on truly empty cells, check pixel density first:
+    # only run Tesseract if the cell crop contains enough dark pixels to
+    # plausibly contain text.
+    _run_fallback = False
+    if not text.strip() and cell_w > 0 and cell_h > 0:
+        if ocr_img is not None:
+            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            if crop.size > 0:
+                # Threshold: pixels darker than 180 (on 0-255 grayscale).
+                # Use 0.5% to catch even small text like "Ei" (2 chars)
+                # in an otherwise empty cell.
+                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                _run_fallback = dark_ratio > 0.005
+    if _run_fallback:
+        # For narrow columns, upscale the crop before OCR
+        if is_narrow and ocr_img is not None:
+            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            _upscaled = _ensure_minimum_crop_size(_crop_slice)
+            if _upscaled is not _crop_slice:
+                # Build a temporary full-size image with the upscaled crop
+                # placed at origin so ocr_region can crop it cleanly.
+                _up_h, _up_w = _upscaled.shape[:2]
+                _tmp_region = PageRegion(
+                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(_upscaled, _tmp_region,
+                                            lang=cell_lang, psm=_cell_psm)
+                # Remap word positions back to original image coordinates
+                _sx = cell_w / max(_up_w, 1)
+                _sy = cell_h / max(_up_h, 1)
+                for _fw in (fallback_words or []):
+                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
+                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
+                    _fw['width'] = int(_fw['width'] * _sx)
+                    _fw['height'] = int(_fw['height'] * _sy)
+            else:
+                # No upscaling needed, use adaptive PSM
+                cell_region = PageRegion(
+                    type=col.type, x=cell_x, y=cell_y,
+                    width=cell_w, height=cell_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+        else:
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
+            elif engine_name == "lighton" and img_bgr is not None:
+                fallback_words = ocr_region_lighton(img_bgr, cell_region)
+            elif use_rapid and img_bgr is not None:
+                fallback_words = ocr_region_rapid(img_bgr, cell_region)
+            else:
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+
+        if fallback_words:
+            # Apply same confidence filter to fallback words
+            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if fallback_words:
+            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
+            fb_y_tol = max(10, int(fb_avg_h * 0.5))
+            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
+            if fb_text.strip():
+                text = fb_text
+                avg_conf = round(
+                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
+                )
+                used_engine = 'cell_ocr_fallback'
+
+        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+        if not text.strip() and _run_fallback and not use_rapid:
+            _fb_region = PageRegion(
+                type=col.type, x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            cell_lang = lang_map.get(col.type, lang)
+            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
+            if psm7_words:
+                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+            if psm7_words:
+                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+                if p7_text.strip():
+                    text = p7_text
+                    avg_conf = round(
+                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                    )
+                    used_engine = 'cell_ocr_psm7'
+
+        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
+        # If a narrow cell is still empty, OCR the entire row strip with
+        # RapidOCR (which handles small text better) and assign words by
+        # X-position overlap with this column.
+        if not text.strip() and is_narrow and img_bgr is not None:
+            row_region = PageRegion(
+                type='_row_strip', x=0, y=row.y,
+                width=img_w, height=row.height,
+            )
+            strip_words = ocr_region_rapid(img_bgr, row_region)
+            if strip_words:
+                # Filter to words overlapping this column's X-range
+                col_left = col.x
+                col_right = col.x + col.width
+                col_words = []
+                for sw in strip_words:
+                    sw_left = sw.get('left', 0)
+                    sw_right = sw_left + sw.get('width', 0)
+                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
+                    if overlap > sw.get('width', 1) * 0.3:
+                        col_words.append(sw)
+                if col_words:
+                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+                if col_words:
+                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
+                    if rs_text.strip():
+                        text = rs_text
+                        avg_conf = round(
+                            sum(w['conf'] for w in col_words) / len(col_words), 1
+                        )
+                        used_engine = 'row_strip_rapid'
+
+    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+    if text.strip():
+        text = _clean_cell_text(text)
+        if not text:
+            avg_conf = 0.0
+
+    return {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': text,
+        'confidence': avg_conf,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2),
+            'y': round(disp_y / img_h * 100, 2),
+            'w': round(disp_w / img_w * 100, 2),
+            'h': round(disp_h / img_h * 100, 2),
+        },
+        'ocr_engine': used_engine,
+    }
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+    """Return True if this row contains only scan artifacts, not real text.
+
+    Artifact rows (scanner shadows, noise) typically produce only single-character
+    detections. A real content row always has at least one token with 2+ characters.
+    """
+    if row.word_count == 0:
+        return True
+    texts = [w.get('text', '').strip() for w in row.words]
+    return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+    rows: List[RowGeometry],
+    top_bound: int,
+    bottom_bound: int,
+) -> None:
+    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+    After filtering out empty or artifact rows, remaining content rows may have
+    gaps between them where the removed rows used to be. This function mutates
+    each row to extend upward/downward to the midpoint of such gaps so that
+    OCR crops cover the full available content area.
+
+    The first row always extends to top_bound; the last row to bottom_bound.
+    """
+    if not rows:
+        return
+    rows.sort(key=lambda r: r.y)
+    n = len(rows)
+    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
+
+    for i, row in enumerate(rows):
+        # New top: midpoint between previous row's bottom and this row's top
+        if i == 0:
+            new_top = top_bound
+        else:
+            prev_bot = orig[i - 1][1]
+            my_top = orig[i][0]
+            gap = my_top - prev_bot
+            new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+        # New bottom: midpoint between this row's bottom and next row's top
+        if i == n - 1:
+            new_bottom = bottom_bound
+        else:
+            my_bot = orig[i][1]
+            next_top = orig[i + 1][0]
+            gap = next_top - my_bot
+            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+        row.y = new_top
+        row.height = max(5, new_bottom - new_top)
+
+    logger.debug(
+        f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+        f"(bounds: top={top_bound}, bottom={bottom_bound})"
+    )
+
+
+def build_cell_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Generic Cell-Grid: Columns × Rows → cells with OCR text.
+
+    This is the layout-agnostic foundation. Every column (except column_ignore)
+    is intersected with every content row to produce numbered cells.
+
+    Args:
+        ocr_img: Binarized full-page image (for Tesseract).
+        column_regions: Classified columns from Step 3 (PageRegion list).
+        row_geometries: Rows from Step 4 (RowGeometry list).
+        img_w: Image width in pixels.
+        img_h: Image height in pixels.
+        lang: Default Tesseract language.
+        ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
+        img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
+
+    Returns:
+        (cells, columns_meta) where cells is a list of cell dicts and
+        columns_meta describes the columns used.
+    """
+    # Resolve engine choice
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
+
+    # Filter to content rows only (skip header/footer)
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows found")
+        return [], []
+
+    # Filter phantom rows: rows with no Tesseract words assigned are
+    # inter-line whitespace gaps that would produce garbage OCR.
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows with words found")
+        return [], []
+
+    # Use columns only — skip ignore, header, footer, page_ref
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid: no usable columns found")
+        return [], []
+
+    # Filter artifact rows: rows whose detected words are all single characters
+    # are caused by scanner shadows or noise, not real text.
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows after artifact filtering")
+        return [], []
+
+    # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
+    # to fill the space so OCR crops are not artificially narrow.
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    # Sort columns left-to-right
+    relevant_cols.sort(key=lambda c: c.x)
+
+    # Build columns_meta
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    # Choose OCR language per column type (Tesseract only)
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        # Pre-assign each word to exactly one column (nearest center)
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            cells.append(cell)
+
+    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+    # Collect cells that are still empty but have visible pixels.
+    # Instead of calling Tesseract once per cell (expensive), crop an entire
+    # column strip and run OCR once, then assign words to cells by Y position.
+    empty_by_col: Dict[int, List[int]] = {}  # col_idx → [cell list indices]
+    for ci, cell in enumerate(cells):
+        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+            bpx = cell['bbox_px']
+            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+            if w > 0 and h > 0 and ocr_img is not None:
+                crop = ocr_img[y:y + h, x:x + w]
+                if crop.size > 0:
+                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                    if dark_ratio > 0.005:
+                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+    for col_idx, cell_indices in empty_by_col.items():
+        if len(cell_indices) < 3:
+            continue  # Not worth batching for < 3 cells
+
+        # Find the column strip bounding box (union of all empty cell bboxes)
+        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+        col_x = cells[cell_indices[0]]['bbox_px']['x']
+        col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+        strip_region = PageRegion(
+            type=relevant_cols[col_idx].type,
+            x=col_x, y=min_y,
+            width=col_w, height=max_y_h - min_y,
+        )
+        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
+        elif engine_name == "lighton" and img_bgr is not None:
+            strip_words = ocr_region_lighton(img_bgr, strip_region)
+        elif use_rapid and img_bgr is not None:
+            strip_words = ocr_region_rapid(img_bgr, strip_region)
+        else:
+            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+        if not strip_words:
+            continue
+
+        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+        if not strip_words:
+            continue
+
+        # Assign words to cells by Y overlap
+        for ci in cell_indices:
+            cell_y = cells[ci]['bbox_px']['y']
+            cell_h = cells[ci]['bbox_px']['h']
+            cell_mid_y = cell_y + cell_h / 2
+
+            matched_words = [
+                w for w in strip_words
+                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+            ]
+            if matched_words:
+                matched_words.sort(key=lambda w: w['left'])
+                batch_text = ' '.join(w['text'] for w in matched_words)
+                batch_text = _clean_cell_text(batch_text)
+                if batch_text.strip():
+                    cells[ci]['text'] = batch_text
+                    cells[ci]['confidence'] = round(
+                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
+                    )
+                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+        if batch_filled > 0:
+            logger.info(
+                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+                f"empty cells in column {col_idx}"
+            )
+
+    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
+    # that had stray Tesseract artifacts giving word_count > 0).
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
+    logger.info(f"build_cell_grid: {len(cells)} cells from "
+                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
+                f"engine={engine_name}")
+
+    return cells, columns_meta
+
+
+def build_cell_grid_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells) for each cell.
+    """
+    # Resolve engine choice (same as build_cell_grid)
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    # Filter phantom rows: rows with no Tesseract words assigned are
+    # inter-line whitespace gaps that would produce garbage OCR.
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    # Filter artifact rows + heal gaps (same logic as build_cell_grid)
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        return
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        # Pre-assign each word to exactly one column (nearest center)
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            yield cell, columns_meta, total_cells
+
+
+def _cells_to_vocab_entries(
+    cells: List[Dict[str, Any]],
+    columns_meta: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Map generic cells to vocab entries with english/german/example fields.
+
+    Groups cells by row_index, maps col_type → field name, and produces
+    one entry per row (only rows with at least one non-empty field).
+    """
+    # Determine image dimensions from first cell (for row-level bbox)
+    col_type_to_field = {
+        'column_en': 'english',
+        'column_de': 'german',
+        'column_example': 'example',
+        'page_ref': 'source_page',
+        'column_marker': 'marker',
+    }
+    bbox_key_map = {
+        'column_en': 'bbox_en',
+        'column_de': 'bbox_de',
+        'column_example': 'bbox_ex',
+        'page_ref': 'bbox_ref',
+        'column_marker': 'bbox_marker',
+    }
+
+    # Group cells by row_index
+    rows: Dict[int, List[Dict]] = {}
+    for cell in cells:
+        ri = cell['row_index']
+        rows.setdefault(ri, []).append(cell)
+
+    entries: List[Dict[str, Any]] = []
+    for row_idx in sorted(rows.keys()):
+        row_cells = rows[row_idx]
+        entry: Dict[str, Any] = {
+            'row_index': row_idx,
+            'english': '',
+            'german': '',
+            'example': '',
+            'source_page': '',
+            'marker': '',
+            'confidence': 0.0,
+            'bbox': None,
+            'bbox_en': None,
+            'bbox_de': None,
+            'bbox_ex': None,
+            'bbox_ref': None,
+            'bbox_marker': None,
+            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
+        }
+
+        confidences = []
+        for cell in row_cells:
+            col_type = cell['col_type']
+            field = col_type_to_field.get(col_type)
+            if field:
+                entry[field] = cell['text']
+            bbox_field = bbox_key_map.get(col_type)
+            if bbox_field:
+                entry[bbox_field] = cell['bbox_pct']
+            if cell['confidence'] > 0:
+                confidences.append(cell['confidence'])
+
+        # Compute row-level bbox as union of all cell bboxes
+        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
+        if all_bboxes:
+            min_x = min(b['x'] for b in all_bboxes)
+            min_y = min(b['y'] for b in all_bboxes)
+            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
+            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
+            entry['bbox'] = {
+                'x': round(min_x, 2),
+                'y': round(min_y, 2),
+                'w': round(max_x2 - min_x, 2),
+                'h': round(max_y2 - min_y, 2),
+            }
+
+        entry['confidence'] = round(
+            sum(confidences) / len(confidences), 1
+        ) if confidences else 0.0
+
+        # Only include if at least one mapped field has text
+        has_content = any(
+            entry.get(f)
+            for f in col_type_to_field.values()
+        )
+        if has_content:
+            entries.append(entry)
+
+    return entries
+
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   →  True
+      [dɑːns]         →  True
+      ["a:mand]       →  True
+      almond ['a:mand] → False (has real word before bracket)
+      Mandel           → False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only → merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
+def _merge_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge multi-line vocabulary entries where text wraps to the next row.
+
+    A row is a continuation of the previous entry when:
+    - EN has text, but DE is empty
+    - EN starts with a lowercase letter (not a new vocab entry)
+    - Previous entry's EN does NOT end with a sentence terminator (.!?)
+    - The continuation text has fewer than 4 words (not an example sentence)
+    - The row was not already merged as phonetic
+
+    Example:
+      Row 5: EN="to put up"       DE="aufstellen"
+      Row 6: EN="with sth."       DE=""
+      → Merged: EN="to put up with sth."  DE="aufstellen"
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+
+        if merged and en and not de:
+            # Check: not phonetic (already handled)
+            if _is_phonetic_only_text(en):
+                merged.append(entry)
+                continue
+
+            # Check: starts with lowercase
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            # Check: fewer than 4 words (not an example sentence)
+            word_count = len(en.split())
+            is_short = word_count < 4
+
+            # Check: previous entry doesn't end with sentence terminator
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+            if starts_lower and is_short and not prev_ends_sentence:
+                # Merge into previous entry
+                prev['english'] = (prev_en + ' ' + en).strip()
+                # Merge example if present
+                ex = (entry.get('example') or '').strip()
+                if ex:
+                    prev_ex = (prev.get('example') or '').strip()
+                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+                logger.debug(
+                    f"Merged continuation row {entry.get('row_index')} "
+                    f"into previous entry: {prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    return merged
+
+
+def build_word_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+    pronunciation: str = "british",
+) -> List[Dict[str, Any]]:
+    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
+
+    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
+    - Maps cells to english/german/example entries
+    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
+    - Falls back to returning raw cells if no vocab columns detected.
+
+    Args:
+        ocr_img: Binarized full-page image (for Tesseract).
+        column_regions: Classified columns from Step 3.
+        row_geometries: Rows from Step 4.
+        img_w, img_h: Image dimensions.
+        lang: Default Tesseract language.
+        ocr_engine: 'tesseract', 'rapid', or 'auto'.
+        img_bgr: BGR color image (required for RapidOCR).
+        pronunciation: 'british' or 'american' for IPA lookup.
+
+    Returns:
+        List of entry dicts with english/german/example text and bbox info (percent).
+    """
+    cells, columns_meta = build_cell_grid(
+        ocr_img, column_regions, row_geometries, img_w, img_h,
+        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
+    )
+
+    if not cells:
+        return []
+
+    # Check if vocab layout is present
+    col_types = {c['type'] for c in columns_meta}
+    if not (col_types & {'column_en', 'column_de'}):
+        logger.info("build_word_grid: no vocab columns — returning raw cells")
+        return cells
+
+    # Vocab mapping: cells → entries
+    entries = _cells_to_vocab_entries(cells, columns_meta)
+
+    # --- Post-processing pipeline (deterministic, no LLM) ---
+    n_raw = len(entries)
+
+    # 0a. Merge phonetic-only continuation rows into previous entry
+    entries = _merge_phonetic_continuation_rows(entries)
+
+    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+    entries = _merge_continuation_rows(entries)
+
+    # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
+    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
+
+    # 2. Replace OCR'd phonetics with dictionary IPA
+    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+
+    # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
+    entries = _split_comma_entries(entries)
+
+    # 4. Attach example sentences (rows without DE → examples for preceding entry)
+    entries = _attach_example_sentences(entries)
+
+    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
+    logger.info(f"build_word_grid: {len(entries)} entries from "
+                f"{n_raw} raw → {len(entries)} after post-processing "
+                f"(engine={engine_name})")
+
+    return entries
+
diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py
new file mode 100644
index 0000000..47713a1
--- /dev/null
+++ b/klausur-service/backend/cv_layout.py
@@ -0,0 +1,3036 @@
+"""
+Document type detection, layout analysis, column/row geometry, and classification.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    ColumnGeometry,
+    DocumentTypeResult,
+    ENGLISH_FUNCTION_WORDS,
+    GERMAN_FUNCTION_WORDS,
+    PageRegion,
+    RowGeometry,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
+    """Detect whether the page is a vocab table, generic table, or full text.
+
+    Uses projection profiles and text density analysis — no OCR required.
+    Runs in < 2 seconds.
+
+    Args:
+        ocr_img: Binarized grayscale image (for projection profiles).
+        img_bgr: BGR color image.
+
+    Returns:
+        DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
+    """
+    if ocr_img is None or ocr_img.size == 0:
+        return DocumentTypeResult(
+            doc_type='full_text', confidence=0.5, pipeline='full_page',
+            skip_steps=['columns', 'rows'],
+            features={'error': 'empty image'},
+        )
+
+    h, w = ocr_img.shape[:2]
+
+    # --- 1. Vertical projection profile → detect column gaps ---
+    # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
+    # Invert: dark pixels on white background → high values = text.
+    vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
+
+    # Smooth the profile to avoid noise spikes
+    kernel_size = max(3, w // 100)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Find significant vertical gaps (columns of near-zero text density)
+    # A gap must be at least 1% of image width and have < 5% of max density
+    max_density = max(vert_smooth.max(), 1)
+    gap_threshold = max_density * 0.05
+    min_gap_width = max(5, w // 100)
+
+    in_gap = False
+    gap_count = 0
+    gap_start = 0
+    vert_gaps = []
+
+    for x in range(w):
+        if vert_smooth[x] < gap_threshold:
+            if not in_gap:
+                in_gap = True
+                gap_start = x
+        else:
+            if in_gap:
+                gap_width = x - gap_start
+                if gap_width >= min_gap_width:
+                    gap_count += 1
+                    vert_gaps.append((gap_start, x, gap_width))
+                in_gap = False
+
+    # Filter out margin gaps (within 10% of image edges)
+    margin_threshold = w * 0.10
+    internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
+    internal_gap_count = len(internal_gaps)
+
+    # --- 2. Horizontal projection profile → detect row gaps ---
+    horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
+    h_kernel = max(3, h // 200)
+    if h_kernel % 2 == 0:
+        h_kernel += 1
+    horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
+
+    h_max = max(horiz_smooth.max(), 1)
+    h_gap_threshold = h_max * 0.05
+    min_row_gap = max(3, h // 200)
+
+    row_gap_count = 0
+    in_gap = False
+    for y in range(h):
+        if horiz_smooth[y] < h_gap_threshold:
+            if not in_gap:
+                in_gap = True
+                gap_start = y
+        else:
+            if in_gap:
+                if y - gap_start >= min_row_gap:
+                    row_gap_count += 1
+                in_gap = False
+
+    # --- 3. Text density distribution (4×4 grid) ---
+    grid_rows, grid_cols = 4, 4
+    cell_h, cell_w = h // grid_rows, w // grid_cols
+    densities = []
+    for gr in range(grid_rows):
+        for gc in range(grid_cols):
+            cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
+                           gc * cell_w:(gc + 1) * cell_w]
+            if cell.size > 0:
+                d = float(np.count_nonzero(cell < 128)) / cell.size
+                densities.append(d)
+
+    density_std = float(np.std(densities)) if densities else 0
+    density_mean = float(np.mean(densities)) if densities else 0
+
+    features = {
+        'vertical_gaps': gap_count,
+        'internal_vertical_gaps': internal_gap_count,
+        'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
+        'row_gaps': row_gap_count,
+        'density_mean': round(density_mean, 4),
+        'density_std': round(density_std, 4),
+        'image_size': (w, h),
+    }
+
+    # --- 4. Decision tree ---
+    # Use internal_gap_count (excludes margin gaps) for column detection.
+    if internal_gap_count >= 2 and row_gap_count >= 5:
+        # Multiple internal vertical gaps + many row gaps → table
+        confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
+        return DocumentTypeResult(
+            doc_type='vocab_table',
+            confidence=round(confidence, 2),
+            pipeline='cell_first',
+            skip_steps=[],
+            features=features,
+        )
+    elif internal_gap_count >= 1 and row_gap_count >= 3:
+        # Some internal structure, likely a table
+        confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
+        return DocumentTypeResult(
+            doc_type='generic_table',
+            confidence=round(confidence, 2),
+            pipeline='cell_first',
+            skip_steps=[],
+            features=features,
+        )
+    elif internal_gap_count == 0:
+        # No internal column gaps → full text (regardless of density)
+        confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
+        return DocumentTypeResult(
+            doc_type='full_text',
+            confidence=round(confidence, 2),
+            pipeline='full_page',
+            skip_steps=['columns', 'rows'],
+            features=features,
+        )
+    else:
+        # Ambiguous — default to vocab_table (most common use case)
+        return DocumentTypeResult(
+            doc_type='vocab_table',
+            confidence=0.5,
+            pipeline='cell_first',
+            skip_steps=[],
+            features=features,
+        )
+
+
+# =============================================================================
+# Stage 4: Dual Image Preparation
+# =============================================================================
+
+def create_ocr_image(img: np.ndarray) -> np.ndarray:
+    """Create a binarized image optimized for Tesseract OCR.
+
+    Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Binary image (white text on black background inverted to black on white).
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Background normalization: divide by blurred version
+    bg = cv2.GaussianBlur(gray, (51, 51), 0)
+    normalized = cv2.divide(gray, bg, scale=255)
+
+    # Adaptive binarization
+    binary = cv2.adaptiveThreshold(
+        normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY, 31, 10
+    )
+
+    # Light denoise
+    denoised = cv2.medianBlur(binary, 3)
+
+    return denoised
+
+
+def create_layout_image(img: np.ndarray) -> np.ndarray:
+    """Create a CLAHE-enhanced grayscale image for layout analysis.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Enhanced grayscale image.
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+    return enhanced
+
+
+# =============================================================================
+# Stage 5: Layout Analysis (Projection Profiles)
+# =============================================================================
+
+def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
+    """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
+    out = mask.copy()
+    n = len(out)
+    i = 0
+    while i < n:
+        if out[i]:
+            start = i
+            while i < n and out[i]:
+                i += 1
+            if (i - start) < min_width:
+                out[start:i] = False
+        else:
+            i += 1
+    return out
+
+
+def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
+    """Find the bounding box of actual text content (excluding page margins).
+
+    Scan artefacts (thin black lines at page edges) are filtered out by
+    discarding contiguous projection runs narrower than 1 % of the image
+    dimension (min 5 px).
+
+    Returns:
+        Tuple of (left_x, right_x, top_y, bottom_y).
+    """
+    h, w = inv.shape[:2]
+    threshold = 0.005
+
+    # --- Horizontal projection for top/bottom ---
+    h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
+    h_mask = h_proj > threshold
+    min_h_run = max(5, h // 100)
+    h_mask = _filter_narrow_runs(h_mask, min_h_run)
+
+    top_y = 0
+    for y in range(h):
+        if h_mask[y]:
+            top_y = max(0, y - 5)
+            break
+
+    bottom_y = h
+    for y in range(h - 1, 0, -1):
+        if h_mask[y]:
+            bottom_y = min(h, y + 5)
+            break
+
+    # --- Vertical projection for left/right margins ---
+    v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
+    v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
+    v_mask = v_proj_norm > threshold
+    min_v_run = max(5, w // 100)
+    v_mask = _filter_narrow_runs(v_mask, min_v_run)
+
+    left_x = 0
+    for x in range(w):
+        if v_mask[x]:
+            left_x = max(0, x - 2)
+            break
+
+    right_x = w
+    for x in range(w - 1, 0, -1):
+        if v_mask[x]:
+            right_x = min(w, x + 2)
+            break
+
+    return left_x, right_x, top_y, bottom_y
+
+
+def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
+    """Detect columns, header, and footer using projection profiles.
+
+    Uses content-bounds detection to exclude page margins before searching
+    for column separators within the actual text area.
+
+    Args:
+        layout_img: CLAHE-enhanced grayscale image.
+        ocr_img: Binarized image for text density analysis.
+
+    Returns:
+        List of PageRegion objects describing detected regions.
+    """
+    h, w = ocr_img.shape[:2]
+
+    # Invert: black text on white → white text on black for projection
+    inv = cv2.bitwise_not(ocr_img)
+
+    # --- Find actual content bounds (exclude page margins) ---
+    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+                f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
+
+    if content_w < w * 0.3 or content_h < h * 0.3:
+        # Fallback if detection seems wrong
+        left_x, right_x = 0, w
+        top_y, bottom_y = 0, h
+        content_w, content_h = w, h
+
+    # --- Vertical projection within content area to find column separators ---
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+    v_proj = np.sum(content_strip, axis=0).astype(float)
+    v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
+
+    # Smooth the projection profile
+    kernel_size = max(5, content_w // 50)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Debug: log projection profile statistics
+    p_mean = float(np.mean(v_proj_smooth))
+    p_median = float(np.median(v_proj_smooth))
+    p_min = float(np.min(v_proj_smooth))
+    p_max = float(np.max(v_proj_smooth))
+    logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
+                f"mean={p_mean:.4f}, median={p_median:.4f}")
+
+    # Find valleys using multiple threshold strategies
+    # Strategy 1: relative to median (catches clear separators)
+    # Strategy 2: local minima approach (catches subtle gaps)
+    threshold = max(p_median * 0.3, p_mean * 0.2)
+    logger.info(f"Layout: valley threshold={threshold:.4f}")
+
+    in_valley = v_proj_smooth < threshold
+
+    # Find contiguous valley regions
+    all_valleys = []
+    start = None
+    for x in range(len(v_proj_smooth)):
+        if in_valley[x] and start is None:
+            start = x
+        elif not in_valley[x] and start is not None:
+            valley_width = x - start
+            valley_depth = float(np.min(v_proj_smooth[start:x]))
+            # Valley must be at least 3px wide
+            if valley_width >= 3:
+                all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
+            start = None
+
+    logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
+                f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
+
+    # Filter: valleys must be inside the content area (not at edges)
+    inner_margin = int(content_w * 0.08)
+    valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
+
+    # If no valleys found with strict threshold, try local minima approach
+    if len(valleys) < 2:
+        logger.info("Layout: trying local minima approach for column detection")
+        # Divide content into 20 segments, find the 2 lowest
+        seg_count = 20
+        seg_width = content_w // seg_count
+        seg_scores = []
+        for i in range(seg_count):
+            sx = i * seg_width
+            ex = min((i + 1) * seg_width, content_w)
+            seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
+            seg_scores.append((i, sx, ex, seg_mean))
+
+        seg_scores.sort(key=lambda s: s[3])
+        logger.info(f"Layout: segment scores (lowest 5): "
+                    f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
+
+        # Find two lowest non-adjacent segments that create reasonable columns
+        candidate_valleys = []
+        for seg_idx, sx, ex, seg_mean in seg_scores:
+            # Must not be at the edges
+            if seg_idx <= 1 or seg_idx >= seg_count - 2:
+                continue
+            # Must be significantly lower than overall mean
+            if seg_mean < p_mean * 0.6:
+                center = (sx + ex) // 2
+                candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
+
+        if len(candidate_valleys) >= 2:
+            # Pick the best pair: non-adjacent, creating reasonable column widths
+            candidate_valleys.sort(key=lambda v: v[2])
+            best_pair = None
+            best_score = float('inf')
+            for i in range(len(candidate_valleys)):
+                for j in range(i + 1, len(candidate_valleys)):
+                    c1 = candidate_valleys[i][2]
+                    c2 = candidate_valleys[j][2]
+                    # Must be at least 20% apart
+                    if (c2 - c1) < content_w * 0.2:
+                        continue
+                    col1 = c1
+                    col2 = c2 - c1
+                    col3 = content_w - c2
+                    # Each column at least 15%
+                    if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
+                        continue
+                    parts = sorted([col1, col2, col3])
+                    score = parts[2] - parts[0]
+                    if score < best_score:
+                        best_score = score
+                        best_pair = (candidate_valleys[i], candidate_valleys[j])
+
+            if best_pair:
+                valleys = list(best_pair)
+                logger.info(f"Layout: local minima found 2 valleys: "
+                            f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+    logger.info(f"Layout: final {len(valleys)} valleys: "
+                f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+    regions = []
+
+    if len(valleys) >= 2:
+        # 3-column layout detected
+        valleys.sort(key=lambda v: v[2])
+
+        if len(valleys) == 2:
+            sep1_center = valleys[0][2]
+            sep2_center = valleys[1][2]
+        else:
+            # Pick the two valleys that best divide into 3 parts
+            # Prefer wider valleys (more likely true separators)
+            best_pair = None
+            best_score = float('inf')
+            for i in range(len(valleys)):
+                for j in range(i + 1, len(valleys)):
+                    c1, c2 = valleys[i][2], valleys[j][2]
+                    # Each column should be at least 15% of content width
+                    col1 = c1
+                    col2 = c2 - c1
+                    col3 = content_w - c2
+                    if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
+                        continue
+                    # Score: lower is better (more even distribution)
+                    parts = sorted([col1, col2, col3])
+                    score = parts[2] - parts[0]
+                    # Bonus for wider valleys (subtract valley width)
+                    score -= (valleys[i][3] + valleys[j][3]) * 0.5
+                    if score < best_score:
+                        best_score = score
+                        best_pair = (c1, c2)
+            if best_pair:
+                sep1_center, sep2_center = best_pair
+            else:
+                sep1_center = valleys[0][2]
+                sep2_center = valleys[1][2]
+
+        # Convert from content-relative to absolute coordinates
+        abs_sep1 = sep1_center + left_x
+        abs_sep2 = sep2_center + left_x
+
+        logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
+                    f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
+
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=abs_sep1, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=abs_sep1, y=top_y,
+            width=abs_sep2 - abs_sep1, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_example', x=abs_sep2, y=top_y,
+            width=w - abs_sep2, height=content_h
+        ))
+
+    elif len(valleys) == 1:
+        # 2-column layout
+        abs_sep = valleys[0][2] + left_x
+
+        logger.info(f"Layout: 2 columns at separator x={abs_sep}")
+
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=abs_sep, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=abs_sep, y=top_y,
+            width=w - abs_sep, height=content_h
+        ))
+
+    else:
+        # No columns detected — run full-page OCR as single column
+        logger.warning("Layout: no column separators found, using full page")
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=w, height=content_h
+        ))
+
+    # Add header/footer info (gap-based detection with fallback)
+    _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
+
+    top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
+    bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
+    col_count = len([r for r in regions if r.type.startswith('column')])
+    logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
+
+    return regions
+
+
+# =============================================================================
+# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
+# =============================================================================
+
+# --- Phase A: Geometry Detection ---
+
+def _detect_columns_by_clustering(
+    word_dicts: List[Dict],
+    left_edges: List[int],
+    edge_word_indices: List[int],
+    content_w: int,
+    content_h: int,
+    left_x: int,
+    right_x: int,
+    top_y: int,
+    bottom_y: int,
+    inv: Optional[np.ndarray] = None,
+) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
+    """Fallback: detect columns by clustering left-aligned word positions.
+
+    Used when the primary gap-based algorithm finds fewer than 2 gaps.
+    """
+    tolerance = max(10, int(content_w * 0.01))
+    sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
+
+    clusters = []
+    cluster_widxs = []
+    cur_edges = [sorted_pairs[0][0]]
+    cur_widxs = [sorted_pairs[0][1]]
+    for edge, widx in sorted_pairs[1:]:
+        if edge - cur_edges[-1] <= tolerance:
+            cur_edges.append(edge)
+            cur_widxs.append(widx)
+        else:
+            clusters.append(cur_edges)
+            cluster_widxs.append(cur_widxs)
+            cur_edges = [edge]
+            cur_widxs = [widx]
+    clusters.append(cur_edges)
+    cluster_widxs.append(cur_widxs)
+
+    MIN_Y_COVERAGE_PRIMARY = 0.30
+    MIN_Y_COVERAGE_SECONDARY = 0.15
+    MIN_WORDS_SECONDARY = 5
+
+    cluster_infos = []
+    for c_edges, c_widxs in zip(clusters, cluster_widxs):
+        if len(c_edges) < 2:
+            continue
+        y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
+        y_span = max(y_positions) - min(y_positions)
+        y_coverage = y_span / content_h if content_h > 0 else 0.0
+        cluster_infos.append({
+            'mean_x': int(np.mean(c_edges)),
+            'count': len(c_edges),
+            'min_edge': min(c_edges),
+            'max_edge': max(c_edges),
+            'y_min': min(y_positions),
+            'y_max': max(y_positions),
+            'y_coverage': y_coverage,
+        })
+
+    primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
+    primary_set = set(id(c) for c in primary)
+    secondary = [c for c in cluster_infos
+                 if id(c) not in primary_set
+                 and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
+                 and c['count'] >= MIN_WORDS_SECONDARY]
+    significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
+
+    if len(significant) < 3:
+        logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
+        return None
+
+    merge_distance = max(30, int(content_w * 0.06))
+    merged = [significant[0].copy()]
+    for s in significant[1:]:
+        if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
+            prev = merged[-1]
+            total = prev['count'] + s['count']
+            avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
+            prev['mean_x'] = avg_x
+            prev['count'] = total
+            prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
+            prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
+        else:
+            merged.append(s.copy())
+
+    if len(merged) < 3:
+        logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
+        return None
+
+    logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
+
+    margin_px = max(6, int(content_w * 0.003))
+    return _build_geometries_from_starts(
+        [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
+        word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
+    )
+
+
+def _detect_sub_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int = 0,
+    top_y: int = 0,
+    header_y: Optional[int] = None,
+    footer_y: Optional[int] = None,
+    _edge_tolerance: int = 8,
+    _min_col_start_ratio: float = 0.10,
+) -> List[ColumnGeometry]:
+    """Split columns that contain internal sub-columns based on left-edge alignment.
+
+    For each column, clusters word left-edges into alignment bins (within
+    ``_edge_tolerance`` px).  The leftmost bin whose word count reaches
+    ``_min_col_start_ratio`` of the column total is treated as the true column
+    start.  Any words to the left of that bin form a sub-column, provided they
+    number >= 2 and < 35 % of total.
+
+    Word ``left`` values are relative to the content ROI (offset by *left_x*),
+    while ``ColumnGeometry.x`` is in absolute image coordinates.  *left_x*
+    bridges the two coordinate systems.
+
+    If *header_y* / *footer_y* are provided (absolute y-coordinates), words
+    in header/footer regions are excluded from alignment clustering to avoid
+    polluting the bins with page numbers or chapter titles.  Word ``top``
+    values are relative to *top_y*.
+
+    Returns a new list of ColumnGeometry — potentially longer than the input.
+    """
+    if content_w <= 0:
+        return geometries
+
+    result: List[ColumnGeometry] = []
+    for geo in geometries:
+        # Only consider wide-enough columns with enough words
+        if geo.width_ratio < 0.15 or geo.word_count < 5:
+            result.append(geo)
+            continue
+
+        # Collect left-edges of confident words, excluding header/footer
+        # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
+        min_top_rel = (header_y - top_y) if header_y is not None else None
+        max_top_rel = (footer_y - top_y) if footer_y is not None else None
+
+        confident = [w for w in geo.words
+                     if w.get('conf', 0) >= 30
+                     and (min_top_rel is None or w['top'] >= min_top_rel)
+                     and (max_top_rel is None or w['top'] <= max_top_rel)]
+        if len(confident) < 3:
+            result.append(geo)
+            continue
+
+        # --- Cluster left-edges into alignment bins ---
+        sorted_edges = sorted(w['left'] for w in confident)
+        bins: List[Tuple[int, int, int, int]] = []  # (center, count, min_edge, max_edge)
+        cur = [sorted_edges[0]]
+        for i in range(1, len(sorted_edges)):
+            if sorted_edges[i] - cur[-1] <= _edge_tolerance:
+                cur.append(sorted_edges[i])
+            else:
+                bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+                cur = [sorted_edges[i]]
+        bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+
+        # --- Find the leftmost bin qualifying as a real column start ---
+        total = len(confident)
+        min_count = max(3, int(total * _min_col_start_ratio))
+        col_start_bin = None
+        for b in bins:
+            if b[1] >= min_count:
+                col_start_bin = b
+                break
+
+        if col_start_bin is None:
+            result.append(geo)
+            continue
+
+        # Words to the left of the column-start bin are sub-column candidates
+        split_threshold = col_start_bin[2] - _edge_tolerance
+        sub_words = [w for w in geo.words if w['left'] < split_threshold]
+        main_words = [w for w in geo.words if w['left'] >= split_threshold]
+
+        # Count only body words (excluding header/footer) for the threshold check
+        # so that header/footer words don't artificially trigger a split.
+        sub_body = [w for w in sub_words
+                    if (min_top_rel is None or w['top'] >= min_top_rel)
+                    and (max_top_rel is None or w['top'] <= max_top_rel)]
+        if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
+            result.append(geo)
+            continue
+
+        # --- Build two sub-column geometries ---
+        # Word 'left' values are relative to left_x; geo.x is absolute.
+        # Convert the split position from relative to absolute coordinates.
+        max_sub_left = max(w['left'] for w in sub_words)
+        split_rel = (max_sub_left + col_start_bin[2]) // 2
+        split_abs = split_rel + left_x
+
+        sub_x = geo.x
+        sub_width = split_abs - geo.x
+        main_x = split_abs
+        main_width = (geo.x + geo.width) - split_abs
+
+        if sub_width <= 0 or main_width <= 0:
+            result.append(geo)
+            continue
+
+        sub_geo = ColumnGeometry(
+            index=0,
+            x=sub_x,
+            y=geo.y,
+            width=sub_width,
+            height=geo.height,
+            word_count=len(sub_words),
+            words=sub_words,
+            width_ratio=sub_width / content_w if content_w > 0 else 0.0,
+            is_sub_column=True,
+        )
+        main_geo = ColumnGeometry(
+            index=0,
+            x=main_x,
+            y=geo.y,
+            width=main_width,
+            height=geo.height,
+            word_count=len(main_words),
+            words=main_words,
+            width_ratio=main_width / content_w if content_w > 0 else 0.0,
+            is_sub_column=True,
+        )
+
+        result.append(sub_geo)
+        result.append(main_geo)
+
+        logger.info(
+            f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
+            f"(rel={split_rel}), sub={len(sub_words)} words, "
+            f"main={len(main_words)} words, "
+            f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
+        )
+
+    # Re-index by left-to-right order
+    result.sort(key=lambda g: g.x)
+    for i, g in enumerate(result):
+        g.index = i
+
+    return result
+
+
+def _split_broad_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int = 0,
+    _broad_threshold: float = 0.35,
+    _min_gap_px: int = 15,
+    _min_words_per_split: int = 5,
+) -> List[ColumnGeometry]:
+    """Split overly broad columns that contain two language blocks (EN+DE).
+
+    Uses word-coverage gap analysis: builds a per-pixel coverage array from the
+    words inside each broad column, finds the largest horizontal gap, and splits
+    the column at that gap.
+
+    Args:
+        geometries: Column geometries from _detect_sub_columns.
+        content_w: Width of the content area in pixels.
+        left_x: Left edge of content ROI in absolute image coordinates.
+        _broad_threshold: Minimum width_ratio to consider a column "broad".
+        _min_gap_px: Minimum gap width (pixels) to trigger a split.
+        _min_words_per_split: Both halves must have at least this many words.
+
+    Returns:
+        Updated list of ColumnGeometry (possibly with more columns).
+    """
+    result: List[ColumnGeometry] = []
+
+    logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
+                f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
+
+    for geo in geometries:
+        if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
+            result.append(geo)
+            continue
+
+        # Build word-coverage array (per pixel within column)
+        col_left_rel = geo.x - left_x  # column left in content-relative coords
+        coverage = np.zeros(geo.width, dtype=np.float32)
+
+        for wd in geo.words:
+            # wd['left'] is relative to left_x (content ROI)
+            wl = wd['left'] - col_left_rel
+            wr = wl + wd.get('width', 0)
+            wl = max(0, int(wl))
+            wr = min(geo.width, int(wr))
+            if wr > wl:
+                coverage[wl:wr] += 1.0
+
+        # Light smoothing (kernel=3px) to avoid noise
+        if len(coverage) > 3:
+            kernel = np.ones(3, dtype=np.float32) / 3.0
+            coverage = np.convolve(coverage, kernel, mode='same')
+
+        # Normalise to [0, 1]
+        cmax = coverage.max()
+        if cmax > 0:
+            coverage /= cmax
+
+        # Find INTERNAL gaps where coverage < 0.5
+        # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
+        low_mask = coverage < 0.5
+        all_gaps = []
+        _gs = None
+        for px in range(len(low_mask)):
+            if low_mask[px]:
+                if _gs is None:
+                    _gs = px
+            else:
+                if _gs is not None:
+                    all_gaps.append((_gs, px, px - _gs))
+                    _gs = None
+        if _gs is not None:
+            all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
+
+        # Filter: only internal gaps (not touching column edges)
+        _edge_margin = 10  # pixels from edge to ignore
+        internal_gaps = [g for g in all_gaps
+                         if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
+        best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
+
+        logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
+                    f"{[g for g in all_gaps if g[2] >= 5]}, "
+                    f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
+                    f"best={best_gap}")
+
+        if best_gap is None or best_gap[2] < _min_gap_px:
+            result.append(geo)
+            continue
+
+        gap_center = (best_gap[0] + best_gap[1]) // 2
+
+        # Split words by midpoint relative to gap
+        left_words = []
+        right_words = []
+        for wd in geo.words:
+            wl = wd['left'] - col_left_rel
+            mid = wl + wd.get('width', 0) / 2.0
+            if mid < gap_center:
+                left_words.append(wd)
+            else:
+                right_words.append(wd)
+
+        if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
+            result.append(geo)
+            continue
+
+        # Build two new ColumnGeometry objects
+        split_x_abs = geo.x + gap_center
+        left_w = gap_center
+        right_w = geo.width - gap_center
+
+        left_geo = ColumnGeometry(
+            index=0,
+            x=geo.x,
+            y=geo.y,
+            width=left_w,
+            height=geo.height,
+            word_count=len(left_words),
+            words=left_words,
+            width_ratio=left_w / content_w if content_w else 0,
+            is_sub_column=True,
+        )
+        right_geo = ColumnGeometry(
+            index=0,
+            x=split_x_abs,
+            y=geo.y,
+            width=right_w,
+            height=geo.height,
+            word_count=len(right_words),
+            words=right_words,
+            width_ratio=right_w / content_w if content_w else 0,
+            is_sub_column=True,
+        )
+
+        logger.info(
+            f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
+            f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
+            f"left={len(left_words)} words (w={left_w}), "
+            f"right={len(right_words)} words (w={right_w})"
+        )
+
+        result.append(left_geo)
+        result.append(right_geo)
+
+    # Re-index left-to-right
+    result.sort(key=lambda g: g.x)
+    for i, g in enumerate(result):
+        g.index = i
+
+    return result
+
+
+def _build_geometries_from_starts(
+    col_starts: List[Tuple[int, int]],
+    word_dicts: List[Dict],
+    left_x: int,
+    right_x: int,
+    top_y: int,
+    bottom_y: int,
+    content_w: int,
+    content_h: int,
+    inv: Optional[np.ndarray] = None,
+) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
+    """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
+    geometries = []
+    for i, (start_x, count) in enumerate(col_starts):
+        if i + 1 < len(col_starts):
+            col_width = col_starts[i + 1][0] - start_x
+        else:
+            col_width = right_x - start_x
+
+        col_left_rel = start_x - left_x
+        col_right_rel = col_left_rel + col_width
+        col_words = [w for w in word_dicts
+                     if col_left_rel <= w['left'] < col_right_rel]
+
+        geometries.append(ColumnGeometry(
+            index=i,
+            x=start_x,
+            y=top_y,
+            width=col_width,
+            height=content_h,
+            word_count=len(col_words),
+            words=col_words,
+            width_ratio=col_width / content_w if content_w > 0 else 0.0,
+        ))
+
+    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
+                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+
+
+def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
+    """Detect column geometry using whitespace-gap analysis with word validation.
+
+    Phase A of the two-phase column detection. Uses vertical projection
+    profiles to find whitespace gaps between columns, then validates that
+    no gap cuts through a word bounding box.
+
+    Falls back to clustering-based detection if fewer than 2 gaps are found.
+
+    Args:
+        ocr_img: Binarized grayscale image for layout analysis.
+        dewarped_bgr: Original BGR image (for Tesseract word detection).
+
+    Returns:
+        Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+        or None if detection fails entirely.
+    """
+    h, w = ocr_img.shape[:2]
+
+    # --- Step 1: Find content bounds ---
+    inv = cv2.bitwise_not(ocr_img)
+    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    if content_w < w * 0.3 or content_h < h * 0.3:
+        left_x, right_x = 0, w
+        top_y, bottom_y = 0, h
+        content_w, content_h = w, h
+
+    logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+                f"y=[{top_y}..{bottom_y}] ({content_h}px)")
+
+    # --- Step 2: Get word bounding boxes from Tesseract ---
+    # Crop from left_x to full image width (not right_x) so words at the right
+    # edge of the last column are included even if they extend past the detected
+    # content boundary (right_x).
+    content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
+    pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
+
+    try:
+        data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
+    except Exception as e:
+        logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
+        return None
+
+    word_dicts = []
+    left_edges = []
+    edge_word_indices = []
+    n_words = len(data['text'])
+    for i in range(n_words):
+        conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
+        text = str(data['text'][i]).strip()
+        if conf < 30 or not text:
+            continue
+        lx = int(data['left'][i])
+        ty = int(data['top'][i])
+        bw = int(data['width'][i])
+        bh = int(data['height'][i])
+        left_edges.append(lx)
+        edge_word_indices.append(len(word_dicts))
+        word_dicts.append({
+            'text': text, 'conf': conf,
+            'left': lx, 'top': ty, 'width': bw, 'height': bh,
+        })
+
+    if len(left_edges) < 5:
+        logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
+        return None
+
+    logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
+
+    # --- Step 2b: Segment by sub-headers ---
+    # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
+    # text bands that pollute the vertical projection.  We detect large
+    # horizontal gaps (= whitespace rows separating sections) and use only
+    # the tallest content segment for the projection.  This makes column
+    # detection immune to sub-headers, illustrations, and section dividers.
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+    h_proj_row = np.sum(content_strip, axis=1).astype(float)
+    h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
+
+    # Find horizontal gaps (near-empty rows)
+    H_GAP_THRESH = 0.02  # rows with <2% ink density are "empty"
+    h_in_gap = h_proj_row_norm < H_GAP_THRESH
+    H_MIN_GAP = max(5, content_h // 200)  # min gap height ~5-7px
+
+    h_gaps: List[Tuple[int, int]] = []
+    h_gap_start = None
+    for y_idx in range(len(h_in_gap)):
+        if h_in_gap[y_idx]:
+            if h_gap_start is None:
+                h_gap_start = y_idx
+        else:
+            if h_gap_start is not None:
+                if y_idx - h_gap_start >= H_MIN_GAP:
+                    h_gaps.append((h_gap_start, y_idx))
+                h_gap_start = None
+    if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
+        h_gaps.append((h_gap_start, len(h_in_gap)))
+
+    # Identify "large" gaps (significantly bigger than median) that indicate
+    # section boundaries (sub-headers, chapter titles).
+    if len(h_gaps) >= 3:
+        gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
+        median_gap_h = gap_sizes[len(gap_sizes) // 2]
+        large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
+        large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
+    else:
+        large_gaps = h_gaps
+
+    # Build content segments between large gaps and pick the tallest
+    seg_boundaries = [0]
+    for gs, ge in large_gaps:
+        seg_boundaries.append(gs)
+        seg_boundaries.append(ge)
+    seg_boundaries.append(content_h)
+
+    segments = []
+    for i in range(0, len(seg_boundaries) - 1, 2):
+        seg_top = seg_boundaries[i]
+        seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
+        seg_height = seg_bot - seg_top
+        if seg_height > 20:  # ignore tiny fragments
+            segments.append((seg_top, seg_bot, seg_height))
+
+    if segments:
+        segments.sort(key=lambda s: s[2], reverse=True)
+        best_seg = segments[0]
+        proj_strip = content_strip[best_seg[0]:best_seg[1], :]
+        effective_h = best_seg[2]
+        if len(segments) > 1:
+            logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
+                        f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
+                        f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
+    else:
+        proj_strip = content_strip
+        effective_h = content_h
+
+    # --- Step 3: Vertical projection profile ---
+    v_proj = np.sum(proj_strip, axis=0).astype(float)
+    v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
+
+    # Smooth the projection to avoid noise-induced micro-gaps
+    kernel_size = max(5, content_w // 80)
+    if kernel_size % 2 == 0:
+        kernel_size += 1  # keep odd for symmetry
+    v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # --- Step 4: Find whitespace gaps ---
+    # Threshold: areas with very little ink density are gaps
+    median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
+    gap_threshold = max(median_density * 0.15, 0.005)
+
+    in_gap = v_smooth < gap_threshold
+    MIN_GAP_WIDTH = max(8, content_w // 200)  # min ~8px or 0.5% of content width
+
+    # Collect contiguous gap regions
+    raw_gaps = []  # (start_x_rel, end_x_rel) relative to content ROI
+    gap_start = None
+    for x in range(len(in_gap)):
+        if in_gap[x]:
+            if gap_start is None:
+                gap_start = x
+        else:
+            if gap_start is not None:
+                gap_width = x - gap_start
+                if gap_width >= MIN_GAP_WIDTH:
+                    raw_gaps.append((gap_start, x))
+                gap_start = None
+    # Handle gap at the right edge
+    if gap_start is not None:
+        gap_width = len(in_gap) - gap_start
+        if gap_width >= MIN_GAP_WIDTH:
+            raw_gaps.append((gap_start, len(in_gap)))
+
+    logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
+                f"min_width={MIN_GAP_WIDTH}px): "
+                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
+
+    # --- Step 5: Validate gaps against word bounding boxes ---
+    # When using a segment for projection, only validate against words
+    # inside that segment — words from sub-headers or other sections
+    # would incorrectly overlap with real column gaps.
+    if segments and len(segments) > 1:
+        seg_top_abs = best_seg[0]  # relative to content strip
+        seg_bot_abs = best_seg[1]
+        segment_words = [wd for wd in word_dicts
+                         if wd['top'] >= seg_top_abs
+                         and wd['top'] + wd['height'] <= seg_bot_abs]
+        logger.info(f"ColumnGeometry: filtering words to segment: "
+                    f"{len(segment_words)}/{len(word_dicts)} words")
+    else:
+        segment_words = word_dicts
+
+    validated_gaps = []
+    for gap_start_rel, gap_end_rel in raw_gaps:
+        # Check if any word overlaps with this gap region
+        overlapping = False
+        for wd in segment_words:
+            word_left = wd['left']
+            word_right = wd['left'] + wd['width']
+            if word_left < gap_end_rel and word_right > gap_start_rel:
+                overlapping = True
+                break
+
+        if not overlapping:
+            validated_gaps.append((gap_start_rel, gap_end_rel))
+        else:
+            # Try to shift the gap to avoid the overlapping word(s)
+            # Find the tightest word boundaries within the gap region
+            min_word_left = content_w
+            max_word_right = 0
+            for wd in segment_words:
+                word_left = wd['left']
+                word_right = wd['left'] + wd['width']
+                if word_left < gap_end_rel and word_right > gap_start_rel:
+                    min_word_left = min(min_word_left, word_left)
+                    max_word_right = max(max_word_right, word_right)
+
+            # Try gap before the overlapping words
+            if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
+                validated_gaps.append((gap_start_rel, min_word_left))
+                logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
+            # Try gap after the overlapping words
+            elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
+                validated_gaps.append((max_word_right, gap_end_rel))
+                logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
+            else:
+                logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                             f"discarded (word overlap, no room to shift)")
+
+    logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
+                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
+
+    # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
+    # When pixel-based projection fails (e.g. due to illustrations or colored
+    # bands), use word bounding boxes to find clear vertical gaps.  This is
+    # immune to decorative graphics that Tesseract doesn't recognise as words.
+    if len(validated_gaps) < 2:
+        logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
+        word_coverage = np.zeros(content_w, dtype=np.int32)
+        for wd in segment_words:
+            wl = max(0, wd['left'])
+            wr = min(wd['left'] + wd['width'], content_w)
+            if wr > wl:
+                word_coverage[wl:wr] += 1
+
+        # Smooth slightly to bridge tiny 1-2px noise gaps between words
+        wc_kernel = max(3, content_w // 300)
+        if wc_kernel % 2 == 0:
+            wc_kernel += 1
+        wc_smooth = np.convolve(word_coverage.astype(float),
+                                np.ones(wc_kernel) / wc_kernel, mode='same')
+
+        wc_in_gap = wc_smooth < 0.5  # effectively zero word coverage
+        WC_MIN_GAP = max(4, content_w // 300)
+
+        wc_gaps: List[Tuple[int, int]] = []
+        wc_gap_start = None
+        for x in range(len(wc_in_gap)):
+            if wc_in_gap[x]:
+                if wc_gap_start is None:
+                    wc_gap_start = x
+            else:
+                if wc_gap_start is not None:
+                    if x - wc_gap_start >= WC_MIN_GAP:
+                        wc_gaps.append((wc_gap_start, x))
+                    wc_gap_start = None
+        if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
+            wc_gaps.append((wc_gap_start, len(wc_in_gap)))
+
+        logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
+                    f"(min_width={WC_MIN_GAP}px): "
+                    f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
+
+        if len(wc_gaps) >= 2:
+            validated_gaps = wc_gaps
+
+    # --- Step 6: Fallback to clustering if too few gaps ---
+    if len(validated_gaps) < 2:
+        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
+        return _detect_columns_by_clustering(
+            word_dicts, left_edges, edge_word_indices,
+            content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
+        )
+
+    # --- Step 7: Derive column boundaries from gaps ---
+    # Sort gaps by position
+    validated_gaps.sort(key=lambda g: g[0])
+
+    # Identify margin gaps (first and last) vs interior gaps
+    # A margin gap touches the edge of the content area (within 2% tolerance)
+    edge_tolerance = max(10, int(content_w * 0.02))
+
+    is_left_margin = validated_gaps[0][0] <= edge_tolerance
+    is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
+
+    # Interior gaps define column boundaries
+    # Column starts at the end of a gap, ends at the start of the next gap
+    col_starts = []
+
+    if is_left_margin:
+        # First column starts after the left margin gap
+        first_gap_end = validated_gaps[0][1]
+        interior_gaps = validated_gaps[1:]
+    else:
+        # No left margin gap — first column starts at content left edge
+        first_gap_end = 0
+        interior_gaps = validated_gaps[:]
+
+    if is_right_margin:
+        # Last gap is right margin — don't use it as column start
+        interior_gaps_for_boundaries = interior_gaps[:-1]
+        right_boundary = validated_gaps[-1][0]  # last column ends at right margin gap start
+    else:
+        interior_gaps_for_boundaries = interior_gaps
+        right_boundary = content_w
+
+    # First column
+    col_starts.append(left_x + first_gap_end)
+
+    # Columns between interior gaps
+    for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
+        col_starts.append(left_x + gap_end_rel)
+
+    # Count words per column region (for logging)
+    col_start_counts = []
+    for i, start_x in enumerate(col_starts):
+        if i + 1 < len(col_starts):
+            next_start = col_starts[i + 1]
+        else:
+            # Rightmost column always extends to full image width (w).
+            # The page margin contains only white space — extending the OCR
+            # crop to the image edge is safe and prevents text near the right
+            # border from being cut off.
+            next_start = w
+
+        col_left_rel = start_x - left_x
+        col_right_rel = next_start - left_x
+        n_words_in_col = sum(1 for w in word_dicts
+                             if col_left_rel <= w['left'] < col_right_rel)
+        col_start_counts.append((start_x, n_words_in_col))
+
+    logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
+                f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
+                f"{col_start_counts}")
+
+    # --- Step 8: Build ColumnGeometry objects ---
+    # Determine right edge for each column
+    all_boundaries = []
+    for i, start_x in enumerate(col_starts):
+        if i + 1 < len(col_starts):
+            end_x = col_starts[i + 1]
+        else:
+            # Rightmost column always extends to full image width (w).
+            end_x = w
+        all_boundaries.append((start_x, end_x))
+
+    geometries = []
+    for i, (start_x, end_x) in enumerate(all_boundaries):
+        col_width = end_x - start_x
+        col_left_rel = start_x - left_x
+        col_right_rel = col_left_rel + col_width
+        col_words = [w for w in word_dicts
+                     if col_left_rel <= w['left'] < col_right_rel]
+
+        geometries.append(ColumnGeometry(
+            index=i,
+            x=start_x,
+            y=top_y,
+            width=col_width,
+            height=content_h,
+            word_count=len(col_words),
+            words=col_words,
+            width_ratio=col_width / content_w if content_w > 0 else 0.0,
+        ))
+
+    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
+                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
+    # --- Step 9: Filter phantom narrow columns ---
+    # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
+    # columns (< 3% of content width) with zero or no words. These are not
+    # real columns — remove them and close the gap between neighbors.
+    min_real_col_w = max(20, int(content_w * 0.03))
+    filtered_geoms = [g for g in geometries
+                      if not (g.word_count < 3 and g.width < min_real_col_w)]
+    if len(filtered_geoms) < len(geometries):
+        n_removed = len(geometries) - len(filtered_geoms)
+        logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
+                    f"(width < {min_real_col_w}px and words < 3)")
+        # Extend each remaining column to close gaps with its right neighbor
+        for i, g in enumerate(filtered_geoms):
+            if i + 1 < len(filtered_geoms):
+                g.width = filtered_geoms[i + 1].x - g.x
+            else:
+                g.width = w - g.x
+            g.index = i
+            col_left_rel = g.x - left_x
+            col_right_rel = col_left_rel + g.width
+            g.words = [w for w in word_dicts
+                       if col_left_rel <= w['left'] < col_right_rel]
+            g.word_count = len(g.words)
+        geometries = filtered_geoms
+        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
+                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
+    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+
+
+def expand_narrow_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int,
+    word_dicts: List[Dict],
+) -> List[ColumnGeometry]:
+    """Expand narrow columns into adjacent whitespace gaps.
+
+    Narrow columns (marker, page_ref, < 10% content width) often lose
+    content at image edges due to residual shear.  This expands them toward
+    the neighbouring column, but never past 40% of the gap or past the
+    nearest word in the neighbour.
+
+    Must be called AFTER _detect_sub_columns() so that sub-column splits
+    (which create the narrowest columns) have already happened.
+    """
+    _NARROW_THRESHOLD_PCT = 10.0
+    _MIN_WORD_MARGIN = 4
+
+    if len(geometries) < 2:
+        return geometries
+
+    logger.info("ExpandNarrowCols: input %d cols: %s",
+                len(geometries),
+                [(i, g.x, g.width, round(g.width / content_w * 100, 1))
+                 for i, g in enumerate(geometries)])
+
+    for i, g in enumerate(geometries):
+        col_pct = g.width / content_w * 100 if content_w > 0 else 100
+        if col_pct >= _NARROW_THRESHOLD_PCT:
+            continue
+
+        expanded = False
+        orig_pct = col_pct
+
+        # --- try expanding to the LEFT ---
+        if i > 0:
+            left_nb = geometries[i - 1]
+            # Gap can be 0 if sub-column split created adjacent columns.
+            # In that case, look at where the neighbor's rightmost words
+            # actually are — there may be unused space we can claim.
+            nb_words_right = [wd['left'] + wd.get('width', 0)
+                              for wd in left_nb.words]
+            if nb_words_right:
+                rightmost_word_abs = left_x + max(nb_words_right)
+                safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
+            else:
+                # No words in neighbor → we can take up to neighbor's start
+                safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
+
+            if safe_left_abs < g.x:
+                g.width += (g.x - safe_left_abs)
+                g.x = safe_left_abs
+                expanded = True
+
+        # --- try expanding to the RIGHT ---
+        if i + 1 < len(geometries):
+            right_nb = geometries[i + 1]
+            nb_words_left = [wd['left'] for wd in right_nb.words]
+            if nb_words_left:
+                leftmost_word_abs = left_x + min(nb_words_left)
+                safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
+            else:
+                safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
+
+            cur_right = g.x + g.width
+            if safe_right_abs > cur_right:
+                g.width = safe_right_abs - g.x
+                expanded = True
+
+        if expanded:
+            col_left_rel = g.x - left_x
+            col_right_rel = col_left_rel + g.width
+            g.words = [wd for wd in word_dicts
+                       if col_left_rel <= wd['left'] < col_right_rel]
+            g.word_count = len(g.words)
+            g.width_ratio = g.width / content_w if content_w > 0 else 0.0
+            logger.info(
+                "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
+                i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
+
+            # --- Shrink overlapping neighbors to match new boundaries ---
+            # Left neighbor: its right edge must not exceed our new left edge
+            if i > 0:
+                left_nb = geometries[i - 1]
+                nb_right = left_nb.x + left_nb.width
+                if nb_right > g.x:
+                    left_nb.width = g.x - left_nb.x
+                    if left_nb.width < 0:
+                        left_nb.width = 0
+                    left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
+                    # Re-assign words
+                    nb_left_rel = left_nb.x - left_x
+                    nb_right_rel = nb_left_rel + left_nb.width
+                    left_nb.words = [wd for wd in word_dicts
+                                     if nb_left_rel <= wd['left'] < nb_right_rel]
+                    left_nb.word_count = len(left_nb.words)
+
+            # Right neighbor: its left edge must not be before our new right edge
+            if i + 1 < len(geometries):
+                right_nb = geometries[i + 1]
+                my_right = g.x + g.width
+                if right_nb.x < my_right:
+                    old_right_edge = right_nb.x + right_nb.width
+                    right_nb.x = my_right
+                    right_nb.width = old_right_edge - right_nb.x
+                    if right_nb.width < 0:
+                        right_nb.width = 0
+                    right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
+                    # Re-assign words
+                    nb_left_rel = right_nb.x - left_x
+                    nb_right_rel = nb_left_rel + right_nb.width
+                    right_nb.words = [wd for wd in word_dicts
+                                      if nb_left_rel <= wd['left'] < nb_right_rel]
+                    right_nb.word_count = len(right_nb.words)
+
+    return geometries
+
+
+# =============================================================================
+# Row Geometry Detection (horizontal whitespace-gap analysis)
+# =============================================================================
+
+def detect_row_geometry(
+    inv: np.ndarray,
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int, bottom_y: int,
+) -> List['RowGeometry']:
+    """Detect row geometry using horizontal whitespace-gap analysis.
+
+    Mirrors the vertical gap approach used for columns, but operates on
+    horizontal projection profiles to find gaps between text lines.
+    Also classifies header/footer rows based on gap size.
+
+    Args:
+        inv: Inverted binarized image (white text on black bg, full page).
+        word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
+        left_x, right_x: Absolute X bounds of the content area.
+        top_y, bottom_y: Absolute Y bounds of the content area.
+
+    Returns:
+        List of RowGeometry objects sorted top to bottom.
+    """
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    if content_h < 10 or content_w < 10:
+        logger.warning("detect_row_geometry: content area too small")
+        return []
+
+    # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+
+    # Build a word-coverage mask so that image regions (high ink density but no
+    # Tesseract words) are ignored.  Only pixels within/near word bounding boxes
+    # contribute to the projection.  This prevents large illustrations from
+    # merging multiple vocabulary rows into one.
+    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
+    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
+    for wd in word_dicts:
+        y1 = max(0, wd['top'] - WORD_PAD_Y)
+        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
+        x1 = max(0, wd['left'])
+        x2 = min(content_w, wd['left'] + wd['width'])
+        word_mask[y1:y2, x1:x2] = 255
+
+    masked_strip = cv2.bitwise_and(content_strip, word_mask)
+    h_proj = np.sum(masked_strip, axis=1).astype(float)
+    h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
+
+    # --- Step 2: Smoothing + threshold ---
+    kernel_size = max(3, content_h // 200)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
+    gap_threshold = max(median_density * 0.15, 0.003)
+
+    in_gap = h_smooth < gap_threshold
+    MIN_GAP_HEIGHT = max(3, content_h // 500)
+
+    # --- Step 3: Collect contiguous gap regions ---
+    raw_gaps = []  # (start_y_rel, end_y_rel) relative to content ROI
+    gap_start = None
+    for y in range(len(in_gap)):
+        if in_gap[y]:
+            if gap_start is None:
+                gap_start = y
+        else:
+            if gap_start is not None:
+                gap_height = y - gap_start
+                if gap_height >= MIN_GAP_HEIGHT:
+                    raw_gaps.append((gap_start, y))
+                gap_start = None
+    if gap_start is not None:
+        gap_height = len(in_gap) - gap_start
+        if gap_height >= MIN_GAP_HEIGHT:
+            raw_gaps.append((gap_start, len(in_gap)))
+
+    logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
+                f"min_height={MIN_GAP_HEIGHT}px)")
+
+    # --- Step 4: Validate gaps against word bounding boxes ---
+    validated_gaps = []
+    for gap_start_rel, gap_end_rel in raw_gaps:
+        overlapping = False
+        for wd in word_dicts:
+            word_top = wd['top']
+            word_bottom = wd['top'] + wd['height']
+            if word_top < gap_end_rel and word_bottom > gap_start_rel:
+                overlapping = True
+                break
+
+        if not overlapping:
+            validated_gaps.append((gap_start_rel, gap_end_rel))
+        else:
+            # Try to shift the gap to avoid overlapping words
+            min_word_top = content_h
+            max_word_bottom = 0
+            for wd in word_dicts:
+                word_top = wd['top']
+                word_bottom = wd['top'] + wd['height']
+                if word_top < gap_end_rel and word_bottom > gap_start_rel:
+                    min_word_top = min(min_word_top, word_top)
+                    max_word_bottom = max(max_word_bottom, word_bottom)
+
+            if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
+                validated_gaps.append((gap_start_rel, min_word_top))
+            elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
+                validated_gaps.append((max_word_bottom, gap_end_rel))
+            else:
+                logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                             f"discarded (word overlap, no room to shift)")
+
+    logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
+
+    # --- Fallback if too few gaps ---
+    if len(validated_gaps) < 2:
+        logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
+        return _build_rows_from_word_grouping(
+            word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
+        )
+
+    validated_gaps.sort(key=lambda g: g[0])
+
+    # --- Step 5: Header/footer detection via gap size ---
+    HEADER_FOOTER_ZONE = 0.15
+    GAP_MULTIPLIER = 2.0
+
+    gap_sizes = [g[1] - g[0] for g in validated_gaps]
+    median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
+    large_gap_threshold = median_gap * GAP_MULTIPLIER
+
+    header_boundary_rel = None  # y below which is header
+    footer_boundary_rel = None  # y above which is footer
+
+    header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
+    footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
+
+    # Find largest gap in header zone
+    best_header_gap = None
+    for gs, ge in validated_gaps:
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
+            if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
+                best_header_gap = (gs, ge)
+
+    if best_header_gap is not None:
+        header_boundary_rel = best_header_gap[1]
+        logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
+                    f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
+                    f"median_gap={median_gap:.0f}px)")
+
+    # Find largest gap in footer zone
+    best_footer_gap = None
+    for gs, ge in validated_gaps:
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
+            if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
+                best_footer_gap = (gs, ge)
+
+    if best_footer_gap is not None:
+        footer_boundary_rel = best_footer_gap[0]
+        logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
+                    f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
+
+    # --- Step 6: Build RowGeometry objects from gaps ---
+    # Rows are the spans between gaps
+    row_boundaries = []  # (start_y_rel, end_y_rel)
+
+    # Top of content to first gap
+    if validated_gaps[0][0] > MIN_GAP_HEIGHT:
+        row_boundaries.append((0, validated_gaps[0][0]))
+
+    # Between gaps
+    for i in range(len(validated_gaps) - 1):
+        row_start = validated_gaps[i][1]
+        row_end = validated_gaps[i + 1][0]
+        if row_end - row_start > 0:
+            row_boundaries.append((row_start, row_end))
+
+    # Last gap to bottom of content
+    if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
+        row_boundaries.append((validated_gaps[-1][1], content_h))
+
+    rows = []
+    for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
+        # Determine row type
+        row_mid = (row_start_rel + row_end_rel) / 2
+        if header_boundary_rel is not None and row_mid < header_boundary_rel:
+            row_type = 'header'
+        elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
+            row_type = 'footer'
+        else:
+            row_type = 'content'
+
+        # Collect words in this row
+        row_words = [w for w in word_dicts
+                     if w['top'] + w['height'] / 2 >= row_start_rel
+                     and w['top'] + w['height'] / 2 < row_end_rel]
+
+        # Gap before this row
+        gap_before = 0
+        if idx == 0 and validated_gaps[0][0] > 0:
+            gap_before = validated_gaps[0][0]
+        elif idx > 0:
+            # Find the gap just before this row boundary
+            for gs, ge in validated_gaps:
+                if ge == row_start_rel:
+                    gap_before = ge - gs
+                    break
+
+        rows.append(RowGeometry(
+            index=idx,
+            x=left_x,
+            y=top_y + row_start_rel,
+            width=content_w,
+            height=row_end_rel - row_start_rel,
+            word_count=len(row_words),
+            words=row_words,
+            row_type=row_type,
+            gap_before=gap_before,
+        ))
+
+    # --- Step 7: Word-center grid regularization ---
+    # Derive precise row boundaries from word vertical centers.  Detects
+    # section breaks (headings, paragraphs) and builds per-section grids.
+    rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
+                                content_w, content_h, inv)
+
+    type_counts = {}
+    for r in rows:
+        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
+    logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
+
+    return rows
+
+
+def _regularize_row_grid(
+    rows: List['RowGeometry'],
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int,
+    content_w: int, content_h: int,
+    inv: np.ndarray,
+) -> List['RowGeometry']:
+    """Rebuild row boundaries from word center-lines with section-break awareness.
+
+    Instead of overlaying a rigid grid, this derives row positions bottom-up
+    from the words themselves:
+
+    1. Group words into line clusters (by Y proximity).
+    2. For each cluster compute center_y (median of word vertical centers)
+       and letter_height (median of word heights).
+    3. Compute the pitch (distance between consecutive centers).
+    4. Detect section breaks where the gap is >1.8× the median pitch
+       (headings, sub-headings, paragraph breaks).
+    5. Within each section, use the local pitch to place row boundaries
+       at the midpoints between consecutive centers.
+    6. Validate that ≥85% of words land in a grid row; otherwise fall back.
+
+    Header/footer rows from the gap-based detection are preserved.
+    """
+    content_rows = [r for r in rows if r.row_type == 'content']
+    non_content = [r for r in rows if r.row_type != 'content']
+
+    if len(content_rows) < 5:
+        return rows
+
+    # --- Step A: Group ALL words into line clusters ---
+    # Collect words that belong to content rows (deduplicated)
+    content_words: List[Dict] = []
+    seen_keys: set = set()
+    for r in content_rows:
+        for w in r.words:
+            key = (w['left'], w['top'], w['width'], w['height'])
+            if key not in seen_keys:
+                seen_keys.add(key)
+                content_words.append(w)
+
+    if len(content_words) < 5:
+        return rows
+
+    # Compute median word height (excluding outliers like tall brackets/IPA)
+    word_heights = sorted(w['height'] for w in content_words)
+    median_wh = word_heights[len(word_heights) // 2]
+
+    # Compute median gap-based row height — this is the actual line height
+    # as detected by the horizontal projection.  We use 40% of this as
+    # grouping tolerance.  This is much more reliable than using word height
+    # alone, because words on the same line can have very different heights
+    # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
+    gap_row_heights = sorted(r.height for r in content_rows)
+    median_row_h = gap_row_heights[len(gap_row_heights) // 2]
+
+    # Tolerance: 40% of row height.  Words on the same line should have
+    # centers within this range.  Even if a word's bbox is taller/shorter,
+    # its center should stay within half a row height of the line center.
+    y_tol = max(10, int(median_row_h * 0.4))
+
+    # Sort by center_y, then group by proximity
+    words_by_center = sorted(content_words,
+                             key=lambda w: (w['top'] + w['height'] / 2, w['left']))
+    line_clusters: List[List[Dict]] = []
+    current_line: List[Dict] = [words_by_center[0]]
+    current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
+
+    for w in words_by_center[1:]:
+        w_center = w['top'] + w['height'] / 2
+        if abs(w_center - current_center) <= y_tol:
+            current_line.append(w)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            line_clusters.append(current_line)
+            current_line = [w]
+            current_center = w_center
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        line_clusters.append(current_line)
+
+    if len(line_clusters) < 3:
+        return rows
+
+    # --- Step B: Compute center_y per cluster ---
+    # center_y = median of (word_top + word_height/2) across all words in cluster
+    # letter_h = median of word heights, but excluding outlier-height words
+    #            (>2× median) so that tall brackets/IPA don't skew the height
+    cluster_info: List[Dict] = []
+    for cl_words in line_clusters:
+        centers = [w['top'] + w['height'] / 2 for w in cl_words]
+        # Filter outlier heights for letter_h computation
+        normal_heights = [w['height'] for w in cl_words
+                          if w['height'] <= median_wh * 2.0]
+        if not normal_heights:
+            normal_heights = [w['height'] for w in cl_words]
+        center_y = float(np.median(centers))
+        letter_h = float(np.median(normal_heights))
+        cluster_info.append({
+            'center_y_rel': center_y,  # relative to content ROI
+            'center_y_abs': center_y + top_y,  # absolute
+            'letter_h': letter_h,
+            'words': cl_words,
+        })
+
+    cluster_info.sort(key=lambda c: c['center_y_rel'])
+
+    # --- Step B2: Merge clusters that are too close together ---
+    # Even with center-based grouping, some edge cases can produce
+    # spurious clusters.  Merge any pair whose centers are closer
+    # than 30% of the row height (they're definitely the same text line).
+    merge_threshold = max(8, median_row_h * 0.3)
+    merged: List[Dict] = [cluster_info[0]]
+    for cl in cluster_info[1:]:
+        prev = merged[-1]
+        if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
+            # Merge: combine words, recompute center
+            combined_words = prev['words'] + cl['words']
+            centers = [w['top'] + w['height'] / 2 for w in combined_words]
+            normal_heights = [w['height'] for w in combined_words
+                              if w['height'] <= median_wh * 2.0]
+            if not normal_heights:
+                normal_heights = [w['height'] for w in combined_words]
+            prev['center_y_rel'] = float(np.median(centers))
+            prev['center_y_abs'] = prev['center_y_rel'] + top_y
+            prev['letter_h'] = float(np.median(normal_heights))
+            prev['words'] = combined_words
+        else:
+            merged.append(cl)
+
+    cluster_info = merged
+
+    if len(cluster_info) < 3:
+        return rows
+
+    # --- Step C: Compute pitches and detect section breaks ---
+    pitches: List[float] = []
+    for i in range(1, len(cluster_info)):
+        pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+        pitches.append(pitch)
+
+    if not pitches:
+        return rows
+
+    median_pitch = float(np.median(pitches))
+    if median_pitch <= 5:
+        return rows
+
+    # A section break is where the gap between line centers is much larger
+    # than the normal pitch (sub-headings, section titles, etc.)
+    BREAK_FACTOR = 1.8
+
+    # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
+    sections: List[List[Dict]] = []
+    current_section: List[Dict] = [cluster_info[0]]
+
+    for i in range(1, len(cluster_info)):
+        gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+        if gap > median_pitch * BREAK_FACTOR:
+            sections.append(current_section)
+            current_section = [cluster_info[i]]
+        else:
+            current_section.append(cluster_info[i])
+
+    if current_section:
+        sections.append(current_section)
+
+    # --- Step E: Build row boundaries per section ---
+    grid_rows: List[RowGeometry] = []
+
+    for section in sections:
+        if not section:
+            continue
+
+        if len(section) == 1:
+            # Single-line section (likely a heading)
+            cl = section[0]
+            half_h = max(cl['letter_h'], median_pitch * 0.4)
+            row_top = cl['center_y_abs'] - half_h
+            row_bot = cl['center_y_abs'] + half_h
+            grid_rows.append(RowGeometry(
+                index=0,
+                x=left_x,
+                y=round(row_top),
+                width=content_w,
+                height=round(row_bot - row_top),
+                word_count=len(cl['words']),
+                words=cl['words'],
+                row_type='content',
+                gap_before=0,
+            ))
+            continue
+
+        # Compute local pitch for this section
+        local_pitches = []
+        for i in range(1, len(section)):
+            local_pitches.append(
+                section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
+            )
+        local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
+
+        # Row boundaries are placed at midpoints between consecutive centers.
+        # First row: top = center - local_pitch/2
+        # Last row: bottom = center + local_pitch/2
+        for i, cl in enumerate(section):
+            if i == 0:
+                row_top = cl['center_y_abs'] - local_pitch / 2
+            else:
+                # Midpoint between this center and previous center
+                prev_center = section[i - 1]['center_y_abs']
+                row_top = (prev_center + cl['center_y_abs']) / 2
+
+            if i == len(section) - 1:
+                row_bot = cl['center_y_abs'] + local_pitch / 2
+            else:
+                next_center = section[i + 1]['center_y_abs']
+                row_bot = (cl['center_y_abs'] + next_center) / 2
+
+            # Clamp to reasonable bounds
+            row_top = max(top_y, row_top)
+            row_bot = min(top_y + content_h, row_bot)
+
+            if row_bot - row_top < 5:
+                continue
+
+            grid_rows.append(RowGeometry(
+                index=0,
+                x=left_x,
+                y=round(row_top),
+                width=content_w,
+                height=round(row_bot - row_top),
+                word_count=len(cl['words']),
+                words=cl['words'],
+                row_type='content',
+                gap_before=0,
+            ))
+
+    if not grid_rows:
+        return rows
+
+    # --- Step F: Re-assign words to grid rows ---
+    # Words may have shifted slightly; assign each word to the row whose
+    # center is closest to the word's vertical center.
+    for gr in grid_rows:
+        gr.words = []
+
+    for w in content_words:
+        w_center = w['top'] + top_y + w['height'] / 2
+        best_row = None
+        best_dist = float('inf')
+        for gr in grid_rows:
+            row_center = gr.y + gr.height / 2
+            dist = abs(w_center - row_center)
+            if dist < best_dist:
+                best_dist = dist
+                best_row = gr
+        if best_row is not None and best_dist < median_pitch:
+            best_row.words.append(w)
+
+    for gr in grid_rows:
+        gr.word_count = len(gr.words)
+
+    # --- Step G: Validate ---
+    words_placed = sum(gr.word_count for gr in grid_rows)
+    if len(content_words) > 0:
+        match_ratio = words_placed / len(content_words)
+        if match_ratio < 0.85:
+            logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
+                        f"of words, keeping gap-based rows")
+            return rows
+
+    # Remove empty grid rows (no words assigned)
+    grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
+
+    # --- Step H: Merge header/footer + re-index ---
+    result = list(non_content) + grid_rows
+    result.sort(key=lambda r: r.y)
+    for i, r in enumerate(result):
+        r.index = i
+
+    row_heights = [gr.height for gr in grid_rows]
+    min_h = min(row_heights) if row_heights else 0
+    max_h = max(row_heights) if row_heights else 0
+    logger.info(f"RowGrid: word-center grid applied "
+                f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
+                f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
+                f"{len(sections)} sections, "
+                f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
+                f"was {len(content_rows)} gap-based rows)")
+
+    return result
+
+
+def _build_rows_from_word_grouping(
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int, bottom_y: int,
+    content_w: int, content_h: int,
+) -> List['RowGeometry']:
+    """Fallback: build rows by grouping words by Y position.
+
+    Uses _group_words_into_lines() with a generous tolerance.
+    No header/footer detection in fallback mode.
+    """
+    if not word_dicts:
+        return []
+
+    y_tolerance = max(20, content_h // 100)
+    lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
+
+    rows = []
+    for idx, line_words in enumerate(lines):
+        if not line_words:
+            continue
+        min_top = min(w['top'] for w in line_words)
+        max_bottom = max(w['top'] + w['height'] for w in line_words)
+        row_height = max_bottom - min_top
+
+        rows.append(RowGeometry(
+            index=idx,
+            x=left_x,
+            y=top_y + min_top,
+            width=content_w,
+            height=row_height,
+            word_count=len(line_words),
+            words=line_words,
+            row_type='content',
+            gap_before=0,
+        ))
+
+    logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
+    return rows
+
+
+# --- Phase B: Content-Based Classification ---
+
+def _score_language(words: List[Dict]) -> Dict[str, float]:
+    """Score the language of a column's words.
+
+    Analyzes function words, umlauts, and capitalization patterns
+    to determine whether text is English or German.
+
+    Args:
+        words: List of word dicts with 'text' and 'conf' keys.
+
+    Returns:
+        Dict with 'eng' and 'deu' scores (0.0-1.0).
+    """
+    if not words:
+        return {'eng': 0.0, 'deu': 0.0}
+
+    # Only consider words with decent confidence
+    good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
+    if not good_words:
+        return {'eng': 0.0, 'deu': 0.0}
+
+    total = len(good_words)
+    en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
+    de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
+
+    # Check for umlauts (strong German signal)
+    raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
+    umlaut_count = sum(1 for t in raw_texts
+                       for c in t if c in 'äöüÄÖÜß')
+
+    # German capitalization: nouns are capitalized mid-sentence
+    # Count words that start with uppercase but aren't at position 0
+    cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
+
+    en_score = en_hits / total if total > 0 else 0.0
+    de_score = de_hits / total if total > 0 else 0.0
+
+    # Boost German score for umlauts
+    if umlaut_count > 0:
+        de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
+
+    # Boost German score for high capitalization ratio (typical for German nouns)
+    if total > 5:
+        cap_ratio = cap_words / total
+        if cap_ratio > 0.3:
+            de_score = min(1.0, de_score + 0.1)
+
+    return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
+
+
+def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
+    """Score the role of a column based on its geometry and content patterns.
+
+    Args:
+        geom: ColumnGeometry with words and dimensions.
+
+    Returns:
+        Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
+    """
+    scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
+
+    if not geom.words:
+        return scores
+
+    texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
+    if not texts:
+        return scores
+
+    avg_word_len = sum(len(t) for t in texts) / len(texts)
+    has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
+    digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
+    digit_ratio = digit_words / len(texts) if texts else 0.0
+
+    # Reference: narrow + mostly numbers/page references
+    if geom.width_ratio < 0.12:
+        scores['reference'] = 0.5
+        if digit_ratio > 0.4:
+            scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
+
+    # Marker: narrow + few short entries
+    if geom.width_ratio < 0.06 and geom.word_count <= 15:
+        scores['marker'] = 0.7
+        if avg_word_len < 4:
+            scores['marker'] = 0.9
+    # Very narrow non-edge column → strong marker regardless of word count
+    if geom.width_ratio < 0.04 and geom.index > 0:
+        scores['marker'] = max(scores['marker'], 0.9)
+
+    # Sentence: longer words + punctuation present
+    if geom.width_ratio > 0.15 and has_punctuation > 2:
+        scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
+        if avg_word_len > 4:
+            scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
+
+    # Vocabulary: medium width + medium word length
+    if 0.10 < geom.width_ratio < 0.45:
+        scores['vocabulary'] = 0.4
+        if 3 < avg_word_len < 8:
+            scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
+
+    return {k: round(v, 3) for k, v in scores.items()}
+
+
+def _build_margin_regions(
+    all_regions: List[PageRegion],
+    left_x: int,
+    right_x: int,
+    img_w: int,
+    top_y: int,
+    content_h: int,
+) -> List[PageRegion]:
+    """Create margin_left / margin_right PageRegions from content bounds.
+
+    Margins represent the space between the image edge and the first/last
+    content column.  They are used downstream for faithful page
+    reconstruction but are skipped during OCR.
+    """
+    margins: List[PageRegion] = []
+    # Minimum gap (px) to create a margin region
+    _min_gap = 5
+
+    if left_x > _min_gap:
+        margins.append(PageRegion(
+            type='margin_left', x=0, y=top_y,
+            width=left_x, height=content_h,
+            classification_confidence=1.0,
+            classification_method='content_bounds',
+        ))
+
+    # Right margin: from end of last content column to image edge
+    non_margin = [r for r in all_regions
+                  if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
+                                    'margin_top', 'margin_bottom')]
+    if non_margin:
+        last_col_end = max(r.x + r.width for r in non_margin)
+    else:
+        last_col_end = right_x
+    if img_w - last_col_end > _min_gap:
+        margins.append(PageRegion(
+            type='margin_right', x=last_col_end, y=top_y,
+            width=img_w - last_col_end, height=content_h,
+            classification_confidence=1.0,
+            classification_method='content_bounds',
+        ))
+
+    if margins:
+        logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
+                     f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
+
+    return margins
+
+
+def positional_column_regions(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    content_h: int,
+    left_x: int,
+) -> List[PageRegion]:
+    """Classify columns by position only (no language scoring).
+
+    Structural columns (page_ref, column_marker) are identified by geometry.
+    Remaining content columns are labelled left→right as column_en, column_de,
+    column_example.  The names are purely positional – no language analysis.
+    """
+    structural: List[PageRegion] = []
+    content_cols: List[ColumnGeometry] = []
+
+    for g in geometries:
+        rel_x = g.x - left_x
+        # page_ref: narrow column in the leftmost 20% region
+        if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
+            structural.append(PageRegion(
+                type='page_ref', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        # column_marker: very narrow, few words
+        elif g.width_ratio < 0.06 and g.word_count <= 15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        # empty or near-empty narrow column → treat as margin/structural
+        elif g.word_count <= 2 and g.width_ratio < 0.15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.85,
+                classification_method='positional',
+            ))
+        else:
+            content_cols.append(g)
+
+    # Single content column → plain text page
+    if len(content_cols) == 1:
+        g = content_cols[0]
+        return structural + [PageRegion(
+            type='column_text', x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.9,
+            classification_method='positional',
+        )]
+
+    # No content columns
+    if not content_cols:
+        return structural
+
+    # Sort content columns left→right and assign positional labels
+    content_cols.sort(key=lambda g: g.x)
+
+    # With exactly 2 content columns: if the left one is very wide (>35%),
+    # it likely contains EN+DE combined, so the right one is examples.
+    if (len(content_cols) == 2
+            and content_cols[0].width_ratio > 0.35
+            and content_cols[1].width_ratio > 0.20):
+        labels = ['column_en', 'column_example']
+    else:
+        labels = ['column_en', 'column_de', 'column_example']
+
+    regions = list(structural)
+    for i, g in enumerate(content_cols):
+        label = labels[i] if i < len(labels) else 'column_example'
+        regions.append(PageRegion(
+            type=label, x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.95,
+            classification_method='positional',
+        ))
+
+    logger.info(f"PositionalColumns: {len(structural)} structural, "
+                f"{len(content_cols)} content → "
+                f"{[r.type for r in regions]}")
+    return regions
+
+
+def classify_column_types(geometries: List[ColumnGeometry],
+                          content_w: int,
+                          top_y: int,
+                          img_w: int,
+                          img_h: int,
+                          bottom_y: int,
+                          left_x: int = 0,
+                          right_x: int = 0,
+                          inv: Optional[np.ndarray] = None) -> List[PageRegion]:
+    """Classify column types using a 3-level fallback chain.
+
+    Level 1: Content-based (language + role scoring)
+    Level 2: Position + language (old rules enhanced with language detection)
+    Level 3: Pure position (exact old code, no regression)
+
+    Args:
+        geometries: List of ColumnGeometry from Phase A.
+        content_w: Total content width.
+        top_y: Top Y of content area.
+        img_w: Full image width.
+        img_h: Full image height.
+        bottom_y: Bottom Y of content area.
+        left_x: Left content bound (from _find_content_bounds).
+        right_x: Right content bound (from _find_content_bounds).
+
+    Returns:
+        List of PageRegion with types, confidence, and method.
+    """
+    content_h = bottom_y - top_y
+
+    def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
+        """Append margin_left / margin_right regions to *result*."""
+        margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
+        return result + margins
+
+    # Special case: single column → plain text page
+    if len(geometries) == 1:
+        geom = geometries[0]
+        return _with_margins([PageRegion(
+            type='column_text', x=geom.x, y=geom.y,
+            width=geom.width, height=geom.height,
+            classification_confidence=0.9,
+            classification_method='content',
+        )])
+
+    # --- Pre-filter: first/last columns with very few words → column_ignore ---
+    # Sub-columns from _detect_sub_columns() are exempt: they intentionally
+    # have few words (page refs, markers) and should not be discarded.
+    ignore_regions = []
+    active_geometries = []
+    for idx, g in enumerate(geometries):
+        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
+            ignore_regions.append(PageRegion(
+                type='column_ignore', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='content',
+            ))
+            logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
+        else:
+            active_geometries.append(g)
+
+    # Re-index active geometries for classification
+    for new_idx, g in enumerate(active_geometries):
+        g.index = new_idx
+    geometries = active_geometries
+
+    # Handle edge case: all columns ignored or only 1 left
+    if len(geometries) == 0:
+        return _with_margins(ignore_regions)
+    if len(geometries) == 1:
+        geom = geometries[0]
+        ignore_regions.append(PageRegion(
+            type='column_text', x=geom.x, y=geom.y,
+            width=geom.width, height=geom.height,
+            classification_confidence=0.9,
+            classification_method='content',
+        ))
+        return _with_margins(ignore_regions)
+
+    # --- Score all columns ---
+    lang_scores = [_score_language(g.words) for g in geometries]
+    role_scores = [_score_role(g) for g in geometries]
+
+    logger.info(f"ClassifyColumns: language scores: "
+                f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
+    logger.info(f"ClassifyColumns: role scores: "
+                f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
+
+    # --- Level 1: Content-based classification ---
+    regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
+    if regions is not None:
+        logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
+        _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+        return _with_margins(ignore_regions + regions)
+
+    # --- Level 2: Position + language enhanced ---
+    regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
+    if regions is not None:
+        logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
+        _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+        return _with_margins(ignore_regions + regions)
+
+    # --- Level 3: Pure position fallback (old code, no regression) ---
+    logger.info("ClassifyColumns: Level 3 (position fallback)")
+    regions = _classify_by_position_fallback(geometries, content_w, content_h)
+    _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+    return _with_margins(ignore_regions + regions)
+
+
+def _classify_by_content(geometries: List[ColumnGeometry],
+                         lang_scores: List[Dict[str, float]],
+                         role_scores: List[Dict[str, float]],
+                         content_w: int,
+                         content_h: int) -> Optional[List[PageRegion]]:
+    """Level 1: Classify columns purely by content analysis.
+
+    Requires clear language signals to distinguish EN/DE columns.
+    Returns None if language signals are too weak.
+    """
+    regions = []
+    assigned = set()
+
+    # Step 1: Assign structural roles first (reference, marker)
+    # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
+    left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
+
+    for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
+        is_left_side = geom.x < left_20_threshold
+        has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
+        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
+            regions.append(PageRegion(
+                type='page_ref', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=rs['reference'],
+                classification_method='content',
+            ))
+            assigned.add(i)
+        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=rs['marker'],
+                classification_method='content',
+            ))
+            assigned.add(i)
+        elif geom.width_ratio < 0.05 and not is_left_side:
+            # Narrow column on the right side → marker, not page_ref
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.8,
+                classification_method='content',
+            ))
+            assigned.add(i)
+
+    # Step 2: Among remaining columns, find EN and DE by language scores
+    remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
+                 for i in range(len(geometries)) if i not in assigned]
+
+    if len(remaining) < 2:
+        # Not enough columns for EN/DE pair
+        if len(remaining) == 1:
+            i, geom, ls, rs = remaining[0]
+            regions.append(PageRegion(
+                type='column_text', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.6,
+                classification_method='content',
+            ))
+        regions.sort(key=lambda r: r.x)
+        return regions
+
+    # Check if we have enough language signal
+    en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
+    de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
+
+    # Position tiebreaker: when language signals are weak, use left=EN, right=DE
+    if (not en_candidates or not de_candidates) and len(remaining) >= 2:
+        max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
+        max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
+        if max_eng < 0.15 and max_deu < 0.15:
+            # Both signals weak — fall back to positional: left=EN, right=DE
+            sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
+            best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
+            best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
+            logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
+            en_conf = 0.4
+            de_conf = 0.4
+
+            regions.append(PageRegion(
+                type='column_en', x=best_en[1].x, y=best_en[1].y,
+                width=best_en[1].width, height=content_h,
+                classification_confidence=en_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_en[0])
+
+            regions.append(PageRegion(
+                type='column_de', x=best_de[1].x, y=best_de[1].y,
+                width=best_de[1].width, height=content_h,
+                classification_confidence=de_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_de[0])
+
+            # Assign remaining as example
+            for i, geom, ls, rs in remaining:
+                if i not in assigned:
+                    regions.append(PageRegion(
+                        type='column_example', x=geom.x, y=geom.y,
+                        width=geom.width, height=content_h,
+                        classification_confidence=0.4,
+                        classification_method='content',
+                    ))
+            regions.sort(key=lambda r: r.x)
+            return regions
+
+    if not en_candidates or not de_candidates:
+        # Language signals too weak for content-based classification
+        logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
+        return None
+
+    # Pick the best EN and DE candidates
+    best_en = max(en_candidates, key=lambda x: x[2]['eng'])
+    best_de = max(de_candidates, key=lambda x: x[2]['deu'])
+
+    # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
+    # Example sentences contain English function words ("the", "a", "is") which inflate
+    # the eng score of the Example column.  When the best EN candidate sits to the RIGHT
+    # of the DE column and there is another EN candidate to the LEFT, prefer the left one
+    # — it is almost certainly the real vocabulary column.
+    if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
+        left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
+        if left_of_de:
+            alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
+            logger.info(
+                f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} "
+                f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
+                f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
+            best_en = alt_en
+
+    if best_en[0] == best_de[0]:
+        # Same column scored highest for both — ambiguous
+        logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
+        return None
+
+    en_conf = best_en[2]['eng']
+    de_conf = best_de[2]['deu']
+
+    regions.append(PageRegion(
+        type='column_en', x=best_en[1].x, y=best_en[1].y,
+        width=best_en[1].width, height=content_h,
+        classification_confidence=round(en_conf, 2),
+        classification_method='content',
+    ))
+    assigned.add(best_en[0])
+
+    regions.append(PageRegion(
+        type='column_de', x=best_de[1].x, y=best_de[1].y,
+        width=best_de[1].width, height=content_h,
+        classification_confidence=round(de_conf, 2),
+        classification_method='content',
+    ))
+    assigned.add(best_de[0])
+
+    # Step 3: Remaining columns → example or text based on role scores
+    for i, geom, ls, rs in remaining:
+        if i in assigned:
+            continue
+        if rs['sentence'] > 0.4:
+            regions.append(PageRegion(
+                type='column_example', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=round(rs['sentence'], 2),
+                classification_method='content',
+            ))
+        else:
+            regions.append(PageRegion(
+                type='column_example', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.5,
+                classification_method='content',
+            ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
+
+
+def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
+                                    lang_scores: List[Dict[str, float]],
+                                    content_w: int,
+                                    content_h: int) -> Optional[List[PageRegion]]:
+    """Level 2: Position-based rules enhanced with language confirmation.
+
+    Uses the old positional heuristics but confirms EN/DE assignment
+    with language scores (swapping if needed).
+    """
+    regions = []
+    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_20_threshold = first_x + content_w * 0.20
+
+    # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
+    g0 = geometries[0]
+    ls0 = lang_scores[0]
+    has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
+    if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
+        regions.append(PageRegion(
+            type='page_ref', x=g0.x, y=g0.y,
+            width=g0.width, height=content_h,
+            classification_confidence=0.8,
+            classification_method='position_enhanced',
+        ))
+        untyped.remove(0)
+
+    # Rule 2: Narrow columns with few words → marker
+    for i in list(untyped):
+        geom = geometries[i]
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.7,
+                classification_method='position_enhanced',
+            ))
+            untyped.remove(i)
+
+    # Rule 3: Rightmost remaining → column_example (if 3+ remaining)
+    if len(untyped) >= 3:
+        last_idx = untyped[-1]
+        geom = geometries[last_idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.7,
+            classification_method='position_enhanced',
+        ))
+        untyped.remove(last_idx)
+
+    # Rule 4: First two remaining → EN/DE, but check language to possibly swap
+    if len(untyped) >= 2:
+        idx_a = untyped[0]
+        idx_b = untyped[1]
+        ls_a = lang_scores[idx_a]
+        ls_b = lang_scores[idx_b]
+
+        # Default: first=EN, second=DE (old behavior)
+        en_idx, de_idx = idx_a, idx_b
+        conf = 0.7
+
+        # Swap if language signals clearly indicate the opposite
+        if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
+            en_idx, de_idx = idx_b, idx_a
+            conf = 0.85
+            logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
+
+        regions.append(PageRegion(
+            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+            width=geometries[en_idx].width, height=content_h,
+            classification_confidence=conf,
+            classification_method='position_enhanced',
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+            width=geometries[de_idx].width, height=content_h,
+            classification_confidence=conf,
+            classification_method='position_enhanced',
+        ))
+        untyped = untyped[2:]
+    elif len(untyped) == 1:
+        idx = untyped[0]
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_en', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.5,
+            classification_method='position_enhanced',
+        ))
+        untyped = []
+
+    # Remaining → example
+    for idx in untyped:
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.5,
+            classification_method='position_enhanced',
+        ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
+
+
+def _classify_by_position_fallback(geometries: List[ColumnGeometry],
+                                   content_w: int,
+                                   content_h: int) -> List[PageRegion]:
+    """Level 3: Pure position-based fallback (identical to old code).
+
+    Guarantees no regression from the previous behavior.
+    """
+    regions = []
+    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_20_threshold = first_x + content_w * 0.20
+
+    # Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
+    g0 = geometries[0]
+    if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
+        regions.append(PageRegion(
+            type='page_ref', x=g0.x, y=g0.y,
+            width=g0.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped.remove(0)
+
+    # Rule 2: Narrow + few words → marker
+    for i in list(untyped):
+        geom = geometries[i]
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=1.0,
+                classification_method='position_fallback',
+            ))
+            untyped.remove(i)
+
+    # Rule 3: Rightmost remaining → example (if 3+)
+    if len(untyped) >= 3:
+        last_idx = untyped[-1]
+        geom = geometries[last_idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped.remove(last_idx)
+
+    # Rule 4: First remaining → EN, second → DE
+    if len(untyped) >= 2:
+        en_idx = untyped[0]
+        de_idx = untyped[1]
+        regions.append(PageRegion(
+            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+            width=geometries[en_idx].width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+            width=geometries[de_idx].width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped = untyped[2:]
+    elif len(untyped) == 1:
+        idx = untyped[0]
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_en', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped = []
+
+    for idx in untyped:
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
+
+
+def _detect_header_footer_gaps(
+    inv: np.ndarray,
+    img_w: int,
+    img_h: int,
+) -> Tuple[Optional[int], Optional[int]]:
+    """Detect header/footer boundaries via horizontal projection gap analysis.
+
+    Scans the full-page inverted image for large horizontal gaps in the top/bottom
+    20% that separate header/footer content from the main body.
+
+    Returns:
+        (header_y, footer_y) — absolute y-coordinates.
+        header_y = bottom edge of header region (None if no header detected).
+        footer_y = top edge of footer region (None if no footer detected).
+    """
+    HEADER_FOOTER_ZONE = 0.20
+    GAP_MULTIPLIER = 2.0
+
+    # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
+    actual_h = min(inv.shape[0], img_h)
+    roi = inv[:actual_h, :]
+    h_proj = np.sum(roi, axis=1).astype(float)
+    proj_w = roi.shape[1]
+    h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
+
+    # Step 2: Smoothing
+    kernel_size = max(3, actual_h // 200)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Step 3: Gap threshold
+    positive = h_smooth[h_smooth > 0]
+    median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
+    gap_threshold = max(median_density * 0.15, 0.003)
+
+    in_gap = h_smooth < gap_threshold
+    MIN_GAP_HEIGHT = max(3, actual_h // 500)
+
+    # Step 4: Collect contiguous gaps
+    raw_gaps: List[Tuple[int, int]] = []
+    gap_start: Optional[int] = None
+    for y in range(len(in_gap)):
+        if in_gap[y]:
+            if gap_start is None:
+                gap_start = y
+        else:
+            if gap_start is not None:
+                gap_height = y - gap_start
+                if gap_height >= MIN_GAP_HEIGHT:
+                    raw_gaps.append((gap_start, y))
+                gap_start = None
+    if gap_start is not None:
+        gap_height = len(in_gap) - gap_start
+        if gap_height >= MIN_GAP_HEIGHT:
+            raw_gaps.append((gap_start, len(in_gap)))
+
+    if not raw_gaps:
+        return None, None
+
+    # Step 5: Compute median gap size and large-gap threshold
+    gap_sizes = [g[1] - g[0] for g in raw_gaps]
+    median_gap = float(np.median(gap_sizes))
+    large_gap_threshold = median_gap * GAP_MULTIPLIER
+
+    # Step 6: Find largest qualifying gap in header / footer zones
+    # A separator gap must have content on BOTH sides — edge-touching gaps
+    # (e.g. dewarp padding at bottom) are not valid separators.
+    EDGE_MARGIN = max(5, actual_h // 400)
+    header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
+    footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
+
+    header_y: Optional[int] = None
+    footer_y: Optional[int] = None
+
+    best_header_size = 0
+    for gs, ge in raw_gaps:
+        if gs <= EDGE_MARGIN:
+            continue  # skip gaps touching the top edge
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
+            if gap_size > best_header_size:
+                best_header_size = gap_size
+                header_y = ge  # bottom edge of gap
+
+    best_footer_size = 0
+    for gs, ge in raw_gaps:
+        if ge >= actual_h - EDGE_MARGIN:
+            continue  # skip gaps touching the bottom edge
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
+            if gap_size > best_footer_size:
+                best_footer_size = gap_size
+                footer_y = gs  # top edge of gap
+
+    if header_y is not None:
+        logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
+                    f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
+    if footer_y is not None:
+        logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
+                    f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
+
+    return header_y, footer_y
+
+
+def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
+                        min_density: float = 0.005) -> bool:
+    """Check whether a horizontal strip contains meaningful ink.
+
+    Args:
+        inv: Inverted binarized image (white-on-black).
+        y_start: Top of the region (inclusive).
+        y_end: Bottom of the region (exclusive).
+        min_density: Fraction of white pixels required to count as content.
+
+    Returns:
+        True if the region contains text/graphics, False if empty margin.
+    """
+    if y_start >= y_end:
+        return False
+    strip = inv[y_start:y_end, :]
+    density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
+    return density > min_density
+
+
+def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
+                       img_w: int, img_h: int,
+                       inv: Optional[np.ndarray] = None) -> None:
+    """Add header/footer/margin regions in-place.
+
+    Uses gap-based detection when *inv* is provided, otherwise falls back
+    to simple top_y/bottom_y bounds.
+
+    Region types depend on whether there is actual content (text/graphics):
+      - 'header' / 'footer'       — region contains text (e.g. title, page number)
+      - 'margin_top' / 'margin_bottom' — region is empty page margin
+    """
+    header_y: Optional[int] = None
+    footer_y: Optional[int] = None
+
+    if inv is not None:
+        header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
+
+    # --- Top region ---
+    top_boundary = header_y if header_y is not None and header_y > 10 else (
+        top_y if top_y > 10 else None
+    )
+    if top_boundary is not None:
+        has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
+        rtype = 'header' if has_content else 'margin_top'
+        regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
+        logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
+                    f"(has_content={has_content})")
+
+    # --- Bottom region ---
+    bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
+        bottom_y if bottom_y < img_h - 10 else None
+    )
+    if bottom_boundary is not None:
+        has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
+        rtype = 'footer' if has_content else 'margin_bottom'
+        regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
+                                  height=img_h - bottom_boundary))
+        logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
+                    f"height={img_h - bottom_boundary}px (has_content={has_content})")
+
+
+# --- Main Entry Point ---
+
+def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
+    """Detect columns using two-phase approach: geometry then content classification.
+
+    Phase A: detect_column_geometry() — clustering word positions into columns.
+    Phase B: classify_column_types() — content-based type assignment with fallback.
+
+    Falls back to projection-based analyze_layout() if geometry detection fails.
+
+    Args:
+        ocr_img: Binarized grayscale image for layout analysis.
+        dewarped_bgr: Original BGR image (for Tesseract word detection).
+
+    Returns:
+        List of PageRegion objects with types, confidence, and method.
+    """
+    h, w = ocr_img.shape[:2]
+
+    # Phase A: Geometry detection
+    result = detect_column_geometry(ocr_img, dewarped_bgr)
+
+    if result is None:
+        # Fallback to projection-based layout
+        logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
+        layout_img = create_layout_image(dewarped_bgr)
+        return analyze_layout(layout_img, ocr_img)
+
+    geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
+    content_w = right_x - left_x
+
+    # Detect header/footer early so sub-column clustering ignores them
+    header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
+
+    # Split sub-columns (e.g. page references) before classification
+    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
+                                      top_y=top_y, header_y=header_y, footer_y=footer_y)
+
+    # Split broad columns that contain EN+DE mixed via word-coverage gaps
+    geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
+
+    # Phase B: Positional classification (no language scoring)
+    content_h = bottom_y - top_y
+    regions = positional_column_regions(geometries, content_w, content_h, left_x)
+
+    col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
+    methods = set(r.classification_method for r in regions if r.classification_method)
+    logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
+                f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
+
+    return regions
diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
new file mode 100644
index 0000000..2f630c3
--- /dev/null
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -0,0 +1,1282 @@
+"""
+OCR engines (RapidOCR, TrOCR, LightOn), vocab postprocessing, and text cleaning.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import io
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    IPA_AVAILABLE,
+    PageRegion,
+    RowGeometry,
+    _britfone_dict,
+    _ipa_convert_american,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Pipeline Step 5: Word Grid from Columns × Rows
+# =============================================================================
+
+def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
+    """Group OCR words into visual lines in reading order.
+
+    Returns a list of line strings (one per visual line in the cell).
+    """
+    if not words:
+        return []
+
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    return [' '.join(w['text'] for w in line) for line in lines]
+
+
+def _rejoin_hyphenated(lines: List[str]) -> List[str]:
+    """Rejoin words split by line-break hyphenation.
+
+    E.g. ['Fuß-', 'boden'] → ['Fußboden']
+         ['some text-', 'thing here'] → ['something here']
+    """
+    if len(lines) <= 1:
+        return lines
+
+    result = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        # If line ends with '-' and there's a next line, rejoin
+        if i + 1 < len(lines) and line.rstrip().endswith('-'):
+            stripped = line.rstrip()
+            # Get the word fragment before hyphen (last word)
+            prefix = stripped[:-1]  # remove trailing hyphen
+            next_line = lines[i + 1]
+            # Join: last word of this line + first word of next line
+            prefix_words = prefix.rsplit(' ', 1)
+            next_words = next_line.split(' ', 1)
+            if len(prefix_words) > 1:
+                joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
+            else:
+                joined = prefix_words[0] + next_words[0]
+            remainder = next_words[1] if len(next_words) > 1 else ''
+            if remainder:
+                result.append(joined + ' ' + remainder)
+            else:
+                result.append(joined)
+            i += 2
+        else:
+            result.append(line)
+            i += 1
+    return result
+
+
+def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words into text in correct reading order, preserving line breaks.
+
+    Groups words into visual lines by Y-tolerance, sorts each line by X,
+    rejoins hyphenated words, then joins lines with newlines.
+    """
+    lines = _words_to_reading_order_lines(words, y_tolerance_px)
+    lines = _rejoin_hyphenated(lines)
+    return '\n'.join(lines)
+
+
+# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
+
+_rapid_engine = None
+RAPIDOCR_AVAILABLE = False
+
+try:
+    from rapidocr import RapidOCR as _RapidOCRClass
+    from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
+    RAPIDOCR_AVAILABLE = True
+    logger.info("RapidOCR available — can be used as alternative to Tesseract")
+except ImportError:
+    logger.info("RapidOCR not installed — using Tesseract only")
+
+
+def _get_rapid_engine():
+    """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
+    global _rapid_engine
+    if _rapid_engine is None:
+        _rapid_engine = _RapidOCRClass(params={
+            # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
+            "Rec.lang_type": _LangRec.LATIN,
+            "Rec.model_type": _ModelType.SERVER,
+            "Rec.ocr_version": _OCRVersion.PPOCRV5,
+            # Tighter detection boxes to reduce word merging
+            "Det.unclip_ratio": 1.3,
+            # Lower threshold to detect small chars (periods, ellipsis, phonetics)
+            "Det.box_thresh": 0.4,
+            # Silence verbose logging
+            "Global.log_level": "critical",
+        })
+        logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
+    return _rapid_engine
+
+
+def ocr_region_rapid(
+    img_bgr: np.ndarray,
+    region: PageRegion,
+) -> List[Dict[str, Any]]:
+    """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
+
+    Args:
+        img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
+        region: Region to crop and OCR.
+
+    Returns:
+        List of word dicts with text, left, top, width, height, conf, region_type.
+    """
+    engine = _get_rapid_engine()
+
+    # Crop region from BGR image
+    crop = img_bgr[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    result = engine(crop)
+
+    if result is None or result.boxes is None or result.txts is None:
+        return []
+
+    words = []
+    boxes = result.boxes    # shape (N, 4, 2) — 4 corner points per text line
+    txts = result.txts      # tuple of strings
+    scores = result.scores  # tuple of floats
+
+    for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
+        if not txt or not txt.strip():
+            continue
+
+        # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        left = int(min(xs))
+        top = int(min(ys))
+        w = int(max(xs) - left)
+        h = int(max(ys) - top)
+
+        words.append({
+            'text': txt.strip(),
+            'left': left + region.x,   # Absolute coords
+            'top': top + region.y,
+            'width': w,
+            'height': h,
+            'conf': int(score * 100),  # 0-100 like Tesseract
+            'region_type': region.type,
+        })
+
+    return words
+
+
+def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
+    """Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
+
+    Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
+    Bboxes are approximated from equal line-height distribution within the region.
+    Falls back to Tesseract if TrOCR is not available.
+    """
+    from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
+
+    if not _check_trocr_available():
+        logger.warning("TrOCR not available, falling back to Tesseract")
+        if region.height > 0 and region.width > 0:
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+            if ocr_img_crop is not None:
+                return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+        return []
+
+    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+    if crop.size == 0:
+        return []
+
+    try:
+        import torch
+        from PIL import Image as _PILImage
+
+        processor, model = get_trocr_model(handwritten=handwritten)
+        if processor is None or model is None:
+            logger.warning("TrOCR model not loaded, falling back to Tesseract")
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        lines = _split_into_lines(pil_crop)
+        if not lines:
+            lines = [pil_crop]
+
+        device = next(model.parameters()).device
+        all_text = []
+        confidences = []
+        for line_img in lines:
+            pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
+            with torch.no_grad():
+                generated_ids = model.generate(pixel_values, max_length=128)
+            text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+            if text_line:
+                all_text.append(text_line)
+                confidences.append(0.85 if len(text_line) > 3 else 0.5)
+
+        if not all_text:
+            return []
+
+        avg_conf = int(sum(confidences) / len(confidences) * 100)
+        line_h = region.height // max(len(all_text), 1)
+        words = []
+        for i, line in enumerate(all_text):
+            words.append({
+                "text": line,
+                "left": region.x,
+                "top": region.y + i * line_h,
+                "width": region.width,
+                "height": line_h,
+                "conf": avg_conf,
+                "region_type": region.type,
+            })
+        return words
+
+    except Exception as e:
+        logger.error(f"ocr_region_trocr failed: {e}")
+        return []
+
+
+def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
+    """Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
+
+    Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
+    """
+    from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
+
+    if not _check_lighton_available():
+        logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
+        if RAPIDOCR_AVAILABLE and img_bgr is not None:
+            return ocr_region_rapid(img_bgr, region)
+        ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+        return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
+
+    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+    if crop.size == 0:
+        return []
+
+    try:
+        import io
+        import torch
+        from PIL import Image as _PILImage
+
+        processor, model = get_lighton_model()
+        if processor is None or model is None:
+            logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
+            if RAPIDOCR_AVAILABLE and img_bgr is not None:
+                return ocr_region_rapid(img_bgr, region)
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        conversation = [{"role": "user", "content": [{"type": "image"}]}]
+        inputs = processor.apply_chat_template(
+            conversation, images=[pil_crop],
+            add_generation_prompt=True, return_tensors="pt"
+        ).to(model.device)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=1024)
+
+        text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
+        if not text:
+            return []
+
+        lines = [l.strip() for l in text.split("\n") if l.strip()]
+        line_h = region.height // max(len(lines), 1)
+        words = []
+        for i, line in enumerate(lines):
+            words.append({
+                "text": line,
+                "left": region.x,
+                "top": region.y + i * line_h,
+                "width": region.width,
+                "height": line_h,
+                "conf": 85,
+                "region_type": region.type,
+            })
+        return words
+
+    except Exception as e:
+        logger.error(f"ocr_region_lighton failed: {e}")
+        return []
+
+
+# =============================================================================
+# Post-Processing: Deterministic Quality Fixes
+# =============================================================================
+
+# --- A. Character Confusion Fix (I/1/l) ---
+
+# Common OCR confusion pairs in vocabulary context
+_CHAR_CONFUSION_RULES = [
+    # "1" at word start followed by lowercase → likely "I" or "l"
+    # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
+    (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
+    # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
+    (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
+    # "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
+    (re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'),    # |ch → Ich, | want → I want
+]
+
+# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
+_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
+
+
+def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Fix common OCR character confusions using context.
+
+    Deterministic rules:
+    - "1" at word start → "I" or "l" based on context
+    - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
+    - "y " artifact at word boundaries → remove (e.g. "y you" → "you")
+    """
+    for entry in entries:
+        en = entry.get('english', '') or ''
+        de = entry.get('german', '') or ''
+        ex = entry.get('example', '') or ''
+
+        # Apply general rules to all fields
+        for pattern, replacement in _CHAR_CONFUSION_RULES:
+            en = pattern.sub(replacement, en)
+            de = pattern.sub(replacement, de)
+            ex = pattern.sub(replacement, ex)
+
+        # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
+        de_lower_words = set(de.lower().replace(',', ' ').split())
+        if de_lower_words & _DE_INDICATORS_FOR_EN_I:
+            # Any remaining "1" in EN that looks like "I"
+            en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
+
+        # Fix "y " artifact before repeated word: "y you" → "you"
+        en = re.sub(r'\by\s+([a-z])', r'\1', en)
+        ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
+
+        entry['english'] = en.strip()
+        entry['german'] = de.strip()
+        entry['example'] = ex.strip()
+
+    return entries
+
+
+# --- B. Comma-Separated Word Form Splitting ---
+
+def _is_singular_plural_pair(parts: List[str]) -> bool:
+    """Detect if comma-separated parts are singular/plural forms of the same word.
+
+    E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
+    "break, broke, broken" → False (different verb forms, OK to split).
+
+    Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
+    OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
+    """
+    if len(parts) != 2:
+        return False
+
+    a, b = parts[0].lower().strip(), parts[1].lower().strip()
+    if not a or not b:
+        return False
+
+    # Common prefix heuristic: if words share >= 50% of the shorter word,
+    # they are likely forms of the same word (Maus/Mäuse, child/children).
+    min_len = min(len(a), len(b))
+    common = 0
+    for ca, cb in zip(a, b):
+        if ca == cb:
+            common += 1
+        else:
+            break
+    if common >= max(2, min_len * 0.5):
+        return True
+
+    # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
+    umlaut_map = str.maketrans('aou', 'äöü')
+    if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
+        return True
+
+    return False
+
+
+def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Split entries with comma-separated word forms into individual entries.
+
+    E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
+    → 3 entries: break/brechen, broke/brach, broken/gebrochen
+
+    Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
+    because those are forms of the same vocabulary entry.
+
+    Only splits when both EN and DE have the same number of comma-parts,
+    parts are short (word forms, not sentences), and at least 3 parts
+    (to avoid splitting pairs that likely belong together).
+    """
+    result: List[Dict[str, Any]] = []
+
+    for entry in entries:
+        en = (entry.get('english', '') or '').strip()
+        de = (entry.get('german', '') or '').strip()
+
+        # Split by comma (but not inside brackets or parentheses)
+        en_parts = _split_by_comma(en)
+        de_parts = _split_by_comma(de)
+
+        # Only split if we have multiple parts and counts match
+        should_split = False
+        if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
+            # All parts must be short (word forms, not sentences)
+            if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
+                # Do NOT split singular/plural pairs (2 parts that are
+                # forms of the same word)
+                if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
+                    should_split = False
+                else:
+                    should_split = True
+
+        if not should_split:
+            result.append(entry)
+            continue
+
+        # Split into individual entries
+        for k in range(len(en_parts)):
+            sub = dict(entry)  # shallow copy
+            sub['english'] = en_parts[k].strip()
+            sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
+            sub['example'] = ''  # examples get attached later
+            sub['split_from_comma'] = True
+            result.append(sub)
+
+    # Re-number
+    for i, e in enumerate(result):
+        e['row_index'] = i
+
+    return result
+
+
+def _split_by_comma(text: str) -> List[str]:
+    """Split text by commas, but not inside brackets [...] or parens (...)."""
+    if ',' not in text:
+        return [text]
+
+    parts = []
+    depth_bracket = 0
+    depth_paren = 0
+    current = []
+
+    for ch in text:
+        if ch == '[':
+            depth_bracket += 1
+        elif ch == ']':
+            depth_bracket = max(0, depth_bracket - 1)
+        elif ch == '(':
+            depth_paren += 1
+        elif ch == ')':
+            depth_paren = max(0, depth_paren - 1)
+        elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
+            parts.append(''.join(current).strip())
+            current = []
+            continue
+        current.append(ch)
+
+    if current:
+        parts.append(''.join(current).strip())
+
+    # Filter empty parts
+    return [p for p in parts if p]
+
+
+# --- C. Example Sentence Attachment ---
+
+def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
+    """Find the vocab entry whose English word(s) best match the example sentence.
+
+    Returns index into vocab_entries, or -1 if no match found.
+    Uses word stem overlap: "a broken arm" matches "broken" or "break".
+    """
+    if not vocab_entries or not example_text:
+        return -1
+
+    example_lower = example_text.lower()
+    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
+
+    best_idx = -1
+    best_score = 0
+
+    for i, entry in enumerate(vocab_entries):
+        en = (entry.get('english', '') or '').lower()
+        if not en:
+            continue
+
+        # Extract vocab words (split on space, comma, newline)
+        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
+
+        # Score: how many vocab words appear in the example?
+        # Also check if example words share a common stem (first 4 chars)
+        direct_matches = vocab_words & example_words
+        score = len(direct_matches) * 10
+
+        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
+        if score == 0:
+            for vw in vocab_words:
+                if len(vw) < 3:
+                    continue
+                stem = vw[:4] if len(vw) >= 4 else vw[:3]
+                for ew in example_words:
+                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
+                        score += 5
+                        break
+
+        if score > best_score:
+            best_score = score
+            best_idx = i
+
+    return best_idx if best_score > 0 else -1
+
+
+def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Attach rows with EN text but no DE translation as examples to matching vocab entries.
+
+    Vocabulary worksheets often have:
+      Row 1: break, broke, broken / brechen, brach, gebrochen
+      Row 2: a broken arm          (no DE → example for "broken")
+      Row 3: a broken plate         (no DE → example for "broken")
+      Row 4: egg / Ei               (has DE → new vocab entry)
+
+    Rules (deterministic, generic):
+    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
+    - Find the best matching vocab entry by checking which entry's English words
+      appear in the example sentence (semantic matching via word overlap)
+    - Fall back to the nearest preceding entry if no word match found
+    - Multiple examples get joined with " | "
+    """
+    if not entries:
+        return entries
+
+    # Separate into vocab entries (have DE) and example candidates (no DE)
+    vocab_entries: List[Dict[str, Any]] = []
+    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts
+
+    for entry in entries:
+        en = (entry.get('english', '') or '').strip()
+        de = (entry.get('german', '') or '').strip()
+        ex = (entry.get('example', '') or '').strip()
+
+        # Treat single-char DE as OCR noise, not real translation.
+        # "Ei" (2 chars) is a valid German word, so threshold is 1.
+        has_de = len(de) > 1
+        has_en = bool(en)
+
+        # Heuristic: a row without DE is an "example sentence" only if
+        # the EN text looks like a sentence (>= 4 words, or contains
+        # typical sentence punctuation).  Short EN text (1-3 words) is
+        # more likely a vocab entry whose DE was missed by OCR.
+        _looks_like_sentence = (
+            len(en.split()) >= 4
+            or en.rstrip().endswith(('.', '!', '?'))
+        )
+        is_example_candidate = (
+            has_en and not has_de and _looks_like_sentence and vocab_entries
+        )
+
+        if is_example_candidate:
+            # This is an example sentence — find best matching vocab entry
+            example_text = en
+
+            match_idx = _find_best_vocab_match(en, vocab_entries)
+            if match_idx < 0:
+                # No word match → fall back to last entry
+                match_idx = len(vocab_entries) - 1
+
+            if match_idx not in examples_for:
+                examples_for[match_idx] = []
+            examples_for[match_idx].append(example_text)
+        else:
+            vocab_entries.append(entry)
+
+    # Attach examples to their matched vocab entries
+    for idx, example_list in examples_for.items():
+        if 0 <= idx < len(vocab_entries):
+            entry = vocab_entries[idx]
+            existing_ex = (entry.get('example', '') or '').strip()
+            new_examples = ' | '.join(example_list)
+            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+
+    # Re-number
+    for i, e in enumerate(vocab_entries):
+        e['row_index'] = i
+
+    return vocab_entries
+
+
+# --- D. Phonetic Bracket IPA Replacement ---
+
+# Pattern: word followed by any bracket type containing phonetic content.
+# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
+# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
+# This intentionally matches mixed brackets (e.g. {content]) because
+# Tesseract frequently misrecognizes bracket characters.
+_PHONETIC_BRACKET_RE = re.compile(
+    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
+)
+
+# Unicode IPA characters — used to distinguish correct IPA (from dictionary
+# lookup) from garbled OCR content when stripping orphan brackets.
+_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
+
+# Minimum word confidence for full-page Tesseract results (0-100).
+# Words below this threshold are OCR noise (scanner shadows, borders).
+_MIN_WORD_CONF = 30
+
+
+def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
+    """Look up IPA for a word using the selected pronunciation dictionary.
+
+    Args:
+        word: English word to look up.
+        pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
+
+    Returns:
+        IPA string or None if not found.
+    """
+    word_lower = word.lower().strip()
+    if not word_lower:
+        return None
+
+    if pronunciation == 'british' and _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+        # Fallback to American if not in Britfone
+        if _ipa_convert_american:
+            result = _ipa_convert_american(word_lower)
+            if result and '*' not in result:
+                return result
+        return None
+
+    if pronunciation == 'american' and _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+        # Fallback to Britfone if not in CMU
+        if _britfone_dict:
+            ipa = _britfone_dict.get(word_lower)
+            if ipa:
+                return ipa
+        return None
+
+    # Try any available source
+    if _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+    if _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+
+    return None
+
+
+def _fix_phonetic_brackets(
+    entries: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+    """Replace OCR'd phonetic transcriptions with dictionary IPA.
+
+    Detects patterns like "dance [du:ns]" and replaces with correct IPA:
+    - British: "dance [dˈɑːns]"  (Britfone, MIT)
+    - American: "dance [dæns]"    (eng_to_ipa/CMU, MIT)
+
+    Only replaces if the word before brackets is found in the dictionary.
+    """
+    if not IPA_AVAILABLE:
+        return entries
+
+    # IPA phonetics only appear in the ENGLISH field of vocab tables.
+    # German and example fields contain meaningful parenthetical content:
+    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
+    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
+    # These must NEVER be processed as phonetic transcriptions.
+    replaced_count = 0
+    for entry in entries:
+        text = entry.get('english', '') or ''
+        if not any(ch in text for ch in '[{('):
+            continue
+        new_text = _replace_phonetics_in_text(text, pronunciation)
+        if new_text != text:
+            logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
+            replaced_count += 1
+        entry['english'] = new_text
+
+    if replaced_count:
+        logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
+    return entries
+
+
+# Grammar particles that appear in brackets after English words:
+#   cross (with), complain (about/of), agree (on/with), look (sth) up
+# These must NOT be replaced with IPA.  Only used for the English field
+# (German/example fields are never processed for IPA replacement).
+_GRAMMAR_BRACKET_WORDS = frozenset({
+    # English prepositions/particles commonly in vocab tables
+    'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
+    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
+    # English grammar abbreviations used in vocab tables
+    'sth', 'sb', 'adj', 'adv',
+})
+
+
+def _is_grammar_bracket_content(content: str) -> bool:
+    """Return True if bracket content is grammar info in the ENGLISH field.
+
+    Grammar info:  cross (with), complain (about/of), agree (on/with)
+    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
+
+    Since we only process the English field, we only need to recognize
+    English grammar particles. Everything else is (garbled) IPA.
+    """
+    if not content:
+        return False
+
+    # Split on / for patterns like (about/of), (on/with)
+    tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
+    if not tokens:
+        return False
+
+    # ALL tokens must be known grammar words
+    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
+
+
+def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
+    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
+
+    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
+    We match any bracket type and replace with dictionary IPA if found.
+    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
+    """
+    if not IPA_AVAILABLE:
+        return text
+
+    def replacer(match):
+        word = match.group(1)
+        bracket_content = match.group(2).strip()
+        full_match = match.group(0)
+
+        # Skip if bracket content looks like regular text (multiple words)
+        if len(bracket_content.split()) > 3:
+            return full_match
+
+        # Look up IPA for the word before brackets
+        ipa = _lookup_ipa(word, pronunciation)
+
+        if ipa:
+            # Word has IPA → bracket content is phonetic (garbled or correct).
+            # Exception: grammar particles like cross (with) — keep those.
+            if _is_grammar_bracket_content(bracket_content):
+                return full_match
+            logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
+            return f"{word} [{ipa}]"
+
+        # No IPA for this word — keep as-is
+        return full_match
+
+    text = _PHONETIC_BRACKET_RE.sub(replacer, text)
+
+    # Second pass: strip remaining orphan brackets that are garbled IPA.
+    # These have no word before them (the main regex requires \b word \s* bracket).
+    # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+    # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+    def _strip_orphan_bracket(m):
+        content = m.group(1).strip()
+        # Keep grammar info: (sich beschweren), (about/of)
+        if _is_grammar_bracket_content(content):
+            return m.group(0)
+        # Keep correct IPA (contains Unicode IPA characters)
+        if any(ch in _IPA_CHARS for ch in content):
+            return m.group(0)
+        logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
+        return ''
+
+    text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
+    text = text.strip()
+
+    return text
+
+
+def _assign_row_words_to_columns(
+    row: RowGeometry,
+    columns: List[PageRegion],
+) -> Dict[int, List[Dict]]:
+    """Assign each word in a row to exactly one column.
+
+    Uses a two-pass strategy:
+    1. Containment: if a word's center falls within a column's horizontal
+       bounds (with padding), assign it to that column.
+    2. Nearest center: for words not contained by any column, fall back to
+       nearest column center distance.
+
+    This prevents long sentences in wide columns (e.g. example) from having
+    their rightmost words stolen by an adjacent column.
+
+    Args:
+        row: Row with words (relative coordinates).
+        columns: Sorted list of columns (absolute coordinates).
+
+    Returns:
+        Dict mapping col_index → list of words assigned to that column.
+    """
+    result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
+
+    if not row.words or not columns:
+        return result
+
+    left_x = row.x  # content ROI left (absolute)
+
+    # Build non-overlapping column assignment ranges using midpoints.
+    # For adjacent columns, the boundary is the midpoint between them.
+    # This prevents words near column borders from being assigned to
+    # the wrong column (e.g. "We" at the start of an example sentence
+    # being stolen by the preceding DE column).
+    n = len(columns)
+    col_ranges_rel = []  # (assign_left, assign_right) per column
+    for ci, col in enumerate(columns):
+        col_left_rel = col.x - left_x
+        col_right_rel = col_left_rel + col.width
+
+        # Left boundary: midpoint to previous column, or 0
+        if ci == 0:
+            assign_left = 0
+        else:
+            prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
+            assign_left = (prev_right + col_left_rel) / 2
+
+        # Right boundary: midpoint to next column, or infinity (row width)
+        if ci == n - 1:
+            assign_right = row.width + 100  # generous for last column
+        else:
+            next_left = columns[ci + 1].x - left_x
+            assign_right = (col_right_rel + next_left) / 2
+
+        col_ranges_rel.append((assign_left, assign_right))
+
+    for w in row.words:
+        w_left = w['left']
+        w_right = w_left + w['width']
+        w_center_x = w_left + w['width'] / 2
+
+        # Primary: overlap-based matching — assign to column with most overlap.
+        # This is more robust than center-based for narrow columns (page_ref)
+        # where the last character's center may fall into the next column.
+        best_col = -1
+        best_overlap = 0
+        for ci, col in enumerate(columns):
+            col_left_rel = col.x - left_x
+            col_right_rel = col_left_rel + col.width
+            overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
+            if overlap > best_overlap:
+                best_overlap = overlap
+                best_col = ci
+
+        if best_col >= 0 and best_overlap > 0:
+            result[best_col].append(w)
+        else:
+            # Fallback: center-based range matching
+            assigned = False
+            for ci, (al, ar) in enumerate(col_ranges_rel):
+                if al <= w_center_x < ar:
+                    result[ci].append(w)
+                    assigned = True
+                    break
+
+            if not assigned:
+                # Last resort: nearest column center
+                best_col = 0
+                col_left_0 = columns[0].x - left_x
+                best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
+                for ci in range(1, n):
+                    col_left = columns[ci].x - left_x
+                    dist = abs(w_center_x - (col_left + columns[ci].width / 2))
+                    if dist < best_dist:
+                        best_dist = dist
+                        best_col = ci
+                result[best_col].append(w)
+
+    return result
+
+
+# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
+_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
+_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
+
+# Common short EN/DE words (2-3 chars).  Tokens at the end of a cell
+# that do NOT appear here are treated as trailing OCR noise.
+_COMMON_SHORT_WORDS: set = {
+    # EN 1-2 letter
+    'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
+    'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
+    'or', 'so', 'to', 'up', 'us', 'we',
+    # EN 3 letter
+    'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
+    'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
+    'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
+    'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
+    'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
+    'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
+    'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
+    'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
+    'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
+    'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
+    'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
+    'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
+    'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
+    'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
+    'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
+    'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
+    'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
+    'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
+    'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
+    'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
+    'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
+    'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
+    'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
+    'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
+    'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
+    'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
+    'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
+    'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
+    'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
+    'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
+    'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
+    'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
+    'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
+    'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
+    'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
+    'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
+    'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
+    'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
+    'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
+    'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
+    'zap', 'zip', 'zoo',
+    # DE 2-3 letter
+    'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
+    'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
+    'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
+    'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
+    'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
+    'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
+    'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
+    'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
+    'wut', 'zum', 'zur',
+}
+
+# Known abbreviations found in EN/DE textbooks and dictionaries.
+# Stored WITHOUT trailing period (the noise filter strips periods).
+# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
+_KNOWN_ABBREVIATIONS: set = {
+    # EN dictionary meta-words
+    'sth', 'sb', 'smth', 'smb', 'sbd',
+    # EN general
+    'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
+    'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
+    # EN references / textbook
+    'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
+    'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
+    'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
+    'ans', 'wb', 'tb', 'vocab',
+    # EN parts of speech / grammar
+    'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
+    'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
+    'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
+    'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
+    'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
+    'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
+    'syn', 'ant', 'opp', 'var', 'orig',
+    # EN titles
+    'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
+    # EN pronunciation
+    'br', 'am', 'brit', 'amer',
+    # EN units
+    'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
+    # DE general
+    'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
+    'bes', 'insb', 'insbes', 'bspw', 'ca',
+    'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
+    'inkl', 'exkl', 'zzgl', 'abzgl',
+    # DE references
+    'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
+    'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
+    's', 'sp', 'zit', 'zs', 'vlg',
+    # DE grammar
+    'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
+    'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
+    'trennb', 'untrennb', 'ugs', 'geh', 'pej',
+    # DE regional
+    'nordd', 'österr', 'schweiz',
+    # Linguistic
+    'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
+    'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
+    'count', 'uncount', 'indef', 'def', 'poss', 'demon',
+}
+
+
+def _is_noise_tail_token(token: str) -> bool:
+    """Check if a token at the END of cell text is trailing OCR noise.
+
+    Trailing fragments are very common OCR artifacts from image edges,
+    borders, and neighbouring cells.  This is more aggressive than a
+    general word filter: any short token that isn't in the dictionary
+    of common EN/DE words is considered noise.
+
+    Examples of noise: "Es)", "3", "ee", "B"
+    Examples to keep:  "sister.", "cupcakes.", "...", "mice", "[eg]"
+    """
+    t = token.strip()
+    if not t:
+        return True
+
+    # Keep ellipsis
+    if t in ('...', '…'):
+        return False
+
+    # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
+    if t.startswith('[') or t.startswith('["') or t.startswith("['"):
+        return False
+    if t.endswith(']'):
+        return False
+
+    # Pure non-alpha → noise ("3", ")", "|")
+    alpha_chars = _RE_ALPHA.findall(t)
+    if not alpha_chars:
+        return True
+
+    # Extract only alpha characters for dictionary lookup
+    cleaned = ''.join(alpha_chars)
+
+    # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
+    if cleaned.lower() in _KNOWN_ABBREVIATIONS:
+        return False
+
+    # Strip normal trailing punctuation before checking for internal noise.
+    stripped_punct = re.sub(r'[.,;:!?]+$', '', t)  # "cupcakes." → "cupcakes"
+    t_check = stripped_punct if stripped_punct else t
+
+    # Check for legitimate punctuation patterns vs. real noise.
+    # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
+    #             "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
+    # Noise: "3d", "B|", "x7"
+    # Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
+    # THEN check if residual contains only alpha characters.
+    t_inner = t_check
+    # Remove all parentheses, hyphens, slashes, and dots — these are normal
+    # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
+    # "(zer)brechen", "wir/uns", "e.g."
+    t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
+    # Now check: does the inner form still have non-alpha noise?
+    inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
+    has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
+
+    # Long alpha words (4+ chars) without internal noise are likely real
+    if len(cleaned) >= 4 and not has_internal_noise:
+        return False
+
+    # Short words: check dictionary (uses only alpha chars)
+    if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
+        return False
+
+    # Default: short or suspicious → noise
+    return True
+
+
+def _is_garbage_text(text: str) -> bool:
+    """Check if entire cell text is OCR garbage from image areas.
+
+    Garbage text = no recognizable dictionary word.  Catches
+    "(ci]oeu", "uanoaain." etc.
+    """
+    words = _RE_REAL_WORD.findall(text)
+    if not words:
+        # Check if any token is a known abbreviation (e.g. "e.g.")
+        alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
+        if alpha_only in _KNOWN_ABBREVIATIONS:
+            return False
+        return True
+
+    for w in words:
+        wl = w.lower()
+        # Known short word or abbreviation → not garbage
+        if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
+            return False
+        # Long word (>= 4 chars): check vowel/consonant ratio.
+        # Real EN/DE words have 20-60% vowels.  Garbage like "uanoaain"
+        # or "cioeu" has unusual ratios (too many or too few vowels).
+        if len(wl) >= 4:
+            vowels = sum(1 for c in wl if c in 'aeiouäöü')
+            ratio = vowels / len(wl)
+            if 0.15 <= ratio <= 0.65:
+                return False  # plausible vowel ratio → real word
+
+    return True
+
+
+def _clean_cell_text(text: str) -> str:
+    """Remove OCR noise from cell text.  Generic filters:
+
+    1. If the entire text has no real alphabetic word (>= 2 letters), clear.
+    2. If the entire text is garbage (no dictionary word), clear.
+    3. Strip trailing noise tokens from the end of the text.
+    """
+    stripped = text.strip()
+    if not stripped:
+        return ''
+
+    # --- Filter 1: No real word at all ---
+    if not _RE_REAL_WORD.search(stripped):
+        # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
+        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+        if alpha_only not in _KNOWN_ABBREVIATIONS:
+            return ''
+
+    # --- Filter 2: Entire text is garbage ---
+    if _is_garbage_text(stripped):
+        return ''
+
+    # --- Filter 3: Strip trailing noise tokens ---
+    tokens = stripped.split()
+    while tokens and _is_noise_tail_token(tokens[-1]):
+        tokens.pop()
+    if not tokens:
+        return ''
+
+    return ' '.join(tokens)
+
+
+def _clean_cell_text_lite(text: str) -> str:
+    """Simplified noise filter for cell-first OCR (isolated cell crops).
+
+    Since each cell is OCR'd in isolation (no neighbour content visible),
+    trailing-noise stripping is unnecessary.  Only 2 filters remain:
+
+    1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty.
+    2. Entire text is garbage (no dictionary word) → empty.
+    """
+    stripped = text.strip()
+    if not stripped:
+        return ''
+
+    # --- Filter 1: No real word at all ---
+    if not _RE_REAL_WORD.search(stripped):
+        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+        if alpha_only not in _KNOWN_ABBREVIATIONS:
+            return ''
+
+    # --- Filter 2: Entire text is garbage ---
+    if _is_garbage_text(stripped):
+        return ''
+
+    return stripped
+
+
+# ---------------------------------------------------------------------------
+# Bold detection via stroke-width analysis (relative / page-level)
+# ---------------------------------------------------------------------------
+
+def _measure_stroke_width(gray_crop: np.ndarray) -> float:
+    """Measure mean stroke width in a binarised cell crop.
+
+    Returns a DPI-normalised value (mean stroke width as % of crop height),
+    or 0.0 if measurement is not possible.
+    """
+    if gray_crop is None or gray_crop.size == 0:
+        return 0.0
+    h, w = gray_crop.shape[:2]
+    if h < 10 or w < 10:
+        return 0.0
+
+    # Binarise: text = white (255), background = black (0)
+    _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+    if cv2.countNonZero(bw) < 20:
+        return 0.0
+
+    # Distance transform: value at each white pixel = distance to nearest black
+    dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
+
+    # Skeleton via morphological thinning
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    thin = bw.copy()
+    for _ in range(max(1, min(h, w) // 6)):
+        eroded = cv2.erode(thin, kernel)
+        if cv2.countNonZero(eroded) < 5:
+            break
+        thin = eroded
+
+    skeleton_pts = thin > 0
+    if not np.any(skeleton_pts):
+        return 0.0
+    mean_stroke = float(np.mean(dist[skeleton_pts]))
+    return mean_stroke / max(h, 1) * 100  # normalised: % of cell height
+
+
+def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
+                         img_w: int, img_h: int) -> None:
+    """Two-pass bold detection: measure all cells, then compare against median.
+
+    Cells with stroke width > 1.4× the page median are marked as bold.
+    This adapts automatically to font, DPI and scan quality.
+    Modifies cells in-place (sets 'is_bold' key).
+    """
+    if ocr_img is None:
+        return
+
+    # Pass 1: measure stroke width for every cell with text
+    metrics: List[float] = []
+    cell_strokes: List[float] = []
+    for cell in cells:
+        sw = 0.0
+        if cell.get('text', '').strip():
+            bp = cell['bbox_px']
+            y1 = max(0, bp['y'])
+            y2 = min(img_h, bp['y'] + bp['h'])
+            x1 = max(0, bp['x'])
+            x2 = min(img_w, bp['x'] + bp['w'])
+            if y2 > y1 and x2 > x1:
+                sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
+        cell_strokes.append(sw)
+        if sw > 0:
+            metrics.append(sw)
+
+    if len(metrics) < 3:
+        # Too few cells to compare — leave all as non-bold
+        return
+
+    median_sw = float(np.median(metrics))
+    if median_sw <= 0:
+        return
+
+    # Pass 2: cells significantly above median → bold
+    for cell, sw in zip(cells, cell_strokes):
+        cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
+
+
+# ---------------------------------------------------------------------------
diff --git a/klausur-service/backend/cv_preprocessing.py b/klausur-service/backend/cv_preprocessing.py
new file mode 100644
index 0000000..133d47f
--- /dev/null
+++ b/klausur-service/backend/cv_preprocessing.py
@@ -0,0 +1,1166 @@
+"""
+Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import time
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+# Guarded imports — mirror cv_vocab_types guards
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
+    """Render a PDF page to a high-resolution numpy array (BGR).
+
+    Args:
+        pdf_data: Raw PDF bytes.
+        page_number: 0-indexed page number.
+        zoom: Zoom factor (3.0 = 432 DPI).
+
+    Returns:
+        numpy array in BGR format.
+    """
+    import fitz  # PyMuPDF
+
+    pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
+    if page_number >= pdf_doc.page_count:
+        raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
+
+    page = pdf_doc[page_number]
+    mat = fitz.Matrix(zoom, zoom)
+    pix = page.get_pixmap(matrix=mat)
+
+    # Convert to numpy BGR
+    img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
+    if pix.n == 4:  # RGBA
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
+    elif pix.n == 3:  # RGB
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
+    else:  # Grayscale
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
+
+    pdf_doc.close()
+    return img_bgr
+
+
+def render_image_high_res(image_data: bytes) -> np.ndarray:
+    """Load an image (PNG/JPEG) into a numpy array (BGR).
+
+    Args:
+        image_data: Raw image bytes.
+
+    Returns:
+        numpy array in BGR format.
+    """
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise ValueError("Could not decode image data")
+    return img_bgr
+
+
+# =============================================================================
+# Stage 1b: Orientation Detection (0°/90°/180°/270°)
+# =============================================================================
+
+def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
+    """Detect page orientation via Tesseract OSD and rotate if needed.
+
+    Handles upside-down scans (180°) common with book scanners where
+    every other page is flipped due to the scanner hinge.
+
+    Returns:
+        (corrected_image, rotation_degrees)  — rotation is 0, 90, 180, or 270.
+    """
+    if pytesseract is None:
+        return img_bgr, 0
+
+    try:
+        # Tesseract OSD needs a grayscale or RGB image
+        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+        pil_img = Image.fromarray(gray)
+
+        osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
+        rotate = osd.get("rotate", 0)
+        confidence = osd.get("orientation_conf", 0.0)
+
+        logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}")
+
+        if rotate == 0 or confidence < 1.0:
+            return img_bgr, 0
+
+        # Apply rotation
+        if rotate == 180:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
+        elif rotate == 90:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        elif rotate == 270:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
+        else:
+            return img_bgr, 0
+
+        logger.info(f"OSD: rotated {rotate}° to fix orientation")
+        return corrected, rotate
+
+    except Exception as e:
+        logger.warning(f"OSD orientation detection failed: {e}")
+        return img_bgr, 0
+
+
+# =============================================================================
+# Stage 2: Deskew (Rotation Correction)
+# =============================================================================
+
+def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
+    """Correct rotation using Hough Line detection.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Tuple of (corrected image, detected angle in degrees).
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Binarize for line detection
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    # Detect lines
+    lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
+                            minLineLength=img.shape[1] // 4, maxLineGap=20)
+
+    if lines is None or len(lines) < 3:
+        return img, 0.0
+
+    # Compute angles of near-horizontal lines
+    angles = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+        if abs(angle) < 15:  # Only near-horizontal
+            angles.append(angle)
+
+    if not angles:
+        return img, 0.0
+
+    median_angle = float(np.median(angles))
+
+    # Limit correction to ±5°
+    if abs(median_angle) > 5.0:
+        median_angle = 5.0 * np.sign(median_angle)
+
+    if abs(median_angle) < 0.1:
+        return img, 0.0
+
+    # Rotate
+    h, w = img.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+    corrected = cv2.warpAffine(img, M, (w, h),
+                               flags=cv2.INTER_LINEAR,
+                               borderMode=cv2.BORDER_REPLICATE)
+
+    logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
+    return corrected, median_angle
+
+
+def deskew_image_by_word_alignment(
+    image_data: bytes,
+    lang: str = "eng+deu",
+    downscale_factor: float = 0.5,
+) -> Tuple[bytes, float]:
+    """Correct rotation by fitting a line through left-most word starts per text line.
+
+    More robust than Hough-based deskew for vocabulary worksheets where text lines
+    have consistent left-alignment.  Runs a quick Tesseract pass on a downscaled
+    copy to find word positions, computes the dominant left-edge column, fits a
+    line through those points and rotates the full-resolution image.
+
+    Args:
+        image_data: Raw image bytes (PNG/JPEG).
+        lang: Tesseract language string for the quick pass.
+        downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
+
+    Returns:
+        Tuple of (rotated image as PNG bytes, detected angle in degrees).
+    """
+    if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
+        return image_data, 0.0
+
+    # 1. Decode image
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img is None:
+        logger.warning("deskew_by_word_alignment: could not decode image")
+        return image_data, 0.0
+
+    orig_h, orig_w = img.shape[:2]
+
+    # 2. Downscale for fast Tesseract pass
+    small_w = int(orig_w * downscale_factor)
+    small_h = int(orig_h * downscale_factor)
+    small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
+
+    # 3. Quick Tesseract — word-level positions
+    pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+    try:
+        data = pytesseract.image_to_data(
+            pil_small, lang=lang, config="--psm 6 --oem 3",
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception as e:
+        logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
+        return image_data, 0.0
+
+    # 4. Per text-line, find the left-most word start
+    #    Group by (block_num, par_num, line_num)
+    line_groups: Dict[tuple, list] = defaultdict(list)
+    for i in range(len(data["text"])):
+        text = (data["text"][i] or "").strip()
+        conf = int(data["conf"][i])
+        if not text or conf < 20:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        line_groups[key].append(i)
+
+    if len(line_groups) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
+        return image_data, 0.0
+
+    # For each line, pick the word with smallest 'left' → compute (left_x, center_y)
+    # Scale back to original resolution
+    scale = 1.0 / downscale_factor
+    points = []  # list of (x, y) in original-image coords
+    for key, indices in line_groups.items():
+        best_idx = min(indices, key=lambda i: data["left"][i])
+        lx = data["left"][best_idx] * scale
+        top = data["top"][best_idx] * scale
+        h = data["height"][best_idx] * scale
+        cy = top + h / 2.0
+        points.append((lx, cy))
+
+    # 5. Find dominant left-edge column + compute angle
+    xs = np.array([p[0] for p in points])
+    ys = np.array([p[1] for p in points])
+    median_x = float(np.median(xs))
+    tolerance = orig_w * 0.03  # 3% of image width
+
+    mask = np.abs(xs - median_x) <= tolerance
+    filtered_xs = xs[mask]
+    filtered_ys = ys[mask]
+
+    if len(filtered_xs) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
+        return image_data, 0.0
+
+    # polyfit: x = a*y + b  →  a = dx/dy  →  angle = arctan(a)
+    coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
+    slope = coeffs[0]  # dx/dy
+    angle_rad = np.arctan(slope)
+    angle_deg = float(np.degrees(angle_rad))
+
+    # Clamp to ±5°
+    angle_deg = max(-5.0, min(5.0, angle_deg))
+
+    logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
+                f"(total lines: {len(line_groups)})")
+
+    if abs(angle_deg) < 0.05:
+        return image_data, 0.0
+
+    # 6. Rotate full-res image
+    center = (orig_w // 2, orig_h // 2)
+    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+    rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    # Encode back to PNG
+    success, png_buf = cv2.imencode(".png", rotated)
+    if not success:
+        logger.warning("deskew_by_word_alignment: PNG encoding failed")
+        return image_data, 0.0
+
+    return png_buf.tobytes(), angle_deg
+
+
+def _projection_gradient_score(profile: np.ndarray) -> float:
+    """Score a projection profile by the L2-norm of its first derivative.
+
+    Higher score = sharper transitions between text-lines and gaps,
+    i.e. better row/column alignment.
+    """
+    diff = np.diff(profile)
+    return float(np.sum(diff * diff))
+
+
+def deskew_image_iterative(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+    coarse_step: float = 0.1,
+    fine_range: float = 0.15,
+    fine_step: float = 0.02,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Iterative deskew using vertical-edge projection optimisation.
+
+    The key insight: at the correct rotation angle, vertical features
+    (word left-edges, column borders) become truly vertical, producing
+    the sharpest peaks in the vertical projection of vertical edges.
+
+    Method:
+      1. Detect vertical edges via Sobel-X on the central crop.
+      2. Coarse sweep: rotate edge image, compute vertical projection
+         gradient score.  The angle where vertical edges align best wins.
+      3. Fine sweep: refine around the coarse winner.
+
+    Args:
+        img: BGR image (full resolution).
+        coarse_range: half-range in degrees for the coarse sweep.
+        coarse_step: step size in degrees for the coarse sweep.
+        fine_range: half-range around the coarse winner for the fine sweep.
+        fine_step: step size in degrees for the fine sweep.
+
+    Returns:
+        (rotated_bgr, angle_degrees, debug_dict)
+    """
+    h, w = img.shape[:2]
+    debug: Dict[str, Any] = {}
+
+    # --- Grayscale + vertical edge detection ---
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Central crop (15%-85% height, 10%-90% width) to avoid page margins
+    y_lo, y_hi = int(h * 0.15), int(h * 0.85)
+    x_lo, x_hi = int(w * 0.10), int(w * 0.90)
+    gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
+
+    # Sobel-X → absolute vertical edges
+    sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
+    edges = np.abs(sobel_x)
+    # Normalise to 0-255 for consistent scoring
+    edge_max = edges.max()
+    if edge_max > 0:
+        edges = (edges / edge_max * 255).astype(np.uint8)
+    else:
+        return img, 0.0, {"error": "no edges detected"}
+
+    crop_h, crop_w = edges.shape[:2]
+    crop_center = (crop_w // 2, crop_h // 2)
+
+    # Trim margin after rotation to avoid border artifacts
+    trim_y = max(4, int(crop_h * 0.03))
+    trim_x = max(4, int(crop_w * 0.03))
+
+    def _sweep_edges(angles: np.ndarray) -> list:
+        """Score each angle by vertical projection gradient of vertical edges."""
+        results = []
+        for angle in angles:
+            if abs(angle) < 1e-6:
+                rotated = edges
+            else:
+                M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
+                rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_REPLICATE)
+            # Trim borders to avoid edge artifacts
+            trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
+            v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
+            score = _projection_gradient_score(v_profile)
+            results.append((float(angle), score))
+        return results
+
+    # --- Phase 1: coarse sweep ---
+    coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
+    coarse_results = _sweep_edges(coarse_angles)
+    best_coarse = max(coarse_results, key=lambda x: x[1])
+    best_coarse_angle, best_coarse_score = best_coarse
+
+    debug["coarse_best_angle"] = round(best_coarse_angle, 2)
+    debug["coarse_best_score"] = round(best_coarse_score, 1)
+    debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
+
+    # --- Phase 2: fine sweep around coarse winner ---
+    fine_lo = best_coarse_angle - fine_range
+    fine_hi = best_coarse_angle + fine_range
+    fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
+    fine_results = _sweep_edges(fine_angles)
+    best_fine = max(fine_results, key=lambda x: x[1])
+    best_fine_angle, best_fine_score = best_fine
+
+    debug["fine_best_angle"] = round(best_fine_angle, 2)
+    debug["fine_best_score"] = round(best_fine_score, 1)
+    debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
+
+    final_angle = best_fine_angle
+
+    # Clamp to ±5°
+    final_angle = max(-5.0, min(5.0, final_angle))
+
+    logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°")
+
+    if abs(final_angle) < 0.05:
+        return img, 0.0, debug
+
+    # --- Rotate full-res image ---
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    return rotated, final_angle, debug
+
+
+def _measure_textline_slope(img: np.ndarray) -> float:
+    """Measure residual text-line slope via Tesseract word-position regression.
+
+    Groups Tesseract words by (block, par, line), fits a linear regression
+    per line (y = slope * x + b), and returns the trimmed-mean slope in
+    degrees.  Positive = text rises to the right, negative = falls.
+
+    This is the most direct measurement of remaining rotation after deskew.
+    """
+    import math as _math
+
+    if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
+        return 0.0
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    data = pytesseract.image_to_data(
+        Image.fromarray(gray),
+        output_type=pytesseract.Output.DICT,
+        config="--psm 6",
+    )
+
+    # Group word centres by text line
+    lines: Dict[tuple, list] = {}
+    for i in range(len(data["text"])):
+        txt = (data["text"][i] or "").strip()
+        if len(txt) < 2 or int(data["conf"][i]) < 30:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        cx = data["left"][i] + data["width"][i] / 2.0
+        cy = data["top"][i] + data["height"][i] / 2.0
+        lines.setdefault(key, []).append((cx, cy))
+
+    # Per-line linear regression → slope angle
+    slopes: list = []
+    for pts in lines.values():
+        if len(pts) < 3:
+            continue
+        pts.sort(key=lambda p: p[0])
+        xs = np.array([p[0] for p in pts], dtype=np.float64)
+        ys = np.array([p[1] for p in pts], dtype=np.float64)
+        if xs[-1] - xs[0] < w * 0.15:
+            continue  # skip short lines
+        A = np.vstack([xs, np.ones_like(xs)]).T
+        result = np.linalg.lstsq(A, ys, rcond=None)
+        slope = result[0][0]
+        slopes.append(_math.degrees(_math.atan(slope)))
+
+    if len(slopes) < 3:
+        return 0.0
+
+    # Trimmed mean (drop 10% extremes on each side)
+    slopes.sort()
+    trim = max(1, len(slopes) // 10)
+    trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
+    if not trimmed:
+        return 0.0
+
+    return sum(trimmed) / len(trimmed)
+
+
+def deskew_two_pass(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Two-pass deskew: iterative projection + word-alignment residual check.
+
+    Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range).
+    Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image
+             to detect and fix residual skew that the projection method missed.
+
+    The two corrections are summed.  If the residual from Pass 2 is below
+    0.3° it is ignored (already good enough).
+
+    Returns:
+        (corrected_bgr, total_angle_degrees, debug_dict)
+    """
+    debug: Dict[str, Any] = {}
+
+    # --- Pass 1: iterative projection ---
+    corrected, angle1, dbg1 = deskew_image_iterative(
+        img.copy(), coarse_range=coarse_range,
+    )
+    debug["pass1_angle"] = round(angle1, 3)
+    debug["pass1_method"] = "iterative"
+    debug["pass1_debug"] = dbg1
+
+    # --- Pass 2: word-alignment residual check on corrected image ---
+    angle2 = 0.0
+    try:
+        # Encode the corrected image to PNG bytes for word-alignment
+        ok, buf = cv2.imencode(".png", corrected)
+        if ok:
+            corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
+            if abs(angle2) >= 0.3:
+                # Significant residual — decode and use the second correction
+                arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
+                corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
+                if corrected2 is not None:
+                    corrected = corrected2
+                    logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied "
+                                f"(total={angle1 + angle2:.2f}°)")
+                else:
+                    angle2 = 0.0
+            else:
+                logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped")
+                angle2 = 0.0
+    except Exception as e:
+        logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
+        angle2 = 0.0
+
+    # --- Pass 3: Tesseract text-line regression residual check ---
+    # The most reliable final check: measure actual text-line slopes
+    # using Tesseract word positions and linear regression per line.
+    angle3 = 0.0
+    try:
+        residual = _measure_textline_slope(corrected)
+        debug["pass3_raw"] = round(residual, 3)
+        if abs(residual) >= 0.3:
+            h3, w3 = corrected.shape[:2]
+            center3 = (w3 // 2, h3 // 2)
+            M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
+            corrected = cv2.warpAffine(
+                corrected, M3, (w3, h3),
+                flags=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+            angle3 = residual
+            logger.info(
+                "deskew_two_pass: pass3 text-line residual=%.2f° applied",
+                residual,
+            )
+        else:
+            logger.info(
+                "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped",
+                residual,
+            )
+    except Exception as e:
+        logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
+
+    total_angle = angle1 + angle2 + angle3
+    debug["pass2_angle"] = round(angle2, 3)
+    debug["pass2_method"] = "word_alignment"
+    debug["pass3_angle"] = round(angle3, 3)
+    debug["pass3_method"] = "textline_regression"
+    debug["total_angle"] = round(total_angle, 3)
+
+    logger.info(
+        "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°",
+        angle1, angle2, angle3, total_angle,
+    )
+
+    return corrected, total_angle, debug
+
+
+# =============================================================================
+# Stage 3: Dewarp (Book Curvature Correction)
+# =============================================================================
+
+def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
+    """Detect the vertical shear angle of the page.
+
+    After deskew (horizontal lines aligned), vertical features like column
+    edges may still be tilted. This measures that tilt by tracking the
+    strongest vertical edge across horizontal strips.
+
+    The result is a shear angle in degrees: the angular difference between
+    true vertical and the detected column edge.
+
+    Returns:
+        Dict with keys: method, shear_degrees, confidence.
+    """
+    h, w = img.shape[:2]
+    result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Vertical Sobel to find vertical edges
+    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+    abs_sobel = np.abs(sobel_x).astype(np.uint8)
+
+    # Binarize with Otsu
+    _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+    num_strips = 20
+    strip_h = h // num_strips
+    edge_positions = []  # (y_center, x_position)
+
+    for i in range(num_strips):
+        y_start = i * strip_h
+        y_end = min((i + 1) * strip_h, h)
+        strip = binary[y_start:y_end, :]
+
+        # Project vertically (sum along y-axis)
+        projection = np.sum(strip, axis=0).astype(np.float64)
+        if projection.max() == 0:
+            continue
+
+        # Find the strongest vertical edge in left 40% of image
+        search_w = int(w * 0.4)
+        left_proj = projection[:search_w]
+        if left_proj.max() == 0:
+            continue
+
+        # Smooth and find peak
+        kernel_size = max(3, w // 100)
+        if kernel_size % 2 == 0:
+            kernel_size += 1
+        smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
+        x_pos = float(np.argmax(smoothed))
+        y_center = (y_start + y_end) / 2.0
+        edge_positions.append((y_center, x_pos))
+
+    if len(edge_positions) < 8:
+        return result
+
+    ys = np.array([p[0] for p in edge_positions])
+    xs = np.array([p[1] for p in edge_positions])
+
+    # Remove outliers (> 2 std from median)
+    median_x = np.median(xs)
+    std_x = max(np.std(xs), 1.0)
+    mask = np.abs(xs - median_x) < 2 * std_x
+    ys = ys[mask]
+    xs = xs[mask]
+
+    if len(ys) < 6:
+        return result
+
+    # Fit straight line: x = slope * y + intercept
+    # The slope tells us the tilt of the vertical edge
+    straight_coeffs = np.polyfit(ys, xs, 1)
+    slope = straight_coeffs[0]  # dx/dy in pixels
+    fitted = np.polyval(straight_coeffs, ys)
+    residuals = xs - fitted
+    rmse = float(np.sqrt(np.mean(residuals ** 2)))
+
+    # Convert slope to angle: arctan(dx/dy) in degrees
+    import math
+    shear_degrees = math.degrees(math.atan(slope))
+
+    confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(float(confidence), 2)
+
+    return result
+
+
+def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear angle by maximising variance of horizontal text-line projections.
+
+    Principle: horizontal text lines produce a row-projection profile with sharp
+    peaks (high variance) when the image is correctly aligned.  Any residual shear
+    smears the peaks and reduces variance.  We sweep ±3° and pick the angle whose
+    corrected projection has the highest variance.
+
+    Works best on pages with clear horizontal banding (vocabulary tables, prose).
+    Complements _detect_shear_angle() which needs strong vertical edges.
+
+    Returns:
+        Dict with keys: method, shear_degrees, confidence.
+    """
+    import math
+    result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Otsu binarisation
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    # Work at half resolution for speed
+    small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
+    sh, sw = small.shape
+
+    # 2-pass angle sweep for 10x better precision:
+    # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
+    # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
+
+    def _sweep_variance(angles_list):
+        results = []
+        for angle_deg in angles_list:
+            if abs(angle_deg) < 0.001:
+                rotated = small
+            else:
+                shear_tan = math.tan(math.radians(angle_deg))
+                M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
+                rotated = cv2.warpAffine(small, M, (sw, sh),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_CONSTANT)
+            profile = np.sum(rotated, axis=1).astype(float)
+            results.append((angle_deg, float(np.var(profile))))
+        return results
+
+    # Pass 1: coarse
+    coarse_angles = [a * 0.5 for a in range(-6, 7)]  # 13 values
+    coarse_results = _sweep_variance(coarse_angles)
+    coarse_best = max(coarse_results, key=lambda x: x[1])
+
+    # Pass 2: fine around coarse best
+    fine_center = coarse_best[0]
+    fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]  # 21 values
+    fine_results = _sweep_variance(fine_angles)
+    fine_best = max(fine_results, key=lambda x: x[1])
+
+    best_angle = fine_best[0]
+    best_variance = fine_best[1]
+    variances = coarse_results + fine_results
+
+    # Confidence: how much sharper is the best angle vs. the mean?
+    all_mean = sum(v for _, v in variances) / len(variances)
+    if all_mean > 0 and best_variance > all_mean:
+        confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
+    else:
+        confidence = 0.0
+
+    result["shear_degrees"] = round(best_angle, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear using Hough transform on printed table / ruled lines.
+
+    Vocabulary worksheets have near-horizontal printed table borders.  After
+    deskew these should be exactly horizontal; any residual tilt equals the
+    vertical shear angle (with inverted sign).
+
+    The sign convention: a horizontal line tilting +α degrees (left end lower)
+    means the page has vertical shear of -α degrees (left column edge drifts
+    to the left going downward).
+
+    Returns:
+        Dict with keys: method, shear_degrees, confidence.
+    """
+    result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+
+    min_len = int(w * 0.15)
+    lines = cv2.HoughLinesP(
+        edges, rho=1, theta=np.pi / 360,
+        threshold=int(w * 0.08),
+        minLineLength=min_len,
+        maxLineGap=20,
+    )
+
+    if lines is None or len(lines) < 3:
+        return result
+
+    horizontal_angles: List[Tuple[float, float]] = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        if x1 == x2:
+            continue
+        angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
+        if abs(angle) <= 5.0:
+            length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
+            horizontal_angles.append((angle, length))
+
+    if len(horizontal_angles) < 3:
+        return result
+
+    # Weighted median
+    angles_arr = np.array([a for a, _ in horizontal_angles])
+    weights_arr = np.array([l for _, l in horizontal_angles])
+    sorted_idx = np.argsort(angles_arr)
+    s_angles = angles_arr[sorted_idx]
+    s_weights = weights_arr[sorted_idx]
+    cum = np.cumsum(s_weights)
+    mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
+    median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
+
+    agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
+    confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
+
+    # Sign inversion: horizontal line tilt is complementary to vertical shear
+    shear_degrees = -median_angle
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear by measuring text-line straightness (Method D).
+
+    Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
+    bounding boxes, groups them into vertical columns by X-proximity,
+    and measures how the left-edge X position drifts with Y (vertical
+    position).  The drift dx/dy is the tangent of the shear angle.
+
+    This directly measures vertical shear (column tilt) rather than
+    horizontal text-line slope, which is already corrected by deskew.
+
+    Returns:
+        Dict with keys: method, shear_degrees, confidence.
+    """
+    import math
+    result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    # Downscale 50% for speed
+    scale = 0.5
+    small = cv2.resize(img, (int(w * scale), int(h * scale)),
+                       interpolation=cv2.INTER_AREA)
+    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+    pil_img = Image.fromarray(gray)
+
+    try:
+        data = pytesseract.image_to_data(
+            pil_img, lang='eng+deu', config='--psm 11 --oem 3',
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception:
+        return result
+
+    # Collect word left-edges (x) and vertical centres (y)
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 20 or len(text) < 2:
+            continue
+        left_x = float(data['left'][i])
+        cy = data['top'][i] + data['height'][i] / 2.0
+        word_w = float(data['width'][i])
+        words.append((left_x, cy, word_w))
+
+    if len(words) < 15:
+        return result
+
+    # --- Group words into vertical columns by left-edge X proximity ---
+    # Sort by x, then cluster words whose left-edges are within x_tol
+    avg_w = sum(ww for _, _, ww in words) / len(words)
+    x_tol = max(avg_w * 0.4, 8)  # tolerance for "same column"
+
+    words_by_x = sorted(words, key=lambda w: w[0])
+    columns: List[List[Tuple[float, float]]] = []  # each: [(left_x, cy), ...]
+    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+    cur_x = words_by_x[0][0]
+
+    for lx, cy, _ in words_by_x[1:]:
+        if abs(lx - cur_x) <= x_tol:
+            cur_col.append((lx, cy))
+            # Update running x as median of cluster
+            cur_x = cur_x * 0.8 + lx * 0.2
+        else:
+            if len(cur_col) >= 5:
+                columns.append(cur_col)
+            cur_col = [(lx, cy)]
+            cur_x = lx
+    if len(cur_col) >= 5:
+        columns.append(cur_col)
+
+    if len(columns) < 2:
+        return result
+
+    # --- For each column, measure X-drift as a function of Y ---
+    # Fit: left_x = a * cy + b  →  a = dx/dy = tan(shear_angle)
+    drifts = []
+    for col in columns:
+        ys = np.array([p[1] for p in col])
+        xs = np.array([p[0] for p in col])
+        y_range = ys.max() - ys.min()
+        if y_range < h * scale * 0.3:
+            continue  # column must span at least 30% of image height
+        # Linear regression: x = a*y + b
+        coeffs = np.polyfit(ys, xs, 1)
+        drifts.append(coeffs[0])  # dx/dy
+
+    if len(drifts) < 2:
+        return result
+
+    # Median dx/dy → shear angle
+    # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
+    median_drift = float(np.median(drifts))
+    shear_degrees = math.degrees(math.atan(median_drift))
+
+    # Confidence from column count + drift consistency
+    drift_std = float(np.std(drifts))
+    consistency = max(0.0, 1.0 - drift_std * 50)  # tighter penalty for drift variance
+    count_factor = min(1.0, len(drifts) / 4.0)
+    confidence = count_factor * 0.5 + consistency * 0.5
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+                "shear=%.3f°, conf=%.2f",
+                len(columns), len(drifts), median_drift,
+                shear_degrees, confidence)
+    return result
+
+
+def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
+    """Check whether the dewarp correction actually improved alignment.
+
+    Compares horizontal projection variance before and after correction.
+    Higher variance means sharper text-line peaks, which indicates better
+    horizontal alignment.
+
+    Returns True if the correction improved the image, False if it should
+    be discarded.
+    """
+    def _h_proj_variance(img: np.ndarray) -> float:
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        _, binary = cv2.threshold(gray, 0, 255,
+                                  cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
+                           interpolation=cv2.INTER_AREA)
+        profile = np.sum(small, axis=1).astype(float)
+        return float(np.var(profile))
+
+    var_before = _h_proj_variance(original)
+    var_after = _h_proj_variance(corrected)
+
+    # Correction must improve variance (even by a tiny margin)
+    return var_after > var_before
+
+
+def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply a vertical shear correction to an image.
+
+    Shifts each row horizontally proportional to its distance from the
+    vertical center. This corrects the tilt of vertical features (columns)
+    without affecting horizontal alignment (text lines).
+
+    Args:
+        img: BGR image.
+        shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
+
+    Returns:
+        Corrected image.
+    """
+    import math
+    h, w = img.shape[:2]
+    shear_tan = math.tan(math.radians(shear_degrees))
+
+    # Affine matrix: shift x by shear_tan * (y - h/2)
+    # [1  shear_tan  -h/2*shear_tan]
+    # [0  1          0             ]
+    M = np.float32([
+        [1, shear_tan, -h / 2.0 * shear_tan],
+        [0, 1, 0],
+    ])
+
+    corrected = cv2.warpAffine(img, M, (w, h),
+                                flags=cv2.INTER_LINEAR,
+                                borderMode=cv2.BORDER_REPLICATE)
+    return corrected
+
+
+def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
+    """Combine multiple shear detections into a single weighted estimate (v2).
+
+    Ensemble v2 changes vs v1:
+    - Minimum confidence raised to 0.5 (was 0.3)
+    - text_lines method gets 1.5× weight boost (most reliable detector)
+    - Outlier filter at 1° from weighted mean
+
+    Returns:
+        (shear_degrees, ensemble_confidence, methods_used_str)
+    """
+    # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
+    # that individual methods detect with moderate confidence.
+    _MIN_CONF = 0.35
+
+    # text_lines gets a weight boost as the most content-aware method
+    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
+
+    accepted = []
+    for d in detections:
+        if d["confidence"] < _MIN_CONF:
+            continue
+        boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
+        effective_conf = d["confidence"] * boost
+        accepted.append((d["shear_degrees"], effective_conf, d["method"]))
+
+    if not accepted:
+        return 0.0, 0.0, "none"
+
+    if len(accepted) == 1:
+        deg, conf, method = accepted[0]
+        return deg, min(conf, 1.0), method
+
+    # First pass: weighted mean
+    total_w = sum(c for _, c, _ in accepted)
+    w_mean = sum(d * c for d, c, _ in accepted) / total_w
+
+    # Outlier filter: keep results within 1° of weighted mean
+    filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
+    if not filtered:
+        filtered = accepted  # fallback: keep all
+
+    # Second pass: weighted mean on filtered results
+    total_w2 = sum(c for _, c, _ in filtered)
+    final_deg = sum(d * c for d, c, _ in filtered) / total_w2
+
+    # Ensemble confidence: average of individual confidences, boosted when
+    # methods agree (all within 0.5° of each other)
+    avg_conf = total_w2 / len(filtered)
+    spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
+    agreement_bonus = 0.15 if spread < 0.5 else 0.0
+    ensemble_conf = min(1.0, avg_conf + agreement_bonus)
+
+    methods_str = "+".join(m for _, _, m in filtered)
+    return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
+
+
+def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
+    """Correct vertical shear after deskew (v2 with quality gate).
+
+    After deskew aligns horizontal text lines, vertical features (column
+    edges) may still be tilted. This detects the tilt angle using an ensemble
+    of four complementary methods and applies an affine shear correction.
+
+    Methods (all run in ~150ms total):
+        A. _detect_shear_angle()           — vertical edge profile (~50ms)
+        B. _detect_shear_by_projection()   — horizontal text-line variance (~30ms)
+        C. _detect_shear_by_hough()        — Hough lines on table borders (~20ms)
+        D. _detect_shear_by_text_lines()   — text-line straightness (~50ms)
+
+    Quality gate: after correction, horizontal projection variance is compared
+    before vs after. If correction worsened alignment, it is discarded.
+
+    Args:
+        img: BGR image (already deskewed).
+        use_ensemble: If False, fall back to single-method behaviour (method A only).
+
+    Returns:
+        Tuple of (corrected_image, dewarp_info).
+        dewarp_info keys: method, shear_degrees, confidence, detections.
+    """
+    no_correction = {
+        "method": "none",
+        "shear_degrees": 0.0,
+        "confidence": 0.0,
+        "detections": [],
+    }
+
+    if not CV2_AVAILABLE:
+        return img, no_correction
+
+    t0 = time.time()
+
+    if use_ensemble:
+        det_a = _detect_shear_angle(img)
+        det_b = _detect_shear_by_projection(img)
+        det_c = _detect_shear_by_hough(img)
+        det_d = _detect_shear_by_text_lines(img)
+        detections = [det_a, det_b, det_c, det_d]
+        shear_deg, confidence, method = _ensemble_shear(detections)
+    else:
+        det_a = _detect_shear_angle(img)
+        detections = [det_a]
+        shear_deg = det_a["shear_degrees"]
+        confidence = det_a["confidence"]
+        method = det_a["method"]
+
+    duration = time.time() - t0
+
+    logger.info(
+        "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
+        "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
+        shear_deg, confidence, method, duration,
+        detections[0]["shear_degrees"], detections[0]["confidence"],
+        detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
+        detections[1]["confidence"] if len(detections) > 1 else 0.0,
+        detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
+        detections[2]["confidence"] if len(detections) > 2 else 0.0,
+        detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
+        detections[3]["confidence"] if len(detections) > 3 else 0.0,
+    )
+
+    # Always include individual detections (even when no correction applied)
+    _all_detections = [
+        {"method": d["method"], "shear_degrees": d["shear_degrees"],
+         "confidence": d["confidence"]}
+        for d in detections
+    ]
+
+    # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
+    # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
+    # catch moderate-confidence detections from multiple agreeing methods).
+    if abs(shear_deg) < 0.08 or confidence < 0.4:
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    # Apply correction (negate the detected shear to straighten)
+    corrected = _apply_shear(img, -shear_deg)
+
+    # Quality gate: verify the correction actually improved alignment.
+    # For small corrections (< 0.5°), the projection variance change can be
+    # negligible, so we skip the quality gate — the cost of a tiny wrong
+    # correction is much less than the cost of leaving 0.4° uncorrected
+    # (which shifts content ~25px at image edges on tall scans).
+    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
+        logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
+                     "projection variance did not improve", shear_deg)
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    info = {
+        "method": method,
+        "shear_degrees": shear_deg,
+        "confidence": confidence,
+        "detections": _all_detections,
+    }
+
+    return corrected, info
+
+
+def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply shear correction with a manual angle.
+
+    Args:
+        img: BGR image (deskewed, before dewarp).
+        shear_degrees: Shear angle in degrees to correct.
+
+    Returns:
+        Corrected image.
+    """
+    if abs(shear_degrees) < 0.001:
+        return img
+    return _apply_shear(img, -shear_degrees)
+
diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py
new file mode 100644
index 0000000..b3e0bc6
--- /dev/null
+++ b/klausur-service/backend/cv_review.py
@@ -0,0 +1,1184 @@
+"""
+Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV_PIPELINE_AVAILABLE,
+    PageRegion,
+    PipelineResult,
+    VocabRow,
+)
+from cv_preprocessing import (
+    deskew_image,
+    dewarp_image,
+    render_image_high_res,
+    render_pdf_high_res,
+)
+from cv_layout import (
+    analyze_layout,
+    create_layout_image,
+    create_ocr_image,
+)
+from cv_ocr_engines import (
+    _fix_character_confusion,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Stage 6: Multi-Pass OCR
+# =============================================================================
+
+def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
+               psm: int, fallback_psm: Optional[int] = None,
+               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
+    """Run Tesseract OCR on a specific region with given PSM.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        region: Region to crop and OCR.
+        lang: Tesseract language string.
+        psm: Page Segmentation Mode.
+        fallback_psm: If confidence too low, retry with this PSM per line.
+        min_confidence: Minimum average confidence before fallback.
+
+    Returns:
+        List of word dicts with text, position, confidence.
+    """
+    # Crop region
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    # Convert to PIL for pytesseract
+    pil_img = Image.fromarray(crop)
+
+    # Run Tesseract with specified PSM
+    config = f'--psm {psm} --oem 3'
+    try:
+        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                         output_type=pytesseract.Output.DICT)
+    except Exception as e:
+        logger.warning(f"Tesseract failed for region {region.type}: {e}")
+        return []
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 10:
+            continue
+        words.append({
+            'text': text,
+            'left': data['left'][i] + region.x,  # Absolute coords
+            'top': data['top'][i] + region.y,
+            'width': data['width'][i],
+            'height': data['height'][i],
+            'conf': conf,
+            'region_type': region.type,
+        })
+
+    # Check average confidence
+    if words and fallback_psm is not None:
+        avg_conf = sum(w['conf'] for w in words) / len(words)
+        if avg_conf < min_confidence:
+            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
+                        f"trying fallback PSM {fallback_psm}")
+            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
+
+    return words
+
+
+def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
+                              lang: str, psm: int) -> List[Dict[str, Any]]:
+    """OCR a region line by line (fallback for low-confidence regions).
+
+    Splits the region into horizontal strips based on text density,
+    then OCRs each strip individually with the given PSM.
+    """
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    # Find text lines via horizontal projection
+    inv = cv2.bitwise_not(crop)
+    h_proj = np.sum(inv, axis=1)
+    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
+
+    # Find line boundaries
+    lines = []
+    in_text = False
+    line_start = 0
+    for y in range(len(h_proj)):
+        if h_proj[y] > threshold and not in_text:
+            line_start = y
+            in_text = True
+        elif h_proj[y] <= threshold and in_text:
+            if y - line_start > 5:  # Minimum line height
+                lines.append((line_start, y))
+            in_text = False
+    if in_text and len(h_proj) - line_start > 5:
+        lines.append((line_start, len(h_proj)))
+
+    all_words = []
+    config = f'--psm {psm} --oem 3'
+
+    for line_y_start, line_y_end in lines:
+        # Add small padding
+        pad = 3
+        y1 = max(0, line_y_start - pad)
+        y2 = min(crop.shape[0], line_y_end + pad)
+        line_crop = crop[y1:y2, :]
+
+        if line_crop.size == 0:
+            continue
+
+        pil_img = Image.fromarray(line_crop)
+        try:
+            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                             output_type=pytesseract.Output.DICT)
+        except Exception:
+            continue
+
+        for i in range(len(data['text'])):
+            text = data['text'][i].strip()
+            conf = int(data['conf'][i])
+            if not text or conf < 10:
+                continue
+            all_words.append({
+                'text': text,
+                'left': data['left'][i] + region.x,
+                'top': data['top'][i] + region.y + y1,
+                'width': data['width'][i],
+                'height': data['height'][i],
+                'conf': conf,
+                'region_type': region.type,
+            })
+
+    return all_words
+
+
+def run_multi_pass_ocr(ocr_img: np.ndarray,
+                       regions: List[PageRegion],
+                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
+    """Run OCR on each detected region with optimized settings.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        regions: Detected page regions.
+        lang: Default language.
+
+    Returns:
+        Dict mapping region type to list of word dicts.
+    """
+    results: Dict[str, List[Dict]] = {}
+
+    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    for region in regions:
+        if region.type in _ocr_skip:
+            continue  # Skip non-content regions
+
+        if region.type == 'column_en':
+            words = ocr_region(ocr_img, region, lang='eng', psm=4)
+        elif region.type == 'column_de':
+            words = ocr_region(ocr_img, region, lang='deu', psm=4)
+        elif region.type == 'column_example':
+            words = ocr_region(ocr_img, region, lang=lang, psm=6,
+                              fallback_psm=7, min_confidence=40.0)
+        else:
+            words = ocr_region(ocr_img, region, lang=lang, psm=6)
+
+        results[region.type] = words
+        logger.info(f"OCR {region.type}: {len(words)} words")
+
+    return results
+
+
+# =============================================================================
+# Stage 7: Line Alignment → Vocabulary Entries
+# =============================================================================
+
+def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
+    """Group words by Y position into lines, sorted by X within each line."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]['top']
+
+    for word in sorted_words[1:]:
+        if abs(word['top'] - current_y) <= y_tolerance_px:
+            current_line.append(word)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            lines.append(current_line)
+            current_line = [word]
+            current_y = word['top']
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        lines.append(current_line)
+
+    return lines
+
+
+def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
+                          regions: List[PageRegion],
+                          y_tolerance_px: int = 25) -> List[VocabRow]:
+    """Align OCR results from different columns into vocabulary rows.
+
+    Uses Y-coordinate matching to pair English words, German translations,
+    and example sentences that appear on the same line.
+
+    Args:
+        ocr_results: Dict mapping region type to word lists.
+        regions: Detected regions (for reference).
+        y_tolerance_px: Max Y-distance to consider words on the same row.
+
+    Returns:
+        List of VocabRow objects.
+    """
+    # If no vocabulary columns detected (e.g. plain text page), return empty
+    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
+        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
+        return []
+
+    # Group words into lines per column
+    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
+    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
+    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
+
+    def line_y_center(line: List[Dict]) -> float:
+        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
+
+    def line_text(line: List[Dict]) -> str:
+        return ' '.join(w['text'] for w in line)
+
+    def line_confidence(line: List[Dict]) -> float:
+        return sum(w['conf'] for w in line) / len(line) if line else 0
+
+    # Build EN entries as the primary reference
+    vocab_rows: List[VocabRow] = []
+
+    for en_line in en_lines:
+        en_y = line_y_center(en_line)
+        en_text = line_text(en_line)
+        en_conf = line_confidence(en_line)
+
+        # Skip very short or likely header content
+        if len(en_text.strip()) < 2:
+            continue
+
+        # Find matching DE line
+        de_text = ""
+        de_conf = 0.0
+        best_de_dist = float('inf')
+        best_de_idx = -1
+        for idx, de_line in enumerate(de_lines):
+            dist = abs(line_y_center(de_line) - en_y)
+            if dist < y_tolerance_px and dist < best_de_dist:
+                best_de_dist = dist
+                best_de_idx = idx
+
+        if best_de_idx >= 0:
+            de_text = line_text(de_lines[best_de_idx])
+            de_conf = line_confidence(de_lines[best_de_idx])
+
+        # Find matching example line
+        ex_text = ""
+        ex_conf = 0.0
+        best_ex_dist = float('inf')
+        best_ex_idx = -1
+        for idx, ex_line in enumerate(ex_lines):
+            dist = abs(line_y_center(ex_line) - en_y)
+            if dist < y_tolerance_px and dist < best_ex_dist:
+                best_ex_dist = dist
+                best_ex_idx = idx
+
+        if best_ex_idx >= 0:
+            ex_text = line_text(ex_lines[best_ex_idx])
+            ex_conf = line_confidence(ex_lines[best_ex_idx])
+
+        avg_conf = en_conf
+        conf_count = 1
+        if de_conf > 0:
+            avg_conf += de_conf
+            conf_count += 1
+        if ex_conf > 0:
+            avg_conf += ex_conf
+            conf_count += 1
+
+        vocab_rows.append(VocabRow(
+            english=en_text.strip(),
+            german=de_text.strip(),
+            example=ex_text.strip(),
+            confidence=avg_conf / conf_count,
+            y_position=int(en_y),
+        ))
+
+    # Handle multi-line wrapping in example column:
+    # If an example line has no matching EN/DE, append to previous entry
+    matched_ex_ys = set()
+    for row in vocab_rows:
+        if row.example:
+            matched_ex_ys.add(row.y_position)
+
+    for ex_line in ex_lines:
+        ex_y = line_y_center(ex_line)
+        # Check if already matched
+        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
+        if already_matched:
+            continue
+
+        # Find nearest previous vocab row
+        best_row = None
+        best_dist = float('inf')
+        for row in vocab_rows:
+            dist = ex_y - row.y_position
+            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
+                best_dist = dist
+                best_row = row
+
+        if best_row:
+            continuation = line_text(ex_line).strip()
+            if continuation:
+                best_row.example = (best_row.example + " " + continuation).strip()
+
+    # Sort by Y position
+    vocab_rows.sort(key=lambda r: r.y_position)
+
+    return vocab_rows
+
+
+# =============================================================================
+# Stage 8: Optional LLM Post-Correction
+# =============================================================================
+
+async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
+                           confidence_threshold: float = 50.0,
+                           enabled: bool = False) -> List[VocabRow]:
+    """Optionally send low-confidence regions to Qwen-VL for correction.
+
+    Default: disabled. Enable per parameter.
+
+    Args:
+        img: Original BGR image.
+        vocab_rows: Current vocabulary rows.
+        confidence_threshold: Rows below this get LLM correction.
+        enabled: Whether to actually run LLM correction.
+
+    Returns:
+        Corrected vocabulary rows.
+    """
+    if not enabled:
+        return vocab_rows
+
+    # TODO: Implement Qwen-VL correction for low-confidence entries
+    # For each row with confidence < threshold:
+    #   1. Crop the relevant region from img
+    #   2. Send crop + OCR text to Qwen-VL
+    #   3. Replace text if LLM provides a confident correction
+    logger.info(f"LLM post-correction skipped (not yet implemented)")
+    return vocab_rows
+
+
+# =============================================================================
+# Orchestrator
+# =============================================================================
+
+async def run_cv_pipeline(
+    pdf_data: Optional[bytes] = None,
+    image_data: Optional[bytes] = None,
+    page_number: int = 0,
+    zoom: float = 3.0,
+    enable_dewarp: bool = True,
+    enable_llm_correction: bool = False,
+    lang: str = "eng+deu",
+) -> PipelineResult:
+    """Run the complete CV document reconstruction pipeline.
+
+    Args:
+        pdf_data: Raw PDF bytes (mutually exclusive with image_data).
+        image_data: Raw image bytes (mutually exclusive with pdf_data).
+        page_number: 0-indexed page number (for PDF).
+        zoom: PDF rendering zoom factor.
+        enable_dewarp: Whether to run dewarp stage.
+        enable_llm_correction: Whether to run LLM post-correction.
+        lang: Tesseract language string.
+
+    Returns:
+        PipelineResult with vocabulary and timing info.
+    """
+    if not CV_PIPELINE_AVAILABLE:
+        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
+
+    result = PipelineResult()
+    total_start = time.time()
+
+    try:
+        # Stage 1: Render
+        t = time.time()
+        if pdf_data:
+            img = render_pdf_high_res(pdf_data, page_number, zoom)
+        elif image_data:
+            img = render_image_high_res(image_data)
+        else:
+            return PipelineResult(error="No input data (pdf_data or image_data required)")
+        result.stages['render'] = round(time.time() - t, 2)
+        result.image_width = img.shape[1]
+        result.image_height = img.shape[0]
+        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
+
+        # Stage 2: Deskew
+        t = time.time()
+        img, angle = deskew_image(img)
+        result.stages['deskew'] = round(time.time() - t, 2)
+        logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
+
+        # Stage 3: Dewarp
+        if enable_dewarp:
+            t = time.time()
+            img, _dewarp_info = dewarp_image(img)
+            result.stages['dewarp'] = round(time.time() - t, 2)
+
+        # Stage 4: Dual image preparation
+        t = time.time()
+        ocr_img = create_ocr_image(img)
+        layout_img = create_layout_image(img)
+        result.stages['image_prep'] = round(time.time() - t, 2)
+
+        # Stage 5: Layout analysis
+        t = time.time()
+        regions = analyze_layout(layout_img, ocr_img)
+        result.stages['layout'] = round(time.time() - t, 2)
+        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
+        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
+
+        # Stage 6: Multi-pass OCR
+        t = time.time()
+        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
+        result.stages['ocr'] = round(time.time() - t, 2)
+        total_words = sum(len(w) for w in ocr_results.values())
+        result.word_count = total_words
+        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
+
+        # Stage 7: Line alignment
+        t = time.time()
+        vocab_rows = match_lines_to_vocab(ocr_results, regions)
+        result.stages['alignment'] = round(time.time() - t, 2)
+
+        # Stage 8: Optional LLM correction
+        if enable_llm_correction:
+            t = time.time()
+            vocab_rows = await llm_post_correct(img, vocab_rows)
+            result.stages['llm_correction'] = round(time.time() - t, 2)
+
+        # Convert to output format
+        result.vocabulary = [
+            {
+                "english": row.english,
+                "german": row.german,
+                "example": row.example,
+                "confidence": round(row.confidence, 1),
+            }
+            for row in vocab_rows
+            if row.english or row.german  # Skip empty rows
+        ]
+
+        result.duration_seconds = round(time.time() - total_start, 2)
+        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
+
+    except Exception as e:
+        logger.error(f"CV Pipeline error: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        result.error = str(e)
+        result.duration_seconds = round(time.time() - total_start, 2)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# LLM-based OCR Correction (Step 6)
+# ---------------------------------------------------------------------------
+
+import httpx
+import os
+import json as _json
+import re as _re
+
+_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
+_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
+logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
+
+# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
+_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
+
+# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
+# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
+# when they appear inside or next to a word character.
+_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
+
+
+def _entry_needs_review(entry: Dict) -> bool:
+    """Check if an entry should be sent to the LLM for review.
+
+    Sends all non-empty entries that don't have IPA phonetic transcriptions.
+    The LLM prompt and _is_spurious_change() guard against unwanted changes.
+    """
+    en = entry.get("english", "") or ""
+    de = entry.get("german", "") or ""
+
+    # Skip completely empty entries
+    if not en.strip() and not de.strip():
+        return False
+    # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
+    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
+        return False
+    return True
+
+
+def _build_llm_prompt(table_lines: List[Dict]) -> str:
+    """Build the LLM correction prompt for a batch of entries."""
+    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
+
+DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
+
+NUR diese Korrekturen sind erlaubt:
+- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
+- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
+- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
+- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
+- Ziffer 6 statt G oder g: "6eld" → "Geld"
+- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
+
+ABSOLUT VERBOTEN — aendere NIEMALS:
+- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
+- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
+- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
+- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
+- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
+- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
+- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
+- Beispielsaetze in der ex-Spalte — NIEMALS aendern
+
+Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
+
+Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
+
+/no_think
+
+Eingabe:
+{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
+
+
+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+    """Detect LLM changes that are likely wrong and should be discarded.
+
+    Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
+    legitimate OCR corrections. Everything else is rejected.
+
+    Filters out:
+    - Case-only changes
+    - Changes that don't contain any digit→letter fix
+    - Completely different words (LLM translating or hallucinating)
+    - Additions or removals of whole words (count changed)
+    """
+    if not old_val or not new_val:
+        return False
+
+    # Case-only change — never a real OCR error
+    if old_val.lower() == new_val.lower():
+        return True
+
+    # If the word count changed significantly, the LLM rewrote rather than fixed
+    old_words = old_val.split()
+    new_words = new_val.split()
+    if abs(len(old_words) - len(new_words)) > 1:
+        return True
+
+    # Core rule: a legitimate correction replaces a digit with the corresponding
+    # letter. If the change doesn't include such a substitution, reject it.
+    # Build a set of (old_char, new_char) pairs that differ between old and new.
+    # Use character-level diff heuristic: if lengths are close, zip and compare.
+    # Map of characters that OCR commonly misreads → set of correct replacements
+    _OCR_CHAR_MAP = {
+        # Digits mistaken for letters
+        '0': set('oOgG'),
+        '1': set('lLiI'),
+        '5': set('sS'),
+        '6': set('gG'),
+        '8': set('bB'),
+        # Non-letter symbols mistaken for letters
+        '|': set('lLiI1'),  # pipe → lowercase l, capital I, or digit 1
+        'l': set('iI|1'),   # lowercase l → capital I (and reverse)
+    }
+    has_valid_fix = False
+    if len(old_val) == len(new_val):
+        for oc, nc in zip(old_val, new_val):
+            if oc != nc:
+                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
+                    has_valid_fix = True
+                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
+                    # Reverse check (e.g. l→I where new is the "correct" char)
+                    has_valid_fix = True
+    else:
+        # Length changed by 1: accept if old had a suspicious char sequence
+        _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
+        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
+            has_valid_fix = True
+
+    if not has_valid_fix:
+        return True  # Reject — looks like translation or hallucination
+
+    return False
+
+
+def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
+    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
+    changes = []
+    entries_out = []
+    for i, orig in enumerate(originals):
+        if i < len(corrected):
+            c = corrected[i]
+            entry = dict(orig)
+            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
+                new_val = c.get(key, "").strip()
+                old_val = (orig.get(field_name, "") or "").strip()
+                if new_val and new_val != old_val:
+                    # Filter spurious LLM changes
+                    if _is_spurious_change(old_val, new_val):
+                        continue
+                    changes.append({
+                        "row_index": orig.get("row_index", i),
+                        "field": field_name,
+                        "old": old_val,
+                        "new": new_val,
+                    })
+                    entry[field_name] = new_val
+                    entry["llm_corrected"] = True
+            entries_out.append(entry)
+        else:
+            entries_out.append(dict(orig))
+    return changes, entries_out
+
+
+# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
+
+REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _SPELL_AVAILABLE = True
+    logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
+except ImportError:
+    _SPELL_AVAILABLE = False
+    logger.warning("pyspellchecker not installed — falling back to LLM review")
+
+# ─── Page-Ref Normalization ───────────────────────────────────────────────────
+# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
+_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
+
+
+def _normalize_page_ref(text: str) -> str:
+    """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
+    if not text:
+        return text
+    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
+
+
+# Suspicious OCR chars → ordered list of most-likely correct replacements
+_SPELL_SUBS: Dict[str, List[str]] = {
+    '0': ['O', 'o'],
+    '1': ['l', 'I'],
+    '5': ['S', 's'],
+    '6': ['G', 'g'],
+    '8': ['B', 'b'],
+    '|': ['I', 'l', '1'],
+}
+_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
+
+# Tokenizer: word tokens (letters + pipe) alternating with separators
+_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
+
+
+def _spell_dict_knows(word: str) -> bool:
+    """True if word is known in EN or DE dictionary."""
+    if not _SPELL_AVAILABLE:
+        return False
+    w = word.lower()
+    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
+
+
+def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
+    """Return corrected form of token, or None if no fix needed/possible.
+
+    *field* is 'english' or 'german' — used to pick the right dictionary
+    for general spell correction (step 3 below).
+    """
+    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
+
+    # 1. Already known word → no fix needed
+    if _spell_dict_knows(token):
+        return None
+
+    # 2. Digit/pipe substitution (existing logic)
+    if has_suspicious:
+        # Standalone pipe → capital I
+        if token == '|':
+            return 'I'
+        # Dictionary-backed single-char substitution
+        for i, ch in enumerate(token):
+            if ch not in _SPELL_SUBS:
+                continue
+            for replacement in _SPELL_SUBS[ch]:
+                candidate = token[:i] + replacement + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+        # Structural rule: suspicious char at position 0 + rest is all lowercase letters
+        first = token[0]
+        if first in _SPELL_SUBS and len(token) >= 2:
+            rest = token[1:]
+            if rest.isalpha() and rest.islower():
+                candidate = _SPELL_SUBS[first][0] + rest
+                if not candidate[0].isdigit():
+                    return candidate
+
+    # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
+    #    Try single-char umlaut substitutions and check against dictionary.
+    if len(token) >= 3 and token.isalpha() and field == "german":
+        _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
+                         'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
+        for i, ch in enumerate(token):
+            if ch in _UMLAUT_SUBS:
+                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+
+    # 4. General spell correction for unknown words (no digits/pipes)
+    #    e.g. "beautful" → "beautiful"
+    if not has_suspicious and len(token) >= 3 and token.isalpha():
+        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
+        if spell is not None:
+            correction = spell.correction(token.lower())
+            if correction and correction != token.lower():
+                # Preserve original capitalisation pattern
+                if token[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if _spell_dict_knows(correction):
+                    return correction
+    return None
+
+
+def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
+    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
+
+    *field* is 'english' or 'german' — forwarded to _spell_fix_token for
+    dictionary selection.
+    """
+    if not text:
+        return text, False
+    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
+    # If no suspicious chars AND no alpha chars that could be misspelled, skip
+    if not has_suspicious and not any(c.isalpha() for c in text):
+        return text, False
+    # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
+    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
+    changed = fixed != text
+    # Tokenize and fix word by word
+    parts: List[str] = []
+    pos = 0
+    for m in _SPELL_TOKEN_RE.finditer(fixed):
+        token, sep = m.group(1), m.group(2)
+        correction = _spell_fix_token(token, field=field)
+        if correction:
+            parts.append(correction)
+            changed = True
+        else:
+            parts.append(token)
+        parts.append(sep)
+        pos = m.end()
+    if pos < len(fixed):
+        parts.append(fixed[pos:])
+    return ''.join(parts), changed
+
+
+def spell_review_entries_sync(entries: List[Dict]) -> Dict:
+    """Rule-based OCR correction: spell-checker + structural heuristics.
+
+    Deterministic — never translates, never touches IPA, never hallucinates.
+    """
+    t0 = time.time()
+    changes: List[Dict] = []
+    all_corrected: List[Dict] = []
+    for i, entry in enumerate(entries):
+        e = dict(entry)
+        # Page-ref normalization (always, regardless of review status)
+        old_ref = (e.get("source_page") or "").strip()
+        if old_ref:
+            new_ref = _normalize_page_ref(old_ref)
+            if new_ref != old_ref:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": "source_page",
+                    "old": old_ref,
+                    "new": new_ref,
+                })
+                e["source_page"] = new_ref
+                e["llm_corrected"] = True
+        if not _entry_needs_review(e):
+            all_corrected.append(e)
+            continue
+        for field_name in ("english", "german", "example"):
+            old_val = (e.get(field_name) or "").strip()
+            if not old_val:
+                continue
+            # example field is mixed-language — try German first (for umlauts)
+            lang = "german" if field_name in ("german", "example") else "english"
+            new_val, was_changed = _spell_fix_field(old_val, field=lang)
+            if was_changed and new_val != old_val:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": field_name,
+                    "old": old_val,
+                    "new": new_val,
+                })
+                e[field_name] = new_val
+                e["llm_corrected"] = True
+        all_corrected.append(e)
+    duration_ms = int((time.time() - t0) * 1000)
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": 0,
+        "model_used": "spell-checker",
+        "duration_ms": duration_ms,
+    }
+
+
+async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
+    """Async generator yielding SSE-compatible events for spell-checker review."""
+    total = len(entries)
+    yield {
+        "type": "meta",
+        "total_entries": total,
+        "to_review": total,
+        "skipped": 0,
+        "model": "spell-checker",
+        "batch_size": batch_size,
+    }
+    result = spell_review_entries_sync(entries)
+    changes = result["changes"]
+    yield {
+        "type": "batch",
+        "batch_index": 0,
+        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
+        "changes": changes,
+        "duration_ms": result["duration_ms"],
+        "progress": {"current": total, "total": total},
+    }
+    yield {
+        "type": "complete",
+        "changes": changes,
+        "model_used": "spell-checker",
+        "duration_ms": result["duration_ms"],
+        "total_entries": total,
+        "reviewed": total,
+        "skipped": 0,
+        "corrections_found": len(changes),
+        "entries_corrected": result["entries_corrected"],
+    }
+
+# ─── End Spell-Checker ────────────────────────────────────────────────────────
+
+
+async def llm_review_entries(
+    entries: List[Dict],
+    model: str = None,
+) -> Dict:
+    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        return spell_review_entries_sync(entries)
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    # Filter: only entries that need review
+    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
+
+    if not reviewable:
+        return {
+            "entries_original": entries,
+            "entries_corrected": [dict(e) for e in entries],
+            "changes": [],
+            "skipped_count": len(entries),
+            "model_used": model,
+            "duration_ms": 0,
+        }
+
+    review_entries = [e for _, e in reviewable]
+    table_lines = [
+        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+        for e in review_entries
+    ]
+
+    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
+                len(review_entries), len(entries), model, len(entries) - len(reviewable))
+    logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
+
+    prompt = _build_llm_prompt(table_lines)
+
+    t0 = time.time()
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.post(
+            f"{_OLLAMA_URL}/api/chat",
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "stream": False,
+                "think": False,   # qwen3: disable chain-of-thought (Ollama >=0.6)
+                "options": {"temperature": 0.1, "num_predict": 8192},
+            },
+        )
+        resp.raise_for_status()
+        content = resp.json().get("message", {}).get("content", "")
+    duration_ms = int((time.time() - t0) * 1000)
+
+    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
+    logger.debug("LLM review raw response (first 500): %.500s", content)
+
+    corrected = _parse_llm_json_array(content)
+    logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
+    changes, corrected_entries = _diff_batch(review_entries, corrected)
+
+    # Merge corrected entries back into the full list
+    all_corrected = [dict(e) for e in entries]
+    for batch_idx, (orig_idx, _) in enumerate(reviewable):
+        if batch_idx < len(corrected_entries):
+            all_corrected[orig_idx] = corrected_entries[batch_idx]
+
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": len(entries) - len(reviewable),
+        "model_used": model,
+        "duration_ms": duration_ms,
+    }
+
+
+async def llm_review_entries_streaming(
+    entries: List[Dict],
+    model: str = None,
+    batch_size: int = _REVIEW_BATCH_SIZE,
+):
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
+
+    Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
+    visible in the UI — this is the only place the fix now runs (removed from Step 1
+    of build_vocab_pipeline_streaming).
+    """
+    # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
+    _CONF_FIELDS = ('english', 'german', 'example')
+    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
+    _fix_character_confusion(entries)  # modifies in-place, returns same list
+    char_changes = [
+        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
+        for i in range(len(entries))
+        for f in _CONF_FIELDS
+        if originals[i][f] != entries[i].get(f, '')
+    ]
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        # Inject char_changes as a batch right after the meta event from the spell checker
+        _meta_sent = False
+        async for event in spell_review_entries_streaming(entries, batch_size):
+            yield event
+            if not _meta_sent and event.get('type') == 'meta' and char_changes:
+                _meta_sent = True
+                yield {
+                    'type': 'batch',
+                    'changes': char_changes,
+                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+                    'progress': {'current': 0, 'total': len(entries)},
+                }
+        return
+
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    # LLM path: emit char_changes first (before meta) so they appear in the UI
+    if char_changes:
+        yield {
+            'type': 'batch',
+            'changes': char_changes,
+            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+            'progress': {'current': 0, 'total': len(entries)},
+        }
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    # Separate reviewable from skipped entries
+    reviewable = []
+    skipped_indices = []
+    for i, e in enumerate(entries):
+        if _entry_needs_review(e):
+            reviewable.append((i, e))
+        else:
+            skipped_indices.append(i)
+
+    total_to_review = len(reviewable)
+
+    # meta event
+    yield {
+        "type": "meta",
+        "total_entries": len(entries),
+        "to_review": total_to_review,
+        "skipped": len(skipped_indices),
+        "model": model,
+        "batch_size": batch_size,
+    }
+
+    all_changes = []
+    all_corrected = [dict(e) for e in entries]
+    total_duration_ms = 0
+    reviewed_count = 0
+
+    # Process in batches
+    for batch_start in range(0, total_to_review, batch_size):
+        batch_items = reviewable[batch_start:batch_start + batch_size]
+        batch_entries = [e for _, e in batch_items]
+
+        table_lines = [
+            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+            for e in batch_entries
+        ]
+
+        prompt = _build_llm_prompt(table_lines)
+
+        logger.info("LLM review streaming: batch %d — sending %d entries to %s",
+                    batch_start // batch_size, len(batch_entries), model)
+
+        t0 = time.time()
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(
+                f"{_OLLAMA_URL}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False,
+                    "think": False,   # qwen3: disable chain-of-thought
+                    "options": {"temperature": 0.1, "num_predict": 8192},
+                },
+            )
+            resp.raise_for_status()
+            content = resp.json().get("message", {}).get("content", "")
+        batch_ms = int((time.time() - t0) * 1000)
+        total_duration_ms += batch_ms
+
+        logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
+        logger.debug("LLM review streaming raw (first 500): %.500s", content)
+
+        corrected = _parse_llm_json_array(content)
+        logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
+        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
+
+        # Merge back
+        for batch_idx, (orig_idx, _) in enumerate(batch_items):
+            if batch_idx < len(batch_corrected):
+                all_corrected[orig_idx] = batch_corrected[batch_idx]
+
+        all_changes.extend(batch_changes)
+        reviewed_count += len(batch_items)
+
+        # Yield batch result
+        yield {
+            "type": "batch",
+            "batch_index": batch_start // batch_size,
+            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
+            "changes": batch_changes,
+            "duration_ms": batch_ms,
+            "progress": {"current": reviewed_count, "total": total_to_review},
+        }
+
+    # Complete event
+    yield {
+        "type": "complete",
+        "changes": all_changes,
+        "model_used": model,
+        "duration_ms": total_duration_ms,
+        "total_entries": len(entries),
+        "reviewed": total_to_review,
+        "skipped": len(skipped_indices),
+        "corrections_found": len(all_changes),
+        "entries_corrected": all_corrected,
+    }
+
+
+def _sanitize_for_json(text: str) -> str:
+    """Remove or escape control characters that break JSON parsing.
+
+    Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
+    JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
+    that are only valid inside JSON strings when properly escaped.
+    """
+    # Replace literal control chars (except \\t \\n \\r) with a space
+    return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
+
+
+def _parse_llm_json_array(text: str) -> List[Dict]:
+    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
+    # Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
+    text = _re.sub(r'<think>.*?</think>', '', text, flags=_re.DOTALL)
+    # Strip markdown code fences
+    text = _re.sub(r'```json\s*', '', text)
+    text = _re.sub(r'```\s*', '', text)
+    # Sanitize control characters before JSON parsing
+    text = _sanitize_for_json(text)
+    # Find first [ ... last ]
+    match = _re.search(r'\[.*\]', text, _re.DOTALL)
+    if match:
+        try:
+            return _json.loads(match.group())
+        except (ValueError, _json.JSONDecodeError) as e:
+            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
+    else:
+        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
+    return []
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 1c4961d..940381b 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1,8163 +1,35 @@
 """
 CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
 
-Uses classical Computer Vision techniques for high-quality OCR:
-- High-resolution PDF rendering (432 DPI)
-- Deskew (rotation correction via Hough Lines)
-- Dewarp (book curvature correction) — pass-through initially
-- Dual image preparation (binarized for OCR, CLAHE for layout)
-- Projection-profile layout analysis (column/row detection)
-- Multi-pass Tesseract OCR with region-specific PSM settings
-- Y-coordinate line alignment for vocabulary matching
-- Optional LLM post-correction for low-confidence regions
+Re-export facade — all logic lives in the sub-modules:
+
+  cv_vocab_types      Dataklassen, Konstanten, IPA, Feature-Flags
+  cv_preprocessing    Bild-I/O, Orientierung, Deskew, Dewarp
+  cv_layout           Dokumenttyp, Spalten, Zeilen, Klassifikation
+  cv_ocr_engines      OCR-Engines, Vocab-Postprocessing, Text-Cleaning
+  cv_cell_grid        Cell-Grid (v2 + Legacy), Vocab-Konvertierung
+  cv_review           LLM/Spell Review, Pipeline-Orchestrierung
 
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 
-import io
-import logging
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass, field
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-# --- Availability Guards ---
-
-try:
-    import cv2
-    CV2_AVAILABLE = True
-except ImportError:
-    cv2 = None
-    CV2_AVAILABLE = False
-    logger.warning("OpenCV not available — CV pipeline disabled")
-
-try:
-    import pytesseract
-    from PIL import Image
-    TESSERACT_AVAILABLE = True
-except ImportError:
-    pytesseract = None
-    Image = None
-    TESSERACT_AVAILABLE = False
-    logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
-
-CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
-
-# --- IPA Dictionary ---
-
-import json
-import os
-import re
-
-IPA_AVAILABLE = False
-_ipa_convert_american = None
-_britfone_dict: Dict[str, str] = {}
-
-try:
-    import eng_to_ipa as _eng_to_ipa
-    _ipa_convert_american = _eng_to_ipa.convert
-    IPA_AVAILABLE = True
-    logger.info("eng_to_ipa available — American IPA lookup enabled")
-except ImportError:
-    logger.info("eng_to_ipa not installed — American IPA disabled")
-
-# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
-_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
-if os.path.exists(_britfone_path):
-    try:
-        with open(_britfone_path, 'r', encoding='utf-8') as f:
-            _britfone_dict = json.load(f)
-        IPA_AVAILABLE = True
-        logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
-    except Exception as e:
-        logger.warning(f"Failed to load Britfone: {e}")
-else:
-    logger.info("Britfone not found — British IPA disabled")
-
-# --- Language Detection Constants ---
-
-GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
-    'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
-    'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
-    'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
-    'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
-
-ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
-    'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
-    'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
-    'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
-    'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
-
-
-# --- Data Classes ---
-
-@dataclass
-class PageRegion:
-    """A detected region on the page."""
-    type: str           # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
-    x: int
-    y: int
-    width: int
-    height: int
-    classification_confidence: float = 1.0   # 0.0-1.0
-    classification_method: str = ""          # 'content', 'position_enhanced', 'position_fallback'
-
-
-@dataclass
-class ColumnGeometry:
-    """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
-    index: int              # 0-basiert, links->rechts
-    x: int
-    y: int
-    width: int
-    height: int
-    word_count: int
-    words: List[Dict]       # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
-    width_ratio: float      # width / content_width (0.0-1.0)
-    is_sub_column: bool = False  # True if created by _detect_sub_columns() split
-
-
-@dataclass
-class RowGeometry:
-    """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
-    index: int              # 0-basiert, oben→unten
-    x: int                  # absolute left (= content left_x)
-    y: int                  # absolute y start
-    width: int              # content width
-    height: int             # Zeilenhoehe in px
-    word_count: int
-    words: List[Dict]
-    row_type: str = 'content'  # 'content' | 'header' | 'footer'
-    gap_before: int = 0     # Gap in px ueber dieser Zeile
-
-
-@dataclass
-class VocabRow:
-    """A single vocabulary entry assembled from multi-column OCR."""
-    english: str = ""
-    german: str = ""
-    example: str = ""
-    source_page: str = ""
-    confidence: float = 0.0
-    y_position: int = 0
-
-
-@dataclass
-class PipelineResult:
-    """Complete result of the CV pipeline."""
-    vocabulary: List[Dict[str, Any]] = field(default_factory=list)
-    word_count: int = 0
-    columns_detected: int = 0
-    duration_seconds: float = 0.0
-    stages: Dict[str, float] = field(default_factory=dict)
-    error: Optional[str] = None
-    image_width: int = 0
-    image_height: int = 0
-
-
-@dataclass
-class DocumentTypeResult:
-    """Result of automatic document type detection."""
-    doc_type: str           # 'vocab_table' | 'full_text' | 'generic_table'
-    confidence: float       # 0.0-1.0
-    pipeline: str           # 'cell_first' | 'full_page'
-    skip_steps: List[str] = field(default_factory=list)  # e.g. ['columns', 'rows']
-    features: Dict[str, Any] = field(default_factory=dict)  # debug info
-
-
-# =============================================================================
-# Stage 1: High-Resolution PDF Rendering
-# =============================================================================
-
-def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
-    """Render a PDF page to a high-resolution numpy array (BGR).
-
-    Args:
-        pdf_data: Raw PDF bytes.
-        page_number: 0-indexed page number.
-        zoom: Zoom factor (3.0 = 432 DPI).
-
-    Returns:
-        numpy array in BGR format.
-    """
-    import fitz  # PyMuPDF
-
-    pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
-    if page_number >= pdf_doc.page_count:
-        raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
-
-    page = pdf_doc[page_number]
-    mat = fitz.Matrix(zoom, zoom)
-    pix = page.get_pixmap(matrix=mat)
-
-    # Convert to numpy BGR
-    img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
-    if pix.n == 4:  # RGBA
-        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
-    elif pix.n == 3:  # RGB
-        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
-    else:  # Grayscale
-        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
-
-    pdf_doc.close()
-    return img_bgr
-
-
-def render_image_high_res(image_data: bytes) -> np.ndarray:
-    """Load an image (PNG/JPEG) into a numpy array (BGR).
-
-    Args:
-        image_data: Raw image bytes.
-
-    Returns:
-        numpy array in BGR format.
-    """
-    img_array = np.frombuffer(image_data, dtype=np.uint8)
-    img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
-    if img_bgr is None:
-        raise ValueError("Could not decode image data")
-    return img_bgr
-
-
-# =============================================================================
-# Stage 1b: Orientation Detection (0°/90°/180°/270°)
-# =============================================================================
-
-def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
-    """Detect page orientation via Tesseract OSD and rotate if needed.
-
-    Handles upside-down scans (180°) common with book scanners where
-    every other page is flipped due to the scanner hinge.
-
-    Returns:
-        (corrected_image, rotation_degrees)  — rotation is 0, 90, 180, or 270.
-    """
-    if pytesseract is None:
-        return img_bgr, 0
-
-    try:
-        # Tesseract OSD needs a grayscale or RGB image
-        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-        pil_img = Image.fromarray(gray)
-
-        osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
-        rotate = osd.get("rotate", 0)
-        confidence = osd.get("orientation_conf", 0.0)
-
-        logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}")
-
-        if rotate == 0 or confidence < 1.0:
-            return img_bgr, 0
-
-        # Apply rotation
-        if rotate == 180:
-            corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
-        elif rotate == 90:
-            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
-        elif rotate == 270:
-            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
-        else:
-            return img_bgr, 0
-
-        logger.info(f"OSD: rotated {rotate}° to fix orientation")
-        return corrected, rotate
-
-    except Exception as e:
-        logger.warning(f"OSD orientation detection failed: {e}")
-        return img_bgr, 0
-
-
-# =============================================================================
-# Stage 2: Deskew (Rotation Correction)
-# =============================================================================
-
-def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
-    """Correct rotation using Hough Line detection.
-
-    Args:
-        img: BGR image.
-
-    Returns:
-        Tuple of (corrected image, detected angle in degrees).
-    """
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    # Binarize for line detection
-    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
-    # Detect lines
-    lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
-                            minLineLength=img.shape[1] // 4, maxLineGap=20)
-
-    if lines is None or len(lines) < 3:
-        return img, 0.0
-
-    # Compute angles of near-horizontal lines
-    angles = []
-    for line in lines:
-        x1, y1, x2, y2 = line[0]
-        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
-        if abs(angle) < 15:  # Only near-horizontal
-            angles.append(angle)
-
-    if not angles:
-        return img, 0.0
-
-    median_angle = float(np.median(angles))
-
-    # Limit correction to ±5°
-    if abs(median_angle) > 5.0:
-        median_angle = 5.0 * np.sign(median_angle)
-
-    if abs(median_angle) < 0.1:
-        return img, 0.0
-
-    # Rotate
-    h, w = img.shape[:2]
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
-    corrected = cv2.warpAffine(img, M, (w, h),
-                               flags=cv2.INTER_LINEAR,
-                               borderMode=cv2.BORDER_REPLICATE)
-
-    logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
-    return corrected, median_angle
-
-
-def deskew_image_by_word_alignment(
-    image_data: bytes,
-    lang: str = "eng+deu",
-    downscale_factor: float = 0.5,
-) -> Tuple[bytes, float]:
-    """Correct rotation by fitting a line through left-most word starts per text line.
-
-    More robust than Hough-based deskew for vocabulary worksheets where text lines
-    have consistent left-alignment.  Runs a quick Tesseract pass on a downscaled
-    copy to find word positions, computes the dominant left-edge column, fits a
-    line through those points and rotates the full-resolution image.
-
-    Args:
-        image_data: Raw image bytes (PNG/JPEG).
-        lang: Tesseract language string for the quick pass.
-        downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
-
-    Returns:
-        Tuple of (rotated image as PNG bytes, detected angle in degrees).
-    """
-    if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
-        return image_data, 0.0
-
-    # 1. Decode image
-    img_array = np.frombuffer(image_data, dtype=np.uint8)
-    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
-    if img is None:
-        logger.warning("deskew_by_word_alignment: could not decode image")
-        return image_data, 0.0
-
-    orig_h, orig_w = img.shape[:2]
-
-    # 2. Downscale for fast Tesseract pass
-    small_w = int(orig_w * downscale_factor)
-    small_h = int(orig_h * downscale_factor)
-    small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
-
-    # 3. Quick Tesseract — word-level positions
-    pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
-    try:
-        data = pytesseract.image_to_data(
-            pil_small, lang=lang, config="--psm 6 --oem 3",
-            output_type=pytesseract.Output.DICT,
-        )
-    except Exception as e:
-        logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
-        return image_data, 0.0
-
-    # 4. Per text-line, find the left-most word start
-    #    Group by (block_num, par_num, line_num)
-    from collections import defaultdict
-    line_groups: Dict[tuple, list] = defaultdict(list)
-    for i in range(len(data["text"])):
-        text = (data["text"][i] or "").strip()
-        conf = int(data["conf"][i])
-        if not text or conf < 20:
-            continue
-        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
-        line_groups[key].append(i)
-
-    if len(line_groups) < 5:
-        logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
-        return image_data, 0.0
-
-    # For each line, pick the word with smallest 'left' → compute (left_x, center_y)
-    # Scale back to original resolution
-    scale = 1.0 / downscale_factor
-    points = []  # list of (x, y) in original-image coords
-    for key, indices in line_groups.items():
-        best_idx = min(indices, key=lambda i: data["left"][i])
-        lx = data["left"][best_idx] * scale
-        top = data["top"][best_idx] * scale
-        h = data["height"][best_idx] * scale
-        cy = top + h / 2.0
-        points.append((lx, cy))
-
-    # 5. Find dominant left-edge column + compute angle
-    xs = np.array([p[0] for p in points])
-    ys = np.array([p[1] for p in points])
-    median_x = float(np.median(xs))
-    tolerance = orig_w * 0.03  # 3% of image width
-
-    mask = np.abs(xs - median_x) <= tolerance
-    filtered_xs = xs[mask]
-    filtered_ys = ys[mask]
-
-    if len(filtered_xs) < 5:
-        logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
-        return image_data, 0.0
-
-    # polyfit: x = a*y + b  →  a = dx/dy  →  angle = arctan(a)
-    coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
-    slope = coeffs[0]  # dx/dy
-    angle_rad = np.arctan(slope)
-    angle_deg = float(np.degrees(angle_rad))
-
-    # Clamp to ±5°
-    angle_deg = max(-5.0, min(5.0, angle_deg))
-
-    logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
-                f"(total lines: {len(line_groups)})")
-
-    if abs(angle_deg) < 0.05:
-        return image_data, 0.0
-
-    # 6. Rotate full-res image
-    center = (orig_w // 2, orig_h // 2)
-    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
-    rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
-                              flags=cv2.INTER_LINEAR,
-                              borderMode=cv2.BORDER_REPLICATE)
-
-    # Encode back to PNG
-    success, png_buf = cv2.imencode(".png", rotated)
-    if not success:
-        logger.warning("deskew_by_word_alignment: PNG encoding failed")
-        return image_data, 0.0
-
-    return png_buf.tobytes(), angle_deg
-
-
-def _projection_gradient_score(profile: np.ndarray) -> float:
-    """Score a projection profile by the L2-norm of its first derivative.
-
-    Higher score = sharper transitions between text-lines and gaps,
-    i.e. better row/column alignment.
-    """
-    diff = np.diff(profile)
-    return float(np.sum(diff * diff))
-
-
-def deskew_image_iterative(
-    img: np.ndarray,
-    coarse_range: float = 5.0,
-    coarse_step: float = 0.1,
-    fine_range: float = 0.15,
-    fine_step: float = 0.02,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
-    """Iterative deskew using vertical-edge projection optimisation.
-
-    The key insight: at the correct rotation angle, vertical features
-    (word left-edges, column borders) become truly vertical, producing
-    the sharpest peaks in the vertical projection of vertical edges.
-
-    Method:
-      1. Detect vertical edges via Sobel-X on the central crop.
-      2. Coarse sweep: rotate edge image, compute vertical projection
-         gradient score.  The angle where vertical edges align best wins.
-      3. Fine sweep: refine around the coarse winner.
-
-    Args:
-        img: BGR image (full resolution).
-        coarse_range: half-range in degrees for the coarse sweep.
-        coarse_step: step size in degrees for the coarse sweep.
-        fine_range: half-range around the coarse winner for the fine sweep.
-        fine_step: step size in degrees for the fine sweep.
-
-    Returns:
-        (rotated_bgr, angle_degrees, debug_dict)
-    """
-    h, w = img.shape[:2]
-    debug: Dict[str, Any] = {}
-
-    # --- Grayscale + vertical edge detection ---
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Central crop (15%-85% height, 10%-90% width) to avoid page margins
-    y_lo, y_hi = int(h * 0.15), int(h * 0.85)
-    x_lo, x_hi = int(w * 0.10), int(w * 0.90)
-    gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
-
-    # Sobel-X → absolute vertical edges
-    sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
-    edges = np.abs(sobel_x)
-    # Normalise to 0-255 for consistent scoring
-    edge_max = edges.max()
-    if edge_max > 0:
-        edges = (edges / edge_max * 255).astype(np.uint8)
-    else:
-        return img, 0.0, {"error": "no edges detected"}
-
-    crop_h, crop_w = edges.shape[:2]
-    crop_center = (crop_w // 2, crop_h // 2)
-
-    # Trim margin after rotation to avoid border artifacts
-    trim_y = max(4, int(crop_h * 0.03))
-    trim_x = max(4, int(crop_w * 0.03))
-
-    def _sweep_edges(angles: np.ndarray) -> list:
-        """Score each angle by vertical projection gradient of vertical edges."""
-        results = []
-        for angle in angles:
-            if abs(angle) < 1e-6:
-                rotated = edges
-            else:
-                M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
-                rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
-                                         flags=cv2.INTER_NEAREST,
-                                         borderMode=cv2.BORDER_REPLICATE)
-            # Trim borders to avoid edge artifacts
-            trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
-            v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
-            score = _projection_gradient_score(v_profile)
-            results.append((float(angle), score))
-        return results
-
-    # --- Phase 1: coarse sweep ---
-    coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
-    coarse_results = _sweep_edges(coarse_angles)
-    best_coarse = max(coarse_results, key=lambda x: x[1])
-    best_coarse_angle, best_coarse_score = best_coarse
-
-    debug["coarse_best_angle"] = round(best_coarse_angle, 2)
-    debug["coarse_best_score"] = round(best_coarse_score, 1)
-    debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
-
-    # --- Phase 2: fine sweep around coarse winner ---
-    fine_lo = best_coarse_angle - fine_range
-    fine_hi = best_coarse_angle + fine_range
-    fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
-    fine_results = _sweep_edges(fine_angles)
-    best_fine = max(fine_results, key=lambda x: x[1])
-    best_fine_angle, best_fine_score = best_fine
-
-    debug["fine_best_angle"] = round(best_fine_angle, 2)
-    debug["fine_best_score"] = round(best_fine_score, 1)
-    debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
-
-    final_angle = best_fine_angle
-
-    # Clamp to ±5°
-    final_angle = max(-5.0, min(5.0, final_angle))
-
-    logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°")
-
-    if abs(final_angle) < 0.05:
-        return img, 0.0, debug
-
-    # --- Rotate full-res image ---
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
-    rotated = cv2.warpAffine(img, M, (w, h),
-                              flags=cv2.INTER_LINEAR,
-                              borderMode=cv2.BORDER_REPLICATE)
-
-    return rotated, final_angle, debug
-
-
-def _measure_textline_slope(img: np.ndarray) -> float:
-    """Measure residual text-line slope via Tesseract word-position regression.
-
-    Groups Tesseract words by (block, par, line), fits a linear regression
-    per line (y = slope * x + b), and returns the trimmed-mean slope in
-    degrees.  Positive = text rises to the right, negative = falls.
-
-    This is the most direct measurement of remaining rotation after deskew.
-    """
-    import math as _math
-
-    if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
-        return 0.0
-
-    h, w = img.shape[:2]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    data = pytesseract.image_to_data(
-        Image.fromarray(gray),
-        output_type=pytesseract.Output.DICT,
-        config="--psm 6",
-    )
-
-    # Group word centres by text line
-    lines: Dict[tuple, list] = {}
-    for i in range(len(data["text"])):
-        txt = (data["text"][i] or "").strip()
-        if len(txt) < 2 or int(data["conf"][i]) < 30:
-            continue
-        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
-        cx = data["left"][i] + data["width"][i] / 2.0
-        cy = data["top"][i] + data["height"][i] / 2.0
-        lines.setdefault(key, []).append((cx, cy))
-
-    # Per-line linear regression → slope angle
-    slopes: list = []
-    for pts in lines.values():
-        if len(pts) < 3:
-            continue
-        pts.sort(key=lambda p: p[0])
-        xs = np.array([p[0] for p in pts], dtype=np.float64)
-        ys = np.array([p[1] for p in pts], dtype=np.float64)
-        if xs[-1] - xs[0] < w * 0.15:
-            continue  # skip short lines
-        A = np.vstack([xs, np.ones_like(xs)]).T
-        result = np.linalg.lstsq(A, ys, rcond=None)
-        slope = result[0][0]
-        slopes.append(_math.degrees(_math.atan(slope)))
-
-    if len(slopes) < 3:
-        return 0.0
-
-    # Trimmed mean (drop 10% extremes on each side)
-    slopes.sort()
-    trim = max(1, len(slopes) // 10)
-    trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
-    if not trimmed:
-        return 0.0
-
-    return sum(trimmed) / len(trimmed)
-
-
-def deskew_two_pass(
-    img: np.ndarray,
-    coarse_range: float = 5.0,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
-    """Two-pass deskew: iterative projection + word-alignment residual check.
-
-    Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range).
-    Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image
-             to detect and fix residual skew that the projection method missed.
-
-    The two corrections are summed.  If the residual from Pass 2 is below
-    0.3° it is ignored (already good enough).
-
-    Returns:
-        (corrected_bgr, total_angle_degrees, debug_dict)
-    """
-    debug: Dict[str, Any] = {}
-
-    # --- Pass 1: iterative projection ---
-    corrected, angle1, dbg1 = deskew_image_iterative(
-        img.copy(), coarse_range=coarse_range,
-    )
-    debug["pass1_angle"] = round(angle1, 3)
-    debug["pass1_method"] = "iterative"
-    debug["pass1_debug"] = dbg1
-
-    # --- Pass 2: word-alignment residual check on corrected image ---
-    angle2 = 0.0
-    try:
-        # Encode the corrected image to PNG bytes for word-alignment
-        ok, buf = cv2.imencode(".png", corrected)
-        if ok:
-            corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
-            if abs(angle2) >= 0.3:
-                # Significant residual — decode and use the second correction
-                arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
-                corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
-                if corrected2 is not None:
-                    corrected = corrected2
-                    logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied "
-                                f"(total={angle1 + angle2:.2f}°)")
-                else:
-                    angle2 = 0.0
-            else:
-                logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped")
-                angle2 = 0.0
-    except Exception as e:
-        logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
-        angle2 = 0.0
-
-    # --- Pass 3: Tesseract text-line regression residual check ---
-    # The most reliable final check: measure actual text-line slopes
-    # using Tesseract word positions and linear regression per line.
-    angle3 = 0.0
-    try:
-        residual = _measure_textline_slope(corrected)
-        debug["pass3_raw"] = round(residual, 3)
-        if abs(residual) >= 0.3:
-            h3, w3 = corrected.shape[:2]
-            center3 = (w3 // 2, h3 // 2)
-            M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
-            corrected = cv2.warpAffine(
-                corrected, M3, (w3, h3),
-                flags=cv2.INTER_LINEAR,
-                borderMode=cv2.BORDER_REPLICATE,
-            )
-            angle3 = residual
-            logger.info(
-                "deskew_two_pass: pass3 text-line residual=%.2f° applied",
-                residual,
-            )
-        else:
-            logger.info(
-                "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped",
-                residual,
-            )
-    except Exception as e:
-        logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
-
-    total_angle = angle1 + angle2 + angle3
-    debug["pass2_angle"] = round(angle2, 3)
-    debug["pass2_method"] = "word_alignment"
-    debug["pass3_angle"] = round(angle3, 3)
-    debug["pass3_method"] = "textline_regression"
-    debug["total_angle"] = round(total_angle, 3)
-
-    logger.info(
-        "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°",
-        angle1, angle2, angle3, total_angle,
-    )
-
-    return corrected, total_angle, debug
-
-
-# =============================================================================
-# Stage 3: Dewarp (Book Curvature Correction)
-# =============================================================================
-
-def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
-    """Detect the vertical shear angle of the page.
-
-    After deskew (horizontal lines aligned), vertical features like column
-    edges may still be tilted. This measures that tilt by tracking the
-    strongest vertical edge across horizontal strips.
-
-    The result is a shear angle in degrees: the angular difference between
-    true vertical and the detected column edge.
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    h, w = img.shape[:2]
-    result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
-
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Vertical Sobel to find vertical edges
-    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
-    abs_sobel = np.abs(sobel_x).astype(np.uint8)
-
-    # Binarize with Otsu
-    _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-
-    num_strips = 20
-    strip_h = h // num_strips
-    edge_positions = []  # (y_center, x_position)
-
-    for i in range(num_strips):
-        y_start = i * strip_h
-        y_end = min((i + 1) * strip_h, h)
-        strip = binary[y_start:y_end, :]
-
-        # Project vertically (sum along y-axis)
-        projection = np.sum(strip, axis=0).astype(np.float64)
-        if projection.max() == 0:
-            continue
-
-        # Find the strongest vertical edge in left 40% of image
-        search_w = int(w * 0.4)
-        left_proj = projection[:search_w]
-        if left_proj.max() == 0:
-            continue
-
-        # Smooth and find peak
-        kernel_size = max(3, w // 100)
-        if kernel_size % 2 == 0:
-            kernel_size += 1
-        smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
-        x_pos = float(np.argmax(smoothed))
-        y_center = (y_start + y_end) / 2.0
-        edge_positions.append((y_center, x_pos))
-
-    if len(edge_positions) < 8:
-        return result
-
-    ys = np.array([p[0] for p in edge_positions])
-    xs = np.array([p[1] for p in edge_positions])
-
-    # Remove outliers (> 2 std from median)
-    median_x = np.median(xs)
-    std_x = max(np.std(xs), 1.0)
-    mask = np.abs(xs - median_x) < 2 * std_x
-    ys = ys[mask]
-    xs = xs[mask]
-
-    if len(ys) < 6:
-        return result
-
-    # Fit straight line: x = slope * y + intercept
-    # The slope tells us the tilt of the vertical edge
-    straight_coeffs = np.polyfit(ys, xs, 1)
-    slope = straight_coeffs[0]  # dx/dy in pixels
-    fitted = np.polyval(straight_coeffs, ys)
-    residuals = xs - fitted
-    rmse = float(np.sqrt(np.mean(residuals ** 2)))
-
-    # Convert slope to angle: arctan(dx/dy) in degrees
-    import math
-    shear_degrees = math.degrees(math.atan(slope))
-
-    confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
-
-    result["shear_degrees"] = round(shear_degrees, 3)
-    result["confidence"] = round(float(confidence), 2)
-
-    return result
-
-
-def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
-    """Detect shear angle by maximising variance of horizontal text-line projections.
-
-    Principle: horizontal text lines produce a row-projection profile with sharp
-    peaks (high variance) when the image is correctly aligned.  Any residual shear
-    smears the peaks and reduces variance.  We sweep ±3° and pick the angle whose
-    corrected projection has the highest variance.
-
-    Works best on pages with clear horizontal banding (vocabulary tables, prose).
-    Complements _detect_shear_angle() which needs strong vertical edges.
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    import math
-    result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
-
-    h, w = img.shape[:2]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Otsu binarisation
-    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
-    # Work at half resolution for speed
-    small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
-    sh, sw = small.shape
-
-    # 2-pass angle sweep for 10x better precision:
-    # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
-    # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
-
-    def _sweep_variance(angles_list):
-        results = []
-        for angle_deg in angles_list:
-            if abs(angle_deg) < 0.001:
-                rotated = small
-            else:
-                shear_tan = math.tan(math.radians(angle_deg))
-                M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
-                rotated = cv2.warpAffine(small, M, (sw, sh),
-                                         flags=cv2.INTER_NEAREST,
-                                         borderMode=cv2.BORDER_CONSTANT)
-            profile = np.sum(rotated, axis=1).astype(float)
-            results.append((angle_deg, float(np.var(profile))))
-        return results
-
-    # Pass 1: coarse
-    coarse_angles = [a * 0.5 for a in range(-6, 7)]  # 13 values
-    coarse_results = _sweep_variance(coarse_angles)
-    coarse_best = max(coarse_results, key=lambda x: x[1])
-
-    # Pass 2: fine around coarse best
-    fine_center = coarse_best[0]
-    fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]  # 21 values
-    fine_results = _sweep_variance(fine_angles)
-    fine_best = max(fine_results, key=lambda x: x[1])
-
-    best_angle = fine_best[0]
-    best_variance = fine_best[1]
-    variances = coarse_results + fine_results
-
-    # Confidence: how much sharper is the best angle vs. the mean?
-    all_mean = sum(v for _, v in variances) / len(variances)
-    if all_mean > 0 and best_variance > all_mean:
-        confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
-    else:
-        confidence = 0.0
-
-    result["shear_degrees"] = round(best_angle, 3)
-    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
-    return result
-
-
-def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
-    """Detect shear using Hough transform on printed table / ruled lines.
-
-    Vocabulary worksheets have near-horizontal printed table borders.  After
-    deskew these should be exactly horizontal; any residual tilt equals the
-    vertical shear angle (with inverted sign).
-
-    The sign convention: a horizontal line tilting +α degrees (left end lower)
-    means the page has vertical shear of -α degrees (left column edge drifts
-    to the left going downward).
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
-    h, w = img.shape[:2]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
-
-    min_len = int(w * 0.15)
-    lines = cv2.HoughLinesP(
-        edges, rho=1, theta=np.pi / 360,
-        threshold=int(w * 0.08),
-        minLineLength=min_len,
-        maxLineGap=20,
-    )
-
-    if lines is None or len(lines) < 3:
-        return result
-
-    horizontal_angles: List[Tuple[float, float]] = []
-    for line in lines:
-        x1, y1, x2, y2 = line[0]
-        if x1 == x2:
-            continue
-        angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
-        if abs(angle) <= 5.0:
-            length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
-            horizontal_angles.append((angle, length))
-
-    if len(horizontal_angles) < 3:
-        return result
-
-    # Weighted median
-    angles_arr = np.array([a for a, _ in horizontal_angles])
-    weights_arr = np.array([l for _, l in horizontal_angles])
-    sorted_idx = np.argsort(angles_arr)
-    s_angles = angles_arr[sorted_idx]
-    s_weights = weights_arr[sorted_idx]
-    cum = np.cumsum(s_weights)
-    mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
-    median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
-
-    agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
-    confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
-
-    # Sign inversion: horizontal line tilt is complementary to vertical shear
-    shear_degrees = -median_angle
-
-    result["shear_degrees"] = round(shear_degrees, 3)
-    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
-    return result
-
-
-def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
-    """Detect shear by measuring text-line straightness (Method D).
-
-    Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
-    bounding boxes, groups them into vertical columns by X-proximity,
-    and measures how the left-edge X position drifts with Y (vertical
-    position).  The drift dx/dy is the tangent of the shear angle.
-
-    This directly measures vertical shear (column tilt) rather than
-    horizontal text-line slope, which is already corrected by deskew.
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    import math
-    result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
-    h, w = img.shape[:2]
-    # Downscale 50% for speed
-    scale = 0.5
-    small = cv2.resize(img, (int(w * scale), int(h * scale)),
-                       interpolation=cv2.INTER_AREA)
-    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
-    pil_img = Image.fromarray(gray)
-
-    try:
-        data = pytesseract.image_to_data(
-            pil_img, lang='eng+deu', config='--psm 11 --oem 3',
-            output_type=pytesseract.Output.DICT,
-        )
-    except Exception:
-        return result
-
-    # Collect word left-edges (x) and vertical centres (y)
-    words = []
-    for i in range(len(data['text'])):
-        text = data['text'][i].strip()
-        conf = int(data['conf'][i])
-        if not text or conf < 20 or len(text) < 2:
-            continue
-        left_x = float(data['left'][i])
-        cy = data['top'][i] + data['height'][i] / 2.0
-        word_w = float(data['width'][i])
-        words.append((left_x, cy, word_w))
-
-    if len(words) < 15:
-        return result
-
-    # --- Group words into vertical columns by left-edge X proximity ---
-    # Sort by x, then cluster words whose left-edges are within x_tol
-    avg_w = sum(ww for _, _, ww in words) / len(words)
-    x_tol = max(avg_w * 0.4, 8)  # tolerance for "same column"
-
-    words_by_x = sorted(words, key=lambda w: w[0])
-    columns: List[List[Tuple[float, float]]] = []  # each: [(left_x, cy), ...]
-    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
-    cur_x = words_by_x[0][0]
-
-    for lx, cy, _ in words_by_x[1:]:
-        if abs(lx - cur_x) <= x_tol:
-            cur_col.append((lx, cy))
-            # Update running x as median of cluster
-            cur_x = cur_x * 0.8 + lx * 0.2
-        else:
-            if len(cur_col) >= 5:
-                columns.append(cur_col)
-            cur_col = [(lx, cy)]
-            cur_x = lx
-    if len(cur_col) >= 5:
-        columns.append(cur_col)
-
-    if len(columns) < 2:
-        return result
-
-    # --- For each column, measure X-drift as a function of Y ---
-    # Fit: left_x = a * cy + b  →  a = dx/dy = tan(shear_angle)
-    drifts = []
-    for col in columns:
-        ys = np.array([p[1] for p in col])
-        xs = np.array([p[0] for p in col])
-        y_range = ys.max() - ys.min()
-        if y_range < h * scale * 0.3:
-            continue  # column must span at least 30% of image height
-        # Linear regression: x = a*y + b
-        coeffs = np.polyfit(ys, xs, 1)
-        drifts.append(coeffs[0])  # dx/dy
-
-    if len(drifts) < 2:
-        return result
-
-    # Median dx/dy → shear angle
-    # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
-    median_drift = float(np.median(drifts))
-    shear_degrees = math.degrees(math.atan(median_drift))
-
-    # Confidence from column count + drift consistency
-    drift_std = float(np.std(drifts))
-    consistency = max(0.0, 1.0 - drift_std * 50)  # tighter penalty for drift variance
-    count_factor = min(1.0, len(drifts) / 4.0)
-    confidence = count_factor * 0.5 + consistency * 0.5
-
-    result["shear_degrees"] = round(shear_degrees, 3)
-    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
-    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
-                "shear=%.3f°, conf=%.2f",
-                len(columns), len(drifts), median_drift,
-                shear_degrees, confidence)
-    return result
-
-
-def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
-    """Check whether the dewarp correction actually improved alignment.
-
-    Compares horizontal projection variance before and after correction.
-    Higher variance means sharper text-line peaks, which indicates better
-    horizontal alignment.
-
-    Returns True if the correction improved the image, False if it should
-    be discarded.
-    """
-    def _h_proj_variance(img: np.ndarray) -> float:
-        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        _, binary = cv2.threshold(gray, 0, 255,
-                                  cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-        small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
-                           interpolation=cv2.INTER_AREA)
-        profile = np.sum(small, axis=1).astype(float)
-        return float(np.var(profile))
-
-    var_before = _h_proj_variance(original)
-    var_after = _h_proj_variance(corrected)
-
-    # Correction must improve variance (even by a tiny margin)
-    return var_after > var_before
-
-
-def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
-    """Apply a vertical shear correction to an image.
-
-    Shifts each row horizontally proportional to its distance from the
-    vertical center. This corrects the tilt of vertical features (columns)
-    without affecting horizontal alignment (text lines).
-
-    Args:
-        img: BGR image.
-        shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
-
-    Returns:
-        Corrected image.
-    """
-    import math
-    h, w = img.shape[:2]
-    shear_tan = math.tan(math.radians(shear_degrees))
-
-    # Affine matrix: shift x by shear_tan * (y - h/2)
-    # [1  shear_tan  -h/2*shear_tan]
-    # [0  1          0             ]
-    M = np.float32([
-        [1, shear_tan, -h / 2.0 * shear_tan],
-        [0, 1, 0],
-    ])
-
-    corrected = cv2.warpAffine(img, M, (w, h),
-                                flags=cv2.INTER_LINEAR,
-                                borderMode=cv2.BORDER_REPLICATE)
-    return corrected
-
-
-def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
-    """Combine multiple shear detections into a single weighted estimate (v2).
-
-    Ensemble v2 changes vs v1:
-    - Minimum confidence raised to 0.5 (was 0.3)
-    - text_lines method gets 1.5× weight boost (most reliable detector)
-    - Outlier filter at 1° from weighted mean
-
-    Returns:
-        (shear_degrees, ensemble_confidence, methods_used_str)
-    """
-    # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
-    # that individual methods detect with moderate confidence.
-    _MIN_CONF = 0.35
-
-    # text_lines gets a weight boost as the most content-aware method
-    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
-
-    accepted = []
-    for d in detections:
-        if d["confidence"] < _MIN_CONF:
-            continue
-        boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
-        effective_conf = d["confidence"] * boost
-        accepted.append((d["shear_degrees"], effective_conf, d["method"]))
-
-    if not accepted:
-        return 0.0, 0.0, "none"
-
-    if len(accepted) == 1:
-        deg, conf, method = accepted[0]
-        return deg, min(conf, 1.0), method
-
-    # First pass: weighted mean
-    total_w = sum(c for _, c, _ in accepted)
-    w_mean = sum(d * c for d, c, _ in accepted) / total_w
-
-    # Outlier filter: keep results within 1° of weighted mean
-    filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
-    if not filtered:
-        filtered = accepted  # fallback: keep all
-
-    # Second pass: weighted mean on filtered results
-    total_w2 = sum(c for _, c, _ in filtered)
-    final_deg = sum(d * c for d, c, _ in filtered) / total_w2
-
-    # Ensemble confidence: average of individual confidences, boosted when
-    # methods agree (all within 0.5° of each other)
-    avg_conf = total_w2 / len(filtered)
-    spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
-    agreement_bonus = 0.15 if spread < 0.5 else 0.0
-    ensemble_conf = min(1.0, avg_conf + agreement_bonus)
-
-    methods_str = "+".join(m for _, _, m in filtered)
-    return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
-
-
-def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
-    """Correct vertical shear after deskew (v2 with quality gate).
-
-    After deskew aligns horizontal text lines, vertical features (column
-    edges) may still be tilted. This detects the tilt angle using an ensemble
-    of four complementary methods and applies an affine shear correction.
-
-    Methods (all run in ~150ms total):
-        A. _detect_shear_angle()           — vertical edge profile (~50ms)
-        B. _detect_shear_by_projection()   — horizontal text-line variance (~30ms)
-        C. _detect_shear_by_hough()        — Hough lines on table borders (~20ms)
-        D. _detect_shear_by_text_lines()   — text-line straightness (~50ms)
-
-    Quality gate: after correction, horizontal projection variance is compared
-    before vs after. If correction worsened alignment, it is discarded.
-
-    Args:
-        img: BGR image (already deskewed).
-        use_ensemble: If False, fall back to single-method behaviour (method A only).
-
-    Returns:
-        Tuple of (corrected_image, dewarp_info).
-        dewarp_info keys: method, shear_degrees, confidence, detections.
-    """
-    no_correction = {
-        "method": "none",
-        "shear_degrees": 0.0,
-        "confidence": 0.0,
-        "detections": [],
-    }
-
-    if not CV2_AVAILABLE:
-        return img, no_correction
-
-    t0 = time.time()
-
-    if use_ensemble:
-        det_a = _detect_shear_angle(img)
-        det_b = _detect_shear_by_projection(img)
-        det_c = _detect_shear_by_hough(img)
-        det_d = _detect_shear_by_text_lines(img)
-        detections = [det_a, det_b, det_c, det_d]
-        shear_deg, confidence, method = _ensemble_shear(detections)
-    else:
-        det_a = _detect_shear_angle(img)
-        detections = [det_a]
-        shear_deg = det_a["shear_degrees"]
-        confidence = det_a["confidence"]
-        method = det_a["method"]
-
-    duration = time.time() - t0
-
-    logger.info(
-        "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
-        "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
-        shear_deg, confidence, method, duration,
-        detections[0]["shear_degrees"], detections[0]["confidence"],
-        detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
-        detections[1]["confidence"] if len(detections) > 1 else 0.0,
-        detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
-        detections[2]["confidence"] if len(detections) > 2 else 0.0,
-        detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
-        detections[3]["confidence"] if len(detections) > 3 else 0.0,
-    )
-
-    # Always include individual detections (even when no correction applied)
-    _all_detections = [
-        {"method": d["method"], "shear_degrees": d["shear_degrees"],
-         "confidence": d["confidence"]}
-        for d in detections
-    ]
-
-    # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
-    # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
-    # catch moderate-confidence detections from multiple agreeing methods).
-    if abs(shear_deg) < 0.08 or confidence < 0.4:
-        no_correction["detections"] = _all_detections
-        return img, no_correction
-
-    # Apply correction (negate the detected shear to straighten)
-    corrected = _apply_shear(img, -shear_deg)
-
-    # Quality gate: verify the correction actually improved alignment.
-    # For small corrections (< 0.5°), the projection variance change can be
-    # negligible, so we skip the quality gate — the cost of a tiny wrong
-    # correction is much less than the cost of leaving 0.4° uncorrected
-    # (which shifts content ~25px at image edges on tall scans).
-    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
-        logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
-                     "projection variance did not improve", shear_deg)
-        no_correction["detections"] = _all_detections
-        return img, no_correction
-
-    info = {
-        "method": method,
-        "shear_degrees": shear_deg,
-        "confidence": confidence,
-        "detections": _all_detections,
-    }
-
-    return corrected, info
-
-
-def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
-    """Apply shear correction with a manual angle.
-
-    Args:
-        img: BGR image (deskewed, before dewarp).
-        shear_degrees: Shear angle in degrees to correct.
-
-    Returns:
-        Corrected image.
-    """
-    if abs(shear_degrees) < 0.001:
-        return img
-    return _apply_shear(img, -shear_degrees)
-
-
-# =============================================================================
-# Document Type Detection
-# =============================================================================
-
-def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
-    """Detect whether the page is a vocab table, generic table, or full text.
-
-    Uses projection profiles and text density analysis — no OCR required.
-    Runs in < 2 seconds.
-
-    Args:
-        ocr_img: Binarized grayscale image (for projection profiles).
-        img_bgr: BGR color image.
-
-    Returns:
-        DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
-    """
-    if ocr_img is None or ocr_img.size == 0:
-        return DocumentTypeResult(
-            doc_type='full_text', confidence=0.5, pipeline='full_page',
-            skip_steps=['columns', 'rows'],
-            features={'error': 'empty image'},
-        )
-
-    h, w = ocr_img.shape[:2]
-
-    # --- 1. Vertical projection profile → detect column gaps ---
-    # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
-    # Invert: dark pixels on white background → high values = text.
-    vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
-
-    # Smooth the profile to avoid noise spikes
-    kernel_size = max(3, w // 100)
-    if kernel_size % 2 == 0:
-        kernel_size += 1
-    vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
-
-    # Find significant vertical gaps (columns of near-zero text density)
-    # A gap must be at least 1% of image width and have < 5% of max density
-    max_density = max(vert_smooth.max(), 1)
-    gap_threshold = max_density * 0.05
-    min_gap_width = max(5, w // 100)
-
-    in_gap = False
-    gap_count = 0
-    gap_start = 0
-    vert_gaps = []
-
-    for x in range(w):
-        if vert_smooth[x] < gap_threshold:
-            if not in_gap:
-                in_gap = True
-                gap_start = x
-        else:
-            if in_gap:
-                gap_width = x - gap_start
-                if gap_width >= min_gap_width:
-                    gap_count += 1
-                    vert_gaps.append((gap_start, x, gap_width))
-                in_gap = False
-
-    # Filter out margin gaps (within 10% of image edges)
-    margin_threshold = w * 0.10
-    internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
-    internal_gap_count = len(internal_gaps)
-
-    # --- 2. Horizontal projection profile → detect row gaps ---
-    horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
-    h_kernel = max(3, h // 200)
-    if h_kernel % 2 == 0:
-        h_kernel += 1
-    horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
-
-    h_max = max(horiz_smooth.max(), 1)
-    h_gap_threshold = h_max * 0.05
-    min_row_gap = max(3, h // 200)
-
-    row_gap_count = 0
-    in_gap = False
-    for y in range(h):
-        if horiz_smooth[y] < h_gap_threshold:
-            if not in_gap:
-                in_gap = True
-                gap_start = y
-        else:
-            if in_gap:
-                if y - gap_start >= min_row_gap:
-                    row_gap_count += 1
-                in_gap = False
-
-    # --- 3. Text density distribution (4×4 grid) ---
-    grid_rows, grid_cols = 4, 4
-    cell_h, cell_w = h // grid_rows, w // grid_cols
-    densities = []
-    for gr in range(grid_rows):
-        for gc in range(grid_cols):
-            cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
-                           gc * cell_w:(gc + 1) * cell_w]
-            if cell.size > 0:
-                d = float(np.count_nonzero(cell < 128)) / cell.size
-                densities.append(d)
-
-    density_std = float(np.std(densities)) if densities else 0
-    density_mean = float(np.mean(densities)) if densities else 0
-
-    features = {
-        'vertical_gaps': gap_count,
-        'internal_vertical_gaps': internal_gap_count,
-        'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
-        'row_gaps': row_gap_count,
-        'density_mean': round(density_mean, 4),
-        'density_std': round(density_std, 4),
-        'image_size': (w, h),
-    }
-
-    # --- 4. Decision tree ---
-    # Use internal_gap_count (excludes margin gaps) for column detection.
-    if internal_gap_count >= 2 and row_gap_count >= 5:
-        # Multiple internal vertical gaps + many row gaps → table
-        confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
-        return DocumentTypeResult(
-            doc_type='vocab_table',
-            confidence=round(confidence, 2),
-            pipeline='cell_first',
-            skip_steps=[],
-            features=features,
-        )
-    elif internal_gap_count >= 1 and row_gap_count >= 3:
-        # Some internal structure, likely a table
-        confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
-        return DocumentTypeResult(
-            doc_type='generic_table',
-            confidence=round(confidence, 2),
-            pipeline='cell_first',
-            skip_steps=[],
-            features=features,
-        )
-    elif internal_gap_count == 0:
-        # No internal column gaps → full text (regardless of density)
-        confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
-        return DocumentTypeResult(
-            doc_type='full_text',
-            confidence=round(confidence, 2),
-            pipeline='full_page',
-            skip_steps=['columns', 'rows'],
-            features=features,
-        )
-    else:
-        # Ambiguous — default to vocab_table (most common use case)
-        return DocumentTypeResult(
-            doc_type='vocab_table',
-            confidence=0.5,
-            pipeline='cell_first',
-            skip_steps=[],
-            features=features,
-        )
-
-
-# =============================================================================
-# Stage 4: Dual Image Preparation
-# =============================================================================
-
-def create_ocr_image(img: np.ndarray) -> np.ndarray:
-    """Create a binarized image optimized for Tesseract OCR.
-
-    Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
-
-    Args:
-        img: BGR image.
-
-    Returns:
-        Binary image (white text on black background inverted to black on white).
-    """
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Background normalization: divide by blurred version
-    bg = cv2.GaussianBlur(gray, (51, 51), 0)
-    normalized = cv2.divide(gray, bg, scale=255)
-
-    # Adaptive binarization
-    binary = cv2.adaptiveThreshold(
-        normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-        cv2.THRESH_BINARY, 31, 10
-    )
-
-    # Light denoise
-    denoised = cv2.medianBlur(binary, 3)
-
-    return denoised
-
-
-def create_layout_image(img: np.ndarray) -> np.ndarray:
-    """Create a CLAHE-enhanced grayscale image for layout analysis.
-
-    Args:
-        img: BGR image.
-
-    Returns:
-        Enhanced grayscale image.
-    """
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
-    enhanced = clahe.apply(gray)
-    return enhanced
-
-
-# =============================================================================
-# Stage 5: Layout Analysis (Projection Profiles)
-# =============================================================================
-
-def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
-    """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
-    out = mask.copy()
-    n = len(out)
-    i = 0
-    while i < n:
-        if out[i]:
-            start = i
-            while i < n and out[i]:
-                i += 1
-            if (i - start) < min_width:
-                out[start:i] = False
-        else:
-            i += 1
-    return out
-
-
-def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
-    """Find the bounding box of actual text content (excluding page margins).
-
-    Scan artefacts (thin black lines at page edges) are filtered out by
-    discarding contiguous projection runs narrower than 1 % of the image
-    dimension (min 5 px).
-
-    Returns:
-        Tuple of (left_x, right_x, top_y, bottom_y).
-    """
-    h, w = inv.shape[:2]
-    threshold = 0.005
-
-    # --- Horizontal projection for top/bottom ---
-    h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
-    h_mask = h_proj > threshold
-    min_h_run = max(5, h // 100)
-    h_mask = _filter_narrow_runs(h_mask, min_h_run)
-
-    top_y = 0
-    for y in range(h):
-        if h_mask[y]:
-            top_y = max(0, y - 5)
-            break
-
-    bottom_y = h
-    for y in range(h - 1, 0, -1):
-        if h_mask[y]:
-            bottom_y = min(h, y + 5)
-            break
-
-    # --- Vertical projection for left/right margins ---
-    v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
-    v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
-    v_mask = v_proj_norm > threshold
-    min_v_run = max(5, w // 100)
-    v_mask = _filter_narrow_runs(v_mask, min_v_run)
-
-    left_x = 0
-    for x in range(w):
-        if v_mask[x]:
-            left_x = max(0, x - 2)
-            break
-
-    right_x = w
-    for x in range(w - 1, 0, -1):
-        if v_mask[x]:
-            right_x = min(w, x + 2)
-            break
-
-    return left_x, right_x, top_y, bottom_y
-
-
-def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
-    """Detect columns, header, and footer using projection profiles.
-
-    Uses content-bounds detection to exclude page margins before searching
-    for column separators within the actual text area.
-
-    Args:
-        layout_img: CLAHE-enhanced grayscale image.
-        ocr_img: Binarized image for text density analysis.
-
-    Returns:
-        List of PageRegion objects describing detected regions.
-    """
-    h, w = ocr_img.shape[:2]
-
-    # Invert: black text on white → white text on black for projection
-    inv = cv2.bitwise_not(ocr_img)
-
-    # --- Find actual content bounds (exclude page margins) ---
-    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
-    content_w = right_x - left_x
-    content_h = bottom_y - top_y
-
-    logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
-                f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
-
-    if content_w < w * 0.3 or content_h < h * 0.3:
-        # Fallback if detection seems wrong
-        left_x, right_x = 0, w
-        top_y, bottom_y = 0, h
-        content_w, content_h = w, h
-
-    # --- Vertical projection within content area to find column separators ---
-    content_strip = inv[top_y:bottom_y, left_x:right_x]
-    v_proj = np.sum(content_strip, axis=0).astype(float)
-    v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
-
-    # Smooth the projection profile
-    kernel_size = max(5, content_w // 50)
-    if kernel_size % 2 == 0:
-        kernel_size += 1
-    v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
-    # Debug: log projection profile statistics
-    p_mean = float(np.mean(v_proj_smooth))
-    p_median = float(np.median(v_proj_smooth))
-    p_min = float(np.min(v_proj_smooth))
-    p_max = float(np.max(v_proj_smooth))
-    logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
-                f"mean={p_mean:.4f}, median={p_median:.4f}")
-
-    # Find valleys using multiple threshold strategies
-    # Strategy 1: relative to median (catches clear separators)
-    # Strategy 2: local minima approach (catches subtle gaps)
-    threshold = max(p_median * 0.3, p_mean * 0.2)
-    logger.info(f"Layout: valley threshold={threshold:.4f}")
-
-    in_valley = v_proj_smooth < threshold
-
-    # Find contiguous valley regions
-    all_valleys = []
-    start = None
-    for x in range(len(v_proj_smooth)):
-        if in_valley[x] and start is None:
-            start = x
-        elif not in_valley[x] and start is not None:
-            valley_width = x - start
-            valley_depth = float(np.min(v_proj_smooth[start:x]))
-            # Valley must be at least 3px wide
-            if valley_width >= 3:
-                all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
-            start = None
-
-    logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
-                f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
-
-    # Filter: valleys must be inside the content area (not at edges)
-    inner_margin = int(content_w * 0.08)
-    valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
-
-    # If no valleys found with strict threshold, try local minima approach
-    if len(valleys) < 2:
-        logger.info("Layout: trying local minima approach for column detection")
-        # Divide content into 20 segments, find the 2 lowest
-        seg_count = 20
-        seg_width = content_w // seg_count
-        seg_scores = []
-        for i in range(seg_count):
-            sx = i * seg_width
-            ex = min((i + 1) * seg_width, content_w)
-            seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
-            seg_scores.append((i, sx, ex, seg_mean))
-
-        seg_scores.sort(key=lambda s: s[3])
-        logger.info(f"Layout: segment scores (lowest 5): "
-                    f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
-
-        # Find two lowest non-adjacent segments that create reasonable columns
-        candidate_valleys = []
-        for seg_idx, sx, ex, seg_mean in seg_scores:
-            # Must not be at the edges
-            if seg_idx <= 1 or seg_idx >= seg_count - 2:
-                continue
-            # Must be significantly lower than overall mean
-            if seg_mean < p_mean * 0.6:
-                center = (sx + ex) // 2
-                candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
-
-        if len(candidate_valleys) >= 2:
-            # Pick the best pair: non-adjacent, creating reasonable column widths
-            candidate_valleys.sort(key=lambda v: v[2])
-            best_pair = None
-            best_score = float('inf')
-            for i in range(len(candidate_valleys)):
-                for j in range(i + 1, len(candidate_valleys)):
-                    c1 = candidate_valleys[i][2]
-                    c2 = candidate_valleys[j][2]
-                    # Must be at least 20% apart
-                    if (c2 - c1) < content_w * 0.2:
-                        continue
-                    col1 = c1
-                    col2 = c2 - c1
-                    col3 = content_w - c2
-                    # Each column at least 15%
-                    if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
-                        continue
-                    parts = sorted([col1, col2, col3])
-                    score = parts[2] - parts[0]
-                    if score < best_score:
-                        best_score = score
-                        best_pair = (candidate_valleys[i], candidate_valleys[j])
-
-            if best_pair:
-                valleys = list(best_pair)
-                logger.info(f"Layout: local minima found 2 valleys: "
-                            f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
-
-    logger.info(f"Layout: final {len(valleys)} valleys: "
-                f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
-
-    regions = []
-
-    if len(valleys) >= 2:
-        # 3-column layout detected
-        valleys.sort(key=lambda v: v[2])
-
-        if len(valleys) == 2:
-            sep1_center = valleys[0][2]
-            sep2_center = valleys[1][2]
-        else:
-            # Pick the two valleys that best divide into 3 parts
-            # Prefer wider valleys (more likely true separators)
-            best_pair = None
-            best_score = float('inf')
-            for i in range(len(valleys)):
-                for j in range(i + 1, len(valleys)):
-                    c1, c2 = valleys[i][2], valleys[j][2]
-                    # Each column should be at least 15% of content width
-                    col1 = c1
-                    col2 = c2 - c1
-                    col3 = content_w - c2
-                    if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
-                        continue
-                    # Score: lower is better (more even distribution)
-                    parts = sorted([col1, col2, col3])
-                    score = parts[2] - parts[0]
-                    # Bonus for wider valleys (subtract valley width)
-                    score -= (valleys[i][3] + valleys[j][3]) * 0.5
-                    if score < best_score:
-                        best_score = score
-                        best_pair = (c1, c2)
-            if best_pair:
-                sep1_center, sep2_center = best_pair
-            else:
-                sep1_center = valleys[0][2]
-                sep2_center = valleys[1][2]
-
-        # Convert from content-relative to absolute coordinates
-        abs_sep1 = sep1_center + left_x
-        abs_sep2 = sep2_center + left_x
-
-        logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
-                    f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
-
-        regions.append(PageRegion(
-            type='column_en', x=0, y=top_y,
-            width=abs_sep1, height=content_h
-        ))
-        regions.append(PageRegion(
-            type='column_de', x=abs_sep1, y=top_y,
-            width=abs_sep2 - abs_sep1, height=content_h
-        ))
-        regions.append(PageRegion(
-            type='column_example', x=abs_sep2, y=top_y,
-            width=w - abs_sep2, height=content_h
-        ))
-
-    elif len(valleys) == 1:
-        # 2-column layout
-        abs_sep = valleys[0][2] + left_x
-
-        logger.info(f"Layout: 2 columns at separator x={abs_sep}")
-
-        regions.append(PageRegion(
-            type='column_en', x=0, y=top_y,
-            width=abs_sep, height=content_h
-        ))
-        regions.append(PageRegion(
-            type='column_de', x=abs_sep, y=top_y,
-            width=w - abs_sep, height=content_h
-        ))
-
-    else:
-        # No columns detected — run full-page OCR as single column
-        logger.warning("Layout: no column separators found, using full page")
-        regions.append(PageRegion(
-            type='column_en', x=0, y=top_y,
-            width=w, height=content_h
-        ))
-
-    # Add header/footer info (gap-based detection with fallback)
-    _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
-
-    top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
-    bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
-    col_count = len([r for r in regions if r.type.startswith('column')])
-    logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
-
-    return regions
-
-
-# =============================================================================
-# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
-# =============================================================================
-
-# --- Phase A: Geometry Detection ---
-
-def _detect_columns_by_clustering(
-    word_dicts: List[Dict],
-    left_edges: List[int],
-    edge_word_indices: List[int],
-    content_w: int,
-    content_h: int,
-    left_x: int,
-    right_x: int,
-    top_y: int,
-    bottom_y: int,
-    inv: Optional[np.ndarray] = None,
-) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
-    """Fallback: detect columns by clustering left-aligned word positions.
-
-    Used when the primary gap-based algorithm finds fewer than 2 gaps.
-    """
-    tolerance = max(10, int(content_w * 0.01))
-    sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
-
-    clusters = []
-    cluster_widxs = []
-    cur_edges = [sorted_pairs[0][0]]
-    cur_widxs = [sorted_pairs[0][1]]
-    for edge, widx in sorted_pairs[1:]:
-        if edge - cur_edges[-1] <= tolerance:
-            cur_edges.append(edge)
-            cur_widxs.append(widx)
-        else:
-            clusters.append(cur_edges)
-            cluster_widxs.append(cur_widxs)
-            cur_edges = [edge]
-            cur_widxs = [widx]
-    clusters.append(cur_edges)
-    cluster_widxs.append(cur_widxs)
-
-    MIN_Y_COVERAGE_PRIMARY = 0.30
-    MIN_Y_COVERAGE_SECONDARY = 0.15
-    MIN_WORDS_SECONDARY = 5
-
-    cluster_infos = []
-    for c_edges, c_widxs in zip(clusters, cluster_widxs):
-        if len(c_edges) < 2:
-            continue
-        y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
-        y_span = max(y_positions) - min(y_positions)
-        y_coverage = y_span / content_h if content_h > 0 else 0.0
-        cluster_infos.append({
-            'mean_x': int(np.mean(c_edges)),
-            'count': len(c_edges),
-            'min_edge': min(c_edges),
-            'max_edge': max(c_edges),
-            'y_min': min(y_positions),
-            'y_max': max(y_positions),
-            'y_coverage': y_coverage,
-        })
-
-    primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
-    primary_set = set(id(c) for c in primary)
-    secondary = [c for c in cluster_infos
-                 if id(c) not in primary_set
-                 and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
-                 and c['count'] >= MIN_WORDS_SECONDARY]
-    significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
-
-    if len(significant) < 3:
-        logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
-        return None
-
-    merge_distance = max(30, int(content_w * 0.06))
-    merged = [significant[0].copy()]
-    for s in significant[1:]:
-        if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
-            prev = merged[-1]
-            total = prev['count'] + s['count']
-            avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
-            prev['mean_x'] = avg_x
-            prev['count'] = total
-            prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
-            prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
-        else:
-            merged.append(s.copy())
-
-    if len(merged) < 3:
-        logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
-        return None
-
-    logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
-
-    margin_px = max(6, int(content_w * 0.003))
-    return _build_geometries_from_starts(
-        [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
-        word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
-    )
-
-
-def _detect_sub_columns(
-    geometries: List[ColumnGeometry],
-    content_w: int,
-    left_x: int = 0,
-    top_y: int = 0,
-    header_y: Optional[int] = None,
-    footer_y: Optional[int] = None,
-    _edge_tolerance: int = 8,
-    _min_col_start_ratio: float = 0.10,
-) -> List[ColumnGeometry]:
-    """Split columns that contain internal sub-columns based on left-edge alignment.
-
-    For each column, clusters word left-edges into alignment bins (within
-    ``_edge_tolerance`` px).  The leftmost bin whose word count reaches
-    ``_min_col_start_ratio`` of the column total is treated as the true column
-    start.  Any words to the left of that bin form a sub-column, provided they
-    number >= 2 and < 35 % of total.
-
-    Word ``left`` values are relative to the content ROI (offset by *left_x*),
-    while ``ColumnGeometry.x`` is in absolute image coordinates.  *left_x*
-    bridges the two coordinate systems.
-
-    If *header_y* / *footer_y* are provided (absolute y-coordinates), words
-    in header/footer regions are excluded from alignment clustering to avoid
-    polluting the bins with page numbers or chapter titles.  Word ``top``
-    values are relative to *top_y*.
-
-    Returns a new list of ColumnGeometry — potentially longer than the input.
-    """
-    if content_w <= 0:
-        return geometries
-
-    result: List[ColumnGeometry] = []
-    for geo in geometries:
-        # Only consider wide-enough columns with enough words
-        if geo.width_ratio < 0.15 or geo.word_count < 5:
-            result.append(geo)
-            continue
-
-        # Collect left-edges of confident words, excluding header/footer
-        # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
-        min_top_rel = (header_y - top_y) if header_y is not None else None
-        max_top_rel = (footer_y - top_y) if footer_y is not None else None
-
-        confident = [w for w in geo.words
-                     if w.get('conf', 0) >= 30
-                     and (min_top_rel is None or w['top'] >= min_top_rel)
-                     and (max_top_rel is None or w['top'] <= max_top_rel)]
-        if len(confident) < 3:
-            result.append(geo)
-            continue
-
-        # --- Cluster left-edges into alignment bins ---
-        sorted_edges = sorted(w['left'] for w in confident)
-        bins: List[Tuple[int, int, int, int]] = []  # (center, count, min_edge, max_edge)
-        cur = [sorted_edges[0]]
-        for i in range(1, len(sorted_edges)):
-            if sorted_edges[i] - cur[-1] <= _edge_tolerance:
-                cur.append(sorted_edges[i])
-            else:
-                bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
-                cur = [sorted_edges[i]]
-        bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
-
-        # --- Find the leftmost bin qualifying as a real column start ---
-        total = len(confident)
-        min_count = max(3, int(total * _min_col_start_ratio))
-        col_start_bin = None
-        for b in bins:
-            if b[1] >= min_count:
-                col_start_bin = b
-                break
-
-        if col_start_bin is None:
-            result.append(geo)
-            continue
-
-        # Words to the left of the column-start bin are sub-column candidates
-        split_threshold = col_start_bin[2] - _edge_tolerance
-        sub_words = [w for w in geo.words if w['left'] < split_threshold]
-        main_words = [w for w in geo.words if w['left'] >= split_threshold]
-
-        # Count only body words (excluding header/footer) for the threshold check
-        # so that header/footer words don't artificially trigger a split.
-        sub_body = [w for w in sub_words
-                    if (min_top_rel is None or w['top'] >= min_top_rel)
-                    and (max_top_rel is None or w['top'] <= max_top_rel)]
-        if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
-            result.append(geo)
-            continue
-
-        # --- Build two sub-column geometries ---
-        # Word 'left' values are relative to left_x; geo.x is absolute.
-        # Convert the split position from relative to absolute coordinates.
-        max_sub_left = max(w['left'] for w in sub_words)
-        split_rel = (max_sub_left + col_start_bin[2]) // 2
-        split_abs = split_rel + left_x
-
-        sub_x = geo.x
-        sub_width = split_abs - geo.x
-        main_x = split_abs
-        main_width = (geo.x + geo.width) - split_abs
-
-        if sub_width <= 0 or main_width <= 0:
-            result.append(geo)
-            continue
-
-        sub_geo = ColumnGeometry(
-            index=0,
-            x=sub_x,
-            y=geo.y,
-            width=sub_width,
-            height=geo.height,
-            word_count=len(sub_words),
-            words=sub_words,
-            width_ratio=sub_width / content_w if content_w > 0 else 0.0,
-            is_sub_column=True,
-        )
-        main_geo = ColumnGeometry(
-            index=0,
-            x=main_x,
-            y=geo.y,
-            width=main_width,
-            height=geo.height,
-            word_count=len(main_words),
-            words=main_words,
-            width_ratio=main_width / content_w if content_w > 0 else 0.0,
-            is_sub_column=True,
-        )
-
-        result.append(sub_geo)
-        result.append(main_geo)
-
-        logger.info(
-            f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
-            f"(rel={split_rel}), sub={len(sub_words)} words, "
-            f"main={len(main_words)} words, "
-            f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
-        )
-
-    # Re-index by left-to-right order
-    result.sort(key=lambda g: g.x)
-    for i, g in enumerate(result):
-        g.index = i
-
-    return result
-
-
-def _split_broad_columns(
-    geometries: List[ColumnGeometry],
-    content_w: int,
-    left_x: int = 0,
-    _broad_threshold: float = 0.35,
-    _min_gap_px: int = 15,
-    _min_words_per_split: int = 5,
-) -> List[ColumnGeometry]:
-    """Split overly broad columns that contain two language blocks (EN+DE).
-
-    Uses word-coverage gap analysis: builds a per-pixel coverage array from the
-    words inside each broad column, finds the largest horizontal gap, and splits
-    the column at that gap.
-
-    Args:
-        geometries: Column geometries from _detect_sub_columns.
-        content_w: Width of the content area in pixels.
-        left_x: Left edge of content ROI in absolute image coordinates.
-        _broad_threshold: Minimum width_ratio to consider a column "broad".
-        _min_gap_px: Minimum gap width (pixels) to trigger a split.
-        _min_words_per_split: Both halves must have at least this many words.
-
-    Returns:
-        Updated list of ColumnGeometry (possibly with more columns).
-    """
-    result: List[ColumnGeometry] = []
-
-    logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
-                f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
-
-    for geo in geometries:
-        if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
-            result.append(geo)
-            continue
-
-        # Build word-coverage array (per pixel within column)
-        col_left_rel = geo.x - left_x  # column left in content-relative coords
-        coverage = np.zeros(geo.width, dtype=np.float32)
-
-        for wd in geo.words:
-            # wd['left'] is relative to left_x (content ROI)
-            wl = wd['left'] - col_left_rel
-            wr = wl + wd.get('width', 0)
-            wl = max(0, int(wl))
-            wr = min(geo.width, int(wr))
-            if wr > wl:
-                coverage[wl:wr] += 1.0
-
-        # Light smoothing (kernel=3px) to avoid noise
-        if len(coverage) > 3:
-            kernel = np.ones(3, dtype=np.float32) / 3.0
-            coverage = np.convolve(coverage, kernel, mode='same')
-
-        # Normalise to [0, 1]
-        cmax = coverage.max()
-        if cmax > 0:
-            coverage /= cmax
-
-        # Find INTERNAL gaps where coverage < 0.5
-        # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
-        low_mask = coverage < 0.5
-        all_gaps = []
-        _gs = None
-        for px in range(len(low_mask)):
-            if low_mask[px]:
-                if _gs is None:
-                    _gs = px
-            else:
-                if _gs is not None:
-                    all_gaps.append((_gs, px, px - _gs))
-                    _gs = None
-        if _gs is not None:
-            all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
-
-        # Filter: only internal gaps (not touching column edges)
-        _edge_margin = 10  # pixels from edge to ignore
-        internal_gaps = [g for g in all_gaps
-                         if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
-        best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
-
-        logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
-                    f"{[g for g in all_gaps if g[2] >= 5]}, "
-                    f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
-                    f"best={best_gap}")
-
-        if best_gap is None or best_gap[2] < _min_gap_px:
-            result.append(geo)
-            continue
-
-        gap_center = (best_gap[0] + best_gap[1]) // 2
-
-        # Split words by midpoint relative to gap
-        left_words = []
-        right_words = []
-        for wd in geo.words:
-            wl = wd['left'] - col_left_rel
-            mid = wl + wd.get('width', 0) / 2.0
-            if mid < gap_center:
-                left_words.append(wd)
-            else:
-                right_words.append(wd)
-
-        if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
-            result.append(geo)
-            continue
-
-        # Build two new ColumnGeometry objects
-        split_x_abs = geo.x + gap_center
-        left_w = gap_center
-        right_w = geo.width - gap_center
-
-        left_geo = ColumnGeometry(
-            index=0,
-            x=geo.x,
-            y=geo.y,
-            width=left_w,
-            height=geo.height,
-            word_count=len(left_words),
-            words=left_words,
-            width_ratio=left_w / content_w if content_w else 0,
-            is_sub_column=True,
-        )
-        right_geo = ColumnGeometry(
-            index=0,
-            x=split_x_abs,
-            y=geo.y,
-            width=right_w,
-            height=geo.height,
-            word_count=len(right_words),
-            words=right_words,
-            width_ratio=right_w / content_w if content_w else 0,
-            is_sub_column=True,
-        )
-
-        logger.info(
-            f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
-            f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
-            f"left={len(left_words)} words (w={left_w}), "
-            f"right={len(right_words)} words (w={right_w})"
-        )
-
-        result.append(left_geo)
-        result.append(right_geo)
-
-    # Re-index left-to-right
-    result.sort(key=lambda g: g.x)
-    for i, g in enumerate(result):
-        g.index = i
-
-    return result
-
-
-def _build_geometries_from_starts(
-    col_starts: List[Tuple[int, int]],
-    word_dicts: List[Dict],
-    left_x: int,
-    right_x: int,
-    top_y: int,
-    bottom_y: int,
-    content_w: int,
-    content_h: int,
-    inv: Optional[np.ndarray] = None,
-) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
-    """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
-    geometries = []
-    for i, (start_x, count) in enumerate(col_starts):
-        if i + 1 < len(col_starts):
-            col_width = col_starts[i + 1][0] - start_x
-        else:
-            col_width = right_x - start_x
-
-        col_left_rel = start_x - left_x
-        col_right_rel = col_left_rel + col_width
-        col_words = [w for w in word_dicts
-                     if col_left_rel <= w['left'] < col_right_rel]
-
-        geometries.append(ColumnGeometry(
-            index=i,
-            x=start_x,
-            y=top_y,
-            width=col_width,
-            height=content_h,
-            word_count=len(col_words),
-            words=col_words,
-            width_ratio=col_width / content_w if content_w > 0 else 0.0,
-        ))
-
-    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
-                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
-    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
-
-
-def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
-    """Detect column geometry using whitespace-gap analysis with word validation.
-
-    Phase A of the two-phase column detection. Uses vertical projection
-    profiles to find whitespace gaps between columns, then validates that
-    no gap cuts through a word bounding box.
-
-    Falls back to clustering-based detection if fewer than 2 gaps are found.
-
-    Args:
-        ocr_img: Binarized grayscale image for layout analysis.
-        dewarped_bgr: Original BGR image (for Tesseract word detection).
-
-    Returns:
-        Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
-        or None if detection fails entirely.
-    """
-    h, w = ocr_img.shape[:2]
-
-    # --- Step 1: Find content bounds ---
-    inv = cv2.bitwise_not(ocr_img)
-    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
-    content_w = right_x - left_x
-    content_h = bottom_y - top_y
-
-    if content_w < w * 0.3 or content_h < h * 0.3:
-        left_x, right_x = 0, w
-        top_y, bottom_y = 0, h
-        content_w, content_h = w, h
-
-    logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
-                f"y=[{top_y}..{bottom_y}] ({content_h}px)")
-
-    # --- Step 2: Get word bounding boxes from Tesseract ---
-    # Crop from left_x to full image width (not right_x) so words at the right
-    # edge of the last column are included even if they extend past the detected
-    # content boundary (right_x).
-    content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
-    pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
-
-    try:
-        data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
-    except Exception as e:
-        logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
-        return None
-
-    word_dicts = []
-    left_edges = []
-    edge_word_indices = []
-    n_words = len(data['text'])
-    for i in range(n_words):
-        conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
-        text = str(data['text'][i]).strip()
-        if conf < 30 or not text:
-            continue
-        lx = int(data['left'][i])
-        ty = int(data['top'][i])
-        bw = int(data['width'][i])
-        bh = int(data['height'][i])
-        left_edges.append(lx)
-        edge_word_indices.append(len(word_dicts))
-        word_dicts.append({
-            'text': text, 'conf': conf,
-            'left': lx, 'top': ty, 'width': bw, 'height': bh,
-        })
-
-    if len(left_edges) < 5:
-        logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
-        return None
-
-    logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
-
-    # --- Step 2b: Segment by sub-headers ---
-    # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
-    # text bands that pollute the vertical projection.  We detect large
-    # horizontal gaps (= whitespace rows separating sections) and use only
-    # the tallest content segment for the projection.  This makes column
-    # detection immune to sub-headers, illustrations, and section dividers.
-    content_strip = inv[top_y:bottom_y, left_x:right_x]
-    h_proj_row = np.sum(content_strip, axis=1).astype(float)
-    h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
-
-    # Find horizontal gaps (near-empty rows)
-    H_GAP_THRESH = 0.02  # rows with <2% ink density are "empty"
-    h_in_gap = h_proj_row_norm < H_GAP_THRESH
-    H_MIN_GAP = max(5, content_h // 200)  # min gap height ~5-7px
-
-    h_gaps: List[Tuple[int, int]] = []
-    h_gap_start = None
-    for y_idx in range(len(h_in_gap)):
-        if h_in_gap[y_idx]:
-            if h_gap_start is None:
-                h_gap_start = y_idx
-        else:
-            if h_gap_start is not None:
-                if y_idx - h_gap_start >= H_MIN_GAP:
-                    h_gaps.append((h_gap_start, y_idx))
-                h_gap_start = None
-    if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
-        h_gaps.append((h_gap_start, len(h_in_gap)))
-
-    # Identify "large" gaps (significantly bigger than median) that indicate
-    # section boundaries (sub-headers, chapter titles).
-    if len(h_gaps) >= 3:
-        gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
-        median_gap_h = gap_sizes[len(gap_sizes) // 2]
-        large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
-        large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
-    else:
-        large_gaps = h_gaps
-
-    # Build content segments between large gaps and pick the tallest
-    seg_boundaries = [0]
-    for gs, ge in large_gaps:
-        seg_boundaries.append(gs)
-        seg_boundaries.append(ge)
-    seg_boundaries.append(content_h)
-
-    segments = []
-    for i in range(0, len(seg_boundaries) - 1, 2):
-        seg_top = seg_boundaries[i]
-        seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
-        seg_height = seg_bot - seg_top
-        if seg_height > 20:  # ignore tiny fragments
-            segments.append((seg_top, seg_bot, seg_height))
-
-    if segments:
-        segments.sort(key=lambda s: s[2], reverse=True)
-        best_seg = segments[0]
-        proj_strip = content_strip[best_seg[0]:best_seg[1], :]
-        effective_h = best_seg[2]
-        if len(segments) > 1:
-            logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
-                        f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
-                        f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
-    else:
-        proj_strip = content_strip
-        effective_h = content_h
-
-    # --- Step 3: Vertical projection profile ---
-    v_proj = np.sum(proj_strip, axis=0).astype(float)
-    v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
-
-    # Smooth the projection to avoid noise-induced micro-gaps
-    kernel_size = max(5, content_w // 80)
-    if kernel_size % 2 == 0:
-        kernel_size += 1  # keep odd for symmetry
-    v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
-    # --- Step 4: Find whitespace gaps ---
-    # Threshold: areas with very little ink density are gaps
-    median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
-    gap_threshold = max(median_density * 0.15, 0.005)
-
-    in_gap = v_smooth < gap_threshold
-    MIN_GAP_WIDTH = max(8, content_w // 200)  # min ~8px or 0.5% of content width
-
-    # Collect contiguous gap regions
-    raw_gaps = []  # (start_x_rel, end_x_rel) relative to content ROI
-    gap_start = None
-    for x in range(len(in_gap)):
-        if in_gap[x]:
-            if gap_start is None:
-                gap_start = x
-        else:
-            if gap_start is not None:
-                gap_width = x - gap_start
-                if gap_width >= MIN_GAP_WIDTH:
-                    raw_gaps.append((gap_start, x))
-                gap_start = None
-    # Handle gap at the right edge
-    if gap_start is not None:
-        gap_width = len(in_gap) - gap_start
-        if gap_width >= MIN_GAP_WIDTH:
-            raw_gaps.append((gap_start, len(in_gap)))
-
-    logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
-                f"min_width={MIN_GAP_WIDTH}px): "
-                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
-
-    # --- Step 5: Validate gaps against word bounding boxes ---
-    # When using a segment for projection, only validate against words
-    # inside that segment — words from sub-headers or other sections
-    # would incorrectly overlap with real column gaps.
-    if segments and len(segments) > 1:
-        seg_top_abs = best_seg[0]  # relative to content strip
-        seg_bot_abs = best_seg[1]
-        segment_words = [wd for wd in word_dicts
-                         if wd['top'] >= seg_top_abs
-                         and wd['top'] + wd['height'] <= seg_bot_abs]
-        logger.info(f"ColumnGeometry: filtering words to segment: "
-                    f"{len(segment_words)}/{len(word_dicts)} words")
-    else:
-        segment_words = word_dicts
-
-    validated_gaps = []
-    for gap_start_rel, gap_end_rel in raw_gaps:
-        # Check if any word overlaps with this gap region
-        overlapping = False
-        for wd in segment_words:
-            word_left = wd['left']
-            word_right = wd['left'] + wd['width']
-            if word_left < gap_end_rel and word_right > gap_start_rel:
-                overlapping = True
-                break
-
-        if not overlapping:
-            validated_gaps.append((gap_start_rel, gap_end_rel))
-        else:
-            # Try to shift the gap to avoid the overlapping word(s)
-            # Find the tightest word boundaries within the gap region
-            min_word_left = content_w
-            max_word_right = 0
-            for wd in segment_words:
-                word_left = wd['left']
-                word_right = wd['left'] + wd['width']
-                if word_left < gap_end_rel and word_right > gap_start_rel:
-                    min_word_left = min(min_word_left, word_left)
-                    max_word_right = max(max_word_right, word_right)
-
-            # Try gap before the overlapping words
-            if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
-                validated_gaps.append((gap_start_rel, min_word_left))
-                logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
-            # Try gap after the overlapping words
-            elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
-                validated_gaps.append((max_word_right, gap_end_rel))
-                logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
-            else:
-                logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
-                             f"discarded (word overlap, no room to shift)")
-
-    logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
-                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
-
-    # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
-    # When pixel-based projection fails (e.g. due to illustrations or colored
-    # bands), use word bounding boxes to find clear vertical gaps.  This is
-    # immune to decorative graphics that Tesseract doesn't recognise as words.
-    if len(validated_gaps) < 2:
-        logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
-        word_coverage = np.zeros(content_w, dtype=np.int32)
-        for wd in segment_words:
-            wl = max(0, wd['left'])
-            wr = min(wd['left'] + wd['width'], content_w)
-            if wr > wl:
-                word_coverage[wl:wr] += 1
-
-        # Smooth slightly to bridge tiny 1-2px noise gaps between words
-        wc_kernel = max(3, content_w // 300)
-        if wc_kernel % 2 == 0:
-            wc_kernel += 1
-        wc_smooth = np.convolve(word_coverage.astype(float),
-                                np.ones(wc_kernel) / wc_kernel, mode='same')
-
-        wc_in_gap = wc_smooth < 0.5  # effectively zero word coverage
-        WC_MIN_GAP = max(4, content_w // 300)
-
-        wc_gaps: List[Tuple[int, int]] = []
-        wc_gap_start = None
-        for x in range(len(wc_in_gap)):
-            if wc_in_gap[x]:
-                if wc_gap_start is None:
-                    wc_gap_start = x
-            else:
-                if wc_gap_start is not None:
-                    if x - wc_gap_start >= WC_MIN_GAP:
-                        wc_gaps.append((wc_gap_start, x))
-                    wc_gap_start = None
-        if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
-            wc_gaps.append((wc_gap_start, len(wc_in_gap)))
-
-        logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
-                    f"(min_width={WC_MIN_GAP}px): "
-                    f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
-
-        if len(wc_gaps) >= 2:
-            validated_gaps = wc_gaps
-
-    # --- Step 6: Fallback to clustering if too few gaps ---
-    if len(validated_gaps) < 2:
-        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
-        return _detect_columns_by_clustering(
-            word_dicts, left_edges, edge_word_indices,
-            content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
-        )
-
-    # --- Step 7: Derive column boundaries from gaps ---
-    # Sort gaps by position
-    validated_gaps.sort(key=lambda g: g[0])
-
-    # Identify margin gaps (first and last) vs interior gaps
-    # A margin gap touches the edge of the content area (within 2% tolerance)
-    edge_tolerance = max(10, int(content_w * 0.02))
-
-    is_left_margin = validated_gaps[0][0] <= edge_tolerance
-    is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
-
-    # Interior gaps define column boundaries
-    # Column starts at the end of a gap, ends at the start of the next gap
-    col_starts = []
-
-    if is_left_margin:
-        # First column starts after the left margin gap
-        first_gap_end = validated_gaps[0][1]
-        interior_gaps = validated_gaps[1:]
-    else:
-        # No left margin gap — first column starts at content left edge
-        first_gap_end = 0
-        interior_gaps = validated_gaps[:]
-
-    if is_right_margin:
-        # Last gap is right margin — don't use it as column start
-        interior_gaps_for_boundaries = interior_gaps[:-1]
-        right_boundary = validated_gaps[-1][0]  # last column ends at right margin gap start
-    else:
-        interior_gaps_for_boundaries = interior_gaps
-        right_boundary = content_w
-
-    # First column
-    col_starts.append(left_x + first_gap_end)
-
-    # Columns between interior gaps
-    for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
-        col_starts.append(left_x + gap_end_rel)
-
-    # Count words per column region (for logging)
-    col_start_counts = []
-    for i, start_x in enumerate(col_starts):
-        if i + 1 < len(col_starts):
-            next_start = col_starts[i + 1]
-        else:
-            # Rightmost column always extends to full image width (w).
-            # The page margin contains only white space — extending the OCR
-            # crop to the image edge is safe and prevents text near the right
-            # border from being cut off.
-            next_start = w
-
-        col_left_rel = start_x - left_x
-        col_right_rel = next_start - left_x
-        n_words_in_col = sum(1 for w in word_dicts
-                             if col_left_rel <= w['left'] < col_right_rel)
-        col_start_counts.append((start_x, n_words_in_col))
-
-    logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
-                f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
-                f"{col_start_counts}")
-
-    # --- Step 8: Build ColumnGeometry objects ---
-    # Determine right edge for each column
-    all_boundaries = []
-    for i, start_x in enumerate(col_starts):
-        if i + 1 < len(col_starts):
-            end_x = col_starts[i + 1]
-        else:
-            # Rightmost column always extends to full image width (w).
-            end_x = w
-        all_boundaries.append((start_x, end_x))
-
-    geometries = []
-    for i, (start_x, end_x) in enumerate(all_boundaries):
-        col_width = end_x - start_x
-        col_left_rel = start_x - left_x
-        col_right_rel = col_left_rel + col_width
-        col_words = [w for w in word_dicts
-                     if col_left_rel <= w['left'] < col_right_rel]
-
-        geometries.append(ColumnGeometry(
-            index=i,
-            x=start_x,
-            y=top_y,
-            width=col_width,
-            height=content_h,
-            word_count=len(col_words),
-            words=col_words,
-            width_ratio=col_width / content_w if content_w > 0 else 0.0,
-        ))
-
-    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
-                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
-
-    # --- Step 9: Filter phantom narrow columns ---
-    # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
-    # columns (< 3% of content width) with zero or no words. These are not
-    # real columns — remove them and close the gap between neighbors.
-    min_real_col_w = max(20, int(content_w * 0.03))
-    filtered_geoms = [g for g in geometries
-                      if not (g.word_count < 3 and g.width < min_real_col_w)]
-    if len(filtered_geoms) < len(geometries):
-        n_removed = len(geometries) - len(filtered_geoms)
-        logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
-                    f"(width < {min_real_col_w}px and words < 3)")
-        # Extend each remaining column to close gaps with its right neighbor
-        for i, g in enumerate(filtered_geoms):
-            if i + 1 < len(filtered_geoms):
-                g.width = filtered_geoms[i + 1].x - g.x
-            else:
-                g.width = w - g.x
-            g.index = i
-            col_left_rel = g.x - left_x
-            col_right_rel = col_left_rel + g.width
-            g.words = [w for w in word_dicts
-                       if col_left_rel <= w['left'] < col_right_rel]
-            g.word_count = len(g.words)
-        geometries = filtered_geoms
-        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
-                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
-
-    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
-
-
-def expand_narrow_columns(
-    geometries: List[ColumnGeometry],
-    content_w: int,
-    left_x: int,
-    word_dicts: List[Dict],
-) -> List[ColumnGeometry]:
-    """Expand narrow columns into adjacent whitespace gaps.
-
-    Narrow columns (marker, page_ref, < 10% content width) often lose
-    content at image edges due to residual shear.  This expands them toward
-    the neighbouring column, but never past 40% of the gap or past the
-    nearest word in the neighbour.
-
-    Must be called AFTER _detect_sub_columns() so that sub-column splits
-    (which create the narrowest columns) have already happened.
-    """
-    _NARROW_THRESHOLD_PCT = 10.0
-    _MIN_WORD_MARGIN = 4
-
-    if len(geometries) < 2:
-        return geometries
-
-    logger.info("ExpandNarrowCols: input %d cols: %s",
-                len(geometries),
-                [(i, g.x, g.width, round(g.width / content_w * 100, 1))
-                 for i, g in enumerate(geometries)])
-
-    for i, g in enumerate(geometries):
-        col_pct = g.width / content_w * 100 if content_w > 0 else 100
-        if col_pct >= _NARROW_THRESHOLD_PCT:
-            continue
-
-        expanded = False
-        orig_pct = col_pct
-
-        # --- try expanding to the LEFT ---
-        if i > 0:
-            left_nb = geometries[i - 1]
-            # Gap can be 0 if sub-column split created adjacent columns.
-            # In that case, look at where the neighbor's rightmost words
-            # actually are — there may be unused space we can claim.
-            nb_words_right = [wd['left'] + wd.get('width', 0)
-                              for wd in left_nb.words]
-            if nb_words_right:
-                rightmost_word_abs = left_x + max(nb_words_right)
-                safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
-            else:
-                # No words in neighbor → we can take up to neighbor's start
-                safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
-
-            if safe_left_abs < g.x:
-                g.width += (g.x - safe_left_abs)
-                g.x = safe_left_abs
-                expanded = True
-
-        # --- try expanding to the RIGHT ---
-        if i + 1 < len(geometries):
-            right_nb = geometries[i + 1]
-            nb_words_left = [wd['left'] for wd in right_nb.words]
-            if nb_words_left:
-                leftmost_word_abs = left_x + min(nb_words_left)
-                safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
-            else:
-                safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
-
-            cur_right = g.x + g.width
-            if safe_right_abs > cur_right:
-                g.width = safe_right_abs - g.x
-                expanded = True
-
-        if expanded:
-            col_left_rel = g.x - left_x
-            col_right_rel = col_left_rel + g.width
-            g.words = [wd for wd in word_dicts
-                       if col_left_rel <= wd['left'] < col_right_rel]
-            g.word_count = len(g.words)
-            g.width_ratio = g.width / content_w if content_w > 0 else 0.0
-            logger.info(
-                "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
-                i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
-
-            # --- Shrink overlapping neighbors to match new boundaries ---
-            # Left neighbor: its right edge must not exceed our new left edge
-            if i > 0:
-                left_nb = geometries[i - 1]
-                nb_right = left_nb.x + left_nb.width
-                if nb_right > g.x:
-                    left_nb.width = g.x - left_nb.x
-                    if left_nb.width < 0:
-                        left_nb.width = 0
-                    left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
-                    # Re-assign words
-                    nb_left_rel = left_nb.x - left_x
-                    nb_right_rel = nb_left_rel + left_nb.width
-                    left_nb.words = [wd for wd in word_dicts
-                                     if nb_left_rel <= wd['left'] < nb_right_rel]
-                    left_nb.word_count = len(left_nb.words)
-
-            # Right neighbor: its left edge must not be before our new right edge
-            if i + 1 < len(geometries):
-                right_nb = geometries[i + 1]
-                my_right = g.x + g.width
-                if right_nb.x < my_right:
-                    old_right_edge = right_nb.x + right_nb.width
-                    right_nb.x = my_right
-                    right_nb.width = old_right_edge - right_nb.x
-                    if right_nb.width < 0:
-                        right_nb.width = 0
-                    right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
-                    # Re-assign words
-                    nb_left_rel = right_nb.x - left_x
-                    nb_right_rel = nb_left_rel + right_nb.width
-                    right_nb.words = [wd for wd in word_dicts
-                                      if nb_left_rel <= wd['left'] < nb_right_rel]
-                    right_nb.word_count = len(right_nb.words)
-
-    return geometries
-
-
-# =============================================================================
-# Row Geometry Detection (horizontal whitespace-gap analysis)
-# =============================================================================
-
-def detect_row_geometry(
-    inv: np.ndarray,
-    word_dicts: List[Dict],
-    left_x: int, right_x: int,
-    top_y: int, bottom_y: int,
-) -> List['RowGeometry']:
-    """Detect row geometry using horizontal whitespace-gap analysis.
-
-    Mirrors the vertical gap approach used for columns, but operates on
-    horizontal projection profiles to find gaps between text lines.
-    Also classifies header/footer rows based on gap size.
-
-    Args:
-        inv: Inverted binarized image (white text on black bg, full page).
-        word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
-        left_x, right_x: Absolute X bounds of the content area.
-        top_y, bottom_y: Absolute Y bounds of the content area.
-
-    Returns:
-        List of RowGeometry objects sorted top to bottom.
-    """
-    content_w = right_x - left_x
-    content_h = bottom_y - top_y
-
-    if content_h < 10 or content_w < 10:
-        logger.warning("detect_row_geometry: content area too small")
-        return []
-
-    # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
-    content_strip = inv[top_y:bottom_y, left_x:right_x]
-
-    # Build a word-coverage mask so that image regions (high ink density but no
-    # Tesseract words) are ignored.  Only pixels within/near word bounding boxes
-    # contribute to the projection.  This prevents large illustrations from
-    # merging multiple vocabulary rows into one.
-    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
-    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
-    for wd in word_dicts:
-        y1 = max(0, wd['top'] - WORD_PAD_Y)
-        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
-        x1 = max(0, wd['left'])
-        x2 = min(content_w, wd['left'] + wd['width'])
-        word_mask[y1:y2, x1:x2] = 255
-
-    masked_strip = cv2.bitwise_and(content_strip, word_mask)
-    h_proj = np.sum(masked_strip, axis=1).astype(float)
-    h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
-
-    # --- Step 2: Smoothing + threshold ---
-    kernel_size = max(3, content_h // 200)
-    if kernel_size % 2 == 0:
-        kernel_size += 1
-    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
-    median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
-    gap_threshold = max(median_density * 0.15, 0.003)
-
-    in_gap = h_smooth < gap_threshold
-    MIN_GAP_HEIGHT = max(3, content_h // 500)
-
-    # --- Step 3: Collect contiguous gap regions ---
-    raw_gaps = []  # (start_y_rel, end_y_rel) relative to content ROI
-    gap_start = None
-    for y in range(len(in_gap)):
-        if in_gap[y]:
-            if gap_start is None:
-                gap_start = y
-        else:
-            if gap_start is not None:
-                gap_height = y - gap_start
-                if gap_height >= MIN_GAP_HEIGHT:
-                    raw_gaps.append((gap_start, y))
-                gap_start = None
-    if gap_start is not None:
-        gap_height = len(in_gap) - gap_start
-        if gap_height >= MIN_GAP_HEIGHT:
-            raw_gaps.append((gap_start, len(in_gap)))
-
-    logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
-                f"min_height={MIN_GAP_HEIGHT}px)")
-
-    # --- Step 4: Validate gaps against word bounding boxes ---
-    validated_gaps = []
-    for gap_start_rel, gap_end_rel in raw_gaps:
-        overlapping = False
-        for wd in word_dicts:
-            word_top = wd['top']
-            word_bottom = wd['top'] + wd['height']
-            if word_top < gap_end_rel and word_bottom > gap_start_rel:
-                overlapping = True
-                break
-
-        if not overlapping:
-            validated_gaps.append((gap_start_rel, gap_end_rel))
-        else:
-            # Try to shift the gap to avoid overlapping words
-            min_word_top = content_h
-            max_word_bottom = 0
-            for wd in word_dicts:
-                word_top = wd['top']
-                word_bottom = wd['top'] + wd['height']
-                if word_top < gap_end_rel and word_bottom > gap_start_rel:
-                    min_word_top = min(min_word_top, word_top)
-                    max_word_bottom = max(max_word_bottom, word_bottom)
-
-            if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
-                validated_gaps.append((gap_start_rel, min_word_top))
-            elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
-                validated_gaps.append((max_word_bottom, gap_end_rel))
-            else:
-                logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
-                             f"discarded (word overlap, no room to shift)")
-
-    logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
-
-    # --- Fallback if too few gaps ---
-    if len(validated_gaps) < 2:
-        logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
-        return _build_rows_from_word_grouping(
-            word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
-        )
-
-    validated_gaps.sort(key=lambda g: g[0])
-
-    # --- Step 5: Header/footer detection via gap size ---
-    HEADER_FOOTER_ZONE = 0.15
-    GAP_MULTIPLIER = 2.0
-
-    gap_sizes = [g[1] - g[0] for g in validated_gaps]
-    median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
-    large_gap_threshold = median_gap * GAP_MULTIPLIER
-
-    header_boundary_rel = None  # y below which is header
-    footer_boundary_rel = None  # y above which is footer
-
-    header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
-    footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
-
-    # Find largest gap in header zone
-    best_header_gap = None
-    for gs, ge in validated_gaps:
-        gap_mid = (gs + ge) / 2
-        gap_size = ge - gs
-        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
-            if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
-                best_header_gap = (gs, ge)
-
-    if best_header_gap is not None:
-        header_boundary_rel = best_header_gap[1]
-        logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
-                    f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
-                    f"median_gap={median_gap:.0f}px)")
-
-    # Find largest gap in footer zone
-    best_footer_gap = None
-    for gs, ge in validated_gaps:
-        gap_mid = (gs + ge) / 2
-        gap_size = ge - gs
-        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
-            if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
-                best_footer_gap = (gs, ge)
-
-    if best_footer_gap is not None:
-        footer_boundary_rel = best_footer_gap[0]
-        logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
-                    f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
-
-    # --- Step 6: Build RowGeometry objects from gaps ---
-    # Rows are the spans between gaps
-    row_boundaries = []  # (start_y_rel, end_y_rel)
-
-    # Top of content to first gap
-    if validated_gaps[0][0] > MIN_GAP_HEIGHT:
-        row_boundaries.append((0, validated_gaps[0][0]))
-
-    # Between gaps
-    for i in range(len(validated_gaps) - 1):
-        row_start = validated_gaps[i][1]
-        row_end = validated_gaps[i + 1][0]
-        if row_end - row_start > 0:
-            row_boundaries.append((row_start, row_end))
-
-    # Last gap to bottom of content
-    if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
-        row_boundaries.append((validated_gaps[-1][1], content_h))
-
-    rows = []
-    for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
-        # Determine row type
-        row_mid = (row_start_rel + row_end_rel) / 2
-        if header_boundary_rel is not None and row_mid < header_boundary_rel:
-            row_type = 'header'
-        elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
-            row_type = 'footer'
-        else:
-            row_type = 'content'
-
-        # Collect words in this row
-        row_words = [w for w in word_dicts
-                     if w['top'] + w['height'] / 2 >= row_start_rel
-                     and w['top'] + w['height'] / 2 < row_end_rel]
-
-        # Gap before this row
-        gap_before = 0
-        if idx == 0 and validated_gaps[0][0] > 0:
-            gap_before = validated_gaps[0][0]
-        elif idx > 0:
-            # Find the gap just before this row boundary
-            for gs, ge in validated_gaps:
-                if ge == row_start_rel:
-                    gap_before = ge - gs
-                    break
-
-        rows.append(RowGeometry(
-            index=idx,
-            x=left_x,
-            y=top_y + row_start_rel,
-            width=content_w,
-            height=row_end_rel - row_start_rel,
-            word_count=len(row_words),
-            words=row_words,
-            row_type=row_type,
-            gap_before=gap_before,
-        ))
-
-    # --- Step 7: Word-center grid regularization ---
-    # Derive precise row boundaries from word vertical centers.  Detects
-    # section breaks (headings, paragraphs) and builds per-section grids.
-    rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
-                                content_w, content_h, inv)
-
-    type_counts = {}
-    for r in rows:
-        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
-    logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
-
-    return rows
-
-
-def _regularize_row_grid(
-    rows: List['RowGeometry'],
-    word_dicts: List[Dict],
-    left_x: int, right_x: int,
-    top_y: int,
-    content_w: int, content_h: int,
-    inv: np.ndarray,
-) -> List['RowGeometry']:
-    """Rebuild row boundaries from word center-lines with section-break awareness.
-
-    Instead of overlaying a rigid grid, this derives row positions bottom-up
-    from the words themselves:
-
-    1. Group words into line clusters (by Y proximity).
-    2. For each cluster compute center_y (median of word vertical centers)
-       and letter_height (median of word heights).
-    3. Compute the pitch (distance between consecutive centers).
-    4. Detect section breaks where the gap is >1.8× the median pitch
-       (headings, sub-headings, paragraph breaks).
-    5. Within each section, use the local pitch to place row boundaries
-       at the midpoints between consecutive centers.
-    6. Validate that ≥85% of words land in a grid row; otherwise fall back.
-
-    Header/footer rows from the gap-based detection are preserved.
-    """
-    content_rows = [r for r in rows if r.row_type == 'content']
-    non_content = [r for r in rows if r.row_type != 'content']
-
-    if len(content_rows) < 5:
-        return rows
-
-    # --- Step A: Group ALL words into line clusters ---
-    # Collect words that belong to content rows (deduplicated)
-    content_words: List[Dict] = []
-    seen_keys: set = set()
-    for r in content_rows:
-        for w in r.words:
-            key = (w['left'], w['top'], w['width'], w['height'])
-            if key not in seen_keys:
-                seen_keys.add(key)
-                content_words.append(w)
-
-    if len(content_words) < 5:
-        return rows
-
-    # Compute median word height (excluding outliers like tall brackets/IPA)
-    word_heights = sorted(w['height'] for w in content_words)
-    median_wh = word_heights[len(word_heights) // 2]
-
-    # Compute median gap-based row height — this is the actual line height
-    # as detected by the horizontal projection.  We use 40% of this as
-    # grouping tolerance.  This is much more reliable than using word height
-    # alone, because words on the same line can have very different heights
-    # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
-    gap_row_heights = sorted(r.height for r in content_rows)
-    median_row_h = gap_row_heights[len(gap_row_heights) // 2]
-
-    # Tolerance: 40% of row height.  Words on the same line should have
-    # centers within this range.  Even if a word's bbox is taller/shorter,
-    # its center should stay within half a row height of the line center.
-    y_tol = max(10, int(median_row_h * 0.4))
-
-    # Sort by center_y, then group by proximity
-    words_by_center = sorted(content_words,
-                             key=lambda w: (w['top'] + w['height'] / 2, w['left']))
-    line_clusters: List[List[Dict]] = []
-    current_line: List[Dict] = [words_by_center[0]]
-    current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
-
-    for w in words_by_center[1:]:
-        w_center = w['top'] + w['height'] / 2
-        if abs(w_center - current_center) <= y_tol:
-            current_line.append(w)
-        else:
-            current_line.sort(key=lambda w: w['left'])
-            line_clusters.append(current_line)
-            current_line = [w]
-            current_center = w_center
-
-    if current_line:
-        current_line.sort(key=lambda w: w['left'])
-        line_clusters.append(current_line)
-
-    if len(line_clusters) < 3:
-        return rows
-
-    # --- Step B: Compute center_y per cluster ---
-    # center_y = median of (word_top + word_height/2) across all words in cluster
-    # letter_h = median of word heights, but excluding outlier-height words
-    #            (>2× median) so that tall brackets/IPA don't skew the height
-    cluster_info: List[Dict] = []
-    for cl_words in line_clusters:
-        centers = [w['top'] + w['height'] / 2 for w in cl_words]
-        # Filter outlier heights for letter_h computation
-        normal_heights = [w['height'] for w in cl_words
-                          if w['height'] <= median_wh * 2.0]
-        if not normal_heights:
-            normal_heights = [w['height'] for w in cl_words]
-        center_y = float(np.median(centers))
-        letter_h = float(np.median(normal_heights))
-        cluster_info.append({
-            'center_y_rel': center_y,  # relative to content ROI
-            'center_y_abs': center_y + top_y,  # absolute
-            'letter_h': letter_h,
-            'words': cl_words,
-        })
-
-    cluster_info.sort(key=lambda c: c['center_y_rel'])
-
-    # --- Step B2: Merge clusters that are too close together ---
-    # Even with center-based grouping, some edge cases can produce
-    # spurious clusters.  Merge any pair whose centers are closer
-    # than 30% of the row height (they're definitely the same text line).
-    merge_threshold = max(8, median_row_h * 0.3)
-    merged: List[Dict] = [cluster_info[0]]
-    for cl in cluster_info[1:]:
-        prev = merged[-1]
-        if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
-            # Merge: combine words, recompute center
-            combined_words = prev['words'] + cl['words']
-            centers = [w['top'] + w['height'] / 2 for w in combined_words]
-            normal_heights = [w['height'] for w in combined_words
-                              if w['height'] <= median_wh * 2.0]
-            if not normal_heights:
-                normal_heights = [w['height'] for w in combined_words]
-            prev['center_y_rel'] = float(np.median(centers))
-            prev['center_y_abs'] = prev['center_y_rel'] + top_y
-            prev['letter_h'] = float(np.median(normal_heights))
-            prev['words'] = combined_words
-        else:
-            merged.append(cl)
-
-    cluster_info = merged
-
-    if len(cluster_info) < 3:
-        return rows
-
-    # --- Step C: Compute pitches and detect section breaks ---
-    pitches: List[float] = []
-    for i in range(1, len(cluster_info)):
-        pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
-        pitches.append(pitch)
-
-    if not pitches:
-        return rows
-
-    median_pitch = float(np.median(pitches))
-    if median_pitch <= 5:
-        return rows
-
-    # A section break is where the gap between line centers is much larger
-    # than the normal pitch (sub-headings, section titles, etc.)
-    BREAK_FACTOR = 1.8
-
-    # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
-    sections: List[List[Dict]] = []
-    current_section: List[Dict] = [cluster_info[0]]
-
-    for i in range(1, len(cluster_info)):
-        gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
-        if gap > median_pitch * BREAK_FACTOR:
-            sections.append(current_section)
-            current_section = [cluster_info[i]]
-        else:
-            current_section.append(cluster_info[i])
-
-    if current_section:
-        sections.append(current_section)
-
-    # --- Step E: Build row boundaries per section ---
-    grid_rows: List[RowGeometry] = []
-
-    for section in sections:
-        if not section:
-            continue
-
-        if len(section) == 1:
-            # Single-line section (likely a heading)
-            cl = section[0]
-            half_h = max(cl['letter_h'], median_pitch * 0.4)
-            row_top = cl['center_y_abs'] - half_h
-            row_bot = cl['center_y_abs'] + half_h
-            grid_rows.append(RowGeometry(
-                index=0,
-                x=left_x,
-                y=round(row_top),
-                width=content_w,
-                height=round(row_bot - row_top),
-                word_count=len(cl['words']),
-                words=cl['words'],
-                row_type='content',
-                gap_before=0,
-            ))
-            continue
-
-        # Compute local pitch for this section
-        local_pitches = []
-        for i in range(1, len(section)):
-            local_pitches.append(
-                section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
-            )
-        local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
-
-        # Row boundaries are placed at midpoints between consecutive centers.
-        # First row: top = center - local_pitch/2
-        # Last row: bottom = center + local_pitch/2
-        for i, cl in enumerate(section):
-            if i == 0:
-                row_top = cl['center_y_abs'] - local_pitch / 2
-            else:
-                # Midpoint between this center and previous center
-                prev_center = section[i - 1]['center_y_abs']
-                row_top = (prev_center + cl['center_y_abs']) / 2
-
-            if i == len(section) - 1:
-                row_bot = cl['center_y_abs'] + local_pitch / 2
-            else:
-                next_center = section[i + 1]['center_y_abs']
-                row_bot = (cl['center_y_abs'] + next_center) / 2
-
-            # Clamp to reasonable bounds
-            row_top = max(top_y, row_top)
-            row_bot = min(top_y + content_h, row_bot)
-
-            if row_bot - row_top < 5:
-                continue
-
-            grid_rows.append(RowGeometry(
-                index=0,
-                x=left_x,
-                y=round(row_top),
-                width=content_w,
-                height=round(row_bot - row_top),
-                word_count=len(cl['words']),
-                words=cl['words'],
-                row_type='content',
-                gap_before=0,
-            ))
-
-    if not grid_rows:
-        return rows
-
-    # --- Step F: Re-assign words to grid rows ---
-    # Words may have shifted slightly; assign each word to the row whose
-    # center is closest to the word's vertical center.
-    for gr in grid_rows:
-        gr.words = []
-
-    for w in content_words:
-        w_center = w['top'] + top_y + w['height'] / 2
-        best_row = None
-        best_dist = float('inf')
-        for gr in grid_rows:
-            row_center = gr.y + gr.height / 2
-            dist = abs(w_center - row_center)
-            if dist < best_dist:
-                best_dist = dist
-                best_row = gr
-        if best_row is not None and best_dist < median_pitch:
-            best_row.words.append(w)
-
-    for gr in grid_rows:
-        gr.word_count = len(gr.words)
-
-    # --- Step G: Validate ---
-    words_placed = sum(gr.word_count for gr in grid_rows)
-    if len(content_words) > 0:
-        match_ratio = words_placed / len(content_words)
-        if match_ratio < 0.85:
-            logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
-                        f"of words, keeping gap-based rows")
-            return rows
-
-    # Remove empty grid rows (no words assigned)
-    grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
-
-    # --- Step H: Merge header/footer + re-index ---
-    result = list(non_content) + grid_rows
-    result.sort(key=lambda r: r.y)
-    for i, r in enumerate(result):
-        r.index = i
-
-    row_heights = [gr.height for gr in grid_rows]
-    min_h = min(row_heights) if row_heights else 0
-    max_h = max(row_heights) if row_heights else 0
-    logger.info(f"RowGrid: word-center grid applied "
-                f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
-                f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
-                f"{len(sections)} sections, "
-                f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
-                f"was {len(content_rows)} gap-based rows)")
-
-    return result
-
-
-def _build_rows_from_word_grouping(
-    word_dicts: List[Dict],
-    left_x: int, right_x: int,
-    top_y: int, bottom_y: int,
-    content_w: int, content_h: int,
-) -> List['RowGeometry']:
-    """Fallback: build rows by grouping words by Y position.
-
-    Uses _group_words_into_lines() with a generous tolerance.
-    No header/footer detection in fallback mode.
-    """
-    if not word_dicts:
-        return []
-
-    y_tolerance = max(20, content_h // 100)
-    lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
-
-    rows = []
-    for idx, line_words in enumerate(lines):
-        if not line_words:
-            continue
-        min_top = min(w['top'] for w in line_words)
-        max_bottom = max(w['top'] + w['height'] for w in line_words)
-        row_height = max_bottom - min_top
-
-        rows.append(RowGeometry(
-            index=idx,
-            x=left_x,
-            y=top_y + min_top,
-            width=content_w,
-            height=row_height,
-            word_count=len(line_words),
-            words=line_words,
-            row_type='content',
-            gap_before=0,
-        ))
-
-    logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
-    return rows
-
-
-# --- Phase B: Content-Based Classification ---
-
-def _score_language(words: List[Dict]) -> Dict[str, float]:
-    """Score the language of a column's words.
-
-    Analyzes function words, umlauts, and capitalization patterns
-    to determine whether text is English or German.
-
-    Args:
-        words: List of word dicts with 'text' and 'conf' keys.
-
-    Returns:
-        Dict with 'eng' and 'deu' scores (0.0-1.0).
-    """
-    if not words:
-        return {'eng': 0.0, 'deu': 0.0}
-
-    # Only consider words with decent confidence
-    good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
-    if not good_words:
-        return {'eng': 0.0, 'deu': 0.0}
-
-    total = len(good_words)
-    en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
-    de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
-
-    # Check for umlauts (strong German signal)
-    raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
-    umlaut_count = sum(1 for t in raw_texts
-                       for c in t if c in 'äöüÄÖÜß')
-
-    # German capitalization: nouns are capitalized mid-sentence
-    # Count words that start with uppercase but aren't at position 0
-    cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
-
-    en_score = en_hits / total if total > 0 else 0.0
-    de_score = de_hits / total if total > 0 else 0.0
-
-    # Boost German score for umlauts
-    if umlaut_count > 0:
-        de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
-
-    # Boost German score for high capitalization ratio (typical for German nouns)
-    if total > 5:
-        cap_ratio = cap_words / total
-        if cap_ratio > 0.3:
-            de_score = min(1.0, de_score + 0.1)
-
-    return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
-
-
-def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
-    """Score the role of a column based on its geometry and content patterns.
-
-    Args:
-        geom: ColumnGeometry with words and dimensions.
-
-    Returns:
-        Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
-    """
-    scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
-
-    if not geom.words:
-        return scores
-
-    texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
-    if not texts:
-        return scores
-
-    avg_word_len = sum(len(t) for t in texts) / len(texts)
-    has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
-    digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
-    digit_ratio = digit_words / len(texts) if texts else 0.0
-
-    # Reference: narrow + mostly numbers/page references
-    if geom.width_ratio < 0.12:
-        scores['reference'] = 0.5
-        if digit_ratio > 0.4:
-            scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
-
-    # Marker: narrow + few short entries
-    if geom.width_ratio < 0.06 and geom.word_count <= 15:
-        scores['marker'] = 0.7
-        if avg_word_len < 4:
-            scores['marker'] = 0.9
-    # Very narrow non-edge column → strong marker regardless of word count
-    if geom.width_ratio < 0.04 and geom.index > 0:
-        scores['marker'] = max(scores['marker'], 0.9)
-
-    # Sentence: longer words + punctuation present
-    if geom.width_ratio > 0.15 and has_punctuation > 2:
-        scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
-        if avg_word_len > 4:
-            scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
-
-    # Vocabulary: medium width + medium word length
-    if 0.10 < geom.width_ratio < 0.45:
-        scores['vocabulary'] = 0.4
-        if 3 < avg_word_len < 8:
-            scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
-
-    return {k: round(v, 3) for k, v in scores.items()}
-
-
-def _build_margin_regions(
-    all_regions: List[PageRegion],
-    left_x: int,
-    right_x: int,
-    img_w: int,
-    top_y: int,
-    content_h: int,
-) -> List[PageRegion]:
-    """Create margin_left / margin_right PageRegions from content bounds.
-
-    Margins represent the space between the image edge and the first/last
-    content column.  They are used downstream for faithful page
-    reconstruction but are skipped during OCR.
-    """
-    margins: List[PageRegion] = []
-    # Minimum gap (px) to create a margin region
-    _min_gap = 5
-
-    if left_x > _min_gap:
-        margins.append(PageRegion(
-            type='margin_left', x=0, y=top_y,
-            width=left_x, height=content_h,
-            classification_confidence=1.0,
-            classification_method='content_bounds',
-        ))
-
-    # Right margin: from end of last content column to image edge
-    non_margin = [r for r in all_regions
-                  if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
-                                    'margin_top', 'margin_bottom')]
-    if non_margin:
-        last_col_end = max(r.x + r.width for r in non_margin)
-    else:
-        last_col_end = right_x
-    if img_w - last_col_end > _min_gap:
-        margins.append(PageRegion(
-            type='margin_right', x=last_col_end, y=top_y,
-            width=img_w - last_col_end, height=content_h,
-            classification_confidence=1.0,
-            classification_method='content_bounds',
-        ))
-
-    if margins:
-        logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
-                     f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
-
-    return margins
-
-
-def positional_column_regions(
-    geometries: List[ColumnGeometry],
-    content_w: int,
-    content_h: int,
-    left_x: int,
-) -> List[PageRegion]:
-    """Classify columns by position only (no language scoring).
-
-    Structural columns (page_ref, column_marker) are identified by geometry.
-    Remaining content columns are labelled left→right as column_en, column_de,
-    column_example.  The names are purely positional – no language analysis.
-    """
-    structural: List[PageRegion] = []
-    content_cols: List[ColumnGeometry] = []
-
-    for g in geometries:
-        rel_x = g.x - left_x
-        # page_ref: narrow column in the leftmost 20% region
-        if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
-            structural.append(PageRegion(
-                type='page_ref', x=g.x, y=g.y,
-                width=g.width, height=content_h,
-                classification_confidence=0.95,
-                classification_method='positional',
-            ))
-        # column_marker: very narrow, few words
-        elif g.width_ratio < 0.06 and g.word_count <= 15:
-            structural.append(PageRegion(
-                type='column_marker', x=g.x, y=g.y,
-                width=g.width, height=content_h,
-                classification_confidence=0.95,
-                classification_method='positional',
-            ))
-        # empty or near-empty narrow column → treat as margin/structural
-        elif g.word_count <= 2 and g.width_ratio < 0.15:
-            structural.append(PageRegion(
-                type='column_marker', x=g.x, y=g.y,
-                width=g.width, height=content_h,
-                classification_confidence=0.85,
-                classification_method='positional',
-            ))
-        else:
-            content_cols.append(g)
-
-    # Single content column → plain text page
-    if len(content_cols) == 1:
-        g = content_cols[0]
-        return structural + [PageRegion(
-            type='column_text', x=g.x, y=g.y,
-            width=g.width, height=content_h,
-            classification_confidence=0.9,
-            classification_method='positional',
-        )]
-
-    # No content columns
-    if not content_cols:
-        return structural
-
-    # Sort content columns left→right and assign positional labels
-    content_cols.sort(key=lambda g: g.x)
-
-    # With exactly 2 content columns: if the left one is very wide (>35%),
-    # it likely contains EN+DE combined, so the right one is examples.
-    if (len(content_cols) == 2
-            and content_cols[0].width_ratio > 0.35
-            and content_cols[1].width_ratio > 0.20):
-        labels = ['column_en', 'column_example']
-    else:
-        labels = ['column_en', 'column_de', 'column_example']
-
-    regions = list(structural)
-    for i, g in enumerate(content_cols):
-        label = labels[i] if i < len(labels) else 'column_example'
-        regions.append(PageRegion(
-            type=label, x=g.x, y=g.y,
-            width=g.width, height=content_h,
-            classification_confidence=0.95,
-            classification_method='positional',
-        ))
-
-    logger.info(f"PositionalColumns: {len(structural)} structural, "
-                f"{len(content_cols)} content → "
-                f"{[r.type for r in regions]}")
-    return regions
-
-
-def classify_column_types(geometries: List[ColumnGeometry],
-                          content_w: int,
-                          top_y: int,
-                          img_w: int,
-                          img_h: int,
-                          bottom_y: int,
-                          left_x: int = 0,
-                          right_x: int = 0,
-                          inv: Optional[np.ndarray] = None) -> List[PageRegion]:
-    """Classify column types using a 3-level fallback chain.
-
-    Level 1: Content-based (language + role scoring)
-    Level 2: Position + language (old rules enhanced with language detection)
-    Level 3: Pure position (exact old code, no regression)
-
-    Args:
-        geometries: List of ColumnGeometry from Phase A.
-        content_w: Total content width.
-        top_y: Top Y of content area.
-        img_w: Full image width.
-        img_h: Full image height.
-        bottom_y: Bottom Y of content area.
-        left_x: Left content bound (from _find_content_bounds).
-        right_x: Right content bound (from _find_content_bounds).
-
-    Returns:
-        List of PageRegion with types, confidence, and method.
-    """
-    content_h = bottom_y - top_y
-
-    def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
-        """Append margin_left / margin_right regions to *result*."""
-        margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
-        return result + margins
-
-    # Special case: single column → plain text page
-    if len(geometries) == 1:
-        geom = geometries[0]
-        return _with_margins([PageRegion(
-            type='column_text', x=geom.x, y=geom.y,
-            width=geom.width, height=geom.height,
-            classification_confidence=0.9,
-            classification_method='content',
-        )])
-
-    # --- Pre-filter: first/last columns with very few words → column_ignore ---
-    # Sub-columns from _detect_sub_columns() are exempt: they intentionally
-    # have few words (page refs, markers) and should not be discarded.
-    ignore_regions = []
-    active_geometries = []
-    for idx, g in enumerate(geometries):
-        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
-            ignore_regions.append(PageRegion(
-                type='column_ignore', x=g.x, y=g.y,
-                width=g.width, height=content_h,
-                classification_confidence=0.95,
-                classification_method='content',
-            ))
-            logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
-        else:
-            active_geometries.append(g)
-
-    # Re-index active geometries for classification
-    for new_idx, g in enumerate(active_geometries):
-        g.index = new_idx
-    geometries = active_geometries
-
-    # Handle edge case: all columns ignored or only 1 left
-    if len(geometries) == 0:
-        return _with_margins(ignore_regions)
-    if len(geometries) == 1:
-        geom = geometries[0]
-        ignore_regions.append(PageRegion(
-            type='column_text', x=geom.x, y=geom.y,
-            width=geom.width, height=geom.height,
-            classification_confidence=0.9,
-            classification_method='content',
-        ))
-        return _with_margins(ignore_regions)
-
-    # --- Score all columns ---
-    lang_scores = [_score_language(g.words) for g in geometries]
-    role_scores = [_score_role(g) for g in geometries]
-
-    logger.info(f"ClassifyColumns: language scores: "
-                f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
-    logger.info(f"ClassifyColumns: role scores: "
-                f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
-
-    # --- Level 1: Content-based classification ---
-    regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
-    if regions is not None:
-        logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
-        _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
-        return _with_margins(ignore_regions + regions)
-
-    # --- Level 2: Position + language enhanced ---
-    regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
-    if regions is not None:
-        logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
-        _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
-        return _with_margins(ignore_regions + regions)
-
-    # --- Level 3: Pure position fallback (old code, no regression) ---
-    logger.info("ClassifyColumns: Level 3 (position fallback)")
-    regions = _classify_by_position_fallback(geometries, content_w, content_h)
-    _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
-    return _with_margins(ignore_regions + regions)
-
-
-def _classify_by_content(geometries: List[ColumnGeometry],
-                         lang_scores: List[Dict[str, float]],
-                         role_scores: List[Dict[str, float]],
-                         content_w: int,
-                         content_h: int) -> Optional[List[PageRegion]]:
-    """Level 1: Classify columns purely by content analysis.
-
-    Requires clear language signals to distinguish EN/DE columns.
-    Returns None if language signals are too weak.
-    """
-    regions = []
-    assigned = set()
-
-    # Step 1: Assign structural roles first (reference, marker)
-    # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
-    left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
-
-    for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
-        is_left_side = geom.x < left_20_threshold
-        has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
-        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
-            regions.append(PageRegion(
-                type='page_ref', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=rs['reference'],
-                classification_method='content',
-            ))
-            assigned.add(i)
-        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
-            regions.append(PageRegion(
-                type='column_marker', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=rs['marker'],
-                classification_method='content',
-            ))
-            assigned.add(i)
-        elif geom.width_ratio < 0.05 and not is_left_side:
-            # Narrow column on the right side → marker, not page_ref
-            regions.append(PageRegion(
-                type='column_marker', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=0.8,
-                classification_method='content',
-            ))
-            assigned.add(i)
-
-    # Step 2: Among remaining columns, find EN and DE by language scores
-    remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
-                 for i in range(len(geometries)) if i not in assigned]
-
-    if len(remaining) < 2:
-        # Not enough columns for EN/DE pair
-        if len(remaining) == 1:
-            i, geom, ls, rs = remaining[0]
-            regions.append(PageRegion(
-                type='column_text', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=0.6,
-                classification_method='content',
-            ))
-        regions.sort(key=lambda r: r.x)
-        return regions
-
-    # Check if we have enough language signal
-    en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
-    de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
-
-    # Position tiebreaker: when language signals are weak, use left=EN, right=DE
-    if (not en_candidates or not de_candidates) and len(remaining) >= 2:
-        max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
-        max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
-        if max_eng < 0.15 and max_deu < 0.15:
-            # Both signals weak — fall back to positional: left=EN, right=DE
-            sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
-            best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
-            best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
-            logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
-            en_conf = 0.4
-            de_conf = 0.4
-
-            regions.append(PageRegion(
-                type='column_en', x=best_en[1].x, y=best_en[1].y,
-                width=best_en[1].width, height=content_h,
-                classification_confidence=en_conf,
-                classification_method='content',
-            ))
-            assigned.add(best_en[0])
-
-            regions.append(PageRegion(
-                type='column_de', x=best_de[1].x, y=best_de[1].y,
-                width=best_de[1].width, height=content_h,
-                classification_confidence=de_conf,
-                classification_method='content',
-            ))
-            assigned.add(best_de[0])
-
-            # Assign remaining as example
-            for i, geom, ls, rs in remaining:
-                if i not in assigned:
-                    regions.append(PageRegion(
-                        type='column_example', x=geom.x, y=geom.y,
-                        width=geom.width, height=content_h,
-                        classification_confidence=0.4,
-                        classification_method='content',
-                    ))
-            regions.sort(key=lambda r: r.x)
-            return regions
-
-    if not en_candidates or not de_candidates:
-        # Language signals too weak for content-based classification
-        logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
-        return None
-
-    # Pick the best EN and DE candidates
-    best_en = max(en_candidates, key=lambda x: x[2]['eng'])
-    best_de = max(de_candidates, key=lambda x: x[2]['deu'])
-
-    # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
-    # Example sentences contain English function words ("the", "a", "is") which inflate
-    # the eng score of the Example column.  When the best EN candidate sits to the RIGHT
-    # of the DE column and there is another EN candidate to the LEFT, prefer the left one
-    # — it is almost certainly the real vocabulary column.
-    if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
-        left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
-        if left_of_de:
-            alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
-            logger.info(
-                f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} "
-                f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
-                f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
-            best_en = alt_en
-
-    if best_en[0] == best_de[0]:
-        # Same column scored highest for both — ambiguous
-        logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
-        return None
-
-    en_conf = best_en[2]['eng']
-    de_conf = best_de[2]['deu']
-
-    regions.append(PageRegion(
-        type='column_en', x=best_en[1].x, y=best_en[1].y,
-        width=best_en[1].width, height=content_h,
-        classification_confidence=round(en_conf, 2),
-        classification_method='content',
-    ))
-    assigned.add(best_en[0])
-
-    regions.append(PageRegion(
-        type='column_de', x=best_de[1].x, y=best_de[1].y,
-        width=best_de[1].width, height=content_h,
-        classification_confidence=round(de_conf, 2),
-        classification_method='content',
-    ))
-    assigned.add(best_de[0])
-
-    # Step 3: Remaining columns → example or text based on role scores
-    for i, geom, ls, rs in remaining:
-        if i in assigned:
-            continue
-        if rs['sentence'] > 0.4:
-            regions.append(PageRegion(
-                type='column_example', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=round(rs['sentence'], 2),
-                classification_method='content',
-            ))
-        else:
-            regions.append(PageRegion(
-                type='column_example', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=0.5,
-                classification_method='content',
-            ))
-
-    regions.sort(key=lambda r: r.x)
-    return regions
-
-
-def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
-                                    lang_scores: List[Dict[str, float]],
-                                    content_w: int,
-                                    content_h: int) -> Optional[List[PageRegion]]:
-    """Level 2: Position-based rules enhanced with language confirmation.
-
-    Uses the old positional heuristics but confirms EN/DE assignment
-    with language scores (swapping if needed).
-    """
-    regions = []
-    untyped = list(range(len(geometries)))
-    first_x = geometries[0].x if geometries else 0
-    left_20_threshold = first_x + content_w * 0.20
-
-    # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
-    g0 = geometries[0]
-    ls0 = lang_scores[0]
-    has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
-    if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
-        regions.append(PageRegion(
-            type='page_ref', x=g0.x, y=g0.y,
-            width=g0.width, height=content_h,
-            classification_confidence=0.8,
-            classification_method='position_enhanced',
-        ))
-        untyped.remove(0)
-
-    # Rule 2: Narrow columns with few words → marker
-    for i in list(untyped):
-        geom = geometries[i]
-        if geom.width_ratio < 0.06 and geom.word_count <= 15:
-            regions.append(PageRegion(
-                type='column_marker', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=0.7,
-                classification_method='position_enhanced',
-            ))
-            untyped.remove(i)
-
-    # Rule 3: Rightmost remaining → column_example (if 3+ remaining)
-    if len(untyped) >= 3:
-        last_idx = untyped[-1]
-        geom = geometries[last_idx]
-        regions.append(PageRegion(
-            type='column_example', x=geom.x, y=geom.y,
-            width=geom.width, height=content_h,
-            classification_confidence=0.7,
-            classification_method='position_enhanced',
-        ))
-        untyped.remove(last_idx)
-
-    # Rule 4: First two remaining → EN/DE, but check language to possibly swap
-    if len(untyped) >= 2:
-        idx_a = untyped[0]
-        idx_b = untyped[1]
-        ls_a = lang_scores[idx_a]
-        ls_b = lang_scores[idx_b]
-
-        # Default: first=EN, second=DE (old behavior)
-        en_idx, de_idx = idx_a, idx_b
-        conf = 0.7
-
-        # Swap if language signals clearly indicate the opposite
-        if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
-            en_idx, de_idx = idx_b, idx_a
-            conf = 0.85
-            logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
-
-        regions.append(PageRegion(
-            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
-            width=geometries[en_idx].width, height=content_h,
-            classification_confidence=conf,
-            classification_method='position_enhanced',
-        ))
-        regions.append(PageRegion(
-            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
-            width=geometries[de_idx].width, height=content_h,
-            classification_confidence=conf,
-            classification_method='position_enhanced',
-        ))
-        untyped = untyped[2:]
-    elif len(untyped) == 1:
-        idx = untyped[0]
-        geom = geometries[idx]
-        regions.append(PageRegion(
-            type='column_en', x=geom.x, y=geom.y,
-            width=geom.width, height=content_h,
-            classification_confidence=0.5,
-            classification_method='position_enhanced',
-        ))
-        untyped = []
-
-    # Remaining → example
-    for idx in untyped:
-        geom = geometries[idx]
-        regions.append(PageRegion(
-            type='column_example', x=geom.x, y=geom.y,
-            width=geom.width, height=content_h,
-            classification_confidence=0.5,
-            classification_method='position_enhanced',
-        ))
-
-    regions.sort(key=lambda r: r.x)
-    return regions
-
-
-def _classify_by_position_fallback(geometries: List[ColumnGeometry],
-                                   content_w: int,
-                                   content_h: int) -> List[PageRegion]:
-    """Level 3: Pure position-based fallback (identical to old code).
-
-    Guarantees no regression from the previous behavior.
-    """
-    regions = []
-    untyped = list(range(len(geometries)))
-    first_x = geometries[0].x if geometries else 0
-    left_20_threshold = first_x + content_w * 0.20
-
-    # Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
-    g0 = geometries[0]
-    if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
-        regions.append(PageRegion(
-            type='page_ref', x=g0.x, y=g0.y,
-            width=g0.width, height=content_h,
-            classification_confidence=1.0,
-            classification_method='position_fallback',
-        ))
-        untyped.remove(0)
-
-    # Rule 2: Narrow + few words → marker
-    for i in list(untyped):
-        geom = geometries[i]
-        if geom.width_ratio < 0.06 and geom.word_count <= 15:
-            regions.append(PageRegion(
-                type='column_marker', x=geom.x, y=geom.y,
-                width=geom.width, height=content_h,
-                classification_confidence=1.0,
-                classification_method='position_fallback',
-            ))
-            untyped.remove(i)
-
-    # Rule 3: Rightmost remaining → example (if 3+)
-    if len(untyped) >= 3:
-        last_idx = untyped[-1]
-        geom = geometries[last_idx]
-        regions.append(PageRegion(
-            type='column_example', x=geom.x, y=geom.y,
-            width=geom.width, height=content_h,
-            classification_confidence=1.0,
-            classification_method='position_fallback',
-        ))
-        untyped.remove(last_idx)
-
-    # Rule 4: First remaining → EN, second → DE
-    if len(untyped) >= 2:
-        en_idx = untyped[0]
-        de_idx = untyped[1]
-        regions.append(PageRegion(
-            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
-            width=geometries[en_idx].width, height=content_h,
-            classification_confidence=1.0,
-            classification_method='position_fallback',
-        ))
-        regions.append(PageRegion(
-            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
-            width=geometries[de_idx].width, height=content_h,
-            classification_confidence=1.0,
-            classification_method='position_fallback',
-        ))
-        untyped = untyped[2:]
-    elif len(untyped) == 1:
-        idx = untyped[0]
-        geom = geometries[idx]
-        regions.append(PageRegion(
-            type='column_en', x=geom.x, y=geom.y,
-            width=geom.width, height=content_h,
-            classification_confidence=1.0,
-            classification_method='position_fallback',
-        ))
-        untyped = []
-
-    for idx in untyped:
-        geom = geometries[idx]
-        regions.append(PageRegion(
-            type='column_example', x=geom.x, y=geom.y,
-            width=geom.width, height=content_h,
-            classification_confidence=1.0,
-            classification_method='position_fallback',
-        ))
-
-    regions.sort(key=lambda r: r.x)
-    return regions
-
-
-def _detect_header_footer_gaps(
-    inv: np.ndarray,
-    img_w: int,
-    img_h: int,
-) -> Tuple[Optional[int], Optional[int]]:
-    """Detect header/footer boundaries via horizontal projection gap analysis.
-
-    Scans the full-page inverted image for large horizontal gaps in the top/bottom
-    20% that separate header/footer content from the main body.
-
-    Returns:
-        (header_y, footer_y) — absolute y-coordinates.
-        header_y = bottom edge of header region (None if no header detected).
-        footer_y = top edge of footer region (None if no footer detected).
-    """
-    HEADER_FOOTER_ZONE = 0.20
-    GAP_MULTIPLIER = 2.0
-
-    # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
-    actual_h = min(inv.shape[0], img_h)
-    roi = inv[:actual_h, :]
-    h_proj = np.sum(roi, axis=1).astype(float)
-    proj_w = roi.shape[1]
-    h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
-
-    # Step 2: Smoothing
-    kernel_size = max(3, actual_h // 200)
-    if kernel_size % 2 == 0:
-        kernel_size += 1
-    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
-    # Step 3: Gap threshold
-    positive = h_smooth[h_smooth > 0]
-    median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
-    gap_threshold = max(median_density * 0.15, 0.003)
-
-    in_gap = h_smooth < gap_threshold
-    MIN_GAP_HEIGHT = max(3, actual_h // 500)
-
-    # Step 4: Collect contiguous gaps
-    raw_gaps: List[Tuple[int, int]] = []
-    gap_start: Optional[int] = None
-    for y in range(len(in_gap)):
-        if in_gap[y]:
-            if gap_start is None:
-                gap_start = y
-        else:
-            if gap_start is not None:
-                gap_height = y - gap_start
-                if gap_height >= MIN_GAP_HEIGHT:
-                    raw_gaps.append((gap_start, y))
-                gap_start = None
-    if gap_start is not None:
-        gap_height = len(in_gap) - gap_start
-        if gap_height >= MIN_GAP_HEIGHT:
-            raw_gaps.append((gap_start, len(in_gap)))
-
-    if not raw_gaps:
-        return None, None
-
-    # Step 5: Compute median gap size and large-gap threshold
-    gap_sizes = [g[1] - g[0] for g in raw_gaps]
-    median_gap = float(np.median(gap_sizes))
-    large_gap_threshold = median_gap * GAP_MULTIPLIER
-
-    # Step 6: Find largest qualifying gap in header / footer zones
-    # A separator gap must have content on BOTH sides — edge-touching gaps
-    # (e.g. dewarp padding at bottom) are not valid separators.
-    EDGE_MARGIN = max(5, actual_h // 400)
-    header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
-    footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
-
-    header_y: Optional[int] = None
-    footer_y: Optional[int] = None
-
-    best_header_size = 0
-    for gs, ge in raw_gaps:
-        if gs <= EDGE_MARGIN:
-            continue  # skip gaps touching the top edge
-        gap_mid = (gs + ge) / 2
-        gap_size = ge - gs
-        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
-            if gap_size > best_header_size:
-                best_header_size = gap_size
-                header_y = ge  # bottom edge of gap
-
-    best_footer_size = 0
-    for gs, ge in raw_gaps:
-        if ge >= actual_h - EDGE_MARGIN:
-            continue  # skip gaps touching the bottom edge
-        gap_mid = (gs + ge) / 2
-        gap_size = ge - gs
-        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
-            if gap_size > best_footer_size:
-                best_footer_size = gap_size
-                footer_y = gs  # top edge of gap
-
-    if header_y is not None:
-        logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
-                    f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
-    if footer_y is not None:
-        logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
-                    f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
-
-    return header_y, footer_y
-
-
-def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
-                        min_density: float = 0.005) -> bool:
-    """Check whether a horizontal strip contains meaningful ink.
-
-    Args:
-        inv: Inverted binarized image (white-on-black).
-        y_start: Top of the region (inclusive).
-        y_end: Bottom of the region (exclusive).
-        min_density: Fraction of white pixels required to count as content.
-
-    Returns:
-        True if the region contains text/graphics, False if empty margin.
-    """
-    if y_start >= y_end:
-        return False
-    strip = inv[y_start:y_end, :]
-    density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
-    return density > min_density
-
-
-def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
-                       img_w: int, img_h: int,
-                       inv: Optional[np.ndarray] = None) -> None:
-    """Add header/footer/margin regions in-place.
-
-    Uses gap-based detection when *inv* is provided, otherwise falls back
-    to simple top_y/bottom_y bounds.
-
-    Region types depend on whether there is actual content (text/graphics):
-      - 'header' / 'footer'       — region contains text (e.g. title, page number)
-      - 'margin_top' / 'margin_bottom' — region is empty page margin
-    """
-    header_y: Optional[int] = None
-    footer_y: Optional[int] = None
-
-    if inv is not None:
-        header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
-
-    # --- Top region ---
-    top_boundary = header_y if header_y is not None and header_y > 10 else (
-        top_y if top_y > 10 else None
-    )
-    if top_boundary is not None:
-        has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
-        rtype = 'header' if has_content else 'margin_top'
-        regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
-        logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
-                    f"(has_content={has_content})")
-
-    # --- Bottom region ---
-    bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
-        bottom_y if bottom_y < img_h - 10 else None
-    )
-    if bottom_boundary is not None:
-        has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
-        rtype = 'footer' if has_content else 'margin_bottom'
-        regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
-                                  height=img_h - bottom_boundary))
-        logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
-                    f"height={img_h - bottom_boundary}px (has_content={has_content})")
-
-
-# --- Main Entry Point ---
-
-def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
-    """Detect columns using two-phase approach: geometry then content classification.
-
-    Phase A: detect_column_geometry() — clustering word positions into columns.
-    Phase B: classify_column_types() — content-based type assignment with fallback.
-
-    Falls back to projection-based analyze_layout() if geometry detection fails.
-
-    Args:
-        ocr_img: Binarized grayscale image for layout analysis.
-        dewarped_bgr: Original BGR image (for Tesseract word detection).
-
-    Returns:
-        List of PageRegion objects with types, confidence, and method.
-    """
-    h, w = ocr_img.shape[:2]
-
-    # Phase A: Geometry detection
-    result = detect_column_geometry(ocr_img, dewarped_bgr)
-
-    if result is None:
-        # Fallback to projection-based layout
-        logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
-        layout_img = create_layout_image(dewarped_bgr)
-        return analyze_layout(layout_img, ocr_img)
-
-    geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
-    content_w = right_x - left_x
-
-    # Detect header/footer early so sub-column clustering ignores them
-    header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
-
-    # Split sub-columns (e.g. page references) before classification
-    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
-                                      top_y=top_y, header_y=header_y, footer_y=footer_y)
-
-    # Split broad columns that contain EN+DE mixed via word-coverage gaps
-    geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
-
-    # Phase B: Positional classification (no language scoring)
-    content_h = bottom_y - top_y
-    regions = positional_column_regions(geometries, content_w, content_h, left_x)
-
-    col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
-    methods = set(r.classification_method for r in regions if r.classification_method)
-    logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
-                f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
-
-    return regions
-
-
-# =============================================================================
-# Pipeline Step 5: Word Grid from Columns × Rows
-# =============================================================================
-
-def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
-    """Group OCR words into visual lines in reading order.
-
-    Returns a list of line strings (one per visual line in the cell).
-    """
-    if not words:
-        return []
-
-    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
-    return [' '.join(w['text'] for w in line) for line in lines]
-
-
-def _rejoin_hyphenated(lines: List[str]) -> List[str]:
-    """Rejoin words split by line-break hyphenation.
-
-    E.g. ['Fuß-', 'boden'] → ['Fußboden']
-         ['some text-', 'thing here'] → ['something here']
-    """
-    if len(lines) <= 1:
-        return lines
-
-    result = []
-    i = 0
-    while i < len(lines):
-        line = lines[i]
-        # If line ends with '-' and there's a next line, rejoin
-        if i + 1 < len(lines) and line.rstrip().endswith('-'):
-            stripped = line.rstrip()
-            # Get the word fragment before hyphen (last word)
-            prefix = stripped[:-1]  # remove trailing hyphen
-            next_line = lines[i + 1]
-            # Join: last word of this line + first word of next line
-            prefix_words = prefix.rsplit(' ', 1)
-            next_words = next_line.split(' ', 1)
-            if len(prefix_words) > 1:
-                joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
-            else:
-                joined = prefix_words[0] + next_words[0]
-            remainder = next_words[1] if len(next_words) > 1 else ''
-            if remainder:
-                result.append(joined + ' ' + remainder)
-            else:
-                result.append(joined)
-            i += 2
-        else:
-            result.append(line)
-            i += 1
-    return result
-
-
-def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
-    """Join OCR words into text in correct reading order, preserving line breaks.
-
-    Groups words into visual lines by Y-tolerance, sorts each line by X,
-    rejoins hyphenated words, then joins lines with newlines.
-    """
-    lines = _words_to_reading_order_lines(words, y_tolerance_px)
-    lines = _rejoin_hyphenated(lines)
-    return '\n'.join(lines)
-
-
-# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
-
-_rapid_engine = None
-RAPIDOCR_AVAILABLE = False
-
-try:
-    from rapidocr import RapidOCR as _RapidOCRClass
-    from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
-    RAPIDOCR_AVAILABLE = True
-    logger.info("RapidOCR available — can be used as alternative to Tesseract")
-except ImportError:
-    logger.info("RapidOCR not installed — using Tesseract only")
-
-
-def _get_rapid_engine():
-    """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
-    global _rapid_engine
-    if _rapid_engine is None:
-        _rapid_engine = _RapidOCRClass(params={
-            # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
-            "Rec.lang_type": _LangRec.LATIN,
-            "Rec.model_type": _ModelType.SERVER,
-            "Rec.ocr_version": _OCRVersion.PPOCRV5,
-            # Tighter detection boxes to reduce word merging
-            "Det.unclip_ratio": 1.3,
-            # Lower threshold to detect small chars (periods, ellipsis, phonetics)
-            "Det.box_thresh": 0.4,
-            # Silence verbose logging
-            "Global.log_level": "critical",
-        })
-        logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
-    return _rapid_engine
-
-
-def ocr_region_rapid(
-    img_bgr: np.ndarray,
-    region: PageRegion,
-) -> List[Dict[str, Any]]:
-    """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
-
-    Args:
-        img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
-        region: Region to crop and OCR.
-
-    Returns:
-        List of word dicts with text, left, top, width, height, conf, region_type.
-    """
-    engine = _get_rapid_engine()
-
-    # Crop region from BGR image
-    crop = img_bgr[region.y:region.y + region.height,
-                   region.x:region.x + region.width]
-
-    if crop.size == 0:
-        return []
-
-    result = engine(crop)
-
-    if result is None or result.boxes is None or result.txts is None:
-        return []
-
-    words = []
-    boxes = result.boxes    # shape (N, 4, 2) — 4 corner points per text line
-    txts = result.txts      # tuple of strings
-    scores = result.scores  # tuple of floats
-
-    for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
-        if not txt or not txt.strip():
-            continue
-
-        # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
-        xs = [p[0] for p in box]
-        ys = [p[1] for p in box]
-        left = int(min(xs))
-        top = int(min(ys))
-        w = int(max(xs) - left)
-        h = int(max(ys) - top)
-
-        words.append({
-            'text': txt.strip(),
-            'left': left + region.x,   # Absolute coords
-            'top': top + region.y,
-            'width': w,
-            'height': h,
-            'conf': int(score * 100),  # 0-100 like Tesseract
-            'region_type': region.type,
-        })
-
-    return words
-
-
-def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
-    """Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
-
-    Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
-    Bboxes are approximated from equal line-height distribution within the region.
-    Falls back to Tesseract if TrOCR is not available.
-    """
-    from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
-
-    if not _check_trocr_available():
-        logger.warning("TrOCR not available, falling back to Tesseract")
-        if region.height > 0 and region.width > 0:
-            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
-            if ocr_img_crop is not None:
-                return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
-        return []
-
-    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
-    if crop.size == 0:
-        return []
-
-    try:
-        import torch
-        from PIL import Image as _PILImage
-
-        processor, model = get_trocr_model(handwritten=handwritten)
-        if processor is None or model is None:
-            logger.warning("TrOCR model not loaded, falling back to Tesseract")
-            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
-
-        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
-        lines = _split_into_lines(pil_crop)
-        if not lines:
-            lines = [pil_crop]
-
-        device = next(model.parameters()).device
-        all_text = []
-        confidences = []
-        for line_img in lines:
-            pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
-            with torch.no_grad():
-                generated_ids = model.generate(pixel_values, max_length=128)
-            text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-            if text_line:
-                all_text.append(text_line)
-                confidences.append(0.85 if len(text_line) > 3 else 0.5)
-
-        if not all_text:
-            return []
-
-        avg_conf = int(sum(confidences) / len(confidences) * 100)
-        line_h = region.height // max(len(all_text), 1)
-        words = []
-        for i, line in enumerate(all_text):
-            words.append({
-                "text": line,
-                "left": region.x,
-                "top": region.y + i * line_h,
-                "width": region.width,
-                "height": line_h,
-                "conf": avg_conf,
-                "region_type": region.type,
-            })
-        return words
-
-    except Exception as e:
-        logger.error(f"ocr_region_trocr failed: {e}")
-        return []
-
-
-def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
-    """Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
-
-    Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
-    """
-    from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
-
-    if not _check_lighton_available():
-        logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
-        if RAPIDOCR_AVAILABLE and img_bgr is not None:
-            return ocr_region_rapid(img_bgr, region)
-        ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
-        return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
-
-    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
-    if crop.size == 0:
-        return []
-
-    try:
-        import io
-        import torch
-        from PIL import Image as _PILImage
-
-        processor, model = get_lighton_model()
-        if processor is None or model is None:
-            logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
-            if RAPIDOCR_AVAILABLE and img_bgr is not None:
-                return ocr_region_rapid(img_bgr, region)
-            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
-
-        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
-        conversation = [{"role": "user", "content": [{"type": "image"}]}]
-        inputs = processor.apply_chat_template(
-            conversation, images=[pil_crop],
-            add_generation_prompt=True, return_tensors="pt"
-        ).to(model.device)
-
-        with torch.no_grad():
-            output_ids = model.generate(**inputs, max_new_tokens=1024)
-
-        text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
-        if not text:
-            return []
-
-        lines = [l.strip() for l in text.split("\n") if l.strip()]
-        line_h = region.height // max(len(lines), 1)
-        words = []
-        for i, line in enumerate(lines):
-            words.append({
-                "text": line,
-                "left": region.x,
-                "top": region.y + i * line_h,
-                "width": region.width,
-                "height": line_h,
-                "conf": 85,
-                "region_type": region.type,
-            })
-        return words
-
-    except Exception as e:
-        logger.error(f"ocr_region_lighton failed: {e}")
-        return []
-
-
-# =============================================================================
-# Post-Processing: Deterministic Quality Fixes
-# =============================================================================
-
-# --- A. Character Confusion Fix (I/1/l) ---
-
-# Common OCR confusion pairs in vocabulary context
-_CHAR_CONFUSION_RULES = [
-    # "1" at word start followed by lowercase → likely "I" or "l"
-    # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
-    (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
-    # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
-    (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
-    # "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
-    (re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'),    # |ch → Ich, | want → I want
-]
-
-# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
-_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
-
-
-def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Fix common OCR character confusions using context.
-
-    Deterministic rules:
-    - "1" at word start → "I" or "l" based on context
-    - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
-    - "y " artifact at word boundaries → remove (e.g. "y you" → "you")
-    """
-    for entry in entries:
-        en = entry.get('english', '') or ''
-        de = entry.get('german', '') or ''
-        ex = entry.get('example', '') or ''
-
-        # Apply general rules to all fields
-        for pattern, replacement in _CHAR_CONFUSION_RULES:
-            en = pattern.sub(replacement, en)
-            de = pattern.sub(replacement, de)
-            ex = pattern.sub(replacement, ex)
-
-        # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
-        de_lower_words = set(de.lower().replace(',', ' ').split())
-        if de_lower_words & _DE_INDICATORS_FOR_EN_I:
-            # Any remaining "1" in EN that looks like "I"
-            en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
-
-        # Fix "y " artifact before repeated word: "y you" → "you"
-        en = re.sub(r'\by\s+([a-z])', r'\1', en)
-        ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
-
-        entry['english'] = en.strip()
-        entry['german'] = de.strip()
-        entry['example'] = ex.strip()
-
-    return entries
-
-
-# --- B. Comma-Separated Word Form Splitting ---
-
-def _is_singular_plural_pair(parts: List[str]) -> bool:
-    """Detect if comma-separated parts are singular/plural forms of the same word.
-
-    E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
-    "break, broke, broken" → False (different verb forms, OK to split).
-
-    Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
-    OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
-    """
-    if len(parts) != 2:
-        return False
-
-    a, b = parts[0].lower().strip(), parts[1].lower().strip()
-    if not a or not b:
-        return False
-
-    # Common prefix heuristic: if words share >= 50% of the shorter word,
-    # they are likely forms of the same word (Maus/Mäuse, child/children).
-    min_len = min(len(a), len(b))
-    common = 0
-    for ca, cb in zip(a, b):
-        if ca == cb:
-            common += 1
-        else:
-            break
-    if common >= max(2, min_len * 0.5):
-        return True
-
-    # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
-    umlaut_map = str.maketrans('aou', 'äöü')
-    if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
-        return True
-
-    return False
-
-
-def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Split entries with comma-separated word forms into individual entries.
-
-    E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
-    → 3 entries: break/brechen, broke/brach, broken/gebrochen
-
-    Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
-    because those are forms of the same vocabulary entry.
-
-    Only splits when both EN and DE have the same number of comma-parts,
-    parts are short (word forms, not sentences), and at least 3 parts
-    (to avoid splitting pairs that likely belong together).
-    """
-    result: List[Dict[str, Any]] = []
-
-    for entry in entries:
-        en = (entry.get('english', '') or '').strip()
-        de = (entry.get('german', '') or '').strip()
-
-        # Split by comma (but not inside brackets or parentheses)
-        en_parts = _split_by_comma(en)
-        de_parts = _split_by_comma(de)
-
-        # Only split if we have multiple parts and counts match
-        should_split = False
-        if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
-            # All parts must be short (word forms, not sentences)
-            if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
-                # Do NOT split singular/plural pairs (2 parts that are
-                # forms of the same word)
-                if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
-                    should_split = False
-                else:
-                    should_split = True
-
-        if not should_split:
-            result.append(entry)
-            continue
-
-        # Split into individual entries
-        for k in range(len(en_parts)):
-            sub = dict(entry)  # shallow copy
-            sub['english'] = en_parts[k].strip()
-            sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
-            sub['example'] = ''  # examples get attached later
-            sub['split_from_comma'] = True
-            result.append(sub)
-
-    # Re-number
-    for i, e in enumerate(result):
-        e['row_index'] = i
-
-    return result
-
-
-def _split_by_comma(text: str) -> List[str]:
-    """Split text by commas, but not inside brackets [...] or parens (...)."""
-    if ',' not in text:
-        return [text]
-
-    parts = []
-    depth_bracket = 0
-    depth_paren = 0
-    current = []
-
-    for ch in text:
-        if ch == '[':
-            depth_bracket += 1
-        elif ch == ']':
-            depth_bracket = max(0, depth_bracket - 1)
-        elif ch == '(':
-            depth_paren += 1
-        elif ch == ')':
-            depth_paren = max(0, depth_paren - 1)
-        elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
-            parts.append(''.join(current).strip())
-            current = []
-            continue
-        current.append(ch)
-
-    if current:
-        parts.append(''.join(current).strip())
-
-    # Filter empty parts
-    return [p for p in parts if p]
-
-
-# --- C. Example Sentence Attachment ---
-
-def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
-    """Find the vocab entry whose English word(s) best match the example sentence.
-
-    Returns index into vocab_entries, or -1 if no match found.
-    Uses word stem overlap: "a broken arm" matches "broken" or "break".
-    """
-    if not vocab_entries or not example_text:
-        return -1
-
-    example_lower = example_text.lower()
-    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
-
-    best_idx = -1
-    best_score = 0
-
-    for i, entry in enumerate(vocab_entries):
-        en = (entry.get('english', '') or '').lower()
-        if not en:
-            continue
-
-        # Extract vocab words (split on space, comma, newline)
-        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
-
-        # Score: how many vocab words appear in the example?
-        # Also check if example words share a common stem (first 4 chars)
-        direct_matches = vocab_words & example_words
-        score = len(direct_matches) * 10
-
-        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
-        if score == 0:
-            for vw in vocab_words:
-                if len(vw) < 3:
-                    continue
-                stem = vw[:4] if len(vw) >= 4 else vw[:3]
-                for ew in example_words:
-                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
-                        score += 5
-                        break
-
-        if score > best_score:
-            best_score = score
-            best_idx = i
-
-    return best_idx if best_score > 0 else -1
-
-
-def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Attach rows with EN text but no DE translation as examples to matching vocab entries.
-
-    Vocabulary worksheets often have:
-      Row 1: break, broke, broken / brechen, brach, gebrochen
-      Row 2: a broken arm          (no DE → example for "broken")
-      Row 3: a broken plate         (no DE → example for "broken")
-      Row 4: egg / Ei               (has DE → new vocab entry)
-
-    Rules (deterministic, generic):
-    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
-    - Find the best matching vocab entry by checking which entry's English words
-      appear in the example sentence (semantic matching via word overlap)
-    - Fall back to the nearest preceding entry if no word match found
-    - Multiple examples get joined with " | "
-    """
-    if not entries:
-        return entries
-
-    # Separate into vocab entries (have DE) and example candidates (no DE)
-    vocab_entries: List[Dict[str, Any]] = []
-    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts
-
-    for entry in entries:
-        en = (entry.get('english', '') or '').strip()
-        de = (entry.get('german', '') or '').strip()
-        ex = (entry.get('example', '') or '').strip()
-
-        # Treat single-char DE as OCR noise, not real translation.
-        # "Ei" (2 chars) is a valid German word, so threshold is 1.
-        has_de = len(de) > 1
-        has_en = bool(en)
-
-        # Heuristic: a row without DE is an "example sentence" only if
-        # the EN text looks like a sentence (>= 4 words, or contains
-        # typical sentence punctuation).  Short EN text (1-3 words) is
-        # more likely a vocab entry whose DE was missed by OCR.
-        _looks_like_sentence = (
-            len(en.split()) >= 4
-            or en.rstrip().endswith(('.', '!', '?'))
-        )
-        is_example_candidate = (
-            has_en and not has_de and _looks_like_sentence and vocab_entries
-        )
-
-        if is_example_candidate:
-            # This is an example sentence — find best matching vocab entry
-            example_text = en
-
-            match_idx = _find_best_vocab_match(en, vocab_entries)
-            if match_idx < 0:
-                # No word match → fall back to last entry
-                match_idx = len(vocab_entries) - 1
-
-            if match_idx not in examples_for:
-                examples_for[match_idx] = []
-            examples_for[match_idx].append(example_text)
-        else:
-            vocab_entries.append(entry)
-
-    # Attach examples to their matched vocab entries
-    for idx, example_list in examples_for.items():
-        if 0 <= idx < len(vocab_entries):
-            entry = vocab_entries[idx]
-            existing_ex = (entry.get('example', '') or '').strip()
-            new_examples = ' | '.join(example_list)
-            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
-
-    # Re-number
-    for i, e in enumerate(vocab_entries):
-        e['row_index'] = i
-
-    return vocab_entries
-
-
-# --- D. Phonetic Bracket IPA Replacement ---
-
-# Pattern: word followed by any bracket type containing phonetic content.
-# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
-# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
-# This intentionally matches mixed brackets (e.g. {content]) because
-# Tesseract frequently misrecognizes bracket characters.
-_PHONETIC_BRACKET_RE = re.compile(
-    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
+from cv_vocab_types import *       # noqa: F401,F403
+from cv_preprocessing import *     # noqa: F401,F403
+from cv_layout import *            # noqa: F401,F403
+from cv_ocr_engines import *       # noqa: F401,F403
+from cv_cell_grid import *         # noqa: F401,F403
+from cv_review import *            # noqa: F401,F403
+
+# Private names used by consumers — not covered by wildcard re-exports.
+from cv_preprocessing import _apply_shear  # noqa: F401
+from cv_layout import (  # noqa: F401
+    _detect_header_footer_gaps,
+    _detect_sub_columns,
+    _split_broad_columns,
 )
-
-# Unicode IPA characters — used to distinguish correct IPA (from dictionary
-# lookup) from garbled OCR content when stripping orphan brackets.
-_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
-
-# Minimum word confidence for full-page Tesseract results (0-100).
-# Words below this threshold are OCR noise (scanner shadows, borders).
-_MIN_WORD_CONF = 30
-
-
-def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
-    """Look up IPA for a word using the selected pronunciation dictionary.
-
-    Args:
-        word: English word to look up.
-        pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
-
-    Returns:
-        IPA string or None if not found.
-    """
-    word_lower = word.lower().strip()
-    if not word_lower:
-        return None
-
-    if pronunciation == 'british' and _britfone_dict:
-        ipa = _britfone_dict.get(word_lower)
-        if ipa:
-            return ipa
-        # Fallback to American if not in Britfone
-        if _ipa_convert_american:
-            result = _ipa_convert_american(word_lower)
-            if result and '*' not in result:
-                return result
-        return None
-
-    if pronunciation == 'american' and _ipa_convert_american:
-        result = _ipa_convert_american(word_lower)
-        if result and '*' not in result:
-            return result
-        # Fallback to Britfone if not in CMU
-        if _britfone_dict:
-            ipa = _britfone_dict.get(word_lower)
-            if ipa:
-                return ipa
-        return None
-
-    # Try any available source
-    if _britfone_dict:
-        ipa = _britfone_dict.get(word_lower)
-        if ipa:
-            return ipa
-    if _ipa_convert_american:
-        result = _ipa_convert_american(word_lower)
-        if result and '*' not in result:
-            return result
-
-    return None
-
-
-def _fix_phonetic_brackets(
-    entries: List[Dict[str, Any]],
-    pronunciation: str = 'british',
-) -> List[Dict[str, Any]]:
-    """Replace OCR'd phonetic transcriptions with dictionary IPA.
-
-    Detects patterns like "dance [du:ns]" and replaces with correct IPA:
-    - British: "dance [dˈɑːns]"  (Britfone, MIT)
-    - American: "dance [dæns]"    (eng_to_ipa/CMU, MIT)
-
-    Only replaces if the word before brackets is found in the dictionary.
-    """
-    if not IPA_AVAILABLE:
-        return entries
-
-    # IPA phonetics only appear in the ENGLISH field of vocab tables.
-    # German and example fields contain meaningful parenthetical content:
-    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
-    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
-    # These must NEVER be processed as phonetic transcriptions.
-    replaced_count = 0
-    for entry in entries:
-        text = entry.get('english', '') or ''
-        if not any(ch in text for ch in '[{('):
-            continue
-        new_text = _replace_phonetics_in_text(text, pronunciation)
-        if new_text != text:
-            logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
-            replaced_count += 1
-        entry['english'] = new_text
-
-    if replaced_count:
-        logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
-    return entries
-
-
-# Grammar particles that appear in brackets after English words:
-#   cross (with), complain (about/of), agree (on/with), look (sth) up
-# These must NOT be replaced with IPA.  Only used for the English field
-# (German/example fields are never processed for IPA replacement).
-_GRAMMAR_BRACKET_WORDS = frozenset({
-    # English prepositions/particles commonly in vocab tables
-    'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
-    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
-    # English grammar abbreviations used in vocab tables
-    'sth', 'sb', 'adj', 'adv',
-})
-
-
-def _is_grammar_bracket_content(content: str) -> bool:
-    """Return True if bracket content is grammar info in the ENGLISH field.
-
-    Grammar info:  cross (with), complain (about/of), agree (on/with)
-    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
-
-    Since we only process the English field, we only need to recognize
-    English grammar particles. Everything else is (garbled) IPA.
-    """
-    if not content:
-        return False
-
-    # Split on / for patterns like (about/of), (on/with)
-    tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
-    if not tokens:
-        return False
-
-    # ALL tokens must be known grammar words
-    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
-
-
-def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
-    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
-
-    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
-    We match any bracket type and replace with dictionary IPA if found.
-    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
-    """
-    if not IPA_AVAILABLE:
-        return text
-
-    def replacer(match):
-        word = match.group(1)
-        bracket_content = match.group(2).strip()
-        full_match = match.group(0)
-
-        # Skip if bracket content looks like regular text (multiple words)
-        if len(bracket_content.split()) > 3:
-            return full_match
-
-        # Look up IPA for the word before brackets
-        ipa = _lookup_ipa(word, pronunciation)
-
-        if ipa:
-            # Word has IPA → bracket content is phonetic (garbled or correct).
-            # Exception: grammar particles like cross (with) — keep those.
-            if _is_grammar_bracket_content(bracket_content):
-                return full_match
-            logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
-            return f"{word} [{ipa}]"
-
-        # No IPA for this word — keep as-is
-        return full_match
-
-    text = _PHONETIC_BRACKET_RE.sub(replacer, text)
-
-    # Second pass: strip remaining orphan brackets that are garbled IPA.
-    # These have no word before them (the main regex requires \b word \s* bracket).
-    # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
-    # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
-    def _strip_orphan_bracket(m):
-        content = m.group(1).strip()
-        # Keep grammar info: (sich beschweren), (about/of)
-        if _is_grammar_bracket_content(content):
-            return m.group(0)
-        # Keep correct IPA (contains Unicode IPA characters)
-        if any(ch in _IPA_CHARS for ch in content):
-            return m.group(0)
-        logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
-        return ''
-
-    text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
-    text = text.strip()
-
-    return text
-
-
-def _assign_row_words_to_columns(
-    row: RowGeometry,
-    columns: List[PageRegion],
-) -> Dict[int, List[Dict]]:
-    """Assign each word in a row to exactly one column.
-
-    Uses a two-pass strategy:
-    1. Containment: if a word's center falls within a column's horizontal
-       bounds (with padding), assign it to that column.
-    2. Nearest center: for words not contained by any column, fall back to
-       nearest column center distance.
-
-    This prevents long sentences in wide columns (e.g. example) from having
-    their rightmost words stolen by an adjacent column.
-
-    Args:
-        row: Row with words (relative coordinates).
-        columns: Sorted list of columns (absolute coordinates).
-
-    Returns:
-        Dict mapping col_index → list of words assigned to that column.
-    """
-    result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
-
-    if not row.words or not columns:
-        return result
-
-    left_x = row.x  # content ROI left (absolute)
-
-    # Build non-overlapping column assignment ranges using midpoints.
-    # For adjacent columns, the boundary is the midpoint between them.
-    # This prevents words near column borders from being assigned to
-    # the wrong column (e.g. "We" at the start of an example sentence
-    # being stolen by the preceding DE column).
-    n = len(columns)
-    col_ranges_rel = []  # (assign_left, assign_right) per column
-    for ci, col in enumerate(columns):
-        col_left_rel = col.x - left_x
-        col_right_rel = col_left_rel + col.width
-
-        # Left boundary: midpoint to previous column, or 0
-        if ci == 0:
-            assign_left = 0
-        else:
-            prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
-            assign_left = (prev_right + col_left_rel) / 2
-
-        # Right boundary: midpoint to next column, or infinity (row width)
-        if ci == n - 1:
-            assign_right = row.width + 100  # generous for last column
-        else:
-            next_left = columns[ci + 1].x - left_x
-            assign_right = (col_right_rel + next_left) / 2
-
-        col_ranges_rel.append((assign_left, assign_right))
-
-    for w in row.words:
-        w_left = w['left']
-        w_right = w_left + w['width']
-        w_center_x = w_left + w['width'] / 2
-
-        # Primary: overlap-based matching — assign to column with most overlap.
-        # This is more robust than center-based for narrow columns (page_ref)
-        # where the last character's center may fall into the next column.
-        best_col = -1
-        best_overlap = 0
-        for ci, col in enumerate(columns):
-            col_left_rel = col.x - left_x
-            col_right_rel = col_left_rel + col.width
-            overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
-            if overlap > best_overlap:
-                best_overlap = overlap
-                best_col = ci
-
-        if best_col >= 0 and best_overlap > 0:
-            result[best_col].append(w)
-        else:
-            # Fallback: center-based range matching
-            assigned = False
-            for ci, (al, ar) in enumerate(col_ranges_rel):
-                if al <= w_center_x < ar:
-                    result[ci].append(w)
-                    assigned = True
-                    break
-
-            if not assigned:
-                # Last resort: nearest column center
-                best_col = 0
-                col_left_0 = columns[0].x - left_x
-                best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
-                for ci in range(1, n):
-                    col_left = columns[ci].x - left_x
-                    dist = abs(w_center_x - (col_left + columns[ci].width / 2))
-                    if dist < best_dist:
-                        best_dist = dist
-                        best_col = ci
-                result[best_col].append(w)
-
-    return result
-
-
-# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
-_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
-_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
-
-# Common short EN/DE words (2-3 chars).  Tokens at the end of a cell
-# that do NOT appear here are treated as trailing OCR noise.
-_COMMON_SHORT_WORDS: set = {
-    # EN 1-2 letter
-    'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
-    'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
-    'or', 'so', 'to', 'up', 'us', 'we',
-    # EN 3 letter
-    'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
-    'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
-    'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
-    'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
-    'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
-    'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
-    'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
-    'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
-    'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
-    'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
-    'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
-    'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
-    'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
-    'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
-    'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
-    'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
-    'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
-    'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
-    'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
-    'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
-    'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
-    'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
-    'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
-    'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
-    'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
-    'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
-    'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
-    'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
-    'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
-    'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
-    'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
-    'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
-    'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
-    'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
-    'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
-    'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
-    'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
-    'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
-    'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
-    'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
-    'zap', 'zip', 'zoo',
-    # DE 2-3 letter
-    'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
-    'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
-    'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
-    'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
-    'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
-    'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
-    'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
-    'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
-    'wut', 'zum', 'zur',
-}
-
-# Known abbreviations found in EN/DE textbooks and dictionaries.
-# Stored WITHOUT trailing period (the noise filter strips periods).
-# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
-_KNOWN_ABBREVIATIONS: set = {
-    # EN dictionary meta-words
-    'sth', 'sb', 'smth', 'smb', 'sbd',
-    # EN general
-    'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
-    'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
-    # EN references / textbook
-    'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
-    'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
-    'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
-    'ans', 'wb', 'tb', 'vocab',
-    # EN parts of speech / grammar
-    'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
-    'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
-    'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
-    'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
-    'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
-    'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
-    'syn', 'ant', 'opp', 'var', 'orig',
-    # EN titles
-    'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
-    # EN pronunciation
-    'br', 'am', 'brit', 'amer',
-    # EN units
-    'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
-    # DE general
-    'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
-    'bes', 'insb', 'insbes', 'bspw', 'ca',
-    'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
-    'inkl', 'exkl', 'zzgl', 'abzgl',
-    # DE references
-    'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
-    'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
-    's', 'sp', 'zit', 'zs', 'vlg',
-    # DE grammar
-    'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
-    'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
-    'trennb', 'untrennb', 'ugs', 'geh', 'pej',
-    # DE regional
-    'nordd', 'österr', 'schweiz',
-    # Linguistic
-    'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
-    'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
-    'count', 'uncount', 'indef', 'def', 'poss', 'demon',
-}
-
-
-def _is_noise_tail_token(token: str) -> bool:
-    """Check if a token at the END of cell text is trailing OCR noise.
-
-    Trailing fragments are very common OCR artifacts from image edges,
-    borders, and neighbouring cells.  This is more aggressive than a
-    general word filter: any short token that isn't in the dictionary
-    of common EN/DE words is considered noise.
-
-    Examples of noise: "Es)", "3", "ee", "B"
-    Examples to keep:  "sister.", "cupcakes.", "...", "mice", "[eg]"
-    """
-    t = token.strip()
-    if not t:
-        return True
-
-    # Keep ellipsis
-    if t in ('...', '…'):
-        return False
-
-    # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
-    if t.startswith('[') or t.startswith('["') or t.startswith("['"):
-        return False
-    if t.endswith(']'):
-        return False
-
-    # Pure non-alpha → noise ("3", ")", "|")
-    alpha_chars = _RE_ALPHA.findall(t)
-    if not alpha_chars:
-        return True
-
-    # Extract only alpha characters for dictionary lookup
-    cleaned = ''.join(alpha_chars)
-
-    # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
-    if cleaned.lower() in _KNOWN_ABBREVIATIONS:
-        return False
-
-    # Strip normal trailing punctuation before checking for internal noise.
-    stripped_punct = re.sub(r'[.,;:!?]+$', '', t)  # "cupcakes." → "cupcakes"
-    t_check = stripped_punct if stripped_punct else t
-
-    # Check for legitimate punctuation patterns vs. real noise.
-    # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
-    #             "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
-    # Noise: "3d", "B|", "x7"
-    # Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
-    # THEN check if residual contains only alpha characters.
-    t_inner = t_check
-    # Remove all parentheses, hyphens, slashes, and dots — these are normal
-    # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
-    # "(zer)brechen", "wir/uns", "e.g."
-    t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
-    # Now check: does the inner form still have non-alpha noise?
-    inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
-    has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
-
-    # Long alpha words (4+ chars) without internal noise are likely real
-    if len(cleaned) >= 4 and not has_internal_noise:
-        return False
-
-    # Short words: check dictionary (uses only alpha chars)
-    if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
-        return False
-
-    # Default: short or suspicious → noise
-    return True
-
-
-def _is_garbage_text(text: str) -> bool:
-    """Check if entire cell text is OCR garbage from image areas.
-
-    Garbage text = no recognizable dictionary word.  Catches
-    "(ci]oeu", "uanoaain." etc.
-    """
-    words = _RE_REAL_WORD.findall(text)
-    if not words:
-        # Check if any token is a known abbreviation (e.g. "e.g.")
-        alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
-        if alpha_only in _KNOWN_ABBREVIATIONS:
-            return False
-        return True
-
-    for w in words:
-        wl = w.lower()
-        # Known short word or abbreviation → not garbage
-        if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
-            return False
-        # Long word (>= 4 chars): check vowel/consonant ratio.
-        # Real EN/DE words have 20-60% vowels.  Garbage like "uanoaain"
-        # or "cioeu" has unusual ratios (too many or too few vowels).
-        if len(wl) >= 4:
-            vowels = sum(1 for c in wl if c in 'aeiouäöü')
-            ratio = vowels / len(wl)
-            if 0.15 <= ratio <= 0.65:
-                return False  # plausible vowel ratio → real word
-
-    return True
-
-
-def _clean_cell_text(text: str) -> str:
-    """Remove OCR noise from cell text.  Generic filters:
-
-    1. If the entire text has no real alphabetic word (>= 2 letters), clear.
-    2. If the entire text is garbage (no dictionary word), clear.
-    3. Strip trailing noise tokens from the end of the text.
-    """
-    stripped = text.strip()
-    if not stripped:
-        return ''
-
-    # --- Filter 1: No real word at all ---
-    if not _RE_REAL_WORD.search(stripped):
-        # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
-        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
-        if alpha_only not in _KNOWN_ABBREVIATIONS:
-            return ''
-
-    # --- Filter 2: Entire text is garbage ---
-    if _is_garbage_text(stripped):
-        return ''
-
-    # --- Filter 3: Strip trailing noise tokens ---
-    tokens = stripped.split()
-    while tokens and _is_noise_tail_token(tokens[-1]):
-        tokens.pop()
-    if not tokens:
-        return ''
-
-    return ' '.join(tokens)
-
-
-def _clean_cell_text_lite(text: str) -> str:
-    """Simplified noise filter for cell-first OCR (isolated cell crops).
-
-    Since each cell is OCR'd in isolation (no neighbour content visible),
-    trailing-noise stripping is unnecessary.  Only 2 filters remain:
-
-    1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty.
-    2. Entire text is garbage (no dictionary word) → empty.
-    """
-    stripped = text.strip()
-    if not stripped:
-        return ''
-
-    # --- Filter 1: No real word at all ---
-    if not _RE_REAL_WORD.search(stripped):
-        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
-        if alpha_only not in _KNOWN_ABBREVIATIONS:
-            return ''
-
-    # --- Filter 2: Entire text is garbage ---
-    if _is_garbage_text(stripped):
-        return ''
-
-    return stripped
-
-
-# ---------------------------------------------------------------------------
-# Bold detection via stroke-width analysis (relative / page-level)
-# ---------------------------------------------------------------------------
-
-def _measure_stroke_width(gray_crop: np.ndarray) -> float:
-    """Measure mean stroke width in a binarised cell crop.
-
-    Returns a DPI-normalised value (mean stroke width as % of crop height),
-    or 0.0 if measurement is not possible.
-    """
-    if gray_crop is None or gray_crop.size == 0:
-        return 0.0
-    h, w = gray_crop.shape[:2]
-    if h < 10 or w < 10:
-        return 0.0
-
-    # Binarise: text = white (255), background = black (0)
-    _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
-    if cv2.countNonZero(bw) < 20:
-        return 0.0
-
-    # Distance transform: value at each white pixel = distance to nearest black
-    dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
-
-    # Skeleton via morphological thinning
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
-    thin = bw.copy()
-    for _ in range(max(1, min(h, w) // 6)):
-        eroded = cv2.erode(thin, kernel)
-        if cv2.countNonZero(eroded) < 5:
-            break
-        thin = eroded
-
-    skeleton_pts = thin > 0
-    if not np.any(skeleton_pts):
-        return 0.0
-    mean_stroke = float(np.mean(dist[skeleton_pts]))
-    return mean_stroke / max(h, 1) * 100  # normalised: % of cell height
-
-
-def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
-                         img_w: int, img_h: int) -> None:
-    """Two-pass bold detection: measure all cells, then compare against median.
-
-    Cells with stroke width > 1.4× the page median are marked as bold.
-    This adapts automatically to font, DPI and scan quality.
-    Modifies cells in-place (sets 'is_bold' key).
-    """
-    if ocr_img is None:
-        return
-
-    # Pass 1: measure stroke width for every cell with text
-    metrics: List[float] = []
-    cell_strokes: List[float] = []
-    for cell in cells:
-        sw = 0.0
-        if cell.get('text', '').strip():
-            bp = cell['bbox_px']
-            y1 = max(0, bp['y'])
-            y2 = min(img_h, bp['y'] + bp['h'])
-            x1 = max(0, bp['x'])
-            x2 = min(img_w, bp['x'] + bp['w'])
-            if y2 > y1 and x2 > x1:
-                sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
-        cell_strokes.append(sw)
-        if sw > 0:
-            metrics.append(sw)
-
-    if len(metrics) < 3:
-        # Too few cells to compare — leave all as non-bold
-        return
-
-    median_sw = float(np.median(metrics))
-    if median_sw <= 0:
-        return
-
-    # Pass 2: cells significantly above median → bold
-    for cell, sw in zip(cells, cell_strokes):
-        cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
-
-
-# ---------------------------------------------------------------------------
-# Cell-First OCR (v2) — each cell cropped and OCR'd in isolation
-# ---------------------------------------------------------------------------
-
-def _ocr_cell_crop(
-    row_idx: int,
-    col_idx: int,
-    row: RowGeometry,
-    col: PageRegion,
-    ocr_img: np.ndarray,
-    img_bgr: Optional[np.ndarray],
-    img_w: int,
-    img_h: int,
-    engine_name: str,
-    lang: str,
-    lang_map: Dict[str, str],
-) -> Dict[str, Any]:
-    """OCR a single cell by cropping the exact column×row intersection.
-
-    No padding beyond cell boundaries → no neighbour bleeding.
-    """
-    # Display bbox: exact column × row intersection
-    disp_x = col.x
-    disp_y = row.y
-    disp_w = col.width
-    disp_h = row.height
-
-    # Crop boundaries: add small internal padding (3px each side) to avoid
-    # clipping characters near column/row edges (e.g. parentheses, descenders).
-    # Stays within image bounds but may extend slightly beyond strict cell.
-    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
-    _PAD = 3
-    cx = max(0, disp_x - _PAD)
-    cy = max(0, disp_y - _PAD)
-    cx2 = min(img_w, disp_x + disp_w + _PAD)
-    cy2 = min(img_h, disp_y + disp_h + _PAD)
-    cw = cx2 - cx
-    ch = cy2 - cy
-
-    empty_cell = {
-        'cell_id': f"R{row_idx:02d}_C{col_idx}",
-        'row_index': row_idx,
-        'col_index': col_idx,
-        'col_type': col.type,
-        'text': '',
-        'confidence': 0.0,
-        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
-        'bbox_pct': {
-            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
-            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
-            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
-            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
-        },
-        'ocr_engine': 'cell_crop_v2',
-        'is_bold': False,
-    }
-
-    if cw <= 0 or ch <= 0:
-        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
-        return empty_cell
-
-    # --- Pixel-density check: skip truly empty cells ---
-    if ocr_img is not None:
-        crop = ocr_img[cy:cy + ch, cx:cx + cw]
-        if crop.size > 0:
-            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-            if dark_ratio < 0.005:
-                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
-                            row_idx, col_idx, dark_ratio, cw, ch)
-                return empty_cell
-
-    # --- Prepare crop for OCR ---
-    cell_lang = lang_map.get(col.type, lang)
-    psm = _select_psm_for_column(col.type, col.width, row.height)
-    text = ''
-    avg_conf = 0.0
-    used_engine = 'cell_crop_v2'
-
-    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
-        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
-        words = ocr_region_trocr(img_bgr, cell_region,
-                                 handwritten=(engine_name == "trocr-handwritten"))
-    elif engine_name == "lighton" and img_bgr is not None:
-        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
-        words = ocr_region_lighton(img_bgr, cell_region)
-    elif engine_name == "rapid" and img_bgr is not None:
-        # Upscale small BGR crops for RapidOCR.
-        # Cell crops typically have height 35-55px but width >300px.
-        # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
-        # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
-        # For very short heights (< 80px), force 3× upscale for better OCR
-        # of small characters like periods, ellipsis, and phonetic symbols.
-        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
-        if bgr_crop.size == 0:
-            words = []
-        else:
-            crop_h, crop_w = bgr_crop.shape[:2]
-            if crop_h < 80:
-                # Force 3× upscale for short rows — small chars need more pixels
-                scale = 3.0
-                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
-                                    interpolation=cv2.INTER_CUBIC)
-            else:
-                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
-            up_h, up_w = bgr_up.shape[:2]
-            scale_x = up_w / max(crop_w, 1)
-            scale_y = up_h / max(crop_h, 1)
-            was_scaled = (up_w != crop_w or up_h != crop_h)
-            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
-                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
-            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-            words = ocr_region_rapid(bgr_up, tmp_region)
-            # Remap positions back to original image coords
-            if words and was_scaled:
-                for w in words:
-                    w['left'] = int(w['left'] / scale_x) + cx
-                    w['top'] = int(w['top'] / scale_y) + cy
-                    w['width'] = int(w['width'] / scale_x)
-                    w['height'] = int(w['height'] / scale_y)
-            elif words:
-                for w in words:
-                    w['left'] += cx
-                    w['top'] += cy
-    else:
-        # Tesseract: upscale tiny crops for better recognition
-        if ocr_img is not None:
-            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
-            upscaled = _ensure_minimum_crop_size(crop_slice)
-            up_h, up_w = upscaled.shape[:2]
-            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
-            # Remap word positions back to original image coordinates
-            if words and (up_w != cw or up_h != ch):
-                sx = cw / max(up_w, 1)
-                sy = ch / max(up_h, 1)
-                for w in words:
-                    w['left'] = int(w['left'] * sx) + cx
-                    w['top'] = int(w['top'] * sy) + cy
-                    w['width'] = int(w['width'] * sx)
-                    w['height'] = int(w['height'] * sy)
-            elif words:
-                for w in words:
-                    w['left'] += cx
-                    w['top'] += cy
-        else:
-            words = []
-
-    # Filter low-confidence words
-    _MIN_WORD_CONF = 30
-    if words:
-        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
-    if words:
-        y_tol = max(15, ch)
-        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
-                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
-    else:
-        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
-                    row_idx, col_idx, cw, ch, psm, engine_name)
-
-    # --- PSM 7 fallback for still-empty Tesseract cells ---
-    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
-        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
-        upscaled = _ensure_minimum_crop_size(crop_slice)
-        up_h, up_w = upscaled.shape[:2]
-        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
-        if psm7_words:
-            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-        if psm7_words:
-            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
-            if p7_text.strip():
-                text = p7_text
-                avg_conf = round(
-                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
-                )
-                used_engine = 'cell_crop_v2_psm7'
-
-    # --- Noise filter ---
-    if text.strip():
-        pre_filter = text
-        text = _clean_cell_text_lite(text)
-        if not text:
-            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
-                        row_idx, col_idx, pre_filter)
-            avg_conf = 0.0
-
-    result = dict(empty_cell)
-    result['text'] = text
-    result['confidence'] = avg_conf
-    result['ocr_engine'] = used_engine
-    return result
-
-
-# Threshold: columns narrower than this (% of image width) use single-cell
-# crop OCR instead of full-page word assignment.
-#
-# Broad columns (>= threshold): Full-page Tesseract word assignment.
-#   Better for multi-word content (sentences, IPA brackets, punctuation).
-#   Examples: EN vocabulary, DE translation, example sentences.
-#
-# Narrow columns (< threshold): Isolated cell-crop OCR.
-#   Prevents neighbour bleeding from adjacent broad columns.
-#   Examples: page_ref, marker, numbering columns.
-#
-# 15% was empirically validated across vocab table scans with 3-5 columns.
-# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
-# The 15% boundary cleanly separates the two groups.
-_NARROW_COL_THRESHOLD_PCT = 15.0
-
-
-def build_cell_grid_v2(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
-
-    Drop-in replacement for build_cell_grid() — same signature & return type.
-
-    Strategy:
-    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
-      words (from row.words). Handles IPA brackets, punctuation, sentence
-      continuity correctly.
-    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
-      neighbour bleeding from adjacent broad columns.
-    """
-    engine_name = "tesseract"
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
-        engine_name = "rapid"
-
-    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
-
-    # Filter to content rows only
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        logger.warning("build_cell_grid_v2: no content rows found")
-        return [], []
-
-    # Filter phantom rows (word_count=0) and artifact rows
-    before = len(content_rows)
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    skipped = before - len(content_rows)
-    if skipped > 0:
-        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
-    if not content_rows:
-        logger.warning("build_cell_grid_v2: no content rows with words found")
-        return [], []
-
-    before_art = len(content_rows)
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    artifact_skipped = before_art - len(content_rows)
-    if artifact_skipped > 0:
-        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
-    if not content_rows:
-        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
-        return [], []
-
-    # Filter columns
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
-                   'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        logger.warning("build_cell_grid_v2: no usable columns found")
-        return [], []
-
-    # Heal row gaps — use header/footer boundaries
-    content_rows.sort(key=lambda r: r.y)
-    header_rows = [r for r in row_geometries if r.row_type == 'header']
-    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
-    if header_rows:
-        top_bound = max(r.y + r.height for r in header_rows)
-    else:
-        top_bound = content_rows[0].y
-    if footer_rows:
-        bottom_bound = min(r.y for r in footer_rows)
-    else:
-        bottom_bound = content_rows[-1].y + content_rows[-1].height
-
-    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
-    relevant_cols.sort(key=lambda c: c.x)
-
-    columns_meta = [
-        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
-        for ci, c in enumerate(relevant_cols)
-    ]
-
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    # --- Classify columns as broad vs narrow ---
-    narrow_col_indices = set()
-    for ci, col in enumerate(relevant_cols):
-        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
-        if col_pct < _NARROW_COL_THRESHOLD_PCT:
-            narrow_col_indices.add(ci)
-
-    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
-    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
-                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
-
-    # --- Phase 1: Broad columns via full-page word assignment ---
-    cells: List[Dict[str, Any]] = []
-
-    for row_idx, row in enumerate(content_rows):
-        # Assign full-page words to columns for this row
-        col_words = _assign_row_words_to_columns(row, relevant_cols)
-
-        for col_idx, col in enumerate(relevant_cols):
-            if col_idx not in narrow_col_indices:
-                # BROAD column: use pre-assigned full-page words
-                words = col_words.get(col_idx, [])
-                # Filter low-confidence words
-                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
-                if words:
-                    y_tol = max(15, row.height)
-                    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-                else:
-                    text = ''
-                    avg_conf = 0.0
-
-                # Apply noise filter
-                text = _clean_cell_text(text)
-
-                cell = {
-                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
-                    'row_index': row_idx,
-                    'col_index': col_idx,
-                    'col_type': col.type,
-                    'text': text,
-                    'confidence': avg_conf,
-                    'bbox_px': {
-                        'x': col.x, 'y': row.y,
-                        'w': col.width, 'h': row.height,
-                    },
-                    'bbox_pct': {
-                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
-                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
-                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
-                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
-                    },
-                    'ocr_engine': 'word_lookup',
-                    'is_bold': False,
-                }
-                cells.append(cell)
-
-    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
-    narrow_tasks = []
-    for row_idx, row in enumerate(content_rows):
-        for col_idx, col in enumerate(relevant_cols):
-            if col_idx in narrow_col_indices:
-                narrow_tasks.append((row_idx, col_idx, row, col))
-
-    if narrow_tasks:
-        max_workers = 4 if engine_name == "tesseract" else 2
-        with ThreadPoolExecutor(max_workers=max_workers) as pool:
-            futures = {
-                pool.submit(
-                    _ocr_cell_crop,
-                    ri, ci, row, col,
-                    ocr_img, img_bgr, img_w, img_h,
-                    engine_name, lang, lang_map,
-                ): (ri, ci)
-                for ri, ci, row, col in narrow_tasks
-            }
-            for future in as_completed(futures):
-                try:
-                    cell = future.result()
-                    cells.append(cell)
-                except Exception as e:
-                    ri, ci = futures[future]
-                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
-
-    # Sort cells by (row_index, col_index)
-    cells.sort(key=lambda c: (c['row_index'], c['col_index']))
-
-    # Remove all-empty rows
-    rows_with_text: set = set()
-    for cell in cells:
-        if cell['text'].strip():
-            rows_with_text.add(cell['row_index'])
-    before_filter = len(cells)
-    cells = [c for c in cells if c['row_index'] in rows_with_text]
-    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
-    if empty_rows_removed > 0:
-        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
-
-    # Bold detection disabled: cell-level stroke-width analysis cannot
-    # distinguish bold from non-bold when cells contain mixed formatting
-    # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
-    # TODO: word-level bold detection would require per-word bounding boxes.
-
-    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
-                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name} (hybrid)")
-
-    return cells, columns_meta
-
-
-def build_cell_grid_v2_streaming(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
-    """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
-
-    Yields:
-        (cell_dict, columns_meta, total_cells)
-    """
-    # Resolve engine — default to Tesseract for cell-first OCR.
-    # Tesseract excels at isolated text crops (binarized, upscaled).
-    # RapidOCR is optimized for full-page scene-text and produces artifacts
-    # on small cell crops (extra chars, missing punctuation, garbled IPA).
-    use_rapid = False
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        engine_name = "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
-
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        return
-
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    if not content_rows:
-        return
-
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
-                   'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        return
-
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    if not content_rows:
-        return
-
-    # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
-    content_rows.sort(key=lambda r: r.y)
-    header_rows = [r for r in row_geometries if r.row_type == 'header']
-    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
-    if header_rows:
-        top_bound = max(r.y + r.height for r in header_rows)
-    else:
-        top_bound = content_rows[0].y
-    if footer_rows:
-        bottom_bound = min(r.y for r in footer_rows)
-    else:
-        bottom_bound = content_rows[-1].y + content_rows[-1].height
-
-    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
-    relevant_cols.sort(key=lambda c: c.x)
-
-    columns_meta = [
-        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
-        for ci, c in enumerate(relevant_cols)
-    ]
-
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    total_cells = len(content_rows) * len(relevant_cols)
-
-    for row_idx, row in enumerate(content_rows):
-        for col_idx, col in enumerate(relevant_cols):
-            cell = _ocr_cell_crop(
-                row_idx, col_idx, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                engine_name, lang, lang_map,
-            )
-            yield cell, columns_meta, total_cells
-
-
-# ---------------------------------------------------------------------------
-# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
-# ---------------------------------------------------------------------------
-
-def _compute_cell_padding(col_width: int, img_w: int) -> int:
-    """Adaptive padding for OCR crops based on column width.
-
-    Narrow columns (page_ref, marker) need more surrounding context so
-    Tesseract can segment characters correctly.  Wide columns keep the
-    minimal 4 px padding to avoid pulling in neighbours.
-    """
-    col_pct = col_width / img_w * 100 if img_w > 0 else 100
-    if col_pct < 5:
-        return max(20, col_width // 2)
-    if col_pct < 10:
-        return max(12, col_width // 4)
-    if col_pct < 15:
-        return 8
-    return 4
-
-
-def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
-                               max_scale: int = 3) -> np.ndarray:
-    """Upscale tiny crops so Tesseract gets enough pixel data.
-
-    If either dimension is below *min_dim*, the crop is bicubic-upscaled
-    so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
-    """
-    h, w = crop.shape[:2]
-    if h >= min_dim and w >= min_dim:
-        return crop
-    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
-    if scale <= 1.0:
-        return crop
-    new_w = int(w * scale)
-    new_h = int(h * scale)
-    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-
-
-def _select_psm_for_column(col_type: str, col_width: int,
-                            row_height: int) -> int:
-    """Choose the best Tesseract PSM for a given column geometry.
-
-    - page_ref columns are almost always single short tokens → PSM 8
-    - Very narrow or short cells → PSM 7 (single text line)
-    - Everything else → PSM 6 (uniform block)
-    """
-    if col_type in ('page_ref', 'marker'):
-        return 8  # single word
-    if col_width < 100 or row_height < 30:
-        return 7  # single line
-    return 6  # uniform block
-
-
-def _ocr_single_cell(
-    row_idx: int,
-    col_idx: int,
-    row: RowGeometry,
-    col: PageRegion,
-    ocr_img: np.ndarray,
-    img_bgr: Optional[np.ndarray],
-    img_w: int,
-    img_h: int,
-    use_rapid: bool,
-    engine_name: str,
-    lang: str,
-    lang_map: Dict[str, str],
-    preassigned_words: Optional[List[Dict]] = None,
-) -> Dict[str, Any]:
-    """Populate a single cell (column x row intersection) via word lookup."""
-    # Display bbox: exact column × row intersection (no padding)
-    disp_x = col.x
-    disp_y = row.y
-    disp_w = col.width
-    disp_h = row.height
-
-    # OCR crop: adaptive padding — narrow columns get more context
-    pad = _compute_cell_padding(col.width, img_w)
-    cell_x = max(0, col.x - pad)
-    cell_y = max(0, row.y - pad)
-    cell_w = min(col.width + 2 * pad, img_w - cell_x)
-    cell_h = min(row.height + 2 * pad, img_h - cell_y)
-    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
-
-    if disp_w <= 0 or disp_h <= 0:
-        return {
-            'cell_id': f"R{row_idx:02d}_C{col_idx}",
-            'row_index': row_idx,
-            'col_index': col_idx,
-            'col_type': col.type,
-            'text': '',
-            'confidence': 0.0,
-            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
-            'bbox_pct': {
-                'x': round(col.x / img_w * 100, 2),
-                'y': round(row.y / img_h * 100, 2),
-                'w': round(col.width / img_w * 100, 2),
-                'h': round(row.height / img_h * 100, 2),
-            },
-            'ocr_engine': 'word_lookup',
-        }
-
-    # --- PRIMARY: Word-lookup from full-page Tesseract ---
-    words = preassigned_words if preassigned_words is not None else []
-    used_engine = 'word_lookup'
-
-    # Filter low-confidence words (OCR noise from images/artifacts).
-    # Tesseract gives low confidence to misread image edges, borders,
-    # and other non-text elements.
-    _MIN_WORD_CONF = 30
-    if words:
-        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
-    if words:
-        # Use row height as Y-tolerance so all words within a single row
-        # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
-        # across two lines due to slight vertical offset).
-        y_tol = max(15, row.height)
-        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-    else:
-        text = ''
-        avg_conf = 0.0
-
-    # --- FALLBACK: Cell-OCR for empty cells ---
-    # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
-    # Re-run OCR on the cell crop to catch what word-lookup missed.
-    # To avoid wasting time on truly empty cells, check pixel density first:
-    # only run Tesseract if the cell crop contains enough dark pixels to
-    # plausibly contain text.
-    _run_fallback = False
-    if not text.strip() and cell_w > 0 and cell_h > 0:
-        if ocr_img is not None:
-            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
-            if crop.size > 0:
-                # Threshold: pixels darker than 180 (on 0-255 grayscale).
-                # Use 0.5% to catch even small text like "Ei" (2 chars)
-                # in an otherwise empty cell.
-                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-                _run_fallback = dark_ratio > 0.005
-    if _run_fallback:
-        # For narrow columns, upscale the crop before OCR
-        if is_narrow and ocr_img is not None:
-            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
-            _upscaled = _ensure_minimum_crop_size(_crop_slice)
-            if _upscaled is not _crop_slice:
-                # Build a temporary full-size image with the upscaled crop
-                # placed at origin so ocr_region can crop it cleanly.
-                _up_h, _up_w = _upscaled.shape[:2]
-                _tmp_region = PageRegion(
-                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
-                )
-                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
-                cell_lang = lang_map.get(col.type, lang)
-                fallback_words = ocr_region(_upscaled, _tmp_region,
-                                            lang=cell_lang, psm=_cell_psm)
-                # Remap word positions back to original image coordinates
-                _sx = cell_w / max(_up_w, 1)
-                _sy = cell_h / max(_up_h, 1)
-                for _fw in (fallback_words or []):
-                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
-                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
-                    _fw['width'] = int(_fw['width'] * _sx)
-                    _fw['height'] = int(_fw['height'] * _sy)
-            else:
-                # No upscaling needed, use adaptive PSM
-                cell_region = PageRegion(
-                    type=col.type, x=cell_x, y=cell_y,
-                    width=cell_w, height=cell_h,
-                )
-                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
-                cell_lang = lang_map.get(col.type, lang)
-                fallback_words = ocr_region(ocr_img, cell_region,
-                                            lang=cell_lang, psm=_cell_psm)
-        else:
-            cell_region = PageRegion(
-                type=col.type,
-                x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
-            )
-            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
-                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
-            elif engine_name == "lighton" and img_bgr is not None:
-                fallback_words = ocr_region_lighton(img_bgr, cell_region)
-            elif use_rapid and img_bgr is not None:
-                fallback_words = ocr_region_rapid(img_bgr, cell_region)
-            else:
-                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
-                cell_lang = lang_map.get(col.type, lang)
-                fallback_words = ocr_region(ocr_img, cell_region,
-                                            lang=cell_lang, psm=_cell_psm)
-
-        if fallback_words:
-            # Apply same confidence filter to fallback words
-            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-        if fallback_words:
-            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
-            fb_y_tol = max(10, int(fb_avg_h * 0.5))
-            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
-            if fb_text.strip():
-                text = fb_text
-                avg_conf = round(
-                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
-                )
-                used_engine = 'cell_ocr_fallback'
-
-        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
-        if not text.strip() and _run_fallback and not use_rapid:
-            _fb_region = PageRegion(
-                type=col.type, x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
-            )
-            cell_lang = lang_map.get(col.type, lang)
-            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
-            if psm7_words:
-                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-            if psm7_words:
-                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
-                if p7_text.strip():
-                    text = p7_text
-                    avg_conf = round(
-                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
-                    )
-                    used_engine = 'cell_ocr_psm7'
-
-        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
-        # If a narrow cell is still empty, OCR the entire row strip with
-        # RapidOCR (which handles small text better) and assign words by
-        # X-position overlap with this column.
-        if not text.strip() and is_narrow and img_bgr is not None:
-            row_region = PageRegion(
-                type='_row_strip', x=0, y=row.y,
-                width=img_w, height=row.height,
-            )
-            strip_words = ocr_region_rapid(img_bgr, row_region)
-            if strip_words:
-                # Filter to words overlapping this column's X-range
-                col_left = col.x
-                col_right = col.x + col.width
-                col_words = []
-                for sw in strip_words:
-                    sw_left = sw.get('left', 0)
-                    sw_right = sw_left + sw.get('width', 0)
-                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
-                    if overlap > sw.get('width', 1) * 0.3:
-                        col_words.append(sw)
-                if col_words:
-                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-                if col_words:
-                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
-                    if rs_text.strip():
-                        text = rs_text
-                        avg_conf = round(
-                            sum(w['conf'] for w in col_words) / len(col_words), 1
-                        )
-                        used_engine = 'row_strip_rapid'
-
-    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
-    if text.strip():
-        text = _clean_cell_text(text)
-        if not text:
-            avg_conf = 0.0
-
-    return {
-        'cell_id': f"R{row_idx:02d}_C{col_idx}",
-        'row_index': row_idx,
-        'col_index': col_idx,
-        'col_type': col.type,
-        'text': text,
-        'confidence': avg_conf,
-        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
-        'bbox_pct': {
-            'x': round(disp_x / img_w * 100, 2),
-            'y': round(disp_y / img_h * 100, 2),
-            'w': round(disp_w / img_w * 100, 2),
-            'h': round(disp_h / img_h * 100, 2),
-        },
-        'ocr_engine': used_engine,
-    }
-
-
-def _is_artifact_row(row: RowGeometry) -> bool:
-    """Return True if this row contains only scan artifacts, not real text.
-
-    Artifact rows (scanner shadows, noise) typically produce only single-character
-    detections. A real content row always has at least one token with 2+ characters.
-    """
-    if row.word_count == 0:
-        return True
-    texts = [w.get('text', '').strip() for w in row.words]
-    return all(len(t) <= 1 for t in texts)
-
-
-def _heal_row_gaps(
-    rows: List[RowGeometry],
-    top_bound: int,
-    bottom_bound: int,
-) -> None:
-    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
-
-    After filtering out empty or artifact rows, remaining content rows may have
-    gaps between them where the removed rows used to be. This function mutates
-    each row to extend upward/downward to the midpoint of such gaps so that
-    OCR crops cover the full available content area.
-
-    The first row always extends to top_bound; the last row to bottom_bound.
-    """
-    if not rows:
-        return
-    rows.sort(key=lambda r: r.y)
-    n = len(rows)
-    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
-
-    for i, row in enumerate(rows):
-        # New top: midpoint between previous row's bottom and this row's top
-        if i == 0:
-            new_top = top_bound
-        else:
-            prev_bot = orig[i - 1][1]
-            my_top = orig[i][0]
-            gap = my_top - prev_bot
-            new_top = prev_bot + gap // 2 if gap > 1 else my_top
-
-        # New bottom: midpoint between this row's bottom and next row's top
-        if i == n - 1:
-            new_bottom = bottom_bound
-        else:
-            my_bot = orig[i][1]
-            next_top = orig[i + 1][0]
-            gap = next_top - my_bot
-            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
-
-        row.y = new_top
-        row.height = max(5, new_bottom - new_top)
-
-    logger.debug(
-        f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
-        f"(bounds: top={top_bound}, bottom={bottom_bound})"
-    )
-
-
-def build_cell_grid(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-    """Generic Cell-Grid: Columns × Rows → cells with OCR text.
-
-    This is the layout-agnostic foundation. Every column (except column_ignore)
-    is intersected with every content row to produce numbered cells.
-
-    Args:
-        ocr_img: Binarized full-page image (for Tesseract).
-        column_regions: Classified columns from Step 3 (PageRegion list).
-        row_geometries: Rows from Step 4 (RowGeometry list).
-        img_w: Image width in pixels.
-        img_h: Image height in pixels.
-        lang: Default Tesseract language.
-        ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
-        img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
-
-    Returns:
-        (cells, columns_meta) where cells is a list of cell dicts and
-        columns_meta describes the columns used.
-    """
-    # Resolve engine choice
-    use_rapid = False
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
-        engine_name = "rapid" if use_rapid else "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
-
-    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
-
-    # Filter to content rows only (skip header/footer)
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        logger.warning("build_cell_grid: no content rows found")
-        return [], []
-
-    # Filter phantom rows: rows with no Tesseract words assigned are
-    # inter-line whitespace gaps that would produce garbage OCR.
-    before = len(content_rows)
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    skipped = before - len(content_rows)
-    if skipped > 0:
-        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
-    if not content_rows:
-        logger.warning("build_cell_grid: no content rows with words found")
-        return [], []
-
-    # Use columns only — skip ignore, header, footer, page_ref
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        logger.warning("build_cell_grid: no usable columns found")
-        return [], []
-
-    # Filter artifact rows: rows whose detected words are all single characters
-    # are caused by scanner shadows or noise, not real text.
-    before_art = len(content_rows)
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    artifact_skipped = before_art - len(content_rows)
-    if artifact_skipped > 0:
-        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
-    if not content_rows:
-        logger.warning("build_cell_grid: no content rows after artifact filtering")
-        return [], []
-
-    # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
-    # to fill the space so OCR crops are not artificially narrow.
-    _heal_row_gaps(
-        content_rows,
-        top_bound=min(c.y for c in relevant_cols),
-        bottom_bound=max(c.y + c.height for c in relevant_cols),
-    )
-
-    # Sort columns left-to-right
-    relevant_cols.sort(key=lambda c: c.x)
-
-    # Build columns_meta
-    columns_meta = [
-        {
-            'index': col_idx,
-            'type': col.type,
-            'x': col.x,
-            'width': col.width,
-        }
-        for col_idx, col in enumerate(relevant_cols)
-    ]
-
-    # Choose OCR language per column type (Tesseract only)
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    cells: List[Dict[str, Any]] = []
-
-    for row_idx, row in enumerate(content_rows):
-        # Pre-assign each word to exactly one column (nearest center)
-        col_words = _assign_row_words_to_columns(row, relevant_cols)
-        for col_idx, col in enumerate(relevant_cols):
-            cell = _ocr_single_cell(
-                row_idx, col_idx, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                use_rapid, engine_name, lang, lang_map,
-                preassigned_words=col_words[col_idx],
-            )
-            cells.append(cell)
-
-    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
-    # Collect cells that are still empty but have visible pixels.
-    # Instead of calling Tesseract once per cell (expensive), crop an entire
-    # column strip and run OCR once, then assign words to cells by Y position.
-    empty_by_col: Dict[int, List[int]] = {}  # col_idx → [cell list indices]
-    for ci, cell in enumerate(cells):
-        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
-            bpx = cell['bbox_px']
-            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
-            if w > 0 and h > 0 and ocr_img is not None:
-                crop = ocr_img[y:y + h, x:x + w]
-                if crop.size > 0:
-                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-                    if dark_ratio > 0.005:
-                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
-
-    for col_idx, cell_indices in empty_by_col.items():
-        if len(cell_indices) < 3:
-            continue  # Not worth batching for < 3 cells
-
-        # Find the column strip bounding box (union of all empty cell bboxes)
-        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
-        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
-        col_x = cells[cell_indices[0]]['bbox_px']['x']
-        col_w = cells[cell_indices[0]]['bbox_px']['w']
-
-        strip_region = PageRegion(
-            type=relevant_cols[col_idx].type,
-            x=col_x, y=min_y,
-            width=col_w, height=max_y_h - min_y,
-        )
-        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
-
-        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
-            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
-        elif engine_name == "lighton" and img_bgr is not None:
-            strip_words = ocr_region_lighton(img_bgr, strip_region)
-        elif use_rapid and img_bgr is not None:
-            strip_words = ocr_region_rapid(img_bgr, strip_region)
-        else:
-            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
-
-        if not strip_words:
-            continue
-
-        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
-        if not strip_words:
-            continue
-
-        # Assign words to cells by Y overlap
-        for ci in cell_indices:
-            cell_y = cells[ci]['bbox_px']['y']
-            cell_h = cells[ci]['bbox_px']['h']
-            cell_mid_y = cell_y + cell_h / 2
-
-            matched_words = [
-                w for w in strip_words
-                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
-            ]
-            if matched_words:
-                matched_words.sort(key=lambda w: w['left'])
-                batch_text = ' '.join(w['text'] for w in matched_words)
-                batch_text = _clean_cell_text(batch_text)
-                if batch_text.strip():
-                    cells[ci]['text'] = batch_text
-                    cells[ci]['confidence'] = round(
-                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
-                    )
-                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
-
-        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
-        if batch_filled > 0:
-            logger.info(
-                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
-                f"empty cells in column {col_idx}"
-            )
-
-    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
-    # that had stray Tesseract artifacts giving word_count > 0).
-    rows_with_text: set = set()
-    for cell in cells:
-        if cell['text'].strip():
-            rows_with_text.add(cell['row_index'])
-    before_filter = len(cells)
-    cells = [c for c in cells if c['row_index'] in rows_with_text]
-    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
-    if empty_rows_removed > 0:
-        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
-
-    logger.info(f"build_cell_grid: {len(cells)} cells from "
-                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name}")
-
-    return cells, columns_meta
-
-
-def build_cell_grid_streaming(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
-    """Like build_cell_grid(), but yields each cell as it is OCR'd.
-
-    Yields:
-        (cell_dict, columns_meta, total_cells) for each cell.
-    """
-    # Resolve engine choice (same as build_cell_grid)
-    use_rapid = False
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
-        engine_name = "rapid" if use_rapid else "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
-
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        return
-
-    # Filter phantom rows: rows with no Tesseract words assigned are
-    # inter-line whitespace gaps that would produce garbage OCR.
-    before = len(content_rows)
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    skipped = before - len(content_rows)
-    if skipped > 0:
-        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
-    if not content_rows:
-        return
-
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        return
-
-    # Filter artifact rows + heal gaps (same logic as build_cell_grid)
-    before_art = len(content_rows)
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    artifact_skipped = before_art - len(content_rows)
-    if artifact_skipped > 0:
-        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
-    if not content_rows:
-        return
-    _heal_row_gaps(
-        content_rows,
-        top_bound=min(c.y for c in relevant_cols),
-        bottom_bound=max(c.y + c.height for c in relevant_cols),
-    )
-
-    relevant_cols.sort(key=lambda c: c.x)
-
-    columns_meta = [
-        {
-            'index': col_idx,
-            'type': col.type,
-            'x': col.x,
-            'width': col.width,
-        }
-        for col_idx, col in enumerate(relevant_cols)
-    ]
-
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    total_cells = len(content_rows) * len(relevant_cols)
-
-    for row_idx, row in enumerate(content_rows):
-        # Pre-assign each word to exactly one column (nearest center)
-        col_words = _assign_row_words_to_columns(row, relevant_cols)
-        for col_idx, col in enumerate(relevant_cols):
-            cell = _ocr_single_cell(
-                row_idx, col_idx, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                use_rapid, engine_name, lang, lang_map,
-                preassigned_words=col_words[col_idx],
-            )
-            yield cell, columns_meta, total_cells
-
-
-def _cells_to_vocab_entries(
-    cells: List[Dict[str, Any]],
-    columns_meta: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Map generic cells to vocab entries with english/german/example fields.
-
-    Groups cells by row_index, maps col_type → field name, and produces
-    one entry per row (only rows with at least one non-empty field).
-    """
-    # Determine image dimensions from first cell (for row-level bbox)
-    col_type_to_field = {
-        'column_en': 'english',
-        'column_de': 'german',
-        'column_example': 'example',
-        'page_ref': 'source_page',
-        'column_marker': 'marker',
-    }
-    bbox_key_map = {
-        'column_en': 'bbox_en',
-        'column_de': 'bbox_de',
-        'column_example': 'bbox_ex',
-        'page_ref': 'bbox_ref',
-        'column_marker': 'bbox_marker',
-    }
-
-    # Group cells by row_index
-    rows: Dict[int, List[Dict]] = {}
-    for cell in cells:
-        ri = cell['row_index']
-        rows.setdefault(ri, []).append(cell)
-
-    entries: List[Dict[str, Any]] = []
-    for row_idx in sorted(rows.keys()):
-        row_cells = rows[row_idx]
-        entry: Dict[str, Any] = {
-            'row_index': row_idx,
-            'english': '',
-            'german': '',
-            'example': '',
-            'source_page': '',
-            'marker': '',
-            'confidence': 0.0,
-            'bbox': None,
-            'bbox_en': None,
-            'bbox_de': None,
-            'bbox_ex': None,
-            'bbox_ref': None,
-            'bbox_marker': None,
-            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
-        }
-
-        confidences = []
-        for cell in row_cells:
-            col_type = cell['col_type']
-            field = col_type_to_field.get(col_type)
-            if field:
-                entry[field] = cell['text']
-            bbox_field = bbox_key_map.get(col_type)
-            if bbox_field:
-                entry[bbox_field] = cell['bbox_pct']
-            if cell['confidence'] > 0:
-                confidences.append(cell['confidence'])
-
-        # Compute row-level bbox as union of all cell bboxes
-        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
-        if all_bboxes:
-            min_x = min(b['x'] for b in all_bboxes)
-            min_y = min(b['y'] for b in all_bboxes)
-            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
-            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
-            entry['bbox'] = {
-                'x': round(min_x, 2),
-                'y': round(min_y, 2),
-                'w': round(max_x2 - min_x, 2),
-                'h': round(max_y2 - min_y, 2),
-            }
-
-        entry['confidence'] = round(
-            sum(confidences) / len(confidences), 1
-        ) if confidences else 0.0
-
-        # Only include if at least one mapped field has text
-        has_content = any(
-            entry.get(f)
-            for f in col_type_to_field.values()
-        )
-        if has_content:
-            entries.append(entry)
-
-    return entries
-
-
-# Regex: line starts with phonetic bracket content only (no real word before it)
-_PHONETIC_ONLY_RE = re.compile(
-    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+from cv_ocr_engines import (  # noqa: F401
+    _fix_character_confusion,
+    _fix_phonetic_brackets,
 )
-
-
-def _is_phonetic_only_text(text: str) -> bool:
-    """Check if text consists only of phonetic transcription.
-
-    Phonetic-only patterns:
-      ['mani serva]   →  True
-      [dɑːns]         →  True
-      ["a:mand]       →  True
-      almond ['a:mand] → False (has real word before bracket)
-      Mandel           → False
-    """
-    t = text.strip()
-    if not t:
-        return False
-    # Must contain at least one bracket
-    if '[' not in t and ']' not in t:
-        return False
-    # Remove all bracket content and surrounding punctuation/whitespace
-    without_brackets = re.sub(r"\[.*?\]", '', t)
-    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
-    # If nothing meaningful remains, it's phonetic-only
-    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
-    return len(alpha_remaining) < 2
-
-
-def _merge_phonetic_continuation_rows(
-    entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Merge rows that contain only phonetic transcription into previous entry.
-
-    In dictionary pages, phonetic transcription sometimes wraps to the next
-    row.  E.g.:
-      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
-      Row 29: EN="['mani serva]"       DE=""
-
-    Row 29 is phonetic-only → merge into row 28's EN field.
-    """
-    if len(entries) < 2:
-        return entries
-
-    merged: List[Dict[str, Any]] = []
-    for entry in entries:
-        en = (entry.get('english') or '').strip()
-        de = (entry.get('german') or '').strip()
-        ex = (entry.get('example') or '').strip()
-
-        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
-        if merged and _is_phonetic_only_text(en) and not de:
-            prev = merged[-1]
-            prev_en = (prev.get('english') or '').strip()
-            # Append phonetic to previous entry's EN
-            if prev_en:
-                prev['english'] = prev_en + ' ' + en
-            else:
-                prev['english'] = en
-            # If there was an example, append to previous too
-            if ex:
-                prev_ex = (prev.get('example') or '').strip()
-                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
-            logger.debug(
-                f"Merged phonetic row {entry.get('row_index')} "
-                f"into previous entry: {prev['english']!r}"
-            )
-            continue
-
-        merged.append(entry)
-
-    return merged
-
-
-def _merge_continuation_rows(
-    entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Merge multi-line vocabulary entries where text wraps to the next row.
-
-    A row is a continuation of the previous entry when:
-    - EN has text, but DE is empty
-    - EN starts with a lowercase letter (not a new vocab entry)
-    - Previous entry's EN does NOT end with a sentence terminator (.!?)
-    - The continuation text has fewer than 4 words (not an example sentence)
-    - The row was not already merged as phonetic
-
-    Example:
-      Row 5: EN="to put up"       DE="aufstellen"
-      Row 6: EN="with sth."       DE=""
-      → Merged: EN="to put up with sth."  DE="aufstellen"
-    """
-    if len(entries) < 2:
-        return entries
-
-    merged: List[Dict[str, Any]] = []
-    for entry in entries:
-        en = (entry.get('english') or '').strip()
-        de = (entry.get('german') or '').strip()
-
-        if merged and en and not de:
-            # Check: not phonetic (already handled)
-            if _is_phonetic_only_text(en):
-                merged.append(entry)
-                continue
-
-            # Check: starts with lowercase
-            first_alpha = next((c for c in en if c.isalpha()), '')
-            starts_lower = first_alpha and first_alpha.islower()
-
-            # Check: fewer than 4 words (not an example sentence)
-            word_count = len(en.split())
-            is_short = word_count < 4
-
-            # Check: previous entry doesn't end with sentence terminator
-            prev = merged[-1]
-            prev_en = (prev.get('english') or '').strip()
-            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
-
-            if starts_lower and is_short and not prev_ends_sentence:
-                # Merge into previous entry
-                prev['english'] = (prev_en + ' ' + en).strip()
-                # Merge example if present
-                ex = (entry.get('example') or '').strip()
-                if ex:
-                    prev_ex = (prev.get('example') or '').strip()
-                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
-                logger.debug(
-                    f"Merged continuation row {entry.get('row_index')} "
-                    f"into previous entry: {prev['english']!r}"
-                )
-                continue
-
-        merged.append(entry)
-
-    return merged
-
-
-def build_word_grid(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-    pronunciation: str = "british",
-) -> List[Dict[str, Any]]:
-    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
-
-    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
-    - Maps cells to english/german/example entries
-    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
-    - Falls back to returning raw cells if no vocab columns detected.
-
-    Args:
-        ocr_img: Binarized full-page image (for Tesseract).
-        column_regions: Classified columns from Step 3.
-        row_geometries: Rows from Step 4.
-        img_w, img_h: Image dimensions.
-        lang: Default Tesseract language.
-        ocr_engine: 'tesseract', 'rapid', or 'auto'.
-        img_bgr: BGR color image (required for RapidOCR).
-        pronunciation: 'british' or 'american' for IPA lookup.
-
-    Returns:
-        List of entry dicts with english/german/example text and bbox info (percent).
-    """
-    cells, columns_meta = build_cell_grid(
-        ocr_img, column_regions, row_geometries, img_w, img_h,
-        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
-    )
-
-    if not cells:
-        return []
-
-    # Check if vocab layout is present
-    col_types = {c['type'] for c in columns_meta}
-    if not (col_types & {'column_en', 'column_de'}):
-        logger.info("build_word_grid: no vocab columns — returning raw cells")
-        return cells
-
-    # Vocab mapping: cells → entries
-    entries = _cells_to_vocab_entries(cells, columns_meta)
-
-    # --- Post-processing pipeline (deterministic, no LLM) ---
-    n_raw = len(entries)
-
-    # 0a. Merge phonetic-only continuation rows into previous entry
-    entries = _merge_phonetic_continuation_rows(entries)
-
-    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
-    entries = _merge_continuation_rows(entries)
-
-    # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
-    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
-
-    # 2. Replace OCR'd phonetics with dictionary IPA
-    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-
-    # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
-    entries = _split_comma_entries(entries)
-
-    # 4. Attach example sentences (rows without DE → examples for preceding entry)
-    entries = _attach_example_sentences(entries)
-
-    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
-    logger.info(f"build_word_grid: {len(entries)} entries from "
-                f"{n_raw} raw → {len(entries)} after post-processing "
-                f"(engine={engine_name})")
-
-    return entries
-
-
-# =============================================================================
-# Stage 6: Multi-Pass OCR
-# =============================================================================
-
-def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
-               psm: int, fallback_psm: Optional[int] = None,
-               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
-    """Run Tesseract OCR on a specific region with given PSM.
-
-    Args:
-        ocr_img: Binarized full-page image.
-        region: Region to crop and OCR.
-        lang: Tesseract language string.
-        psm: Page Segmentation Mode.
-        fallback_psm: If confidence too low, retry with this PSM per line.
-        min_confidence: Minimum average confidence before fallback.
-
-    Returns:
-        List of word dicts with text, position, confidence.
-    """
-    # Crop region
-    crop = ocr_img[region.y:region.y + region.height,
-                   region.x:region.x + region.width]
-
-    if crop.size == 0:
-        return []
-
-    # Convert to PIL for pytesseract
-    pil_img = Image.fromarray(crop)
-
-    # Run Tesseract with specified PSM
-    config = f'--psm {psm} --oem 3'
-    try:
-        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
-                                         output_type=pytesseract.Output.DICT)
-    except Exception as e:
-        logger.warning(f"Tesseract failed for region {region.type}: {e}")
-        return []
-
-    words = []
-    for i in range(len(data['text'])):
-        text = data['text'][i].strip()
-        conf = int(data['conf'][i])
-        if not text or conf < 10:
-            continue
-        words.append({
-            'text': text,
-            'left': data['left'][i] + region.x,  # Absolute coords
-            'top': data['top'][i] + region.y,
-            'width': data['width'][i],
-            'height': data['height'][i],
-            'conf': conf,
-            'region_type': region.type,
-        })
-
-    # Check average confidence
-    if words and fallback_psm is not None:
-        avg_conf = sum(w['conf'] for w in words) / len(words)
-        if avg_conf < min_confidence:
-            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
-                        f"trying fallback PSM {fallback_psm}")
-            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
-
-    return words
-
-
-def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
-                              lang: str, psm: int) -> List[Dict[str, Any]]:
-    """OCR a region line by line (fallback for low-confidence regions).
-
-    Splits the region into horizontal strips based on text density,
-    then OCRs each strip individually with the given PSM.
-    """
-    crop = ocr_img[region.y:region.y + region.height,
-                   region.x:region.x + region.width]
-
-    if crop.size == 0:
-        return []
-
-    # Find text lines via horizontal projection
-    inv = cv2.bitwise_not(crop)
-    h_proj = np.sum(inv, axis=1)
-    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
-
-    # Find line boundaries
-    lines = []
-    in_text = False
-    line_start = 0
-    for y in range(len(h_proj)):
-        if h_proj[y] > threshold and not in_text:
-            line_start = y
-            in_text = True
-        elif h_proj[y] <= threshold and in_text:
-            if y - line_start > 5:  # Minimum line height
-                lines.append((line_start, y))
-            in_text = False
-    if in_text and len(h_proj) - line_start > 5:
-        lines.append((line_start, len(h_proj)))
-
-    all_words = []
-    config = f'--psm {psm} --oem 3'
-
-    for line_y_start, line_y_end in lines:
-        # Add small padding
-        pad = 3
-        y1 = max(0, line_y_start - pad)
-        y2 = min(crop.shape[0], line_y_end + pad)
-        line_crop = crop[y1:y2, :]
-
-        if line_crop.size == 0:
-            continue
-
-        pil_img = Image.fromarray(line_crop)
-        try:
-            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
-                                             output_type=pytesseract.Output.DICT)
-        except Exception:
-            continue
-
-        for i in range(len(data['text'])):
-            text = data['text'][i].strip()
-            conf = int(data['conf'][i])
-            if not text or conf < 10:
-                continue
-            all_words.append({
-                'text': text,
-                'left': data['left'][i] + region.x,
-                'top': data['top'][i] + region.y + y1,
-                'width': data['width'][i],
-                'height': data['height'][i],
-                'conf': conf,
-                'region_type': region.type,
-            })
-
-    return all_words
-
-
-def run_multi_pass_ocr(ocr_img: np.ndarray,
-                       regions: List[PageRegion],
-                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
-    """Run OCR on each detected region with optimized settings.
-
-    Args:
-        ocr_img: Binarized full-page image.
-        regions: Detected page regions.
-        lang: Default language.
-
-    Returns:
-        Dict mapping region type to list of word dicts.
-    """
-    results: Dict[str, List[Dict]] = {}
-
-    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
-    for region in regions:
-        if region.type in _ocr_skip:
-            continue  # Skip non-content regions
-
-        if region.type == 'column_en':
-            words = ocr_region(ocr_img, region, lang='eng', psm=4)
-        elif region.type == 'column_de':
-            words = ocr_region(ocr_img, region, lang='deu', psm=4)
-        elif region.type == 'column_example':
-            words = ocr_region(ocr_img, region, lang=lang, psm=6,
-                              fallback_psm=7, min_confidence=40.0)
-        else:
-            words = ocr_region(ocr_img, region, lang=lang, psm=6)
-
-        results[region.type] = words
-        logger.info(f"OCR {region.type}: {len(words)} words")
-
-    return results
-
-
-# =============================================================================
-# Stage 7: Line Alignment → Vocabulary Entries
-# =============================================================================
-
-def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
-    """Group words by Y position into lines, sorted by X within each line."""
-    if not words:
-        return []
-
-    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
-    lines: List[List[Dict]] = []
-    current_line: List[Dict] = [sorted_words[0]]
-    current_y = sorted_words[0]['top']
-
-    for word in sorted_words[1:]:
-        if abs(word['top'] - current_y) <= y_tolerance_px:
-            current_line.append(word)
-        else:
-            current_line.sort(key=lambda w: w['left'])
-            lines.append(current_line)
-            current_line = [word]
-            current_y = word['top']
-
-    if current_line:
-        current_line.sort(key=lambda w: w['left'])
-        lines.append(current_line)
-
-    return lines
-
-
-def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
-                          regions: List[PageRegion],
-                          y_tolerance_px: int = 25) -> List[VocabRow]:
-    """Align OCR results from different columns into vocabulary rows.
-
-    Uses Y-coordinate matching to pair English words, German translations,
-    and example sentences that appear on the same line.
-
-    Args:
-        ocr_results: Dict mapping region type to word lists.
-        regions: Detected regions (for reference).
-        y_tolerance_px: Max Y-distance to consider words on the same row.
-
-    Returns:
-        List of VocabRow objects.
-    """
-    # If no vocabulary columns detected (e.g. plain text page), return empty
-    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
-        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
-        return []
-
-    # Group words into lines per column
-    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
-    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
-    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
-
-    def line_y_center(line: List[Dict]) -> float:
-        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
-
-    def line_text(line: List[Dict]) -> str:
-        return ' '.join(w['text'] for w in line)
-
-    def line_confidence(line: List[Dict]) -> float:
-        return sum(w['conf'] for w in line) / len(line) if line else 0
-
-    # Build EN entries as the primary reference
-    vocab_rows: List[VocabRow] = []
-
-    for en_line in en_lines:
-        en_y = line_y_center(en_line)
-        en_text = line_text(en_line)
-        en_conf = line_confidence(en_line)
-
-        # Skip very short or likely header content
-        if len(en_text.strip()) < 2:
-            continue
-
-        # Find matching DE line
-        de_text = ""
-        de_conf = 0.0
-        best_de_dist = float('inf')
-        best_de_idx = -1
-        for idx, de_line in enumerate(de_lines):
-            dist = abs(line_y_center(de_line) - en_y)
-            if dist < y_tolerance_px and dist < best_de_dist:
-                best_de_dist = dist
-                best_de_idx = idx
-
-        if best_de_idx >= 0:
-            de_text = line_text(de_lines[best_de_idx])
-            de_conf = line_confidence(de_lines[best_de_idx])
-
-        # Find matching example line
-        ex_text = ""
-        ex_conf = 0.0
-        best_ex_dist = float('inf')
-        best_ex_idx = -1
-        for idx, ex_line in enumerate(ex_lines):
-            dist = abs(line_y_center(ex_line) - en_y)
-            if dist < y_tolerance_px and dist < best_ex_dist:
-                best_ex_dist = dist
-                best_ex_idx = idx
-
-        if best_ex_idx >= 0:
-            ex_text = line_text(ex_lines[best_ex_idx])
-            ex_conf = line_confidence(ex_lines[best_ex_idx])
-
-        avg_conf = en_conf
-        conf_count = 1
-        if de_conf > 0:
-            avg_conf += de_conf
-            conf_count += 1
-        if ex_conf > 0:
-            avg_conf += ex_conf
-            conf_count += 1
-
-        vocab_rows.append(VocabRow(
-            english=en_text.strip(),
-            german=de_text.strip(),
-            example=ex_text.strip(),
-            confidence=avg_conf / conf_count,
-            y_position=int(en_y),
-        ))
-
-    # Handle multi-line wrapping in example column:
-    # If an example line has no matching EN/DE, append to previous entry
-    matched_ex_ys = set()
-    for row in vocab_rows:
-        if row.example:
-            matched_ex_ys.add(row.y_position)
-
-    for ex_line in ex_lines:
-        ex_y = line_y_center(ex_line)
-        # Check if already matched
-        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
-        if already_matched:
-            continue
-
-        # Find nearest previous vocab row
-        best_row = None
-        best_dist = float('inf')
-        for row in vocab_rows:
-            dist = ex_y - row.y_position
-            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
-                best_dist = dist
-                best_row = row
-
-        if best_row:
-            continuation = line_text(ex_line).strip()
-            if continuation:
-                best_row.example = (best_row.example + " " + continuation).strip()
-
-    # Sort by Y position
-    vocab_rows.sort(key=lambda r: r.y_position)
-
-    return vocab_rows
-
-
-# =============================================================================
-# Stage 8: Optional LLM Post-Correction
-# =============================================================================
-
-async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
-                           confidence_threshold: float = 50.0,
-                           enabled: bool = False) -> List[VocabRow]:
-    """Optionally send low-confidence regions to Qwen-VL for correction.
-
-    Default: disabled. Enable per parameter.
-
-    Args:
-        img: Original BGR image.
-        vocab_rows: Current vocabulary rows.
-        confidence_threshold: Rows below this get LLM correction.
-        enabled: Whether to actually run LLM correction.
-
-    Returns:
-        Corrected vocabulary rows.
-    """
-    if not enabled:
-        return vocab_rows
-
-    # TODO: Implement Qwen-VL correction for low-confidence entries
-    # For each row with confidence < threshold:
-    #   1. Crop the relevant region from img
-    #   2. Send crop + OCR text to Qwen-VL
-    #   3. Replace text if LLM provides a confident correction
-    logger.info(f"LLM post-correction skipped (not yet implemented)")
-    return vocab_rows
-
-
-# =============================================================================
-# Orchestrator
-# =============================================================================
-
-async def run_cv_pipeline(
-    pdf_data: Optional[bytes] = None,
-    image_data: Optional[bytes] = None,
-    page_number: int = 0,
-    zoom: float = 3.0,
-    enable_dewarp: bool = True,
-    enable_llm_correction: bool = False,
-    lang: str = "eng+deu",
-) -> PipelineResult:
-    """Run the complete CV document reconstruction pipeline.
-
-    Args:
-        pdf_data: Raw PDF bytes (mutually exclusive with image_data).
-        image_data: Raw image bytes (mutually exclusive with pdf_data).
-        page_number: 0-indexed page number (for PDF).
-        zoom: PDF rendering zoom factor.
-        enable_dewarp: Whether to run dewarp stage.
-        enable_llm_correction: Whether to run LLM post-correction.
-        lang: Tesseract language string.
-
-    Returns:
-        PipelineResult with vocabulary and timing info.
-    """
-    if not CV_PIPELINE_AVAILABLE:
-        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
-
-    result = PipelineResult()
-    total_start = time.time()
-
-    try:
-        # Stage 1: Render
-        t = time.time()
-        if pdf_data:
-            img = render_pdf_high_res(pdf_data, page_number, zoom)
-        elif image_data:
-            img = render_image_high_res(image_data)
-        else:
-            return PipelineResult(error="No input data (pdf_data or image_data required)")
-        result.stages['render'] = round(time.time() - t, 2)
-        result.image_width = img.shape[1]
-        result.image_height = img.shape[0]
-        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
-
-        # Stage 2: Deskew
-        t = time.time()
-        img, angle = deskew_image(img)
-        result.stages['deskew'] = round(time.time() - t, 2)
-        logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
-
-        # Stage 3: Dewarp
-        if enable_dewarp:
-            t = time.time()
-            img, _dewarp_info = dewarp_image(img)
-            result.stages['dewarp'] = round(time.time() - t, 2)
-
-        # Stage 4: Dual image preparation
-        t = time.time()
-        ocr_img = create_ocr_image(img)
-        layout_img = create_layout_image(img)
-        result.stages['image_prep'] = round(time.time() - t, 2)
-
-        # Stage 5: Layout analysis
-        t = time.time()
-        regions = analyze_layout(layout_img, ocr_img)
-        result.stages['layout'] = round(time.time() - t, 2)
-        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
-        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
-
-        # Stage 6: Multi-pass OCR
-        t = time.time()
-        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
-        result.stages['ocr'] = round(time.time() - t, 2)
-        total_words = sum(len(w) for w in ocr_results.values())
-        result.word_count = total_words
-        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
-
-        # Stage 7: Line alignment
-        t = time.time()
-        vocab_rows = match_lines_to_vocab(ocr_results, regions)
-        result.stages['alignment'] = round(time.time() - t, 2)
-
-        # Stage 8: Optional LLM correction
-        if enable_llm_correction:
-            t = time.time()
-            vocab_rows = await llm_post_correct(img, vocab_rows)
-            result.stages['llm_correction'] = round(time.time() - t, 2)
-
-        # Convert to output format
-        result.vocabulary = [
-            {
-                "english": row.english,
-                "german": row.german,
-                "example": row.example,
-                "confidence": round(row.confidence, 1),
-            }
-            for row in vocab_rows
-            if row.english or row.german  # Skip empty rows
-        ]
-
-        result.duration_seconds = round(time.time() - total_start, 2)
-        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
-
-    except Exception as e:
-        logger.error(f"CV Pipeline error: {e}")
-        import traceback
-        logger.debug(traceback.format_exc())
-        result.error = str(e)
-        result.duration_seconds = round(time.time() - total_start, 2)
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# LLM-based OCR Correction (Step 6)
-# ---------------------------------------------------------------------------
-
-import httpx
-import os
-import json as _json
-import re as _re
-
-_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
-OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
-_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
-logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
-
-# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
-_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
-
-# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
-# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
-# when they appear inside or next to a word character.
-_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
-
-
-def _entry_needs_review(entry: Dict) -> bool:
-    """Check if an entry should be sent to the LLM for review.
-
-    Sends all non-empty entries that don't have IPA phonetic transcriptions.
-    The LLM prompt and _is_spurious_change() guard against unwanted changes.
-    """
-    en = entry.get("english", "") or ""
-    de = entry.get("german", "") or ""
-
-    # Skip completely empty entries
-    if not en.strip() and not de.strip():
-        return False
-    # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
-    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
-        return False
-    return True
-
-
-def _build_llm_prompt(table_lines: List[Dict]) -> str:
-    """Build the LLM correction prompt for a batch of entries."""
-    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
-
-DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
-
-NUR diese Korrekturen sind erlaubt:
-- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
-- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
-- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
-- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
-- Ziffer 6 statt G oder g: "6eld" → "Geld"
-- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
-
-ABSOLUT VERBOTEN — aendere NIEMALS:
-- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
-- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
-- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
-- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
-- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
-- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
-- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
-- Beispielsaetze in der ex-Spalte — NIEMALS aendern
-
-Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
-
-Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
-Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
-
-/no_think
-
-Eingabe:
-{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
-
-
-def _is_spurious_change(old_val: str, new_val: str) -> bool:
-    """Detect LLM changes that are likely wrong and should be discarded.
-
-    Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
-    legitimate OCR corrections. Everything else is rejected.
-
-    Filters out:
-    - Case-only changes
-    - Changes that don't contain any digit→letter fix
-    - Completely different words (LLM translating or hallucinating)
-    - Additions or removals of whole words (count changed)
-    """
-    if not old_val or not new_val:
-        return False
-
-    # Case-only change — never a real OCR error
-    if old_val.lower() == new_val.lower():
-        return True
-
-    # If the word count changed significantly, the LLM rewrote rather than fixed
-    old_words = old_val.split()
-    new_words = new_val.split()
-    if abs(len(old_words) - len(new_words)) > 1:
-        return True
-
-    # Core rule: a legitimate correction replaces a digit with the corresponding
-    # letter. If the change doesn't include such a substitution, reject it.
-    # Build a set of (old_char, new_char) pairs that differ between old and new.
-    # Use character-level diff heuristic: if lengths are close, zip and compare.
-    # Map of characters that OCR commonly misreads → set of correct replacements
-    _OCR_CHAR_MAP = {
-        # Digits mistaken for letters
-        '0': set('oOgG'),
-        '1': set('lLiI'),
-        '5': set('sS'),
-        '6': set('gG'),
-        '8': set('bB'),
-        # Non-letter symbols mistaken for letters
-        '|': set('lLiI1'),  # pipe → lowercase l, capital I, or digit 1
-        'l': set('iI|1'),   # lowercase l → capital I (and reverse)
-    }
-    has_valid_fix = False
-    if len(old_val) == len(new_val):
-        for oc, nc in zip(old_val, new_val):
-            if oc != nc:
-                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
-                    has_valid_fix = True
-                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
-                    # Reverse check (e.g. l→I where new is the "correct" char)
-                    has_valid_fix = True
-    else:
-        # Length changed by 1: accept if old had a suspicious char sequence
-        _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
-        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
-            has_valid_fix = True
-
-    if not has_valid_fix:
-        return True  # Reject — looks like translation or hallucination
-
-    return False
-
-
-def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
-    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
-    changes = []
-    entries_out = []
-    for i, orig in enumerate(originals):
-        if i < len(corrected):
-            c = corrected[i]
-            entry = dict(orig)
-            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
-                new_val = c.get(key, "").strip()
-                old_val = (orig.get(field_name, "") or "").strip()
-                if new_val and new_val != old_val:
-                    # Filter spurious LLM changes
-                    if _is_spurious_change(old_val, new_val):
-                        continue
-                    changes.append({
-                        "row_index": orig.get("row_index", i),
-                        "field": field_name,
-                        "old": old_val,
-                        "new": new_val,
-                    })
-                    entry[field_name] = new_val
-                    entry["llm_corrected"] = True
-            entries_out.append(entry)
-        else:
-            entries_out.append(dict(orig))
-    return changes, entries_out
-
-
-# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
-
-REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
-
-try:
-    from spellchecker import SpellChecker as _SpellChecker
-    _en_spell = _SpellChecker(language='en', distance=1)
-    _de_spell = _SpellChecker(language='de', distance=1)
-    _SPELL_AVAILABLE = True
-    logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
-except ImportError:
-    _SPELL_AVAILABLE = False
-    logger.warning("pyspellchecker not installed — falling back to LLM review")
-
-# ─── Page-Ref Normalization ───────────────────────────────────────────────────
-# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
-_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
-
-
-def _normalize_page_ref(text: str) -> str:
-    """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
-    if not text:
-        return text
-    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
-
-
-# Suspicious OCR chars → ordered list of most-likely correct replacements
-_SPELL_SUBS: Dict[str, List[str]] = {
-    '0': ['O', 'o'],
-    '1': ['l', 'I'],
-    '5': ['S', 's'],
-    '6': ['G', 'g'],
-    '8': ['B', 'b'],
-    '|': ['I', 'l', '1'],
-}
-_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
-
-# Tokenizer: word tokens (letters + pipe) alternating with separators
-_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
-
-
-def _spell_dict_knows(word: str) -> bool:
-    """True if word is known in EN or DE dictionary."""
-    if not _SPELL_AVAILABLE:
-        return False
-    w = word.lower()
-    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
-
-
-def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
-    """Return corrected form of token, or None if no fix needed/possible.
-
-    *field* is 'english' or 'german' — used to pick the right dictionary
-    for general spell correction (step 3 below).
-    """
-    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
-
-    # 1. Already known word → no fix needed
-    if _spell_dict_knows(token):
-        return None
-
-    # 2. Digit/pipe substitution (existing logic)
-    if has_suspicious:
-        # Standalone pipe → capital I
-        if token == '|':
-            return 'I'
-        # Dictionary-backed single-char substitution
-        for i, ch in enumerate(token):
-            if ch not in _SPELL_SUBS:
-                continue
-            for replacement in _SPELL_SUBS[ch]:
-                candidate = token[:i] + replacement + token[i + 1:]
-                if _spell_dict_knows(candidate):
-                    return candidate
-        # Structural rule: suspicious char at position 0 + rest is all lowercase letters
-        first = token[0]
-        if first in _SPELL_SUBS and len(token) >= 2:
-            rest = token[1:]
-            if rest.isalpha() and rest.islower():
-                candidate = _SPELL_SUBS[first][0] + rest
-                if not candidate[0].isdigit():
-                    return candidate
-
-    # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
-    #    Try single-char umlaut substitutions and check against dictionary.
-    if len(token) >= 3 and token.isalpha() and field == "german":
-        _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
-                         'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
-        for i, ch in enumerate(token):
-            if ch in _UMLAUT_SUBS:
-                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
-                if _spell_dict_knows(candidate):
-                    return candidate
-
-    # 4. General spell correction for unknown words (no digits/pipes)
-    #    e.g. "beautful" → "beautiful"
-    if not has_suspicious and len(token) >= 3 and token.isalpha():
-        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
-        if spell is not None:
-            correction = spell.correction(token.lower())
-            if correction and correction != token.lower():
-                # Preserve original capitalisation pattern
-                if token[0].isupper():
-                    correction = correction[0].upper() + correction[1:]
-                if _spell_dict_knows(correction):
-                    return correction
-    return None
-
-
-def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
-    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
-
-    *field* is 'english' or 'german' — forwarded to _spell_fix_token for
-    dictionary selection.
-    """
-    if not text:
-        return text, False
-    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
-    # If no suspicious chars AND no alpha chars that could be misspelled, skip
-    if not has_suspicious and not any(c.isalpha() for c in text):
-        return text, False
-    # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
-    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
-    changed = fixed != text
-    # Tokenize and fix word by word
-    parts: List[str] = []
-    pos = 0
-    for m in _SPELL_TOKEN_RE.finditer(fixed):
-        token, sep = m.group(1), m.group(2)
-        correction = _spell_fix_token(token, field=field)
-        if correction:
-            parts.append(correction)
-            changed = True
-        else:
-            parts.append(token)
-        parts.append(sep)
-        pos = m.end()
-    if pos < len(fixed):
-        parts.append(fixed[pos:])
-    return ''.join(parts), changed
-
-
-def spell_review_entries_sync(entries: List[Dict]) -> Dict:
-    """Rule-based OCR correction: spell-checker + structural heuristics.
-
-    Deterministic — never translates, never touches IPA, never hallucinates.
-    """
-    t0 = time.time()
-    changes: List[Dict] = []
-    all_corrected: List[Dict] = []
-    for i, entry in enumerate(entries):
-        e = dict(entry)
-        # Page-ref normalization (always, regardless of review status)
-        old_ref = (e.get("source_page") or "").strip()
-        if old_ref:
-            new_ref = _normalize_page_ref(old_ref)
-            if new_ref != old_ref:
-                changes.append({
-                    "row_index": e.get("row_index", i),
-                    "field": "source_page",
-                    "old": old_ref,
-                    "new": new_ref,
-                })
-                e["source_page"] = new_ref
-                e["llm_corrected"] = True
-        if not _entry_needs_review(e):
-            all_corrected.append(e)
-            continue
-        for field_name in ("english", "german", "example"):
-            old_val = (e.get(field_name) or "").strip()
-            if not old_val:
-                continue
-            # example field is mixed-language — try German first (for umlauts)
-            lang = "german" if field_name in ("german", "example") else "english"
-            new_val, was_changed = _spell_fix_field(old_val, field=lang)
-            if was_changed and new_val != old_val:
-                changes.append({
-                    "row_index": e.get("row_index", i),
-                    "field": field_name,
-                    "old": old_val,
-                    "new": new_val,
-                })
-                e[field_name] = new_val
-                e["llm_corrected"] = True
-        all_corrected.append(e)
-    duration_ms = int((time.time() - t0) * 1000)
-    return {
-        "entries_original": entries,
-        "entries_corrected": all_corrected,
-        "changes": changes,
-        "skipped_count": 0,
-        "model_used": "spell-checker",
-        "duration_ms": duration_ms,
-    }
-
-
-async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
-    """Async generator yielding SSE-compatible events for spell-checker review."""
-    total = len(entries)
-    yield {
-        "type": "meta",
-        "total_entries": total,
-        "to_review": total,
-        "skipped": 0,
-        "model": "spell-checker",
-        "batch_size": batch_size,
-    }
-    result = spell_review_entries_sync(entries)
-    changes = result["changes"]
-    yield {
-        "type": "batch",
-        "batch_index": 0,
-        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
-        "changes": changes,
-        "duration_ms": result["duration_ms"],
-        "progress": {"current": total, "total": total},
-    }
-    yield {
-        "type": "complete",
-        "changes": changes,
-        "model_used": "spell-checker",
-        "duration_ms": result["duration_ms"],
-        "total_entries": total,
-        "reviewed": total,
-        "skipped": 0,
-        "corrections_found": len(changes),
-        "entries_corrected": result["entries_corrected"],
-    }
-
-# ─── End Spell-Checker ────────────────────────────────────────────────────────
-
-
-async def llm_review_entries(
-    entries: List[Dict],
-    model: str = None,
-) -> Dict:
-    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
-    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
-        return spell_review_entries_sync(entries)
-    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
-        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
-    model = model or OLLAMA_REVIEW_MODEL
-
-    # Filter: only entries that need review
-    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
-
-    if not reviewable:
-        return {
-            "entries_original": entries,
-            "entries_corrected": [dict(e) for e in entries],
-            "changes": [],
-            "skipped_count": len(entries),
-            "model_used": model,
-            "duration_ms": 0,
-        }
-
-    review_entries = [e for _, e in reviewable]
-    table_lines = [
-        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
-        for e in review_entries
-    ]
-
-    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
-                len(review_entries), len(entries), model, len(entries) - len(reviewable))
-    logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
-
-    prompt = _build_llm_prompt(table_lines)
-
-    t0 = time.time()
-    async with httpx.AsyncClient(timeout=300.0) as client:
-        resp = await client.post(
-            f"{_OLLAMA_URL}/api/chat",
-            json={
-                "model": model,
-                "messages": [{"role": "user", "content": prompt}],
-                "stream": False,
-                "think": False,   # qwen3: disable chain-of-thought (Ollama >=0.6)
-                "options": {"temperature": 0.1, "num_predict": 8192},
-            },
-        )
-        resp.raise_for_status()
-        content = resp.json().get("message", {}).get("content", "")
-    duration_ms = int((time.time() - t0) * 1000)
-
-    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
-    logger.debug("LLM review raw response (first 500): %.500s", content)
-
-    corrected = _parse_llm_json_array(content)
-    logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
-    changes, corrected_entries = _diff_batch(review_entries, corrected)
-
-    # Merge corrected entries back into the full list
-    all_corrected = [dict(e) for e in entries]
-    for batch_idx, (orig_idx, _) in enumerate(reviewable):
-        if batch_idx < len(corrected_entries):
-            all_corrected[orig_idx] = corrected_entries[batch_idx]
-
-    return {
-        "entries_original": entries,
-        "entries_corrected": all_corrected,
-        "changes": changes,
-        "skipped_count": len(entries) - len(reviewable),
-        "model_used": model,
-        "duration_ms": duration_ms,
-    }
-
-
-async def llm_review_entries_streaming(
-    entries: List[Dict],
-    model: str = None,
-    batch_size: int = _REVIEW_BATCH_SIZE,
-):
-    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
-
-    Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
-    visible in the UI — this is the only place the fix now runs (removed from Step 1
-    of build_vocab_pipeline_streaming).
-    """
-    # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
-    _CONF_FIELDS = ('english', 'german', 'example')
-    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
-    _fix_character_confusion(entries)  # modifies in-place, returns same list
-    char_changes = [
-        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
-        for i in range(len(entries))
-        for f in _CONF_FIELDS
-        if originals[i][f] != entries[i].get(f, '')
-    ]
-
-    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
-        # Inject char_changes as a batch right after the meta event from the spell checker
-        _meta_sent = False
-        async for event in spell_review_entries_streaming(entries, batch_size):
-            yield event
-            if not _meta_sent and event.get('type') == 'meta' and char_changes:
-                _meta_sent = True
-                yield {
-                    'type': 'batch',
-                    'changes': char_changes,
-                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
-                    'progress': {'current': 0, 'total': len(entries)},
-                }
-        return
-
-    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
-        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
-    # LLM path: emit char_changes first (before meta) so they appear in the UI
-    if char_changes:
-        yield {
-            'type': 'batch',
-            'changes': char_changes,
-            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
-            'progress': {'current': 0, 'total': len(entries)},
-        }
-
-    model = model or OLLAMA_REVIEW_MODEL
-
-    # Separate reviewable from skipped entries
-    reviewable = []
-    skipped_indices = []
-    for i, e in enumerate(entries):
-        if _entry_needs_review(e):
-            reviewable.append((i, e))
-        else:
-            skipped_indices.append(i)
-
-    total_to_review = len(reviewable)
-
-    # meta event
-    yield {
-        "type": "meta",
-        "total_entries": len(entries),
-        "to_review": total_to_review,
-        "skipped": len(skipped_indices),
-        "model": model,
-        "batch_size": batch_size,
-    }
-
-    all_changes = []
-    all_corrected = [dict(e) for e in entries]
-    total_duration_ms = 0
-    reviewed_count = 0
-
-    # Process in batches
-    for batch_start in range(0, total_to_review, batch_size):
-        batch_items = reviewable[batch_start:batch_start + batch_size]
-        batch_entries = [e for _, e in batch_items]
-
-        table_lines = [
-            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
-            for e in batch_entries
-        ]
-
-        prompt = _build_llm_prompt(table_lines)
-
-        logger.info("LLM review streaming: batch %d — sending %d entries to %s",
-                    batch_start // batch_size, len(batch_entries), model)
-
-        t0 = time.time()
-        async with httpx.AsyncClient(timeout=300.0) as client:
-            resp = await client.post(
-                f"{_OLLAMA_URL}/api/chat",
-                json={
-                    "model": model,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "stream": False,
-                    "think": False,   # qwen3: disable chain-of-thought
-                    "options": {"temperature": 0.1, "num_predict": 8192},
-                },
-            )
-            resp.raise_for_status()
-            content = resp.json().get("message", {}).get("content", "")
-        batch_ms = int((time.time() - t0) * 1000)
-        total_duration_ms += batch_ms
-
-        logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
-        logger.debug("LLM review streaming raw (first 500): %.500s", content)
-
-        corrected = _parse_llm_json_array(content)
-        logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
-        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
-
-        # Merge back
-        for batch_idx, (orig_idx, _) in enumerate(batch_items):
-            if batch_idx < len(batch_corrected):
-                all_corrected[orig_idx] = batch_corrected[batch_idx]
-
-        all_changes.extend(batch_changes)
-        reviewed_count += len(batch_items)
-
-        # Yield batch result
-        yield {
-            "type": "batch",
-            "batch_index": batch_start // batch_size,
-            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
-            "changes": batch_changes,
-            "duration_ms": batch_ms,
-            "progress": {"current": reviewed_count, "total": total_to_review},
-        }
-
-    # Complete event
-    yield {
-        "type": "complete",
-        "changes": all_changes,
-        "model_used": model,
-        "duration_ms": total_duration_ms,
-        "total_entries": len(entries),
-        "reviewed": total_to_review,
-        "skipped": len(skipped_indices),
-        "corrections_found": len(all_changes),
-        "entries_corrected": all_corrected,
-    }
-
-
-def _sanitize_for_json(text: str) -> str:
-    """Remove or escape control characters that break JSON parsing.
-
-    Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
-    JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
-    that are only valid inside JSON strings when properly escaped.
-    """
-    # Replace literal control chars (except \\t \\n \\r) with a space
-    return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
-
-
-def _parse_llm_json_array(text: str) -> List[Dict]:
-    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
-    # Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
-    text = _re.sub(r'<think>.*?</think>', '', text, flags=_re.DOTALL)
-    # Strip markdown code fences
-    text = _re.sub(r'```json\s*', '', text)
-    text = _re.sub(r'```\s*', '', text)
-    # Sanitize control characters before JSON parsing
-    text = _sanitize_for_json(text)
-    # Find first [ ... last ]
-    match = _re.search(r'\[.*\]', text, _re.DOTALL)
-    if match:
-        try:
-            return _json.loads(match.group())
-        except (ValueError, _json.JSONDecodeError) as e:
-            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
-    else:
-        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
-    return []
+from cv_cell_grid import _cells_to_vocab_entries  # noqa: F401
diff --git a/klausur-service/backend/cv_vocab_types.py b/klausur-service/backend/cv_vocab_types.py
new file mode 100644
index 0000000..74a6b9c
--- /dev/null
+++ b/klausur-service/backend/cv_vocab_types.py
@@ -0,0 +1,156 @@
+"""
+Shared types, constants, and availability guards for the CV vocabulary pipeline.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re  # noqa: F401 — re-exported for downstream modules
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import numpy as np  # noqa: F401
+
+logger = logging.getLogger(__name__)
+
+# --- Availability Guards ---
+
+try:
+    import cv2  # noqa: F401
+    CV2_AVAILABLE = True
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+    CV2_AVAILABLE = False
+    logger.warning("OpenCV not available — CV pipeline disabled")
+
+try:
+    import pytesseract  # noqa: F401
+    from PIL import Image  # noqa: F401
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+    TESSERACT_AVAILABLE = False
+    logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
+
+CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
+
+# --- IPA Dictionary ---
+
+IPA_AVAILABLE = False
+_ipa_convert_american = None
+_britfone_dict: Dict[str, str] = {}
+
+try:
+    import eng_to_ipa as _eng_to_ipa
+    _ipa_convert_american = _eng_to_ipa.convert
+    IPA_AVAILABLE = True
+    logger.info("eng_to_ipa available — American IPA lookup enabled")
+except ImportError:
+    logger.info("eng_to_ipa not installed — American IPA disabled")
+
+# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
+_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
+if os.path.exists(_britfone_path):
+    try:
+        with open(_britfone_path, 'r', encoding='utf-8') as f:
+            _britfone_dict = json.load(f)
+        IPA_AVAILABLE = True
+        logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
+    except Exception as e:
+        logger.warning(f"Failed to load Britfone: {e}")
+else:
+    logger.info("Britfone not found — British IPA disabled")
+
+# --- Language Detection Constants ---
+
+GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
+    'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
+    'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
+    'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
+    'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
+
+ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
+    'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
+    'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+    'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
+    'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
+
+
+# --- Data Classes ---
+
+@dataclass
+class PageRegion:
+    """A detected region on the page."""
+    type: str           # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
+    x: int
+    y: int
+    width: int
+    height: int
+    classification_confidence: float = 1.0   # 0.0-1.0
+    classification_method: str = ""          # 'content', 'position_enhanced', 'position_fallback'
+
+
+@dataclass
+class ColumnGeometry:
+    """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
+    index: int              # 0-basiert, links->rechts
+    x: int
+    y: int
+    width: int
+    height: int
+    word_count: int
+    words: List[Dict]       # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
+    width_ratio: float      # width / content_width (0.0-1.0)
+    is_sub_column: bool = False  # True if created by _detect_sub_columns() split
+
+
+@dataclass
+class RowGeometry:
+    """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
+    index: int              # 0-basiert, oben→unten
+    x: int                  # absolute left (= content left_x)
+    y: int                  # absolute y start
+    width: int              # content width
+    height: int             # Zeilenhoehe in px
+    word_count: int
+    words: List[Dict]
+    row_type: str = 'content'  # 'content' | 'header' | 'footer'
+    gap_before: int = 0     # Gap in px ueber dieser Zeile
+
+
+@dataclass
+class VocabRow:
+    """A single vocabulary entry assembled from multi-column OCR."""
+    english: str = ""
+    german: str = ""
+    example: str = ""
+    source_page: str = ""
+    confidence: float = 0.0
+    y_position: int = 0
+
+
+@dataclass
+class PipelineResult:
+    """Complete result of the CV pipeline."""
+    vocabulary: List[Dict[str, Any]] = field(default_factory=list)
+    word_count: int = 0
+    columns_detected: int = 0
+    duration_seconds: float = 0.0
+    stages: Dict[str, float] = field(default_factory=dict)
+    error: Optional[str] = None
+    image_width: int = 0
+    image_height: int = 0
+
+
+@dataclass
+class DocumentTypeResult:
+    """Result of automatic document type detection."""
+    doc_type: str           # 'vocab_table' | 'full_text' | 'generic_table'
+    confidence: float       # 0.0-1.0
+    pipeline: str           # 'cell_first' | 'full_page'
+    skip_steps: List[str] = field(default_factory=list)  # e.g. ['columns', 'rows']
+    features: Dict[str, Any] = field(default_factory=dict)  # debug info