Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
@@ -0,0 +1,498 @@
+"""
+Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
+Extracted from cv_cell_grid.py.
+Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+    _clean_cell_text,
+    _clean_cell_text_lite,
+    _words_to_reading_order_text,
+    _words_to_spaced_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+    _MIN_WORD_CONF,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
+# ---------------------------------------------------------------------------
+
+def _ocr_cell_crop(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+    """OCR a single cell by cropping the exact column x row intersection.
+
+    No padding beyond cell boundaries -> no neighbour bleeding.
+    """
+    # Display bbox: exact column x row intersection
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # Crop boundaries: add small internal padding (3px each side) to avoid
+    # clipping characters near column/row edges (e.g. parentheses, descenders).
+    # Stays within image bounds but may extend slightly beyond strict cell.
+    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
+    _PAD = 3
+    cx = max(0, disp_x - _PAD)
+    cy = max(0, disp_y - _PAD)
+    cx2 = min(img_w, disp_x + disp_w + _PAD)
+    cy2 = min(img_h, disp_y + disp_h + _PAD)
+    cw = cx2 - cx
+    ch = cy2 - cy
+
+    empty_cell = {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': '',
+        'confidence': 0.0,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
+            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
+            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
+            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
+        },
+        'ocr_engine': 'cell_crop_v2',
+        'is_bold': False,
+    }
+
+    if cw <= 0 or ch <= 0:
+        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+        return empty_cell
+
+    # --- Pixel-density check: skip truly empty cells ---
+    if ocr_img is not None:
+        crop = ocr_img[cy:cy + ch, cx:cx + cw]
+        if crop.size > 0:
+            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+            if dark_ratio < 0.005:
+                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                            row_idx, col_idx, dark_ratio, cw, ch)
+                return empty_cell
+
+    # --- Prepare crop for OCR ---
+    cell_lang = lang_map.get(col.type, lang)
+    psm = _select_psm_for_column(col.type, col.width, row.height)
+    text = ''
+    avg_conf = 0.0
+    used_engine = 'cell_crop_v2'
+
+    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_trocr(img_bgr, cell_region,
+                                 handwritten=(engine_name == "trocr-handwritten"))
+    elif engine_name == "lighton" and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_lighton(img_bgr, cell_region)
+    elif engine_name == "rapid" and img_bgr is not None:
+        # Upscale small BGR crops for RapidOCR.
+        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+        if bgr_crop.size == 0:
+            words = []
+        else:
+            crop_h, crop_w = bgr_crop.shape[:2]
+            if crop_h < 80:
+                # Force 3x upscale for short rows — small chars need more pixels
+                scale = 3.0
+                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                    interpolation=cv2.INTER_CUBIC)
+            else:
+                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+            up_h, up_w = bgr_up.shape[:2]
+            scale_x = up_w / max(crop_w, 1)
+            scale_y = up_h / max(crop_h, 1)
+            was_scaled = (up_w != crop_w or up_h != crop_h)
+            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region_rapid(bgr_up, tmp_region)
+            # Remap positions back to original image coords
+            if words and was_scaled:
+                for w in words:
+                    w['left'] = int(w['left'] / scale_x) + cx
+                    w['top'] = int(w['top'] / scale_y) + cy
+                    w['width'] = int(w['width'] / scale_x)
+                    w['height'] = int(w['height'] / scale_y)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+    else:
+        # Tesseract: upscale tiny crops for better recognition
+        if ocr_img is not None:
+            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+            upscaled = _ensure_minimum_crop_size(crop_slice)
+            up_h, up_w = upscaled.shape[:2]
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
+            # Remap word positions back to original image coordinates
+            if words and (up_w != cw or up_h != ch):
+                sx = cw / max(up_w, 1)
+                sy = ch / max(up_h, 1)
+                for w in words:
+                    w['left'] = int(w['left'] * sx) + cx
+                    w['top'] = int(w['top'] * sy) + cy
+                    w['width'] = int(w['width'] * sx)
+                    w['height'] = int(w['height'] * sy)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+        else:
+            words = []
+
+    # Filter low-confidence words
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, ch)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+    else:
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+                    row_idx, col_idx, cw, ch, psm, engine_name)
+
+    # --- PSM 7 fallback for still-empty Tesseract cells ---
+    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
+        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+        upscaled = _ensure_minimum_crop_size(crop_slice)
+        up_h, up_w = upscaled.shape[:2]
+        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
+        if psm7_words:
+            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if psm7_words:
+            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+            if p7_text.strip():
+                text = p7_text
+                avg_conf = round(
+                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                )
+                used_engine = 'cell_crop_v2_psm7'
+                # Remap PSM7 word positions back to original image coords
+                if up_w != cw or up_h != ch:
+                    sx = cw / max(up_w, 1)
+                    sy = ch / max(up_h, 1)
+                    for w in psm7_words:
+                        w['left'] = int(w['left'] * sx) + cx
+                        w['top'] = int(w['top'] * sy) + cy
+                        w['width'] = int(w['width'] * sx)
+                        w['height'] = int(w['height'] * sy)
+                else:
+                    for w in psm7_words:
+                        w['left'] += cx
+                        w['top'] += cy
+                words = psm7_words
+
+    # --- Noise filter ---
+    if text.strip():
+        pre_filter = text
+        text = _clean_cell_text_lite(text)
+        if not text:
+            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+                        row_idx, col_idx, pre_filter)
+            avg_conf = 0.0
+
+    result = dict(empty_cell)
+    result['text'] = text
+    result['confidence'] = avg_conf
+    result['ocr_engine'] = used_engine
+
+    # Store individual word bounding boxes (absolute image coordinates)
+    # for pixel-accurate overlay positioning in the frontend.
+    if words and text.strip():
+        result['word_boxes'] = [
+            {
+                'text': w.get('text', ''),
+                'left': w['left'],
+                'top': w['top'],
+                'width': w['width'],
+                'height': w['height'],
+                'conf': w.get('conf', 0),
+            }
+            for w in words
+            if w.get('text', '').strip()
+        ]
+
+    return result
+
+
+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2 — hybrid grid builder (current default)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+    skip_heal_gaps: bool = False,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
+
+    Drop-in replacement for build_cell_grid() -- same signature & return type.
+
+    Strategy:
+    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+      words (from row.words). Handles IPA brackets, punctuation, sentence
+      continuity correctly.
+    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+      neighbour bleeding from adjacent broad columns.
+    """
+    engine_name = "tesseract"
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+        engine_name = "rapid"
+
+    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
+
+    # Filter to content rows only
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows found")
+        return [], []
+
+    # Filter phantom rows (word_count=0) and artifact rows
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows with words found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
+        return [], []
+
+    # Filter columns
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid_v2: no usable columns found")
+        return [], []
+
+    # Heal row gaps -- use header/footer boundaries
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    # skip_heal_gaps: When True, keep cell positions at their exact row geometry
+    # positions without expanding to fill gaps from removed rows.
+    if not skip_heal_gaps:
+        _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    # --- Classify columns as broad vs narrow ---
+    narrow_col_indices = set()
+    for ci, col in enumerate(relevant_cols):
+        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+        if col_pct < _NARROW_COL_THRESHOLD_PCT:
+            narrow_col_indices.add(ci)
+
+    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+    # --- Phase 1: Broad columns via full-page word assignment ---
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        # Assign full-page words to columns for this row
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx not in narrow_col_indices:
+                # BROAD column: use pre-assigned full-page words
+                words = col_words.get(col_idx, [])
+                # Filter low-confidence words
+                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+                # Single full-width column (box sub-session): preserve spacing
+                is_single_full_column = (
+                    len(relevant_cols) == 1
+                    and img_w > 0
+                    and relevant_cols[0].width / img_w > 0.9
+                )
+
+                if words:
+                    y_tol = max(15, row.height)
+                    if is_single_full_column:
+                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+                        logger.info(f"R{row_idx:02d}: {len(words)} words, "
+                                    f"text={text!r:.100}")
+                    else:
+                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+                else:
+                    text = ''
+                    avg_conf = 0.0
+                    if is_single_full_column:
+                        logger.info(f"R{row_idx:02d}: 0 words (row has "
+                                    f"{row.word_count} total, y={row.y}..{row.y+row.height})")
+
+                # Apply noise filter -- but NOT for single-column sub-sessions
+                if not is_single_full_column:
+                    text = _clean_cell_text(text)
+
+                cell = {
+                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
+                    'row_index': row_idx,
+                    'col_index': col_idx,
+                    'col_type': col.type,
+                    'text': text,
+                    'confidence': avg_conf,
+                    'bbox_px': {
+                        'x': col.x, 'y': row.y,
+                        'w': col.width, 'h': row.height,
+                    },
+                    'bbox_pct': {
+                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
+                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
+                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
+                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
+                    },
+                    'ocr_engine': 'word_lookup',
+                    'is_bold': False,
+                }
+                # Store word bounding boxes for pixel-accurate overlay
+                if words and text.strip():
+                    cell['word_boxes'] = [
+                        {
+                            'text': w.get('text', ''),
+                            'left': w['left'],
+                            'top': w['top'],
+                            'width': w['width'],
+                            'height': w['height'],
+                            'conf': w.get('conf', 0),
+                        }
+                        for w in words
+                        if w.get('text', '').strip()
+                    ]
+                cells.append(cell)
+
+    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+    narrow_tasks = []
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx in narrow_col_indices:
+                narrow_tasks.append((row_idx, col_idx, row, col))
+
+    if narrow_tasks:
+        max_workers = 4 if engine_name == "tesseract" else 2
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            futures = {
+                pool.submit(
+                    _ocr_cell_crop,
+                    ri, ci, row, col,
+                    ocr_img, img_bgr, img_w, img_h,
+                    engine_name, lang, lang_map,
+                ): (ri, ci)
+                for ri, ci, row, col in narrow_tasks
+            }
+            for future in as_completed(futures):
+                try:
+                    cell = future.result()
+                    cells.append(cell)
+                except Exception as e:
+                    ri, ci = futures[future]
+                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+    # Sort cells by (row_index, col_index)
+    cells.sort(key=lambda c: (c['row_index'], c['col_index']))
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
+
+    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
+                f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+                f"engine={engine_name} (hybrid)")
+
+    return cells, columns_meta
@@ -0,0 +1,60 @@
+"""
+Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
+
+Re-export hub — all public and private names remain importable from here
+for backward compatibility. The actual implementations live in:
+
+  cv_cell_grid_helpers.py    — shared helpers (_heal_row_gaps, _is_artifact_row, ...)
+  cv_cell_grid_build.py      — v2 hybrid grid (build_cell_grid_v2, _ocr_cell_crop)
+  cv_cell_grid_legacy.py     — deprecated v1 grid (build_cell_grid, _ocr_single_cell)
+  cv_cell_grid_streaming.py  — streaming variants (build_cell_grid_v2_streaming, ...)
+  cv_cell_grid_merge.py      — row-merging logic (_merge_wrapped_rows, ...)
+  cv_cell_grid_vocab.py      — vocab extraction (_cells_to_vocab_entries, build_word_grid)
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+# --- Helpers ---
+from cv_cell_grid_helpers import (  # noqa: F401
+    _MIN_WORD_CONF,
+    _compute_cell_padding,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+# --- v2 build (current default) ---
+from cv_cell_grid_build import (  # noqa: F401
+    _NARROW_COL_THRESHOLD_PCT,
+    _ocr_cell_crop,
+    build_cell_grid_v2,
+)
+
+# --- Legacy build (DEPRECATED) ---
+from cv_cell_grid_legacy import (  # noqa: F401
+    _ocr_single_cell,
+    build_cell_grid,
+)
+
+# --- Streaming variants ---
+from cv_cell_grid_streaming import (  # noqa: F401
+    build_cell_grid_streaming,
+    build_cell_grid_v2_streaming,
+)
+
+# --- Row merging ---
+from cv_cell_grid_merge import (  # noqa: F401
+    _PHONETIC_ONLY_RE,
+    _is_phonetic_only_text,
+    _merge_continuation_rows,
+    _merge_phonetic_continuation_rows,
+    _merge_wrapped_rows,
+)
+
+# --- Vocab extraction ---
+from cv_cell_grid_vocab import (  # noqa: F401
+    _cells_to_vocab_entries,
+    build_word_grid,
+)
@@ -0,0 +1,136 @@
+"""
+Shared helpers for cell-grid construction (v2 + legacy).
+
+Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
+cv_cell_grid_legacy.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+# Minimum OCR word confidence to keep (used across multiple functions)
+_MIN_WORD_CONF = 30
+
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+    """Adaptive padding for OCR crops based on column width.
+
+    Narrow columns (page_ref, marker) need more surrounding context so
+    Tesseract can segment characters correctly.  Wide columns keep the
+    minimal 4 px padding to avoid pulling in neighbours.
+    """
+    col_pct = col_width / img_w * 100 if img_w > 0 else 100
+    if col_pct < 5:
+        return max(20, col_width // 2)
+    if col_pct < 10:
+        return max(12, col_width // 4)
+    if col_pct < 15:
+        return 8
+    return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+                               max_scale: int = 3) -> np.ndarray:
+    """Upscale tiny crops so Tesseract gets enough pixel data.
+
+    If either dimension is below *min_dim*, the crop is bicubic-upscaled
+    so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
+    """
+    h, w = crop.shape[:2]
+    if h >= min_dim and w >= min_dim:
+        return crop
+    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+    if scale <= 1.0:
+        return crop
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+                            row_height: int) -> int:
+    """Choose the best Tesseract PSM for a given column geometry.
+
+    - page_ref columns are almost always single short tokens -> PSM 8
+    - Very narrow or short cells -> PSM 7 (single text line)
+    - Everything else -> PSM 6 (uniform block)
+    """
+    if col_type in ('page_ref', 'marker'):
+        return 8  # single word
+    if col_width < 100 or row_height < 30:
+        return 7  # single line
+    return 6  # uniform block
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+    """Return True if this row contains only scan artifacts, not real text.
+
+    Artifact rows (scanner shadows, noise) typically produce only single-character
+    detections. A real content row always has at least one token with 2+ characters.
+    """
+    if row.word_count == 0:
+        return True
+    texts = [w.get('text', '').strip() for w in row.words]
+    return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+    rows: List[RowGeometry],
+    top_bound: int,
+    bottom_bound: int,
+) -> None:
+    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+    After filtering out empty or artifact rows, remaining content rows may have
+    gaps between them where the removed rows used to be. This function mutates
+    each row to extend upward/downward to the midpoint of such gaps so that
+    OCR crops cover the full available content area.
+
+    The first row always extends to top_bound; the last row to bottom_bound.
+    """
+    if not rows:
+        return
+    rows.sort(key=lambda r: r.y)
+    n = len(rows)
+    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
+
+    for i, row in enumerate(rows):
+        # New top: midpoint between previous row's bottom and this row's top
+        if i == 0:
+            new_top = top_bound
+        else:
+            prev_bot = orig[i - 1][1]
+            my_top = orig[i][0]
+            gap = my_top - prev_bot
+            new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+        # New bottom: midpoint between this row's bottom and next row's top
+        if i == n - 1:
+            new_bottom = bottom_bound
+        else:
+            my_bot = orig[i][1]
+            next_top = orig[i + 1][0]
+            gap = next_top - my_bot
+            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+        row.y = new_top
+        row.height = max(5, new_bottom - new_top)
+
+    logger.debug(
+        f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+        f"(bounds: top={top_bound}, bottom={bottom_bound})"
+    )
@@ -0,0 +1,436 @@
+"""
+Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
+
+Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+    _clean_cell_text,
+    _words_to_reading_order_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+    _MIN_WORD_CONF,
+    _compute_cell_padding,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
+# ---------------------------------------------------------------------------
+
+def _ocr_single_cell(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    use_rapid: bool,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+    preassigned_words: Optional[List[Dict]] = None,
+) -> Dict[str, Any]:
+    """Populate a single cell (column x row intersection) via word lookup."""
+    # Display bbox: exact column x row intersection (no padding)
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # OCR crop: adaptive padding -- narrow columns get more context
+    pad = _compute_cell_padding(col.width, img_w)
+    cell_x = max(0, col.x - pad)
+    cell_y = max(0, row.y - pad)
+    cell_w = min(col.width + 2 * pad, img_w - cell_x)
+    cell_h = min(row.height + 2 * pad, img_h - cell_y)
+    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
+
+    if disp_w <= 0 or disp_h <= 0:
+        return {
+            'cell_id': f"R{row_idx:02d}_C{col_idx}",
+            'row_index': row_idx,
+            'col_index': col_idx,
+            'col_type': col.type,
+            'text': '',
+            'confidence': 0.0,
+            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+            'bbox_pct': {
+                'x': round(col.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(col.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'ocr_engine': 'word_lookup',
+        }
+
+    # --- PRIMARY: Word-lookup from full-page Tesseract ---
+    words = preassigned_words if preassigned_words is not None else []
+    used_engine = 'word_lookup'
+
+    # Filter low-confidence words
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, row.height)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+    else:
+        text = ''
+        avg_conf = 0.0
+
+    # --- FALLBACK: Cell-OCR for empty cells ---
+    _run_fallback = False
+    if not text.strip() and cell_w > 0 and cell_h > 0:
+        if ocr_img is not None:
+            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            if crop.size > 0:
+                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                _run_fallback = dark_ratio > 0.005
+    if _run_fallback:
+        # For narrow columns, upscale the crop before OCR
+        if is_narrow and ocr_img is not None:
+            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            _upscaled = _ensure_minimum_crop_size(_crop_slice)
+            if _upscaled is not _crop_slice:
+                _up_h, _up_w = _upscaled.shape[:2]
+                _tmp_region = PageRegion(
+                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(_upscaled, _tmp_region,
+                                            lang=cell_lang, psm=_cell_psm)
+                # Remap word positions back to original image coordinates
+                _sx = cell_w / max(_up_w, 1)
+                _sy = cell_h / max(_up_h, 1)
+                for _fw in (fallback_words or []):
+                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
+                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
+                    _fw['width'] = int(_fw['width'] * _sx)
+                    _fw['height'] = int(_fw['height'] * _sy)
+            else:
+                cell_region = PageRegion(
+                    type=col.type, x=cell_x, y=cell_y,
+                    width=cell_w, height=cell_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+        else:
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
+            elif engine_name == "lighton" and img_bgr is not None:
+                fallback_words = ocr_region_lighton(img_bgr, cell_region)
+            elif use_rapid and img_bgr is not None:
+                fallback_words = ocr_region_rapid(img_bgr, cell_region)
+            else:
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+
+        if fallback_words:
+            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if fallback_words:
+            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
+            fb_y_tol = max(10, int(fb_avg_h * 0.5))
+            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
+            if fb_text.strip():
+                text = fb_text
+                avg_conf = round(
+                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
+                )
+                used_engine = 'cell_ocr_fallback'
+
+        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+        if not text.strip() and _run_fallback and not use_rapid:
+            _fb_region = PageRegion(
+                type=col.type, x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            cell_lang = lang_map.get(col.type, lang)
+            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
+            if psm7_words:
+                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+            if psm7_words:
+                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+                if p7_text.strip():
+                    text = p7_text
+                    avg_conf = round(
+                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                    )
+                    used_engine = 'cell_ocr_psm7'
+
+        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
+        if not text.strip() and is_narrow and img_bgr is not None:
+            row_region = PageRegion(
+                type='_row_strip', x=0, y=row.y,
+                width=img_w, height=row.height,
+            )
+            strip_words = ocr_region_rapid(img_bgr, row_region)
+            if strip_words:
+                col_left = col.x
+                col_right = col.x + col.width
+                col_words = []
+                for sw in strip_words:
+                    sw_left = sw.get('left', 0)
+                    sw_right = sw_left + sw.get('width', 0)
+                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
+                    if overlap > sw.get('width', 1) * 0.3:
+                        col_words.append(sw)
+                if col_words:
+                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+                if col_words:
+                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
+                    if rs_text.strip():
+                        text = rs_text
+                        avg_conf = round(
+                            sum(w['conf'] for w in col_words) / len(col_words), 1
+                        )
+                        used_engine = 'row_strip_rapid'
+
+    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+    if text.strip():
+        text = _clean_cell_text(text)
+        if not text:
+            avg_conf = 0.0
+
+    return {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': text,
+        'confidence': avg_conf,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2),
+            'y': round(disp_y / img_h * 100, 2),
+            'w': round(disp_w / img_w * 100, 2),
+            'h': round(disp_h / img_h * 100, 2),
+        },
+        'ocr_engine': used_engine,
+    }
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid — legacy grid builder (DEPRECATED)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Generic Cell-Grid: Columns x Rows -> cells with OCR text.
+
+    DEPRECATED: Use build_cell_grid_v2 instead.
+    """
+    # Resolve engine choice
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
+
+    # Filter to content rows only (skip header/footer)
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows found")
+        return [], []
+
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows with words found")
+        return [], []
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid: no usable columns found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows after artifact filtering")
+        return [], []
+
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            cells.append(cell)
+
+    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+    empty_by_col: Dict[int, List[int]] = {}
+    for ci, cell in enumerate(cells):
+        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+            bpx = cell['bbox_px']
+            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+            if w > 0 and h > 0 and ocr_img is not None:
+                crop = ocr_img[y:y + h, x:x + w]
+                if crop.size > 0:
+                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                    if dark_ratio > 0.005:
+                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+    for col_idx, cell_indices in empty_by_col.items():
+        if len(cell_indices) < 3:
+            continue
+
+        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+        col_x = cells[cell_indices[0]]['bbox_px']['x']
+        col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+        strip_region = PageRegion(
+            type=relevant_cols[col_idx].type,
+            x=col_x, y=min_y,
+            width=col_w, height=max_y_h - min_y,
+        )
+        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
+        elif engine_name == "lighton" and img_bgr is not None:
+            strip_words = ocr_region_lighton(img_bgr, strip_region)
+        elif use_rapid and img_bgr is not None:
+            strip_words = ocr_region_rapid(img_bgr, strip_region)
+        else:
+            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+        if not strip_words:
+            continue
+
+        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+        if not strip_words:
+            continue
+
+        for ci in cell_indices:
+            cell_y = cells[ci]['bbox_px']['y']
+            cell_h = cells[ci]['bbox_px']['h']
+            cell_mid_y = cell_y + cell_h / 2
+
+            matched_words = [
+                w for w in strip_words
+                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+            ]
+            if matched_words:
+                matched_words.sort(key=lambda w: w['left'])
+                batch_text = ' '.join(w['text'] for w in matched_words)
+                batch_text = _clean_cell_text(batch_text)
+                if batch_text.strip():
+                    cells[ci]['text'] = batch_text
+                    cells[ci]['confidence'] = round(
+                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
+                    )
+                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+        if batch_filled > 0:
+            logger.info(
+                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+                f"empty cells in column {col_idx}"
+            )
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
+    logger.info(f"build_cell_grid: {len(cells)} cells from "
+                f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+                f"engine={engine_name}")
+
+    return cells, columns_meta
@@ -0,0 +1,235 @@
+"""
+Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_ocr_engines import _RE_ALPHA
+
+logger = logging.getLogger(__name__)
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   ->  True
+      [dance]         ->  True
+      ["a:mand]       ->  True
+      almond ['a:mand] -> False (has real word before bracket)
+      Mandel           -> False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only -> merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
+def _merge_wrapped_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows where the primary column (EN) is empty -- cell wrap continuation.
+
+    In textbook vocabulary tables, columns are often narrow, so the author
+    wraps text within a cell. OCR treats each physical line as a separate row.
+    The key indicator: if the EN column is empty but DE/example have text,
+    this row is a continuation of the previous row's cells.
+
+    Example (original textbook has ONE row):
+      Row 2: EN="take part (in)"  DE="teilnehmen (an), mitmachen"  EX="More than 200 singers took"
+      Row 3: EN=""                DE="(bei)"                        EX="part in the concert."
+      -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
+
+    Also handles the reverse case: DE empty but EN has text (wrap in EN column).
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        if not merged:
+            merged.append(entry)
+            continue
+
+        prev = merged[-1]
+        prev_en = (prev.get('english') or '').strip()
+        prev_de = (prev.get('german') or '').strip()
+        prev_ex = (prev.get('example') or '').strip()
+
+        # Case 1: EN is empty -> continuation of previous row
+        if not en and (de or ex) and prev_en:
+            if de:
+                if prev_de.endswith(','):
+                    sep = ' '
+                elif prev_de.endswith(('-', '(')):
+                    sep = ''
+                else:
+                    sep = ' '
+                prev['german'] = (prev_de + sep + de).strip()
+            if ex:
+                sep = ' ' if prev_ex else ''
+                prev['example'] = (prev_ex + sep + ex).strip()
+            logger.debug(
+                f"Merged wrapped row {entry.get('row_index')} into previous "
+                f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
+            )
+            continue
+
+        # Case 2: DE is empty, EN has text that looks like continuation
+        if en and not de and prev_de:
+            is_paren = en.startswith('(')
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            if (is_paren or starts_lower) and len(en.split()) < 5:
+                sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
+                prev['english'] = (prev_en + sep + en).strip()
+                if ex:
+                    sep2 = ' ' if prev_ex else ''
+                    prev['example'] = (prev_ex + sep2 + ex).strip()
+                logger.debug(
+                    f"Merged wrapped row {entry.get('row_index')} into previous "
+                    f"(empty DE): EN={prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    if len(merged) < len(entries):
+        logger.info(
+            f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
+            f"continuation rows ({len(entries)} -> {len(merged)})"
+        )
+    return merged
+
+
+def _merge_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge multi-line vocabulary entries where text wraps to the next row.
+
+    A row is a continuation of the previous entry when:
+    - EN has text, but DE is empty
+    - EN starts with a lowercase letter (not a new vocab entry)
+    - Previous entry's EN does NOT end with a sentence terminator (.!?)
+    - The continuation text has fewer than 4 words (not an example sentence)
+    - The row was not already merged as phonetic
+
+    Example:
+      Row 5: EN="to put up"       DE="aufstellen"
+      Row 6: EN="with sth."       DE=""
+      -> Merged: EN="to put up with sth."  DE="aufstellen"
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+
+        if merged and en and not de:
+            # Check: not phonetic (already handled)
+            if _is_phonetic_only_text(en):
+                merged.append(entry)
+                continue
+
+            # Check: starts with lowercase
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            # Check: fewer than 4 words (not an example sentence)
+            word_count = len(en.split())
+            is_short = word_count < 4
+
+            # Check: previous entry doesn't end with sentence terminator
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+            if starts_lower and is_short and not prev_ends_sentence:
+                # Merge into previous entry
+                prev['english'] = (prev_en + ' ' + en).strip()
+                # Merge example if present
+                ex = (entry.get('example') or '').strip()
+                if ex:
+                    prev_ex = (prev.get('example') or '').strip()
+                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+                logger.debug(
+                    f"Merged continuation row {entry.get('row_index')} "
+                    f"into previous entry: {prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    return merged
@@ -0,0 +1,217 @@
+"""
+Streaming variants of cell-grid builders (v2 + legacy).
+
+Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
+useful for progress reporting.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+)
+from cv_cell_grid_helpers import (
+    _heal_row_gaps,
+    _is_artifact_row,
+)
+from cv_cell_grid_build import _ocr_cell_crop
+from cv_cell_grid_legacy import _ocr_single_cell
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2_streaming
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells)
+    """
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        engine_name = "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    if not content_rows:
+        return
+
+    # Use header/footer boundaries for heal_row_gaps
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_cell_crop(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                engine_name, lang, lang_map,
+            )
+            yield cell, columns_meta, total_cells
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_streaming — legacy streaming variant
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+    DEPRECATED: Use build_cell_grid_v2_streaming instead.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells) for each cell.
+    """
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        return
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            yield cell, columns_meta, total_cells
@@ -0,0 +1,200 @@
+"""
+Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List
+
+from cv_ocr_engines import (
+    _attach_example_sentences,
+    _fix_phonetic_brackets,
+    _split_comma_entries,
+)
+from cv_cell_grid_legacy import build_cell_grid
+from cv_cell_grid_merge import (
+    _merge_continuation_rows,
+    _merge_phonetic_continuation_rows,
+    _merge_wrapped_rows,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _cells_to_vocab_entries(
+    cells: List[Dict[str, Any]],
+    columns_meta: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Map generic cells to vocab entries with english/german/example fields.
+
+    Groups cells by row_index, maps col_type -> field name, and produces
+    one entry per row (only rows with at least one non-empty field).
+    """
+    col_type_to_field = {
+        'column_en': 'english',
+        'column_de': 'german',
+        'column_example': 'example',
+        'page_ref': 'source_page',
+        'column_marker': 'marker',
+        'column_text': 'text',  # generic single-column (box sub-sessions)
+    }
+    bbox_key_map = {
+        'column_en': 'bbox_en',
+        'column_de': 'bbox_de',
+        'column_example': 'bbox_ex',
+        'page_ref': 'bbox_ref',
+        'column_marker': 'bbox_marker',
+        'column_text': 'bbox_text',
+    }
+
+    # Group cells by row_index
+    rows: Dict[int, List[Dict]] = {}
+    for cell in cells:
+        ri = cell['row_index']
+        rows.setdefault(ri, []).append(cell)
+
+    entries: List[Dict[str, Any]] = []
+    for row_idx in sorted(rows.keys()):
+        row_cells = rows[row_idx]
+        entry: Dict[str, Any] = {
+            'row_index': row_idx,
+            'english': '',
+            'german': '',
+            'example': '',
+            'text': '',  # generic single-column (box sub-sessions)
+            'source_page': '',
+            'marker': '',
+            'confidence': 0.0,
+            'bbox': None,
+            'bbox_en': None,
+            'bbox_de': None,
+            'bbox_ex': None,
+            'bbox_ref': None,
+            'bbox_marker': None,
+            'bbox_text': None,
+            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
+        }
+
+        confidences = []
+        for cell in row_cells:
+            col_type = cell['col_type']
+            field = col_type_to_field.get(col_type)
+            if field:
+                entry[field] = cell['text']
+            bbox_field = bbox_key_map.get(col_type)
+            if bbox_field:
+                entry[bbox_field] = cell['bbox_pct']
+            if cell['confidence'] > 0:
+                confidences.append(cell['confidence'])
+
+        # Compute row-level bbox as union of all cell bboxes
+        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
+        if all_bboxes:
+            min_x = min(b['x'] for b in all_bboxes)
+            min_y = min(b['y'] for b in all_bboxes)
+            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
+            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
+            entry['bbox'] = {
+                'x': round(min_x, 2),
+                'y': round(min_y, 2),
+                'w': round(max_x2 - min_x, 2),
+                'h': round(max_y2 - min_y, 2),
+            }
+
+        entry['confidence'] = round(
+            sum(confidences) / len(confidences), 1
+        ) if confidences else 0.0
+
+        # Only include if at least one mapped field has text
+        has_content = any(
+            entry.get(f)
+            for f in col_type_to_field.values()
+        )
+        if has_content:
+            entries.append(entry)
+
+    return entries
+
+
+def build_word_grid(
+    ocr_img,
+    column_regions,
+    row_geometries,
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr=None,
+    pronunciation: str = "british",
+) -> List[Dict[str, Any]]:
+    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
+
+    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
+    - Maps cells to english/german/example entries
+    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
+    - Falls back to returning raw cells if no vocab columns detected.
+
+    Args:
+        ocr_img: Binarized full-page image (for Tesseract).
+        column_regions: Classified columns from Step 3.
+        row_geometries: Rows from Step 4.
+        img_w, img_h: Image dimensions.
+        lang: Default Tesseract language.
+        ocr_engine: 'tesseract', 'rapid', or 'auto'.
+        img_bgr: BGR color image (required for RapidOCR).
+        pronunciation: 'british' or 'american' for IPA lookup.
+
+    Returns:
+        List of entry dicts with english/german/example text and bbox info (percent).
+    """
+    cells, columns_meta = build_cell_grid(
+        ocr_img, column_regions, row_geometries, img_w, img_h,
+        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
+    )
+
+    if not cells:
+        return []
+
+    # Check if vocab layout is present
+    col_types = {c['type'] for c in columns_meta}
+    if not (col_types & {'column_en', 'column_de'}):
+        logger.info("build_word_grid: no vocab columns -- returning raw cells")
+        return cells
+
+    # Vocab mapping: cells -> entries
+    entries = _cells_to_vocab_entries(cells, columns_meta)
+
+    # --- Post-processing pipeline (deterministic, no LLM) ---
+    n_raw = len(entries)
+
+    # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
+    entries = _merge_wrapped_rows(entries)
+
+    # 0a. Merge phonetic-only continuation rows into previous entry
+    entries = _merge_phonetic_continuation_rows(entries)
+
+    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+    entries = _merge_continuation_rows(entries)
+
+    # 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
+    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
+
+    # 2. Replace OCR'd phonetics with dictionary IPA
+    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+
+    # 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
+    entries = _split_comma_entries(entries)
+
+    # 4. Attach example sentences (rows without DE -> examples for preceding entry)
+    entries = _attach_example_sentences(entries)
+
+    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
+    logger.info(f"build_word_grid: {len(entries)} entries from "
+                f"{n_raw} raw -> {len(entries)} after post-processing "
+                f"(engine={engine_name})")
+
+    return entries
@@ -0,0 +1,471 @@
+"""
+Embedded box detection and page zone splitting for the CV vocabulary pipeline.
+
+Detects boxes (grammar tips, exercises, etc.) that span the page width and
+interrupt the normal column layout. Splits the page into vertical zones so
+that column detection can run independently per zone.
+
+Two-stage algorithm (both run, results merged):
+  1. Morphological line detection — finds bordered boxes via horizontal lines.
+  2. Background shading detection — finds shaded/colored boxes via median-blur
+     background analysis.  Works for colored (blue, green) and grayscale
+     (gray shading on B/W scans) boxes.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+import cv2
+import numpy as np
+
+from cv_vocab_types import DetectedBox, PageZone
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "detect_boxes",
+    "split_page_into_zones",
+]
+
+
+# ---------------------------------------------------------------------------
+# Stage 1: Morphological line detection
+# ---------------------------------------------------------------------------
+
+def _detect_boxes_by_lines(
+    gray: np.ndarray,
+    content_x: int,
+    content_w: int,
+    content_y: int,
+    content_h: int,
+) -> List[DetectedBox]:
+    """Find boxes defined by pairs of long horizontal border lines.
+
+    Args:
+        gray: Grayscale image (full page).
+        content_x, content_w: Horizontal content bounds.
+        content_y, content_h: Vertical content bounds.
+
+    Returns:
+        List of DetectedBox for each detected bordered box.
+    """
+    h, w = gray.shape[:2]
+
+    # Binarize: dark pixels → white on black background
+    _, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
+
+    # Horizontal morphology kernel — at least 50% of content width
+    kernel_w = max(50, content_w // 2)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
+    lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
+
+    # Horizontal projection: count line pixels per row
+    h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
+    line_threshold = content_w * 0.30
+
+    # Group consecutive rows with enough line pixels into line segments
+    line_segments: List[Tuple[int, int]] = []  # (y_start, y_end)
+    seg_start: Optional[int] = None
+    for y in range(h):
+        if h_proj[y] >= line_threshold:
+            if seg_start is None:
+                seg_start = y
+        else:
+            if seg_start is not None:
+                line_segments.append((seg_start, y))
+                seg_start = None
+    if seg_start is not None:
+        line_segments.append((seg_start, h))
+
+    if len(line_segments) < 2:
+        return []
+
+    # Pair lines into boxes: top-line + bottom-line
+    # Minimum box height: 30px.  Maximum: 70% of content height.
+    min_box_h = 30
+    max_box_h = int(content_h * 0.70)
+
+    boxes: List[DetectedBox] = []
+    used = set()
+    for i, (top_start, top_end) in enumerate(line_segments):
+        if i in used:
+            continue
+        for j in range(i + 1, len(line_segments)):
+            if j in used:
+                continue
+            bot_start, bot_end = line_segments[j]
+            box_y = top_start
+            box_h = bot_end - top_start
+            if box_h < min_box_h or box_h > max_box_h:
+                continue
+
+            # Estimate border thickness from line segment heights
+            border_top = top_end - top_start
+            border_bot = bot_end - bot_start
+
+            box = DetectedBox(
+                x=content_x,
+                y=box_y,
+                width=content_w,
+                height=box_h,
+                confidence=0.8,
+                border_thickness=max(border_top, border_bot),
+            )
+            boxes.append(box)
+            used.add(i)
+            used.add(j)
+            break  # move to next top-line candidate
+
+    return boxes
+
+
+# ---------------------------------------------------------------------------
+# Stage 2: Background shading detection (color + grayscale)
+# ---------------------------------------------------------------------------
+
+def _detect_boxes_by_shading(
+    img_bgr: np.ndarray,
+    content_x: int,
+    content_w: int,
+    content_y: int,
+    content_h: int,
+) -> List[DetectedBox]:
+    """Find boxes with shaded/colored background (no visible border lines).
+
+    Uses heavy median blur to remove text and reveal the underlying background.
+    Then detects rectangular regions where the background differs from white.
+    Works for both colored boxes (blue, green) and grayscale shading (gray on
+    B/W scans).
+
+    Args:
+        img_bgr: BGR color image (full page).
+        content_x, content_w: Horizontal content bounds.
+        content_y, content_h: Vertical content bounds.
+
+    Returns:
+        List of DetectedBox for each detected shaded box.
+    """
+    h, w = img_bgr.shape[:2]
+
+    # --- Heavy median blur removes text strokes, keeps background ---
+    blur_size = 31  # large kernel to wipe out text
+    blurred = cv2.medianBlur(img_bgr, blur_size)
+    blur_gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)
+    blur_hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
+
+    # Estimate page background from top-left / top-right corners
+    corner_size = max(20, min(h // 10, w // 10))
+    corners = np.concatenate([
+        blur_gray[:corner_size, :corner_size].ravel(),
+        blur_gray[:corner_size, -corner_size:].ravel(),
+    ])
+    page_bg = float(np.median(corners))
+
+    # Two masks: grayscale shading + color saturation
+    # Grayscale: regions noticeably darker than the page background
+    shade_thresh = max(page_bg - 30, 150)
+    gray_mask = (blur_gray < shade_thresh).astype(np.uint8) * 255
+
+    # Color: regions with noticeable saturation (blue/green/etc. boxes)
+    sat_mask = (blur_hsv[:, :, 1] > 20).astype(np.uint8) * 255
+
+    combined = cv2.bitwise_or(gray_mask, sat_mask)
+
+    # Morphological cleanup: close gaps, remove small noise
+    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 10))
+    combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
+    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
+    combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel_open)
+
+    contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    # Size thresholds: smaller boxes allowed (e.g. "German leihen" ~30% width)
+    min_area = content_w * 30  # at least 30px tall at full width
+    min_box_h = 25
+    max_box_h = int(content_h * 0.70)
+    min_width_ratio = 0.25  # boxes can be ~25% of content width
+
+    boxes: List[DetectedBox] = []
+    for cnt in contours:
+        area = cv2.contourArea(cnt)
+        if area < min_area:
+            continue
+
+        bx, by, bw, bh = cv2.boundingRect(cnt)
+
+        # Width filter
+        if bw < content_w * min_width_ratio:
+            continue
+
+        # Height filter
+        if bh < min_box_h or bh > max_box_h:
+            continue
+
+        # Rectangularity check: area / bounding-rect area > 0.6
+        rect_area = bw * bh
+        if rect_area > 0 and area / rect_area < 0.5:
+            continue
+
+        # Verify that the background inside this region is actually shaded
+        roi_gray = blur_gray[by:by + bh, bx:bx + bw]
+        roi_hsv = blur_hsv[by:by + bh, bx:bx + bw]
+        if roi_gray.size == 0:
+            continue
+
+        median_val = float(np.median(roi_gray))
+        median_sat = float(np.median(roi_hsv[:, :, 1]))
+
+        # Must be noticeably different from page background
+        is_shaded = median_val < (page_bg - 15)
+        is_colored = median_sat > 15
+
+        if not is_shaded and not is_colored:
+            continue
+
+        conf = 0.7 if is_colored else 0.6
+
+        boxes.append(DetectedBox(
+            x=bx,
+            y=by,
+            width=bw,
+            height=bh,
+            confidence=conf,
+            border_thickness=0,
+        ))
+
+    return boxes
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+def _validate_box(
+    box: DetectedBox,
+    gray: np.ndarray,
+    content_w: int,
+    content_h: int,
+    median_row_gap: int,
+) -> bool:
+    """Validate that a detected box is genuine (not a table-row separator etc.)."""
+    # Must span > 25% of content width (lowered from 60% to allow smaller boxes)
+    if box.width < content_w * 0.25:
+        return False
+
+    # Height constraints
+    if box.height < 25 or box.height > content_h * 0.70:
+        return False
+
+    # Must not be confused with a table-row separator:
+    # real boxes are at least 3x the median row gap
+    if median_row_gap > 0 and box.height < median_row_gap * 3:
+        return False
+
+    # Must contain some text (ink density check)
+    h, w = gray.shape[:2]
+    y1 = max(0, box.y)
+    y2 = min(h, box.y + box.height)
+    x1 = max(0, box.x)
+    x2 = min(w, box.x + box.width)
+    roi = gray[y1:y2, x1:x2]
+    if roi.size == 0:
+        return False
+    ink_ratio = np.sum(roi < 128) / roi.size
+    if ink_ratio < 0.002:  # nearly empty → not a real content box
+        return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Public API: detect_boxes
+# ---------------------------------------------------------------------------
+
+def _merge_overlapping_boxes(boxes: List[DetectedBox]) -> List[DetectedBox]:
+    """Merge boxes that overlap significantly (IoU > 0.3 or one contains the other).
+
+    When two boxes overlap, keep the one with higher confidence (or the larger
+    one if confidences are equal).
+    """
+    if len(boxes) <= 1:
+        return boxes
+
+    # Sort by area descending so larger boxes are processed first
+    boxes = sorted(boxes, key=lambda b: b.width * b.height, reverse=True)
+    keep = [True] * len(boxes)
+
+    for i in range(len(boxes)):
+        if not keep[i]:
+            continue
+        bi = boxes[i]
+        for j in range(i + 1, len(boxes)):
+            if not keep[j]:
+                continue
+            bj = boxes[j]
+
+            # Compute overlap
+            x1 = max(bi.x, bj.x)
+            y1 = max(bi.y, bj.y)
+            x2 = min(bi.x + bi.width, bj.x + bj.width)
+            y2 = min(bi.y + bi.height, bj.y + bj.height)
+
+            if x2 <= x1 or y2 <= y1:
+                continue  # no overlap
+
+            inter = (x2 - x1) * (y2 - y1)
+            area_i = bi.width * bi.height
+            area_j = bj.width * bj.height
+            smaller_area = min(area_i, area_j)
+
+            # If overlap covers > 50% of the smaller box, merge (drop the weaker)
+            if smaller_area > 0 and inter / smaller_area > 0.50:
+                # Keep the one with higher confidence; if equal, keep larger
+                if bj.confidence > bi.confidence:
+                    keep[i] = False
+                    break
+                else:
+                    keep[j] = False
+
+    return [b for b, k in zip(boxes, keep) if k]
+
+
+def detect_boxes(
+    img_bgr: np.ndarray,
+    content_x: int,
+    content_w: int,
+    content_y: int,
+    content_h: int,
+    median_row_gap: int = 0,
+) -> List[DetectedBox]:
+    """Detect embedded boxes on a page image.
+
+    Runs BOTH line-based and shading-based detection, then merges and
+    deduplicates results.
+
+    Args:
+        img_bgr: BGR color image (full page or cropped).
+        content_x, content_w: Horizontal content bounds.
+        content_y, content_h: Vertical content bounds.
+        median_row_gap: Median row gap height (for filtering out table separators).
+
+    Returns:
+        List of validated DetectedBox instances, sorted by y position.
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Stage 1: Line-based detection (bordered boxes)
+    line_boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
+
+    # Stage 2: Shading-based detection (colored/gray background boxes)
+    shade_boxes = _detect_boxes_by_shading(img_bgr, content_x, content_w, content_y, content_h)
+
+    logger.debug("BoxDetect: %d line-based, %d shading-based candidates",
+                 len(line_boxes), len(shade_boxes))
+
+    # Combine and deduplicate
+    all_boxes = line_boxes + shade_boxes
+    merged = _merge_overlapping_boxes(all_boxes)
+
+    # Validate
+    validated = [b for b in merged if _validate_box(b, gray, content_w, content_h, median_row_gap)]
+
+    # Sort top to bottom
+    validated.sort(key=lambda b: b.y)
+
+    if validated:
+        logger.info("BoxDetect: %d box(es) detected (line=%d, shade=%d, merged=%d)",
+                     len(validated), len(line_boxes), len(shade_boxes), len(merged))
+    else:
+        logger.debug("BoxDetect: no boxes detected")
+
+    return validated
+
+
+# ---------------------------------------------------------------------------
+# Zone Splitting
+# ---------------------------------------------------------------------------
+
+def split_page_into_zones(
+    content_x: int,
+    content_y: int,
+    content_w: int,
+    content_h: int,
+    boxes: List[DetectedBox],
+    min_zone_height: int = 40,
+) -> List[PageZone]:
+    """Split a page into vertical zones based on detected boxes.
+
+    Regions above, between, and below boxes become 'content' zones;
+    box regions become 'box' zones.
+
+    Args:
+        content_x, content_y, content_w, content_h: Content area bounds.
+        boxes: Detected boxes, sorted by y position.
+        min_zone_height: Minimum height for a content zone to be kept.
+
+    Returns:
+        List of PageZone, ordered top to bottom.
+    """
+    if not boxes:
+        # Single zone: entire content area
+        return [PageZone(
+            index=0,
+            zone_type='content',
+            y=content_y,
+            height=content_h,
+            x=content_x,
+            width=content_w,
+        )]
+
+    zones: List[PageZone] = []
+    zone_idx = 0
+    cursor_y = content_y
+    content_bottom = content_y + content_h
+
+    for box in boxes:
+        # Content zone above this box
+        gap_above = box.y - cursor_y
+        if gap_above >= min_zone_height:
+            zones.append(PageZone(
+                index=zone_idx,
+                zone_type='content',
+                y=cursor_y,
+                height=gap_above,
+                x=content_x,
+                width=content_w,
+            ))
+            zone_idx += 1
+
+        # Box zone
+        zones.append(PageZone(
+            index=zone_idx,
+            zone_type='box',
+            y=box.y,
+            height=box.height,
+            x=box.x,
+            width=box.width,
+            box=box,
+        ))
+        zone_idx += 1
+
+        cursor_y = box.y + box.height
+
+    # Content zone below last box
+    remaining = content_bottom - cursor_y
+    if remaining >= min_zone_height:
+        zones.append(PageZone(
+            index=zone_idx,
+            zone_type='content',
+            y=cursor_y,
+            height=remaining,
+            x=content_x,
+            width=content_w,
+        ))
+
+    logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
+                f"{[z.zone_type for z in zones]}")
+
+    return zones
@@ -0,0 +1,339 @@
+"""
+Box layout classifier — detects internal layout type of embedded boxes.
+
+Classifies each box as: flowing | columnar | bullet_list | header_only
+and provides layout-appropriate grid building.
+
+Used by the Box-Grid-Review step to rebuild box zones with correct structure.
+"""
+
+import logging
+import re
+import statistics
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Bullet / list-item patterns at the start of a line
+_BULLET_RE = re.compile(
+    r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s'  # dash, bullet chars
+    r'|^\d{1,2}[.)]\s'     # numbered: "1) " or "1. "
+    r'|^[a-z][.)]\s'       # lettered: "a) " or "a. "
+)
+
+
+def classify_box_layout(
+    words: List[Dict],
+    box_w: int,
+    box_h: int,
+) -> str:
+    """Classify the internal layout of a detected box.
+
+    Args:
+        words: OCR word dicts within the box (with top, left, width, height, text)
+        box_w: Box width in pixels
+        box_h: Box height in pixels
+
+    Returns:
+        'header_only' | 'bullet_list' | 'columnar' | 'flowing'
+    """
+    if not words:
+        return "header_only"
+
+    # Group words into lines by y-proximity
+    lines = _group_into_lines(words)
+
+    # Header only: very few words or single line
+    total_words = sum(len(line) for line in lines)
+    if total_words <= 5 or len(lines) <= 1:
+        return "header_only"
+
+    # Bullet list: check if majority of lines start with bullet patterns
+    bullet_count = 0
+    for line in lines:
+        first_text = line[0].get("text", "") if line else ""
+        if _BULLET_RE.match(first_text):
+            bullet_count += 1
+        # Also check if first word IS a bullet char
+        elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
+            bullet_count += 1
+    if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
+        return "bullet_list"
+
+    # Columnar: check for multiple distinct x-clusters
+    if len(lines) >= 3 and _has_column_structure(words, box_w):
+        return "columnar"
+
+    # Default: flowing text
+    return "flowing"
+
+
+def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
+    """Group words into lines by y-proximity."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
+    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
+    median_h = statistics.median(heights) if heights else 20
+    y_tolerance = max(median_h * 0.5, 5)
+
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]["top"]
+
+    for w in sorted_words[1:]:
+        if abs(w["top"] - current_y) <= y_tolerance:
+            current_line.append(w)
+        else:
+            lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+            current_line = [w]
+            current_y = w["top"]
+
+    if current_line:
+        lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+
+    return lines
+
+
+def _has_column_structure(words: List[Dict], box_w: int) -> bool:
+    """Check if words have multiple distinct left-edge clusters (columns)."""
+    if box_w <= 0:
+        return False
+
+    lines = _group_into_lines(words)
+    if len(lines) < 3:
+        return False
+
+    # Collect left-edges of non-first words in each line
+    # (first word of each line often aligns regardless of columns)
+    left_edges = []
+    for line in lines:
+        for w in line[1:]:  # skip first word
+            left_edges.append(w["left"])
+
+    if len(left_edges) < 4:
+        return False
+
+    # Check if left edges cluster into 2+ distinct groups
+    left_edges.sort()
+    gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
+    if not gaps:
+        return False
+
+    median_gap = statistics.median(gaps)
+    # A column gap is typically > 15% of box width
+    column_gap_threshold = box_w * 0.15
+    large_gaps = [g for g in gaps if g > column_gap_threshold]
+
+    return len(large_gaps) >= 1
+
+
+def build_box_zone_grid(
+    zone_words: List[Dict],
+    box_x: int,
+    box_y: int,
+    box_w: int,
+    box_h: int,
+    zone_index: int,
+    img_w: int,
+    img_h: int,
+    layout_type: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Build a grid for a box zone with layout-aware processing.
+
+    If layout_type is None, auto-detects it.
+    For 'flowing' and 'bullet_list', forces single-column layout.
+    For 'columnar', uses the standard multi-column detection.
+    For 'header_only', creates a single cell.
+
+    Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
+    """
+    from grid_editor_helpers import _build_zone_grid, _cluster_rows
+
+    if not zone_words:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+            "box_layout_type": layout_type or "header_only",
+            "box_grid_reviewed": False,
+        }
+
+    # Auto-detect layout if not specified
+    if not layout_type:
+        layout_type = classify_box_layout(zone_words, box_w, box_h)
+
+    logger.info(
+        "Box zone %d: layout_type=%s, %d words, %dx%d",
+        zone_index, layout_type, len(zone_words), box_w, box_h,
+    )
+
+    if layout_type == "header_only":
+        # Single cell with all text concatenated
+        all_text = " ".join(
+            w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
+        ).strip()
+        return {
+            "columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
+                         "x_min_px": box_x, "x_max_px": box_x + box_w,
+                         "x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
+                         "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
+                         "bold": False}],
+            "rows": [{"index": 0, "row_index": 0,
+                       "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
+                       "y_min_px": box_y, "y_max_px": box_y + box_h,
+                       "y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
+                       "y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
+                       "is_header": True}],
+            "cells": [{
+                "cell_id": f"Z{zone_index}_R0C0",
+                "row_index": 0,
+                "col_index": 0,
+                "col_type": "column_1",
+                "text": all_text,
+                "word_boxes": zone_words,
+            }],
+            "header_rows": [0],
+            "box_layout_type": layout_type,
+            "box_grid_reviewed": False,
+        }
+
+    if layout_type in ("flowing", "bullet_list"):
+        # Force single column — each line becomes one row with one cell.
+        # Detect bullet structure from indentation and merge continuation
+        # lines into the bullet they belong to.
+        lines = _group_into_lines(zone_words)
+        column = {
+            "col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
+            "x_min_px": box_x, "x_max_px": box_x + box_w,
+            "x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
+            "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
+            "bold": False,
+        }
+
+        # --- Detect indentation levels ---
+        line_indents = []
+        for line_words in lines:
+            if not line_words:
+                line_indents.append(0)
+                continue
+            min_left = min(w["left"] for w in line_words)
+            line_indents.append(min_left - box_x)
+
+        # Find the minimum indent (= bullet/main level)
+        valid_indents = [ind for ind in line_indents if ind >= 0]
+        min_indent = min(valid_indents) if valid_indents else 0
+
+        # Indentation threshold: lines indented > 15px more than minimum
+        # are continuation lines belonging to the previous bullet
+        INDENT_THRESHOLD = 15
+
+        # --- Group lines into logical items (bullet + continuations) ---
+        # Each item is a list of line indices
+        items: List[List[int]] = []
+        for li, indent in enumerate(line_indents):
+            is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
+            if is_continuation:
+                items[-1].append(li)
+            else:
+                items.append([li])
+
+        logger.info(
+            "Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
+            zone_index, len(lines), len(items),
+            [int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
+        )
+
+        # --- Build rows and cells from grouped items ---
+        rows = []
+        cells = []
+        header_rows = []
+
+        for row_idx, item_line_indices in enumerate(items):
+            # Collect all words from all lines in this item
+            item_words = []
+            item_texts = []
+            for li in item_line_indices:
+                if li < len(lines):
+                    item_words.extend(lines[li])
+                    line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
+                    if line_text:
+                        item_texts.append(line_text)
+
+            if not item_words:
+                continue
+
+            y_min = min(w["top"] for w in item_words)
+            y_max = max(w["top"] + w["height"] for w in item_words)
+            y_center = (y_min + y_max) / 2
+
+            row = {
+                "index": row_idx,
+                "row_index": row_idx,
+                "y_min": y_min,
+                "y_max": y_max,
+                "y_center": y_center,
+                "y_min_px": y_min,
+                "y_max_px": y_max,
+                "y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
+                "y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
+                "is_header": False,
+            }
+            rows.append(row)
+
+            # Join multi-line text with newline for display
+            merged_text = "\n".join(item_texts)
+
+            # Add bullet marker if this is a bullet item without one
+            first_text = item_texts[0] if item_texts else ""
+            is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
+            if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
+                # Continuation item without bullet — add one
+                merged_text = "• " + merged_text
+
+            cell = {
+                "cell_id": f"Z{zone_index}_R{row_idx}C0",
+                "row_index": row_idx,
+                "col_index": 0,
+                "col_type": "column_1",
+                "text": merged_text,
+                "word_boxes": item_words,
+            }
+            cells.append(cell)
+
+        # Detect header: first item if it has no continuation lines and is short
+        if len(items) >= 2:
+            first_item_texts = []
+            for li in items[0]:
+                if li < len(lines):
+                    first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
+            first_text = " ".join(first_item_texts)
+            if (len(first_text) < 40
+                    or first_text.isupper()
+                    or first_text.rstrip().endswith(':')):
+                header_rows = [0]
+
+        return {
+            "columns": [column],
+            "rows": rows,
+            "cells": cells,
+            "header_rows": header_rows,
+            "box_layout_type": layout_type,
+            "box_grid_reviewed": False,
+        }
+
+    # Columnar: use standard grid builder with independent column detection
+    result = _build_zone_grid(
+        zone_words, box_x, box_y, box_w, box_h,
+        zone_index, img_w, img_h,
+        global_columns=None,  # detect columns independently
+    )
+
+    # Colspan detection is now handled generically by _detect_colspan_cells
+    # in grid_editor_helpers.py (called inside _build_zone_grid).
+
+    result["box_layout_type"] = layout_type
+    result["box_grid_reviewed"] = False
+    return result
@@ -0,0 +1,312 @@
+"""
+Color detection for OCR word boxes.
+
+Detects the text color of existing OCR words and recovers colored text
+regions (e.g. red markers, blue headings) that standard OCR may have missed.
+
+Standard OCR (Tesseract, PaddleOCR) binarises images before processing,
+destroying all color information.  This module adds it back by sampling
+HSV pixel values at word-box positions and finding colored regions that
+no word-box covers.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# HSV color ranges  (OpenCV:  H 0-180,  S 0-255,  V 0-255)
+# ---------------------------------------------------------------------------
+
+_COLOR_RANGES: Dict[str, List[Tuple[np.ndarray, np.ndarray]]] = {
+    "red": [
+        (np.array([0, 70, 50]), np.array([10, 255, 255])),
+        (np.array([170, 70, 50]), np.array([180, 255, 255])),
+    ],
+    "orange": [
+        (np.array([10, 70, 50]), np.array([25, 255, 255])),
+    ],
+    "yellow": [
+        (np.array([25, 70, 50]), np.array([35, 255, 255])),
+    ],
+    "green": [
+        (np.array([35, 70, 50]), np.array([85, 255, 255])),
+    ],
+    "blue": [
+        (np.array([100, 70, 50]), np.array([130, 255, 255])),
+    ],
+    "purple": [
+        (np.array([130, 70, 50]), np.array([170, 255, 255])),
+    ],
+}
+
+_COLOR_HEX: Dict[str, str] = {
+    "black": "#000000",
+    "gray": "#6b7280",
+    "red": "#dc2626",
+    "orange": "#ea580c",
+    "yellow": "#ca8a04",
+    "green": "#16a34a",
+    "blue": "#2563eb",
+    "purple": "#9333ea",
+}
+
+
+def _hue_to_color_name(hue: float) -> str:
+    """Map OpenCV hue (0-180) to a color name."""
+    if hue < 10 or hue > 170:
+        return "red"
+    if hue < 25:
+        return "orange"
+    if hue < 35:
+        return "yellow"
+    if hue < 85:
+        return "green"
+    if hue < 130:
+        return "blue"
+    return "purple"
+
+
+# ---------------------------------------------------------------------------
+# 1.  Color annotation for existing word boxes
+# ---------------------------------------------------------------------------
+
+def detect_word_colors(
+    img_bgr: np.ndarray,
+    word_boxes: List[Dict],
+    sat_threshold: int = 55,
+    min_sat_ratio: float = 0.25,
+) -> None:
+    """Annotate each word_box in-place with its detected text color.
+
+    Adds ``color`` (hex string) and ``color_name`` (e.g. 'red', 'black')
+    keys to each dict.
+
+    Algorithm per word:
+      1. Crop the word region from the image.
+      2. Otsu-threshold for text/background separation.
+      3. Sample background color from border pixels of the crop.
+      4. Remove text pixels that match the background (avoids colored
+         backgrounds like blue boxes leaking into the result).
+      5. Use **median** hue (robust to outliers) and require a minimum
+         ratio of saturated pixels before classifying as colored.
+    """
+    if img_bgr is None or not word_boxes:
+        return
+
+    img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    img_h, img_w = img_bgr.shape[:2]
+
+    colored_count = 0
+
+    for wb in word_boxes:
+        x1 = max(0, int(wb["left"]))
+        y1 = max(0, int(wb["top"]))
+        x2 = min(img_w, int(wb["left"] + wb["width"]))
+        y2 = min(img_h, int(wb["top"] + wb["height"]))
+
+        if x2 <= x1 or y2 <= y1:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+            continue
+
+        crop_hsv = img_hsv[y1:y2, x1:x2]
+        crop_bgr = img_bgr[y1:y2, x1:x2]
+        crop_gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
+        ch, cw = crop_hsv.shape[:2]
+
+        # --- Text mask: Otsu (adaptive) + high-saturation pixels ---
+        _, dark_mask = cv2.threshold(
+            crop_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU,
+        )
+        sat_mask = (crop_hsv[:, :, 1] > sat_threshold).astype(np.uint8) * 255
+        text_mask = cv2.bitwise_or(dark_mask, sat_mask)
+
+        text_pixels = crop_hsv[text_mask > 0]
+
+        if len(text_pixels) < 3:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+            continue
+
+        # --- Background subtraction via border pixels ---
+        # Sample background from the 2px border ring of the crop
+        if ch > 6 and cw > 6:
+            border = 2
+            bg_top = crop_hsv[:border, :].reshape(-1, 3)
+            bg_bot = crop_hsv[-border:, :].reshape(-1, 3)
+            bg_lft = crop_hsv[border:-border, :border].reshape(-1, 3)
+            bg_rgt = crop_hsv[border:-border, -border:].reshape(-1, 3)
+            bg_pixels = np.vstack([bg_top, bg_bot, bg_lft, bg_rgt])
+
+            bg_med_h = float(np.median(bg_pixels[:, 0]))
+            bg_med_s = float(np.median(bg_pixels[:, 1]))
+
+            # If background is tinted (S > 15), remove text pixels
+            # with similar hue to avoid false colored detections
+            if bg_med_s > 15:
+                hue_diff = np.minimum(
+                    np.abs(text_pixels[:, 0].astype(float) - bg_med_h),
+                    180.0 - np.abs(text_pixels[:, 0].astype(float) - bg_med_h),
+                )
+                keep = hue_diff > 20
+                if np.any(keep):
+                    text_pixels = text_pixels[keep]
+
+        if len(text_pixels) < 3:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+            continue
+
+        # --- Classification using MEDIAN (robust to outliers) ---
+        median_sat = float(np.median(text_pixels[:, 1]))
+        sat_count = int(np.sum(text_pixels[:, 1] > sat_threshold))
+        sat_ratio = sat_count / len(text_pixels)
+
+        if median_sat < sat_threshold or sat_ratio < min_sat_ratio:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+        else:
+            # Use median hue of saturated pixels only for cleaner signal
+            sat_pixels = text_pixels[text_pixels[:, 1] > sat_threshold]
+            median_hue = float(np.median(sat_pixels[:, 0]))
+            name = _hue_to_color_name(median_hue)
+
+            # Red requires higher saturation — scanner artifacts on black
+            # text often produce a slight warm tint (hue ~0) with low
+            # saturation that would otherwise be misclassified as red.
+            if name == "red" and median_sat < 90:
+                wb["color"] = _COLOR_HEX["black"]
+                wb["color_name"] = "black"
+                continue
+
+            wb["color"] = _COLOR_HEX.get(name, _COLOR_HEX["black"])
+            wb["color_name"] = name
+            colored_count += 1
+
+    if colored_count:
+        logger.info("color annotation: %d / %d words are colored",
+                     colored_count, len(word_boxes))
+
+
+# ---------------------------------------------------------------------------
+# 2.  Recover colored text that OCR missed
+# ---------------------------------------------------------------------------
+
+def recover_colored_text(
+    img_bgr: np.ndarray,
+    existing_words: List[Dict],
+    min_area: int = 40,
+    max_regions: int = 60,
+) -> List[Dict]:
+    """Find colored text regions not covered by any existing word box.
+
+    Returns a list of recovered word dicts with ``color``, ``color_name``,
+    and ``recovered=True`` fields.  The ``text`` is set via a lightweight
+    shape heuristic (e.g. ``!`` for tall narrow shapes) or ``?``.
+    """
+    if img_bgr is None:
+        return []
+
+    img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    ih, iw = img_bgr.shape[:2]
+    max_area = int(ih * iw * 0.005)
+
+    # --- Build occupancy mask from existing words (adaptive padding) ---
+    # Pad word boxes generously to prevent colored-pixel artifacts in
+    # narrow inter-word gaps from being recovered as false characters.
+    heights = [wb["height"] for wb in existing_words if wb.get("height", 0) > 0]
+    median_h = int(np.median(heights)) if heights else 20
+    pad = max(8, int(median_h * 0.35))
+
+    occupied = np.zeros((ih, iw), dtype=np.uint8)
+    for wb in existing_words:
+        x1 = max(0, int(wb["left"]) - pad)
+        y1 = max(0, int(wb["top"]) - pad)
+        x2 = min(iw, int(wb["left"] + wb["width"]) + pad)
+        y2 = min(ih, int(wb["top"] + wb["height"]) + pad)
+        occupied[y1:y2, x1:x2] = 255
+
+    recovered: List[Dict] = []
+
+    for color_name, ranges in _COLOR_RANGES.items():
+        # Create mask for this color
+        mask = np.zeros((ih, iw), dtype=np.uint8)
+        for lower, upper in ranges:
+            mask = cv2.bitwise_or(mask, cv2.inRange(img_hsv, lower, upper))
+
+        # Remove pixels already covered by existing OCR words
+        mask = cv2.bitwise_and(mask, cv2.bitwise_not(occupied))
+
+        # Morphological cleanup:
+        # - Close with tall kernel to merge ! stroke + dot
+        # - Open to remove noise specks
+        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 8))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_close)
+        kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open)
+
+        contours, _ = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+        )
+
+        candidates = []
+        for cnt in contours:
+            area = cv2.contourArea(cnt)
+            if area < min_area or area > max_area:
+                continue
+            bx, by, bw, bh = cv2.boundingRect(cnt)
+            if bh < 6:
+                continue
+            # Reject regions too wide to be single characters
+            if bw > median_h * 4:
+                continue
+            candidates.append((area, bx, by, bw, bh))
+
+        # Keep largest first, limited count
+        candidates.sort(key=lambda c: c[0], reverse=True)
+
+        for area, bx, by, bw, bh in candidates[:max_regions]:
+            text = _identify_shape(bw, bh)
+            recovered.append({
+                "text": text,
+                "left": bx,
+                "top": by,
+                "width": bw,
+                "height": bh,
+                "conf": 45,
+                "color": _COLOR_HEX.get(color_name, "#000000"),
+                "color_name": color_name,
+                "recovered": True,
+            })
+
+    if recovered:
+        logger.info(
+            "color recovery: %d colored regions found (%s)",
+            len(recovered),
+            ", ".join(
+                f"{c}: {sum(1 for r in recovered if r['color_name'] == c)}"
+                for c in sorted({r["color_name"] for r in recovered})
+            ),
+        )
+
+    return recovered
+
+
+def _identify_shape(w: int, h: int) -> str:
+    """Simple shape heuristic for common single-character text markers."""
+    aspect = w / h if h > 0 else 1.0
+    if aspect < 0.55 and h > 10:
+        # Tall, narrow — likely exclamation mark
+        return "!"
+    if 0.6 < aspect < 1.5 and max(w, h) < 25:
+        # Small, roughly square — bullet or dot
+        return "•"
+    return "?"
@@ -0,0 +1,413 @@
+"""
+PP-DocLayout ONNX Document Layout Detection.
+
+Uses PP-DocLayout ONNX model to detect document structure regions:
+  table, figure, title, text, list, header, footer, equation, reference, abstract
+
+Fallback: If ONNX model not available, returns empty list (caller should
+fall back to OpenCV-based detection in cv_graphic_detect.py).
+
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "detect_layout_regions",
+    "is_doclayout_available",
+    "get_doclayout_status",
+    "LayoutRegion",
+    "DOCLAYOUT_CLASSES",
+]
+
+# ---------------------------------------------------------------------------
+# Class labels (PP-DocLayout default order)
+# ---------------------------------------------------------------------------
+
+DOCLAYOUT_CLASSES = [
+    "table", "figure", "title", "text", "list",
+    "header", "footer", "equation", "reference", "abstract",
+]
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LayoutRegion:
+    """A detected document layout region."""
+    x: int
+    y: int
+    width: int
+    height: int
+    label: str           # table, figure, title, text, list, etc.
+    confidence: float
+    label_index: int     # raw class index
+
+
+# ---------------------------------------------------------------------------
+# ONNX model loading
+# ---------------------------------------------------------------------------
+
+_MODEL_SEARCH_PATHS = [
+    # 1. Explicit environment variable
+    os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
+    # 2. Docker default cache path
+    "/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
+    # 3. Local dev relative to working directory
+    "models/onnx/pp-doclayout/model.onnx",
+]
+
+_onnx_session: Optional[object] = None
+_model_path: Optional[str] = None
+_load_attempted: bool = False
+_load_error: Optional[str] = None
+
+
+def _find_model_path() -> Optional[str]:
+    """Search for the ONNX model file in known locations."""
+    for p in _MODEL_SEARCH_PATHS:
+        if p and Path(p).is_file():
+            return str(Path(p).resolve())
+    return None
+
+
+def _load_onnx_session():
+    """Lazy-load the ONNX runtime session (once)."""
+    global _onnx_session, _model_path, _load_attempted, _load_error
+
+    if _load_attempted:
+        return _onnx_session
+
+    _load_attempted = True
+
+    path = _find_model_path()
+    if path is None:
+        _load_error = "ONNX model not found in any search path"
+        logger.info("PP-DocLayout: %s", _load_error)
+        return None
+
+    try:
+        import onnxruntime as ort  # type: ignore[import-untyped]
+
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # Prefer CPU – keeps the GPU free for OCR / LLM.
+        providers = ["CPUExecutionProvider"]
+        _onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
+        _model_path = path
+        logger.info("PP-DocLayout: model loaded from %s", path)
+    except ImportError:
+        _load_error = "onnxruntime not installed"
+        logger.info("PP-DocLayout: %s", _load_error)
+    except Exception as exc:
+        _load_error = str(exc)
+        logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
+
+    return _onnx_session
+
+
+# ---------------------------------------------------------------------------
+# Public helpers
+# ---------------------------------------------------------------------------
+
+
+def is_doclayout_available() -> bool:
+    """Return True if the ONNX model can be loaded successfully."""
+    return _load_onnx_session() is not None
+
+
+def get_doclayout_status() -> Dict:
+    """Return diagnostic information about the DocLayout backend."""
+    _load_onnx_session()  # ensure we tried
+    return {
+        "available": _onnx_session is not None,
+        "model_path": _model_path,
+        "load_error": _load_error,
+        "classes": DOCLAYOUT_CLASSES,
+        "class_count": len(DOCLAYOUT_CLASSES),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Pre-processing
+# ---------------------------------------------------------------------------
+
+_INPUT_SIZE = 800  # PP-DocLayout expects 800x800
+
+
+def preprocess_image(img_bgr: np.ndarray) -> tuple:
+    """Resize + normalize image for PP-DocLayout ONNX input.
+
+    Returns:
+        (input_tensor, scale_x, scale_y, pad_x, pad_y)
+        where scale/pad allow mapping boxes back to original coords.
+    """
+    orig_h, orig_w = img_bgr.shape[:2]
+
+    # Compute scale to fit within _INPUT_SIZE keeping aspect ratio
+    scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
+    new_w = int(orig_w * scale)
+    new_h = int(orig_h * scale)
+
+    import cv2  # local import — cv2 is always available in this service
+    resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+
+    # Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
+    pad_x = (_INPUT_SIZE - new_w) // 2
+    pad_y = (_INPUT_SIZE - new_h) // 2
+    padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
+    padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
+
+    # Normalize to [0, 1] float32
+    blob = padded.astype(np.float32) / 255.0
+
+    # HWC → CHW
+    blob = blob.transpose(2, 0, 1)
+
+    # Add batch dimension → (1, 3, 800, 800)
+    blob = np.expand_dims(blob, axis=0)
+
+    return blob, scale, pad_x, pad_y
+
+
+# ---------------------------------------------------------------------------
+# Non-Maximum Suppression (NMS)
+# ---------------------------------------------------------------------------
+
+
+def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
+    """Compute IoU between two boxes [x1, y1, x2, y2]."""
+    ix1 = max(box_a[0], box_b[0])
+    iy1 = max(box_a[1], box_b[1])
+    ix2 = min(box_a[2], box_b[2])
+    iy2 = min(box_a[3], box_b[3])
+
+    inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
+    if inter == 0:
+        return 0.0
+
+    area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
+    area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+    union = area_a + area_b - inter
+    return inter / union if union > 0 else 0.0
+
+
+def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
+    """Apply greedy Non-Maximum Suppression.
+
+    Args:
+        boxes: (N, 4) array of [x1, y1, x2, y2].
+        scores: (N,) confidence scores.
+        iou_threshold: Overlap threshold for suppression.
+
+    Returns:
+        List of kept indices.
+    """
+    if len(boxes) == 0:
+        return []
+
+    order = np.argsort(scores)[::-1].tolist()
+    keep: List[int] = []
+
+    while order:
+        i = order.pop(0)
+        keep.append(i)
+        remaining = []
+        for j in order:
+            if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
+                remaining.append(j)
+        order = remaining
+
+    return keep
+
+
+# ---------------------------------------------------------------------------
+# Post-processing
+# ---------------------------------------------------------------------------
+
+
+def _postprocess(
+    outputs: list,
+    scale: float,
+    pad_x: int,
+    pad_y: int,
+    orig_w: int,
+    orig_h: int,
+    confidence_threshold: float,
+    max_regions: int,
+) -> List[LayoutRegion]:
+    """Parse ONNX output tensors into LayoutRegion list.
+
+    PP-DocLayout ONNX typically outputs one tensor of shape
+    (1, N, 6) or three tensors (boxes, scores, class_ids).
+    We handle both common formats.
+    """
+    regions: List[LayoutRegion] = []
+
+    # --- Determine output format ---
+    if len(outputs) == 1:
+        # Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
+        raw = np.squeeze(outputs[0])  # (N, 6) or (N, 5+num_classes)
+        if raw.ndim == 1:
+            raw = raw.reshape(1, -1)
+        if raw.shape[0] == 0:
+            return []
+
+        if raw.shape[1] == 6:
+            # Format: x1, y1, x2, y2, score, class_id
+            all_boxes = raw[:, :4]
+            all_scores = raw[:, 4]
+            all_classes = raw[:, 5].astype(int)
+        elif raw.shape[1] > 6:
+            # Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
+            all_boxes = raw[:, :4]
+            cls_scores = raw[:, 5:]
+            all_classes = np.argmax(cls_scores, axis=1)
+            all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
+        else:
+            logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
+            return []
+
+    elif len(outputs) == 3:
+        # Three tensors: boxes (N,4), scores (N,), class_ids (N,)
+        all_boxes = np.squeeze(outputs[0])
+        all_scores = np.squeeze(outputs[1])
+        all_classes = np.squeeze(outputs[2]).astype(int)
+        if all_boxes.ndim == 1:
+            all_boxes = all_boxes.reshape(1, 4)
+            all_scores = np.array([all_scores])
+            all_classes = np.array([all_classes])
+    else:
+        logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
+        return []
+
+    # --- Confidence filter ---
+    mask = all_scores >= confidence_threshold
+    boxes = all_boxes[mask]
+    scores = all_scores[mask]
+    classes = all_classes[mask]
+
+    if len(boxes) == 0:
+        return []
+
+    # --- NMS ---
+    keep_idxs = nms(boxes, scores, iou_threshold=0.5)
+    boxes = boxes[keep_idxs]
+    scores = scores[keep_idxs]
+    classes = classes[keep_idxs]
+
+    # --- Scale boxes back to original image coordinates ---
+    for i in range(len(boxes)):
+        x1, y1, x2, y2 = boxes[i]
+
+        # Remove padding offset
+        x1 = (x1 - pad_x) / scale
+        y1 = (y1 - pad_y) / scale
+        x2 = (x2 - pad_x) / scale
+        y2 = (y2 - pad_y) / scale
+
+        # Clamp to original dimensions
+        x1 = max(0, min(x1, orig_w))
+        y1 = max(0, min(y1, orig_h))
+        x2 = max(0, min(x2, orig_w))
+        y2 = max(0, min(y2, orig_h))
+
+        w = int(round(x2 - x1))
+        h = int(round(y2 - y1))
+        if w < 5 or h < 5:
+            continue
+
+        cls_idx = int(classes[i])
+        label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
+
+        regions.append(LayoutRegion(
+            x=int(round(x1)),
+            y=int(round(y1)),
+            width=w,
+            height=h,
+            label=label,
+            confidence=round(float(scores[i]), 4),
+            label_index=cls_idx,
+        ))
+
+    # Sort by confidence descending, limit
+    regions.sort(key=lambda r: r.confidence, reverse=True)
+    return regions[:max_regions]
+
+
+# ---------------------------------------------------------------------------
+# Main detection function
+# ---------------------------------------------------------------------------
+
+
+def detect_layout_regions(
+    img_bgr: np.ndarray,
+    confidence_threshold: float = 0.5,
+    max_regions: int = 50,
+) -> List[LayoutRegion]:
+    """Detect document layout regions using PP-DocLayout ONNX model.
+
+    Args:
+        img_bgr: BGR color image (OpenCV format).
+        confidence_threshold: Minimum confidence to keep a detection.
+        max_regions: Maximum number of regions to return.
+
+    Returns:
+        List of LayoutRegion sorted by confidence descending.
+        Returns empty list if model is not available.
+    """
+    session = _load_onnx_session()
+    if session is None:
+        return []
+
+    if img_bgr is None or img_bgr.size == 0:
+        return []
+
+    orig_h, orig_w = img_bgr.shape[:2]
+
+    # Pre-process
+    input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
+
+    # Run inference
+    try:
+        input_name = session.get_inputs()[0].name
+        outputs = session.run(None, {input_name: input_tensor})
+    except Exception as exc:
+        logger.warning("PP-DocLayout inference failed: %s", exc)
+        return []
+
+    # Post-process
+    regions = _postprocess(
+        outputs,
+        scale=scale,
+        pad_x=pad_x,
+        pad_y=pad_y,
+        orig_w=orig_w,
+        orig_h=orig_h,
+        confidence_threshold=confidence_threshold,
+        max_regions=max_regions,
+    )
+
+    if regions:
+        label_counts: Dict[str, int] = {}
+        for r in regions:
+            label_counts[r.label] = label_counts.get(r.label, 0) + 1
+        logger.info(
+            "PP-DocLayout: %d regions (%s)",
+            len(regions),
+            ", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
+        )
+    else:
+        logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
+
+    return regions
@@ -0,0 +1,422 @@
+"""
+Graphical element detection for OCR pages.
+
+Region-based approach:
+  1. Build a color mask (saturation channel — black text is invisible).
+  2. Dilate heavily to merge nearby colored pixels into regions.
+  3. For each region, check overlap with OCR word boxes:
+       - High word overlap → colored text (skip)
+       - Low word overlap  → colored graphic / image (keep)
+  4. Separately detect large black-ink illustrations via ink mask.
+
+Boxes and text colors are handled by cv_box_detect / cv_color_detect.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["detect_graphic_elements", "GraphicElement"]
+
+
+@dataclass
+class GraphicElement:
+    """A detected non-text graphical element."""
+    x: int
+    y: int
+    width: int
+    height: int
+    area: int
+    shape: str          # image, illustration
+    color_name: str     # dominant color or 'black'
+    color_hex: str
+    confidence: float
+    contour: Any = field(default=None, repr=False)
+
+
+# ---------------------------------------------------------------------------
+# Color helpers
+# ---------------------------------------------------------------------------
+
+_COLOR_HEX = {
+    "black": "#000000",
+    "gray": "#6b7280",
+    "red": "#dc2626",
+    "orange": "#ea580c",
+    "yellow": "#ca8a04",
+    "green": "#16a34a",
+    "blue": "#2563eb",
+    "purple": "#9333ea",
+}
+
+
+def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
+    """Return (color_name, color_hex) for an HSV region."""
+    if hsv_roi.size == 0:
+        return "black", _COLOR_HEX["black"]
+
+    pixels = hsv_roi.reshape(-1, 3)
+    sat = pixels[:, 1]
+    sat_mask = sat > sat_threshold
+    sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
+
+    if sat_ratio < 0.15:
+        return "black", _COLOR_HEX["black"]
+
+    sat_pixels = pixels[sat_mask]
+    if len(sat_pixels) < 3:
+        return "black", _COLOR_HEX["black"]
+
+    med_hue = float(np.median(sat_pixels[:, 0]))
+
+    if med_hue < 10 or med_hue > 170:
+        name = "red"
+    elif med_hue < 25:
+        name = "orange"
+    elif med_hue < 35:
+        name = "yellow"
+    elif med_hue < 85:
+        name = "green"
+    elif med_hue < 130:
+        name = "blue"
+    else:
+        name = "purple"
+
+    return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
+
+
+# ---------------------------------------------------------------------------
+# Main detection
+# ---------------------------------------------------------------------------
+
+def detect_graphic_elements(
+    img_bgr: np.ndarray,
+    word_boxes: List[Dict],
+    detected_boxes: Optional[List[Dict]] = None,
+    max_elements: int = 50,
+) -> List[GraphicElement]:
+    """Find non-text graphical regions on the page.
+
+    Region-based: dilate color mask to form regions, then check word
+    overlap to distinguish colored text from colored graphics.
+
+    Args:
+        img_bgr: BGR color image.
+        word_boxes: List of OCR word dicts with left/top/width/height.
+        detected_boxes: Optional list of detected box dicts (x/y/w/h).
+        max_elements: Maximum number of elements to return.
+
+    Returns:
+        List of GraphicElement, sorted by area descending.
+    """
+    if img_bgr is None:
+        return []
+
+    # ------------------------------------------------------------------
+    # Try PP-DocLayout ONNX first if available
+    # ------------------------------------------------------------------
+    import os
+    backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
+    if backend in ("doclayout", "auto"):
+        try:
+            from cv_doclayout_detect import detect_layout_regions, is_doclayout_available
+            if is_doclayout_available():
+                regions = detect_layout_regions(img_bgr)
+                if regions:
+                    _LABEL_TO_COLOR = {
+                        "figure": ("image", "green", _COLOR_HEX.get("green", "#16a34a")),
+                        "table":  ("image", "blue",  _COLOR_HEX.get("blue", "#2563eb")),
+                    }
+                    converted: List[GraphicElement] = []
+                    for r in regions:
+                        shape, color_name, color_hex = _LABEL_TO_COLOR.get(
+                            r.label,
+                            (r.label, "gray", _COLOR_HEX.get("gray", "#6b7280")),
+                        )
+                        converted.append(GraphicElement(
+                            x=r.x,
+                            y=r.y,
+                            width=r.width,
+                            height=r.height,
+                            area=r.width * r.height,
+                            shape=shape,
+                            color_name=color_name,
+                            color_hex=color_hex,
+                            confidence=r.confidence,
+                            contour=None,
+                        ))
+                    converted.sort(key=lambda g: g.area, reverse=True)
+                    result = converted[:max_elements]
+                    if result:
+                        shape_counts: Dict[str, int] = {}
+                        for g in result:
+                            shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
+                        logger.info(
+                            "GraphicDetect (PP-DocLayout): %d elements (%s)",
+                            len(result),
+                            ", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
+                        )
+                    return result
+        except Exception as e:
+            logger.warning("PP-DocLayout failed, falling back to OpenCV: %s", e)
+    # ------------------------------------------------------------------
+    # OpenCV fallback (original logic)
+    # ------------------------------------------------------------------
+
+    h, w = img_bgr.shape[:2]
+
+    logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
+                 w, h, len(word_boxes), len(detected_boxes or []))
+
+    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    candidates: List[GraphicElement] = []
+
+    # --- Build word mask (for overlap checking) ---
+    word_mask = np.zeros((h, w), dtype=np.uint8)
+    for wb in word_boxes:
+        x1 = max(0, int(wb.get("left", 0)))
+        y1 = max(0, int(wb.get("top", 0)))
+        x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
+        y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
+        word_mask[y1:y2, x1:x2] = 255
+
+    # =====================================================================
+    # PASS 1 — COLORED IMAGE REGIONS
+    # =====================================================================
+    # Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
+    sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
+    val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
+    color_pixels = cv2.bitwise_and(sat_mask, val_mask)
+
+    # Remove tiny speckle
+    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+    color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
+
+    # Count raw colored pixels before dilation (for density check later)
+    color_pixel_raw = color_pixels.copy()
+
+    # Heavy dilation to merge nearby colored elements into regions.
+    # A 25x25 kernel merges elements within ~12px of each other.
+    kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
+    region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
+
+    contours_regions, _ = cv2.findContours(
+        region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
+
+    for cnt in contours_regions:
+        bx, by, bw, bh = cv2.boundingRect(cnt)
+
+        # Skip tiny regions
+        if bw < 15 or bh < 15:
+            continue
+
+        # Skip page-spanning regions
+        if bw > w * 0.6 or bh > h * 0.6:
+            logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
+            continue
+
+        bbox_area = bw * bh
+
+        # Check: how much of this region's bounding box overlaps with words?
+        roi_words = word_mask[by:by + bh, bx:bx + bw]
+        word_pixel_count = int(np.sum(roi_words > 0))
+        word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
+
+        # Check: how many OCR word centroids fall inside this region?
+        # Colored text that OCR detected will have multiple centroids inside.
+        # Actual images may have 0-1 spurious OCR artifacts.
+        word_centroid_count = sum(
+            1 for wb in word_boxes
+            if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
+                and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
+        )
+
+        # Check: how many actual colored pixels are in this region?
+        roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
+        color_pixel_count = int(np.sum(roi_color > 0))
+
+        # Color pixel density (before any skip checks so we can log it)
+        density = color_pixel_count / bbox_area if bbox_area > 0 else 0
+
+        # --- Skip heuristics for colored TEXT (not images) ---
+
+        # (a) High word-box pixel overlap → clearly text
+        if word_overlap > 0.40:
+            logger.info(
+                "GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
+                "overlap=%.0f%% centroids=%d",
+                bx, by, bw, bh, word_overlap * 100, word_centroid_count,
+            )
+            continue
+
+        # (b) Multiple OCR words detected inside → colored text
+        #     (images rarely produce 2+ confident word detections)
+        if word_centroid_count >= 2:
+            logger.info(
+                "GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
+                "centroids=%d overlap=%.0f%% density=%.0f%%",
+                bx, by, bw, bh, word_centroid_count,
+                word_overlap * 100, density * 100,
+            )
+            continue
+
+        # (c) Even 1 word + some pixel overlap → likely text
+        if word_centroid_count >= 1 and word_overlap > 0.10:
+            logger.info(
+                "GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
+                "centroids=%d overlap=%.0f%%",
+                bx, by, bw, bh, word_centroid_count, word_overlap * 100,
+            )
+            continue
+
+        # Need a minimum number of colored pixels (not just dilated area)
+        if color_pixel_count < 200:
+            continue
+
+        # (d) Very low density → thin strokes, almost certainly text.
+        # Large regions (photos/illustrations) can have low color density
+        # because most pixels are grayscale ink.  Use a lower threshold
+        # for regions bigger than 100×80 px.
+        _min_density = 0.05 if (bw > 100 and bh > 80) else 0.20
+        if density < _min_density:
+            logger.info(
+                "GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
+                "density=%.0f%% (min=%.0f%%, likely colored text)",
+                bx, by, bw, bh, density * 100, _min_density * 100,
+            )
+            continue
+
+        # (e) Moderate density + small height → colored text line
+        if density < 0.35 and bh < h * 0.05:
+            logger.info(
+                "GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
+                "density=%.0f%% height=%.1f%%",
+                bx, by, bw, bh, density * 100, 100.0 * bh / h,
+            )
+            continue
+
+        # Determine dominant color from the actual colored pixels
+        roi_hsv = hsv[by:by + bh, bx:bx + bw]
+        color_px_mask = roi_color > 0
+        if np.sum(color_px_mask) > 0:
+            masked_hsv = roi_hsv[color_px_mask]
+            color_name, color_hex = _dominant_color(masked_hsv)
+        else:
+            color_name, color_hex = "black", _COLOR_HEX["black"]
+
+        # Confidence based on color density and low word overlap
+        conf = min(0.95, 0.5 + density * 0.5)
+
+        logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s",
+                     bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name)
+        candidates.append(GraphicElement(
+            x=bx, y=by, width=bw, height=bh,
+            area=color_pixel_count,
+            shape="image",
+            color_name=color_name, color_hex=color_hex,
+            confidence=round(conf, 2), contour=cnt,
+        ))
+
+    # =====================================================================
+    # PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
+    # =====================================================================
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    # Exclude words and colored regions already found
+    exclusion = np.zeros((h, w), dtype=np.uint8)
+    word_pad = 5
+    for wb in word_boxes:
+        x1 = max(0, int(wb.get("left", 0)) - word_pad)
+        y1 = max(0, int(wb.get("top", 0)) - word_pad)
+        x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
+        y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
+        exclusion[y1:y2, x1:x2] = 255
+
+    if detected_boxes:
+        for box in detected_boxes:
+            bbx = int(box.get("x", 0))
+            bby = int(box.get("y", 0))
+            bbw = int(box.get("w", box.get("width", 0)))
+            bbh = int(box.get("h", box.get("height", 0)))
+            inset = 8
+            x1 = max(0, bbx + inset)
+            y1 = max(0, bby + inset)
+            x2 = min(w, bbx + bbw - inset)
+            y2 = min(h, bby + bbh - inset)
+            if x2 > x1 and y2 > y1:
+                exclusion[y1:y2, x1:x2] = 255
+
+    ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
+    ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
+
+    contours_ink, _ = cv2.findContours(
+        ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
+
+    for cnt in contours_ink:
+        area = cv2.contourArea(cnt)
+        bx, by, bw, bh = cv2.boundingRect(cnt)
+
+        if area < 5000 or min(bw, bh) < 40:
+            continue
+        if bw > w * 0.8 or bh > h * 0.8:
+            continue
+
+        logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
+                     bx, by, bw, bh, int(area))
+        candidates.append(GraphicElement(
+            x=bx, y=by, width=bw, height=bh,
+            area=int(area), shape="illustration",
+            color_name="black", color_hex="#000000",
+            confidence=0.5, contour=cnt,
+        ))
+
+    # =====================================================================
+    # Deduplicate and return
+    # =====================================================================
+    candidates.sort(key=lambda g: g.area, reverse=True)
+
+    final: List[GraphicElement] = []
+    for c in candidates:
+        overlap = False
+        for f in final:
+            ix1 = max(c.x, f.x)
+            iy1 = max(c.y, f.y)
+            ix2 = min(c.x + c.width, f.x + f.width)
+            iy2 = min(c.y + c.height, f.y + f.height)
+            if ix2 > ix1 and iy2 > iy1:
+                inter = (ix2 - ix1) * (iy2 - iy1)
+                smaller = min(c.width * c.height, f.width * f.height)
+                if smaller > 0 and inter / smaller > 0.5:
+                    overlap = True
+                    break
+        if not overlap:
+            final.append(c)
+
+    result = final[:max_elements]
+
+    if result:
+        shape_counts: Dict[str, int] = {}
+        for g in result:
+            shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
+        logger.info(
+            "GraphicDetect: %d elements found (%s)",
+            len(result),
+            ", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
+        )
+    else:
+        logger.info("GraphicDetect: no graphic elements found")
+
+    return result
@@ -0,0 +1,231 @@
+"""
+Syllable Core — hyphenator init, word validation, pipe autocorrect.
+
+Extracted from cv_syllable_detect.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# IPA/phonetic characters -- skip cells containing these
+_IPA_RE = re.compile(r'[\[\]\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u00e6\u0254\u0259\u025b\u025c\u026a\u028a\u028c]')
+
+# Common German words that should NOT be merged with adjacent tokens.
+_STOP_WORDS = frozenset([
+    # Articles
+    'der', 'die', 'das', 'dem', 'den', 'des',
+    'ein', 'eine', 'einem', 'einen', 'einer',
+    # Pronouns
+    'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
+    # Prepositions
+    'mit', 'von', 'zu', 'f\u00fcr', 'auf', 'in', 'an', 'um', 'am', 'im',
+    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', '\u00fcber', 'unter',
+    'zwischen', 'ohne', 'gegen',
+    # Conjunctions
+    'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
+    # Adverbs
+    'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
+    # Verbs
+    'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
+    'sein', 'haben',
+    # Other
+    'kein', 'keine', 'keinem', 'keinen', 'keiner',
+])
+
+# Cached hyphenators
+_hyph_de = None
+_hyph_en = None
+
+# Cached spellchecker (for autocorrect_pipe_artifacts)
+_spell_de = None
+
+
+def _get_hyphenators():
+    """Lazy-load pyphen hyphenators (cached across calls)."""
+    global _hyph_de, _hyph_en
+    if _hyph_de is not None:
+        return _hyph_de, _hyph_en
+    try:
+        import pyphen
+    except ImportError:
+        return None, None
+    _hyph_de = pyphen.Pyphen(lang='de_DE')
+    _hyph_en = pyphen.Pyphen(lang='en_US')
+    return _hyph_de, _hyph_en
+
+
+def _get_spellchecker():
+    """Lazy-load German spellchecker (cached across calls)."""
+    global _spell_de
+    if _spell_de is not None:
+        return _spell_de
+    try:
+        from spellchecker import SpellChecker
+    except ImportError:
+        return None
+    _spell_de = SpellChecker(language='de')
+    return _spell_de
+
+
+def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
+    """Check whether pyphen recognises a word (DE or EN)."""
+    if len(word) < 2:
+        return False
+    return ('|' in hyph_de.inserted(word, hyphen='|')
+            or '|' in hyph_en.inserted(word, hyphen='|'))
+
+
+def _is_real_word(word: str) -> bool:
+    """Check whether spellchecker knows this word (case-insensitive)."""
+    spell = _get_spellchecker()
+    if spell is None:
+        return False
+    return word.lower() in spell
+
+
+def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
+    """Try to hyphenate a word using DE then EN dictionary.
+
+    Returns word with | separators, or None if not recognized.
+    """
+    hyph = hyph_de.inserted(word, hyphen='|')
+    if '|' in hyph:
+        return hyph
+    hyph = hyph_en.inserted(word, hyphen='|')
+    if '|' in hyph:
+        return hyph
+    return None
+
+
+def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
+    """Try to correct a word that has OCR pipe artifacts.
+
+    Printed syllable divider lines on dictionary pages confuse OCR:
+    the vertical stroke is often read as an extra character (commonly
+    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
+
+    Uses ``spellchecker`` (frequency-based word list) for validation.
+
+    Strategy:
+        1. Strip ``|`` -- if spellchecker knows the result, done.
+        2. Try deleting each pipe-like character (l, I, 1, i, t).
+        3. Fall back to spellchecker's own ``correction()`` method.
+        4. Preserve the original casing of the first letter.
+    """
+    stripped = word_with_pipes.replace('|', '')
+    if not stripped or len(stripped) < 3:
+        return stripped  # too short to validate
+
+    # Step 1: if the stripped word is already a real word, done
+    if _is_real_word(stripped):
+        return stripped
+
+    # Step 2: try deleting pipe-like characters (most likely artifacts)
+    _PIPE_LIKE = frozenset('lI1it')
+    for idx in range(len(stripped)):
+        if stripped[idx] not in _PIPE_LIKE:
+            continue
+        candidate = stripped[:idx] + stripped[idx + 1:]
+        if len(candidate) >= 3 and _is_real_word(candidate):
+            return candidate
+
+    # Step 3: use spellchecker's built-in correction
+    spell = _get_spellchecker()
+    if spell is not None:
+        suggestion = spell.correction(stripped.lower())
+        if suggestion and suggestion != stripped.lower():
+            # Preserve original first-letter case
+            if stripped[0].isupper():
+                suggestion = suggestion[0].upper() + suggestion[1:]
+            return suggestion
+
+    return None  # could not fix
+
+
+def autocorrect_pipe_artifacts(
+    zones_data: List[Dict], session_id: str,
+) -> int:
+    """Strip OCR pipe artifacts and correct garbled words in-place.
+
+    Printed syllable divider lines on dictionary scans are read by OCR
+    as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
+    This function:
+
+    1. Strips ``|`` from every word in content cells.
+    2. Validates with spellchecker (real dictionary lookup).
+    3. If not recognised, tries deleting pipe-like characters or uses
+       spellchecker's correction (e.g. ``Zeplpelin`` -> ``Zeppelin``).
+    4. Updates both word-box texts and cell text.
+
+    Returns the number of cells modified.
+    """
+    spell = _get_spellchecker()
+    if spell is None:
+        logger.warning("spellchecker not available -- pipe autocorrect limited")
+        # Fall back: still strip pipes even without spellchecker
+        pass
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+
+            cell_changed = False
+
+            # --- Fix word boxes ---
+            for wb in cell.get("word_boxes", []):
+                wb_text = wb.get("text", "")
+                if "|" not in wb_text:
+                    continue
+
+                # Separate trailing punctuation
+                m = re.match(
+                    r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)'
+                    r'(.*?)'
+                    r'([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$',
+                    wb_text,
+                )
+                if not m:
+                    continue
+                lead, core, trail = m.group(1), m.group(2), m.group(3)
+                if "|" not in core:
+                    continue
+
+                corrected = _autocorrect_piped_word(core)
+                if corrected is not None and corrected != core:
+                    wb["text"] = lead + corrected + trail
+                    cell_changed = True
+
+            # --- Rebuild cell text from word boxes ---
+            if cell_changed:
+                wbs = cell.get("word_boxes", [])
+                if wbs:
+                    cell["text"] = " ".join(
+                        (wb.get("text") or "") for wb in wbs
+                    )
+                modified += 1
+
+            # --- Fallback: strip residual | from cell text ---
+            text = cell.get("text", "")
+            if "|" in text:
+                clean = text.replace("|", "")
+                if clean != text:
+                    cell["text"] = clean
+                    if not cell_changed:
+                        modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: autocorrected pipe artifacts in %d cells",
+            session_id, modified,
+        )
+    return modified
@@ -0,0 +1,32 @@
+"""
+Syllable divider insertion for dictionary pages — barrel re-export.
+
+All implementation split into:
+  cv_syllable_core  — hyphenator init, word validation, pipe autocorrect
+  cv_syllable_merge — word gap merging, syllabification, divider insertion
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+# Core: init, validation, autocorrect
+from cv_syllable_core import (  # noqa: F401
+    _IPA_RE,
+    _STOP_WORDS,
+    _get_hyphenators,
+    _get_spellchecker,
+    _is_known_word,
+    _is_real_word,
+    _hyphenate_word,
+    _autocorrect_piped_word,
+    autocorrect_pipe_artifacts,
+)
+
+# Merge: gap merging, syllabify, insert
+from cv_syllable_merge import (  # noqa: F401
+    _try_merge_pipe_gaps,
+    merge_word_gaps_in_zones,
+    _try_merge_word_gaps,
+    _syllabify_text,
+    insert_syllable_dividers,
+)
@@ -0,0 +1,300 @@
+"""
+Syllable Merge — word gap merging, syllabification, divider insertion.
+
+Extracted from cv_syllable_detect.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from cv_syllable_core import (
+    _get_hyphenators,
+    _hyphenate_word,
+    _IPA_RE,
+    _STOP_WORDS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
+    """Merge fragments separated by single spaces where OCR split at a pipe.
+
+    Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
+    Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
+
+    Guards against false merges:
+    - The FIRST token must be pure alpha (word start -- no attached punctuation)
+    - The second token may have trailing punctuation (comma, period) which
+      stays attached to the merged word: "Ka" + "fer," -> "Kafer,"
+    - Common German function words (der, die, das, ...) are never merged
+    - At least one fragment must be very short (<=3 alpha chars)
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        # Extract alpha-only core for lookup
+        prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
+
+        # Guard 1: first token must be pure alpha (word-start fragment)
+        #          second token may have trailing punctuation
+        # Guard 2: neither alpha core can be a common German function word
+        # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
+        # Guard 4: combined length must be >= 4
+        should_try = (
+            prev == prev_alpha  # first token: pure alpha (word start)
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 3
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                # pyphen recognizes merged word -- collapse the space
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
+def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
+    """Merge OCR word-gap fragments in cell texts using pyphen validation.
+
+    OCR often splits words at syllable boundaries into separate word_boxes,
+    producing text like "zerknit tert" instead of "zerknittert".  This
+    function tries to merge adjacent fragments in every content cell.
+
+    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
+    but still guarded by pyphen dictionary lookup and stop-word exclusion.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, _ = _get_hyphenators()
+    if hyph_de is None:
+        return 0
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if not text or " " not in text:
+                continue
+
+            # Skip IPA cells
+            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
+            if _IPA_RE.search(text_no_brackets):
+                continue
+
+            new_text = _try_merge_word_gaps(text, hyph_de)
+            if new_text != text:
+                cell["text"] = new_text
+                modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: merged word gaps in %d cells",
+            session_id, modified,
+        )
+    return modified
+
+
+def _try_merge_word_gaps(text: str, hyph_de) -> str:
+    """Merge OCR word fragments with relaxed threshold (max_short=5).
+
+    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
+    (max_short=5 instead of 3).  Still requires pyphen to recognize the
+    merged word.
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
+
+        should_try = (
+            prev == prev_alpha
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 5
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
+def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
+    """Syllabify all significant words in a text string.
+
+    1. Strip existing | dividers
+    2. Merge pipe-gap spaces where possible
+    3. Apply pyphen to each word >= 3 alphabetic chars
+    4. Words pyphen doesn't recognize stay as-is (no bad guesses)
+    """
+    if not text:
+        return text
+
+    # Skip cells that contain IPA transcription characters outside brackets.
+    text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
+    if _IPA_RE.search(text_no_brackets):
+        return text
+
+    # Phase 1: strip existing pipe dividers for clean normalization
+    clean = text.replace('|', '')
+
+    # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
+    clean = _try_merge_pipe_gaps(clean, hyph_de)
+
+    # Phase 3: tokenize and syllabify each word
+    # Split on whitespace and comma/semicolon sequences, keeping separators
+    tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
+
+    result = []
+    for tok in tokens:
+        if not tok or re.match(r'^[\s,;:]+$', tok):
+            result.append(tok)
+            continue
+
+        # Strip trailing/leading punctuation for pyphen lookup
+        m = re.match(r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)(.*?)([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', tok)
+        if not m:
+            result.append(tok)
+            continue
+        lead, word, trail = m.group(1), m.group(2), m.group(3)
+
+        if len(word) < 3 or not re.search(r'[a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df]', word):
+            result.append(tok)
+            continue
+
+        hyph = _hyphenate_word(word, hyph_de, hyph_en)
+        if hyph:
+            result.append(lead + hyph + trail)
+        else:
+            result.append(tok)
+
+    return ''.join(result)
+
+
+def insert_syllable_dividers(
+    zones_data: List[Dict],
+    img_bgr: np.ndarray,
+    session_id: str,
+    *,
+    force: bool = False,
+    col_filter: Optional[set] = None,
+) -> int:
+    """Insert pipe syllable dividers into dictionary cells.
+
+    For dictionary pages: process all content column cells, strip existing
+    pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
+
+    Pre-check: at least 1% of content cells must already contain ``|`` from
+    OCR.  This guards against pages with zero pipe characters.
+
+    Args:
+        force: If True, skip the pipe-ratio pre-check and syllabify all
+            content words regardless of whether the original has pipe dividers.
+        col_filter: If set, only process cells whose col_type is in this set.
+            None means process all content columns.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, hyph_en = _get_hyphenators()
+    if hyph_de is None:
+        logger.warning("pyphen not installed -- skipping syllable insertion")
+        return 0
+
+    # Pre-check: count cells that already have | from OCR.
+    if not force:
+        total_col_cells = 0
+        cells_with_pipes = 0
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                if cell.get("col_type", "").startswith("column_"):
+                    total_col_cells += 1
+                    if "|" in cell.get("text", ""):
+                        cells_with_pipes += 1
+
+        if total_col_cells > 0:
+            pipe_ratio = cells_with_pipes / total_col_cells
+            if pipe_ratio < 0.01:
+                logger.info(
+                    "build-grid session %s: skipping syllable insertion -- "
+                    "only %.1f%% of cells have existing pipes (need >=1%%)",
+                    session_id, pipe_ratio * 100,
+                )
+                return 0
+
+    insertions = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            if col_filter is not None and ct not in col_filter:
+                continue
+            text = cell.get("text", "")
+            if not text:
+                continue
+
+            # In auto mode (force=False), only normalize cells that already
+            # have | from OCR (i.e. printed syllable dividers on the original
+            # scan).  Don't add new syllable marks to other words.
+            if not force and "|" not in text:
+                continue
+
+            new_text = _syllabify_text(text, hyph_de, hyph_en)
+            if new_text != text:
+                cell["text"] = new_text
+                insertions += 1
+
+    if insertions:
+        logger.info(
+            "build-grid session %s: syllable dividers inserted/normalized "
+            "in %d cells (pyphen)",
+            session_id, insertions,
+        )
+    return insertions
@@ -0,0 +1,493 @@
+"""
+Cell text filtering, column/row word assignment, and bold detection.
+
+This module contains:
+- _assign_row_words_to_columns(): spatial assignment of OCR words to grid columns
+- Cell text noise filtering (_clean_cell_text, _clean_cell_text_lite, etc.)
+- Bold detection via stroke-width analysis (_measure_stroke_width, _classify_bold_cells)
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import re
+import logging
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# Column / Row word assignment
+# ---------------------------------------------------------------------------
+
+def _assign_row_words_to_columns(
+    row: RowGeometry,
+    columns: List[PageRegion],
+) -> Dict[int, List[Dict]]:
+    """Assign each word in a row to exactly one column.
+
+    Uses a two-pass strategy:
+    1. Containment: if a word's center falls within a column's horizontal
+       bounds (with padding), assign it to that column.
+    2. Nearest center: for words not contained by any column, fall back to
+       nearest column center distance.
+
+    This prevents long sentences in wide columns (e.g. example) from having
+    their rightmost words stolen by an adjacent column.
+
+    Args:
+        row: Row with words (relative coordinates).
+        columns: Sorted list of columns (absolute coordinates).
+
+    Returns:
+        Dict mapping col_index -> list of words assigned to that column.
+    """
+    result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
+
+    if not row.words or not columns:
+        return result
+
+    left_x = row.x  # content ROI left (absolute)
+
+    # Build non-overlapping column assignment ranges using midpoints.
+    # For adjacent columns, the boundary is the midpoint between them.
+    # This prevents words near column borders from being assigned to
+    # the wrong column (e.g. "We" at the start of an example sentence
+    # being stolen by the preceding DE column).
+    n = len(columns)
+    col_ranges_rel = []  # (assign_left, assign_right) per column
+    for ci, col in enumerate(columns):
+        col_left_rel = col.x - left_x
+        col_right_rel = col_left_rel + col.width
+
+        # Left boundary: midpoint to previous column, or 0
+        if ci == 0:
+            assign_left = 0
+        else:
+            prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
+            assign_left = (prev_right + col_left_rel) / 2
+
+        # Right boundary: midpoint to next column, or infinity (row width)
+        if ci == n - 1:
+            assign_right = row.width + 100  # generous for last column
+        else:
+            next_left = columns[ci + 1].x - left_x
+            assign_right = (col_right_rel + next_left) / 2
+
+        col_ranges_rel.append((assign_left, assign_right))
+
+    for w in row.words:
+        w_left = w['left']
+        w_right = w_left + w['width']
+        w_center_x = w_left + w['width'] / 2
+
+        # Primary: overlap-based matching — assign to column with most overlap.
+        # This is more robust than center-based for narrow columns (page_ref)
+        # where the last character's center may fall into the next column.
+        best_col = -1
+        best_overlap = 0
+        for ci, col in enumerate(columns):
+            col_left_rel = col.x - left_x
+            col_right_rel = col_left_rel + col.width
+            overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
+            if overlap > best_overlap:
+                best_overlap = overlap
+                best_col = ci
+
+        if best_col >= 0 and best_overlap > 0:
+            result[best_col].append(w)
+        else:
+            # Fallback: center-based range matching
+            assigned = False
+            for ci, (al, ar) in enumerate(col_ranges_rel):
+                if al <= w_center_x < ar:
+                    result[ci].append(w)
+                    assigned = True
+                    break
+
+            if not assigned:
+                # Last resort: nearest column center
+                best_col = 0
+                col_left_0 = columns[0].x - left_x
+                best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
+                for ci in range(1, n):
+                    col_left = columns[ci].x - left_x
+                    dist = abs(w_center_x - (col_left + columns[ci].width / 2))
+                    if dist < best_dist:
+                        best_dist = dist
+                        best_col = ci
+                result[best_col].append(w)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Cell text noise filtering
+# ---------------------------------------------------------------------------
+
+# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
+_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
+_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
+
+# Common short EN/DE words (2-3 chars).  Tokens at the end of a cell
+# that do NOT appear here are treated as trailing OCR noise.
+_COMMON_SHORT_WORDS: set = {
+    # EN 1-2 letter
+    'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
+    'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
+    'or', 'so', 'to', 'up', 'us', 'we',
+    # EN 3 letter
+    'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
+    'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
+    'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
+    'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
+    'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
+    'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
+    'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
+    'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
+    'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
+    'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
+    'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
+    'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
+    'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
+    'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
+    'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
+    'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
+    'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
+    'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
+    'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
+    'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
+    'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
+    'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
+    'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
+    'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
+    'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
+    'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
+    'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
+    'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
+    'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
+    'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
+    'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
+    'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
+    'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
+    'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
+    'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
+    'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
+    'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
+    'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
+    'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
+    'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
+    'zap', 'zip', 'zoo',
+    # DE 2-3 letter
+    'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
+    'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
+    'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
+    'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
+    'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
+    'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
+    'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
+    'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
+    'wut', 'zum', 'zur',
+}
+
+# Known abbreviations found in EN/DE textbooks and dictionaries.
+# Stored WITHOUT trailing period (the noise filter strips periods).
+# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
+_KNOWN_ABBREVIATIONS: set = {
+    # EN dictionary meta-words
+    'sth', 'sb', 'smth', 'smb', 'sbd',
+    # EN general
+    'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
+    'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
+    # EN references / textbook
+    'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
+    'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
+    'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
+    'ans', 'wb', 'tb', 'vocab',
+    # EN parts of speech / grammar
+    'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
+    'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
+    'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
+    'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
+    'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
+    'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
+    'syn', 'ant', 'opp', 'var', 'orig',
+    # EN titles
+    'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
+    # EN pronunciation
+    'br', 'am', 'brit', 'amer',
+    # EN units
+    'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
+    # DE general
+    'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
+    'bes', 'insb', 'insbes', 'bspw', 'ca',
+    'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
+    'inkl', 'exkl', 'zzgl', 'abzgl',
+    # DE references
+    'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
+    'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
+    's', 'sp', 'zit', 'zs', 'vlg',
+    # DE grammar
+    'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
+    'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
+    'trennb', 'untrennb', 'ugs', 'geh', 'pej',
+    # DE regional
+    'nordd', 'österr', 'schweiz',
+    # Linguistic
+    'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
+    'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
+    'count', 'uncount', 'indef', 'def', 'poss', 'demon',
+}
+
+
+def _is_noise_tail_token(token: str) -> bool:
+    """Check if a token at the END of cell text is trailing OCR noise.
+
+    Trailing fragments are very common OCR artifacts from image edges,
+    borders, and neighbouring cells.  This is more aggressive than a
+    general word filter: any short token that isn't in the dictionary
+    of common EN/DE words is considered noise.
+
+    Examples of noise: "Es)", "3", "ee", "B"
+    Examples to keep:  "sister.", "cupcakes.", "...", "mice", "[eg]"
+    """
+    t = token.strip()
+    if not t:
+        return True
+
+    # Keep ellipsis
+    if t in ('...', '…'):
+        return False
+
+    # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
+    if t.startswith('[') or t.startswith('["') or t.startswith("['"):
+        return False
+    if t.endswith(']'):
+        return False
+
+    # Keep meaningful punctuation tokens used in textbooks
+    # = (definition marker), (= (definition opener), ; (separator)
+    if t in ('=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', '&'):
+        return False
+
+    # Pure non-alpha -> noise ("3", ")", "|")
+    alpha_chars = _RE_ALPHA.findall(t)
+    if not alpha_chars:
+        return True
+
+    # Extract only alpha characters for dictionary lookup
+    cleaned = ''.join(alpha_chars)
+
+    # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
+    if cleaned.lower() in _KNOWN_ABBREVIATIONS:
+        return False
+
+    # Strip normal trailing punctuation before checking for internal noise.
+    stripped_punct = re.sub(r'[.,;:!?]+$', '', t)  # "cupcakes." -> "cupcakes"
+    t_check = stripped_punct if stripped_punct else t
+
+    # Check for legitimate punctuation patterns vs. real noise.
+    # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
+    #             "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
+    # Noise: "3d", "B|", "x7"
+    # Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
+    # THEN check if residual contains only alpha characters.
+    t_inner = t_check
+    # Remove all parentheses, hyphens, slashes, and dots — these are normal
+    # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
+    # "(zer)brechen", "wir/uns", "e.g."
+    t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
+    # Now check: does the inner form still have non-alpha noise?
+    inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
+    has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
+
+    # Long alpha words (4+ chars) without internal noise are likely real
+    if len(cleaned) >= 4 and not has_internal_noise:
+        return False
+
+    # Short words: check dictionary (uses only alpha chars)
+    if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
+        return False
+
+    # Default: short or suspicious -> noise
+    return True
+
+
+def _is_garbage_text(text: str) -> bool:
+    """Check if entire cell text is OCR garbage from image areas.
+
+    Garbage text = no recognizable dictionary word.  Catches
+    "(ci]oeu", "uanoaain." etc.
+    """
+    words = _RE_REAL_WORD.findall(text)
+    if not words:
+        # Check if any token is a known abbreviation (e.g. "e.g.")
+        alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
+        if alpha_only in _KNOWN_ABBREVIATIONS:
+            return False
+        return True
+
+    for w in words:
+        wl = w.lower()
+        # Known short word or abbreviation -> not garbage
+        if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
+            return False
+        # Long word (>= 4 chars): check vowel/consonant ratio.
+        # Real EN/DE words have 20-60% vowels.  Garbage like "uanoaain"
+        # or "cioeu" has unusual ratios (too many or too few vowels).
+        if len(wl) >= 4:
+            vowels = sum(1 for c in wl if c in 'aeiouäöü')
+            ratio = vowels / len(wl)
+            if 0.15 <= ratio <= 0.65:
+                return False  # plausible vowel ratio -> real word
+
+    return True
+
+
+def _clean_cell_text(text: str) -> str:
+    """Remove OCR noise from cell text.  Generic filters:
+
+    1. If the entire text has no real alphabetic word (>= 2 letters), clear.
+    2. If the entire text is garbage (no dictionary word), clear.
+    3. Strip trailing noise tokens from the end of the text.
+    """
+    stripped = text.strip()
+    if not stripped:
+        return ''
+
+    # --- Filter 1: No real word at all ---
+    if not _RE_REAL_WORD.search(stripped):
+        # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
+        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+        if alpha_only not in _KNOWN_ABBREVIATIONS:
+            return ''
+
+    # --- Filter 2: Entire text is garbage ---
+    if _is_garbage_text(stripped):
+        return ''
+
+    # --- Filter 3: Strip trailing noise tokens ---
+    tokens = stripped.split()
+    while tokens and _is_noise_tail_token(tokens[-1]):
+        tokens.pop()
+    if not tokens:
+        return ''
+
+    return ' '.join(tokens)
+
+
+def _clean_cell_text_lite(text: str) -> str:
+    """Simplified noise filter for cell-first OCR (isolated cell crops).
+
+    Since each cell is OCR'd in isolation (no neighbour content visible),
+    trailing-noise stripping is unnecessary.  Only 2 filters remain:
+
+    1. No real alphabetic word (>= 2 letters) and not a known abbreviation -> empty.
+    2. Entire text is garbage (no dictionary word) -> empty.
+    """
+    stripped = text.strip()
+    if not stripped:
+        return ''
+
+    # --- Filter 1: No real word at all ---
+    if not _RE_REAL_WORD.search(stripped):
+        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+        if alpha_only not in _KNOWN_ABBREVIATIONS:
+            return ''
+
+    # --- Filter 2: Entire text is garbage ---
+    if _is_garbage_text(stripped):
+        return ''
+
+    return stripped
+
+
+# ---------------------------------------------------------------------------
+# Bold detection via stroke-width analysis (relative / page-level)
+# ---------------------------------------------------------------------------
+
+def _measure_stroke_width(gray_crop: np.ndarray) -> float:
+    """Measure mean stroke width in a binarised cell crop.
+
+    Returns a DPI-normalised value (mean stroke width as % of crop height),
+    or 0.0 if measurement is not possible.
+    """
+    if gray_crop is None or gray_crop.size == 0:
+        return 0.0
+    h, w = gray_crop.shape[:2]
+    if h < 10 or w < 10:
+        return 0.0
+
+    # Binarise: text = white (255), background = black (0)
+    _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+    if cv2.countNonZero(bw) < 20:
+        return 0.0
+
+    # Distance transform: value at each white pixel = distance to nearest black
+    dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
+
+    # Skeleton via morphological thinning
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    thin = bw.copy()
+    for _ in range(max(1, min(h, w) // 6)):
+        eroded = cv2.erode(thin, kernel)
+        if cv2.countNonZero(eroded) < 5:
+            break
+        thin = eroded
+
+    skeleton_pts = thin > 0
+    if not np.any(skeleton_pts):
+        return 0.0
+    mean_stroke = float(np.mean(dist[skeleton_pts]))
+    return mean_stroke / max(h, 1) * 100  # normalised: % of cell height
+
+
+def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
+                         img_w: int, img_h: int) -> None:
+    """Two-pass bold detection: measure all cells, then compare against median.
+
+    Cells with stroke width > 1.4x the page median are marked as bold.
+    This adapts automatically to font, DPI and scan quality.
+    Modifies cells in-place (sets 'is_bold' key).
+    """
+    if ocr_img is None:
+        return
+
+    # Pass 1: measure stroke width for every cell with text
+    metrics: List[float] = []
+    cell_strokes: List[float] = []
+    for cell in cells:
+        sw = 0.0
+        if cell.get('text', '').strip():
+            bp = cell['bbox_px']
+            y1 = max(0, bp['y'])
+            y2 = min(img_h, bp['y'] + bp['h'])
+            x1 = max(0, bp['x'])
+            x2 = min(img_w, bp['x'] + bp['w'])
+            if y2 > y1 and x2 > x1:
+                sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
+        cell_strokes.append(sw)
+        if sw > 0:
+            metrics.append(sw)
+
+    if len(metrics) < 3:
+        # Too few cells to compare — leave all as non-bold
+        return
+
+    median_sw = float(np.median(metrics))
+    if median_sw <= 0:
+        return
+
+    # Pass 2: cells significantly above median -> bold
+    for cell, sw in zip(cells, cell_strokes):
+        cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
@@ -0,0 +1,189 @@
+"""Cell-level IPA phonetic fixes for overlay mode.
+
+In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
+(entry['english']).  But the overlay reads cell['text'] directly, so
+phonetic fixes must be applied to cells too.
+
+Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_vocab_types import IPA_AVAILABLE
+
+from cv_ocr_ipa_lookup import (
+    _insert_missing_ipa,
+    _replace_phonetics_in_text,
+    _text_has_garbled_ipa,
+)
+from cv_ocr_ipa_repair import (
+    _has_non_dict_trailing,
+    _insert_headword_ipa,
+    _strip_post_bracket_garbled,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def fix_cell_phonetics(
+    cells: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+    """Apply IPA phonetic fixes to cell texts for overlay mode.
+
+    In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
+    (entry['english']).  But the overlay reads cell['text'] directly, so
+    phonetic fixes must be applied to cells too.
+
+    Processing depends on column type:
+    - column_en: Full processing (replace garbled IPA + strip orphan brackets
+      + insert missing IPA). Safe because these cells contain only English
+      headwords.
+    - column_text: Light processing (replace garbled IPA ONLY). No orphan
+      bracket stripping (brackets may be German content like "(probieren)")
+      and no IPA insertion (would add tokens and break overlay positioning).
+    """
+    if not IPA_AVAILABLE:
+        return cells
+
+    ipa_col_types = {'column_en', 'column_text'}
+    replaced = 0
+
+    for cell in cells:
+        col_type = cell.get('col_type', '')
+        if col_type not in ipa_col_types:
+            continue
+        text = cell.get('text', '') or ''
+        if not text.strip():
+            continue
+
+        if col_type == 'column_en':
+            # Full processing: replace garbled IPA, strip orphan brackets.
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
+            if new_text == text:
+                # Insert IPA when garbled phonetics exist OR when trailing
+                # non-dictionary words suggest garbled IPA in plain ASCII.
+                if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
+                    new_text = _insert_missing_ipa(text, pronunciation)
+            # Strip trailing garbled fragments after proper [IPA] brackets
+            # (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
+            if ']' in new_text:
+                new_text = _strip_post_bracket_garbled(new_text, pronunciation)
+        else:
+            # column_text: replace garbled IPA, no orphan stripping
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
+            # Insert headword IPA ONLY if there's a gap in word_boxes
+            # suggesting Tesseract missed an IPA bracket on the page.
+            # Without gap evidence, the original page had no IPA.
+            if new_text == text:
+                wb = cell.get('word_boxes', [])
+                if _has_ipa_gap(text, wb):
+                    inserted = _insert_headword_ipa(text, pronunciation)
+                    if inserted != text:
+                        new_text = inserted
+                        _sync_word_boxes_after_ipa_insert(cell, text, new_text)
+
+        if new_text != text:
+            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
+            cell['text'] = new_text
+            replaced += 1
+
+    if replaced:
+        logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
+    return cells
+
+
+def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
+    """Check if word_boxes show a gap where IPA brackets should be.
+
+    On a typical vocab page, the layout is:
+        headword [ipa]              German translation
+
+    If Tesseract missed the IPA bracket, the gap between the headword
+    and the next word (German translation) is unusually large (>80px)
+    because the IPA occupied physical space on the page.
+
+    If no IPA was on the page (e.g. "be good at sth."), the words are
+    close together (<30px).
+    """
+    if not word_boxes or len(word_boxes) < 2:
+        return False
+
+    tokens = text.split()
+    if not tokens:
+        return False
+
+    # Find the headword index: skip numeric prefixes like "».55", "0.56"
+    hw_box_idx = 0
+    for i, wb in enumerate(word_boxes):
+        wt = wb.get('text', '')
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
+        if len(clean) >= 2:
+            hw_box_idx = i
+            break
+
+    if hw_box_idx >= len(word_boxes) - 1:
+        return False
+
+    # Check gap between headword and the next word_box
+    hw = word_boxes[hw_box_idx]
+    next_wb = word_boxes[hw_box_idx + 1]
+    gap = next_wb['left'] - (hw['left'] + hw['width'])
+
+    return gap > 80
+
+
+def _sync_word_boxes_after_ipa_insert(
+    cell: Dict[str, Any],
+    old_text: str,
+    new_text: str,
+) -> None:
+    """Insert a synthetic word_box for an IPA token added by IPA insertion.
+
+    E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
+    Adds a new word_box right after the headword's box so the 1:1
+    token-to-box mapping in the frontend overlay stays consistent.
+    """
+    word_boxes = cell.get('word_boxes')
+    if not word_boxes:
+        return
+
+    old_tokens = old_text.split()
+    new_tokens = new_text.split()
+
+    if len(new_tokens) != len(old_tokens) + 1:
+        return  # unexpected change, skip
+
+    # Find the inserted token by walking both lists in parallel.
+    # One token in new_tokens won't match — that's the inserted IPA.
+    insert_idx = -1
+    j = 0  # index into old_tokens
+    for i in range(len(new_tokens)):
+        if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
+            j += 1
+        else:
+            insert_idx = i
+            break
+
+    if insert_idx < 0 or insert_idx >= len(new_tokens):
+        return
+
+    ipa_token = new_tokens[insert_idx]
+
+    # The headword is at insert_idx - 1 in old_tokens (and word_boxes)
+    ref_idx = insert_idx - 1
+    if ref_idx < 0 or ref_idx >= len(word_boxes):
+        return
+
+    ref_box = word_boxes[ref_idx]
+    ipa_box = {
+        'text': ipa_token,
+        'left': ref_box['left'] + ref_box['width'] + 2,
+        'top': ref_box['top'],
+        'width': ref_box['width'],
+        'height': ref_box['height'],
+        'conf': ref_box.get('conf', 90),
+    }
+    word_boxes.insert(insert_idx, ipa_box)
@@ -0,0 +1,381 @@
+"""
+OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
+
+This module contains the OCR engine wrappers and re-exports all functions
+from the split sub-modules for backward compatibility.
+
+Sub-modules:
+- cv_ocr_word_assembly: Word grouping and text assembly
+- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
+- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
+- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
+- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
+- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import io
+import logging
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    IPA_AVAILABLE,
+    PageRegion,
+    RowGeometry,
+    _britfone_dict,
+    _ipa_convert_american,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None  # type: ignore[assignment,misc]
+
+
+# ── Re-exports from sub-modules (backward compatibility) ──────────────────
+
+from cv_ocr_word_assembly import (  # noqa: F401
+    _group_words_into_lines,
+    _words_to_reading_order_lines,
+    _rejoin_hyphenated,
+    _words_to_reading_order_text,
+    _words_to_spaced_text,
+)
+
+from cv_ocr_vocab_postprocess import (  # noqa: F401
+    _CHAR_CONFUSION_RULES,
+    _DE_INDICATORS_FOR_EN_I,
+    _fix_character_confusion,
+    _is_singular_plural_pair,
+    _split_comma_entries,
+    _split_by_comma,
+    _find_best_vocab_match,
+    _attach_example_sentences,
+)
+
+from cv_ocr_ipa_lookup import (  # noqa: F401
+    _PHONETIC_BRACKET_RE,
+    _IPA_CHARS,
+    _MIN_WORD_CONF,
+    _GRAMMAR_BRACKET_WORDS,
+    _lookup_ipa,
+    _fix_phonetic_brackets,
+    _is_grammar_bracket_content,
+    _replace_phonetics_in_text,
+    _text_has_garbled_ipa,
+    _decompose_compound,
+    _insert_missing_ipa,
+)
+
+from cv_ocr_ipa_repair import (  # noqa: F401
+    _has_non_dict_trailing,
+    _strip_post_bracket_garbled,
+    fix_ipa_continuation_cell,
+    _insert_headword_ipa,
+)
+
+from cv_ocr_cell_phonetics import (  # noqa: F401
+    fix_cell_phonetics,
+    _has_ipa_gap,
+    _sync_word_boxes_after_ipa_insert,
+)
+
+from cv_ocr_cell_filter import (  # noqa: F401
+    _RE_REAL_WORD,
+    _RE_ALPHA,
+    _COMMON_SHORT_WORDS,
+    _KNOWN_ABBREVIATIONS,
+    _assign_row_words_to_columns,
+    _is_noise_tail_token,
+    _is_garbage_text,
+    _clean_cell_text,
+    _clean_cell_text_lite,
+    _measure_stroke_width,
+    _classify_bold_cells,
+)
+
+
+# ── OCR Engine Wrappers ───────────────────────────────────────────────────
+
+_rapid_engine = None
+RAPIDOCR_AVAILABLE = False
+
+try:
+    from rapidocr import RapidOCR as _RapidOCRClass
+    from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
+    RAPIDOCR_AVAILABLE = True
+    logger.info("RapidOCR available — can be used as alternative to Tesseract")
+except ImportError:
+    logger.info("RapidOCR not installed — using Tesseract only")
+
+
+def _get_rapid_engine():
+    """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
+    global _rapid_engine
+    if _rapid_engine is None:
+        _rapid_engine = _RapidOCRClass(params={
+            "Rec.lang_type": _LangRec.LATIN,
+            "Rec.model_type": _ModelType.SERVER,
+            "Rec.ocr_version": _OCRVersion.PPOCRV5,
+            "Det.unclip_ratio": 1.3,
+            "Det.box_thresh": 0.4,
+            "Global.log_level": "critical",
+        })
+        logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
+    return _rapid_engine
+
+
+def ocr_region_rapid(
+    img_bgr: np.ndarray,
+    region: PageRegion,
+) -> List[Dict[str, Any]]:
+    """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
+    engine = _get_rapid_engine()
+
+    crop = img_bgr[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    result = engine(crop)
+
+    if result is None or result.boxes is None or result.txts is None:
+        return []
+
+    words = []
+    boxes = result.boxes
+    txts = result.txts
+    scores = result.scores
+
+    for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
+        if not txt or not txt.strip():
+            continue
+
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        left = int(min(xs))
+        top = int(min(ys))
+        w = int(max(xs) - left)
+        h = int(max(ys) - top)
+
+        words.append({
+            'text': txt.strip(),
+            'left': left + region.x,
+            'top': top + region.y,
+            'width': w,
+            'height': h,
+            'conf': int(score * 100),
+            'region_type': region.type,
+        })
+
+    return words
+
+
+def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
+    """Run TrOCR on a region. Returns line-level word dicts."""
+    from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
+
+    if not _check_trocr_available():
+        logger.warning("TrOCR not available, falling back to Tesseract")
+        if region.height > 0 and region.width > 0:
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+            if ocr_img_crop is not None:
+                return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+        return []
+
+    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+    if crop.size == 0:
+        return []
+
+    try:
+        import torch
+        from PIL import Image as _PILImage
+
+        processor, model = get_trocr_model(handwritten=handwritten)
+        if processor is None or model is None:
+            logger.warning("TrOCR model not loaded, falling back to Tesseract")
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        lines = _split_into_lines(pil_crop)
+        if not lines:
+            lines = [pil_crop]
+
+        device = next(model.parameters()).device
+        all_text = []
+        confidences = []
+        for line_img in lines:
+            pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
+            with torch.no_grad():
+                generated_ids = model.generate(pixel_values, max_length=128)
+            text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+            if text_line:
+                all_text.append(text_line)
+                confidences.append(0.85 if len(text_line) > 3 else 0.5)
+
+        if not all_text:
+            return []
+
+        avg_conf = int(sum(confidences) / len(confidences) * 100)
+        line_h = region.height // max(len(all_text), 1)
+        words = []
+        for i, line in enumerate(all_text):
+            words.append({
+                "text": line,
+                "left": region.x,
+                "top": region.y + i * line_h,
+                "width": region.width,
+                "height": line_h,
+                "conf": avg_conf,
+                "region_type": region.type,
+            })
+        return words
+
+    except Exception as e:
+        logger.error(f"ocr_region_trocr failed: {e}")
+        return []
+
+
+def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
+    """Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
+    from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
+
+    if not _check_lighton_available():
+        logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
+        if RAPIDOCR_AVAILABLE and img_bgr is not None:
+            return ocr_region_rapid(img_bgr, region)
+        ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+        return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
+
+    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+    if crop.size == 0:
+        return []
+
+    try:
+        import io
+        import torch
+        from PIL import Image as _PILImage
+
+        processor, model = get_lighton_model()
+        if processor is None or model is None:
+            logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
+            if RAPIDOCR_AVAILABLE and img_bgr is not None:
+                return ocr_region_rapid(img_bgr, region)
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        conversation = [{"role": "user", "content": [{"type": "image"}]}]
+        inputs = processor.apply_chat_template(
+            conversation, images=[pil_crop],
+            add_generation_prompt=True, return_tensors="pt"
+        ).to(model.device)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=1024)
+
+        text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
+        if not text:
+            return []
+
+        lines = [l.strip() for l in text.split("\n") if l.strip()]
+        line_h = region.height // max(len(lines), 1)
+        words = []
+        for i, line in enumerate(lines):
+            words.append({
+                "text": line,
+                "left": region.x,
+                "top": region.y + i * line_h,
+                "width": region.width,
+                "height": line_h,
+                "conf": 85,
+                "region_type": region.type,
+            })
+        return words
+
+    except Exception as e:
+        logger.error(f"ocr_region_lighton failed: {e}")
+        return []
+
+
+async def ocr_region_paddle(
+    img_bgr: np.ndarray,
+    region: Optional["PageRegion"] = None,
+) -> List[Dict[str, Any]]:
+    """Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
+    force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
+
+    if not force_remote:
+        try:
+            if region is None:
+                h, w = img_bgr.shape[:2]
+                _region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
+            else:
+                _region = region
+
+            words = ocr_region_rapid(img_bgr, _region)
+            if words:
+                logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
+                return words
+            logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
+        except Exception as e:
+            logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
+
+    from services.paddleocr_remote import ocr_remote_paddle
+
+    if region is not None:
+        crop = img_bgr[
+            region.y : region.y + region.height,
+            region.x : region.x + region.width,
+        ]
+        offset_x, offset_y = region.x, region.y
+    else:
+        crop = img_bgr
+        offset_x, offset_y = 0, 0
+
+    if crop.size == 0:
+        return []
+
+    h, w = crop.shape[:2]
+    scale = 1.0
+    _MAX_DIM = 1500
+    if max(h, w) > _MAX_DIM:
+        scale = _MAX_DIM / max(h, w)
+        new_w, new_h = int(w * scale), int(h * scale)
+        crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        logger.info("ocr_region_paddle: downscaled %dx%d → %dx%d (scale=%.2f)",
+                     w, h, new_w, new_h, scale)
+
+    success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
+    if not success:
+        logger.error("ocr_region_paddle: cv2.imencode failed")
+        return []
+
+    words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
+    logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
+
+    inv_scale = 1.0 / scale if scale != 1.0 else 1.0
+    for wd in words:
+        wd["left"] = int(wd["left"] * inv_scale) + offset_x
+        wd["top"] = int(wd["top"] * inv_scale) + offset_y
+        wd["width"] = int(wd["width"] * inv_scale)
+        wd["height"] = int(wd["height"] * inv_scale)
+        if region is not None:
+            wd["region_type"] = region.type
+
+    return words
@@ -0,0 +1,476 @@
+"""
+IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
+
+Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
+in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)).  This module
+provides functions to:
+
+- Look up correct IPA pronunciations (British/American) for English words.
+- Detect and replace garbled phonetic brackets with dictionary IPA.
+- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
+- Strip orphan brackets and post-bracket garbled fragments.
+- Handle IPA continuation cells (phonetics on a separate row from headword).
+
+All IPA data comes from open-source dictionaries:
+- Britfone (MIT) for British English
+- eng_to_ipa / CMU (MIT) for American English
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import (
+    IPA_AVAILABLE,
+    _britfone_dict,
+    _ipa_convert_american,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# --- D. Phonetic Bracket IPA Replacement ---
+
+# Pattern: word followed by any bracket type containing phonetic content.
+# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
+# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
+# This intentionally matches mixed brackets (e.g. {content]) because
+# Tesseract frequently misrecognizes bracket characters.
+_PHONETIC_BRACKET_RE = re.compile(
+    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
+)
+
+# Unicode IPA characters — used to distinguish correct IPA (from dictionary
+# lookup) from garbled OCR content when stripping orphan brackets.
+_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
+
+# Minimum word confidence for full-page Tesseract results (0-100).
+# Words below this threshold are OCR noise (scanner shadows, borders).
+_MIN_WORD_CONF = 30
+
+
+def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
+    """Look up IPA for a word using the selected pronunciation dictionary.
+
+    Args:
+        word: English word to look up.
+        pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
+
+    Returns:
+        IPA string or None if not found.
+    """
+    word_lower = word.lower().strip()
+    if not word_lower:
+        return None
+
+    if pronunciation == 'british' and _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+        # Fallback to American if not in Britfone
+        if _ipa_convert_american:
+            result = _ipa_convert_american(word_lower)
+            if result and '*' not in result:
+                return result
+        return None
+
+    if pronunciation == 'american' and _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+        # Fallback to Britfone if not in CMU
+        if _britfone_dict:
+            ipa = _britfone_dict.get(word_lower)
+            if ipa:
+                return ipa
+        return None
+
+    # Try any available source
+    if _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+    if _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+
+    return None
+
+
+def _fix_phonetic_brackets(
+    entries: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+    """Replace OCR'd phonetic transcriptions with dictionary IPA.
+
+    Detects patterns like "dance [du:ns]" and replaces with correct IPA:
+    - British: "dance [dˈɑːns]"  (Britfone, MIT)
+    - American: "dance [dæns]"    (eng_to_ipa/CMU, MIT)
+
+    Only replaces if the word before brackets is found in the dictionary.
+    """
+    if not IPA_AVAILABLE:
+        return entries
+
+    # IPA phonetics only appear in the ENGLISH field of vocab tables.
+    # German and example fields contain meaningful parenthetical content:
+    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
+    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
+    # These must NEVER be processed as phonetic transcriptions.
+    replaced_count = 0
+    for entry in entries:
+        text = entry.get('english', '') or ''
+        if not any(ch in text for ch in '[{('):
+            continue
+        new_text = _replace_phonetics_in_text(text, pronunciation)
+        if new_text != text:
+            logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
+            replaced_count += 1
+        entry['english'] = new_text
+
+    if replaced_count:
+        logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
+    return entries
+
+
+# Grammar particles that appear in brackets after English words:
+#   cross (with), complain (about/of), agree (on/with), look (sth) up
+# These must NOT be replaced with IPA.  Only used for the English field
+# (German/example fields are never processed for IPA replacement).
+_GRAMMAR_BRACKET_WORDS = frozenset({
+    # English prepositions/particles commonly in vocab tables
+    'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
+    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
+    # English grammar abbreviations used in vocab tables
+    'sth', 'sb', 'adj', 'adv',
+    # Number/plural/grammar annotations
+    'pl', 'sg', 'sing', 'no', 'also', 'auch',
+    # Regional English markers
+    'ae', 'be', 'ame', 'bre',
+})
+
+
+def _is_grammar_bracket_content(content: str) -> bool:
+    """Return True if bracket content is grammar info in the ENGLISH field.
+
+    Grammar info:  cross (with), complain (about/of), agree (on/with)
+    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
+
+    Since we only process the English field, we only need to recognize
+    English grammar particles. Everything else is (garbled) IPA.
+    """
+    if not content:
+        return False
+
+    # Split on / and spaces for patterns like (about/of), (no pl)
+    tokens = re.split(r'[/\s]+', content.strip().lower())
+    tokens = [t for t in tokens if t]
+    if not tokens:
+        return False
+
+    # ALL tokens must be known grammar words
+    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
+
+
+def _replace_phonetics_in_text(
+    text: str,
+    pronunciation: str = 'british',
+    strip_orphans: bool = True,
+) -> str:
+    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
+
+    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
+    We match any bracket type and replace with dictionary IPA if found.
+    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
+
+    Args:
+        strip_orphans: If True, strip orphan brackets that look like garbled IPA.
+            Set to False for column_text where brackets may be German content.
+    """
+    if not IPA_AVAILABLE:
+        return text
+
+    def replacer(match):
+        word = match.group(1)
+        bracket_content = match.group(2).strip()
+        full_match = match.group(0)
+
+        # Skip if bracket content looks like regular text (multiple words)
+        if len(bracket_content.split()) > 3:
+            return full_match
+
+        # Look up IPA for the word before brackets
+        ipa = _lookup_ipa(word, pronunciation)
+
+        if ipa:
+            # Word has IPA → bracket content is phonetic (garbled or correct).
+            # Exception: grammar particles like cross (with) — keep those.
+            if _is_grammar_bracket_content(bracket_content):
+                return full_match
+            logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
+            return f"{word} [{ipa}]"
+
+        # No IPA for this word — keep as-is
+        return full_match
+
+    text = _PHONETIC_BRACKET_RE.sub(replacer, text)
+
+    if strip_orphans:
+        # Second pass: strip remaining orphan brackets that are garbled IPA.
+        # These have no word before them (the main regex requires \b word \s* bracket).
+        # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+        # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+        def _strip_orphan_bracket(m):
+            content = m.group(1).strip()
+            # Keep grammar info: (sich beschweren), (about/of)
+            if _is_grammar_bracket_content(content):
+                return m.group(0)
+            # Keep correct IPA (contains Unicode IPA characters)
+            if any(ch in _IPA_CHARS for ch in content):
+                return m.group(0)
+            # Keep real-word parentheticals like (probieren), (Profit), (Geld).
+            # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
+            # — they never contain a real word ≥4 letters with proper casing.
+            content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
+            if len(content_alpha) >= 4:
+                return m.group(0)
+            logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
+            return ''
+
+        text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
+
+    text = text.strip()
+
+    return text
+
+
+def _text_has_garbled_ipa(text: str) -> bool:
+    """Check if text contains garbled IPA-like fragments from OCR.
+
+    Returns True if there is evidence of OCR-mangled phonetic
+    transcription, e.g. stress marks, length marks, or IPA special chars.
+    This is used to decide whether ``_insert_missing_ipa`` should run:
+    it must only insert IPA to *replace* garbled phonetics that are already
+    in the text — never to ADD phonetics where none existed on the page.
+    """
+    # Bracketed text that doesn't contain valid IPA symbols is garbled OCR
+    # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
+    stripped = text.strip()
+    if stripped.startswith('[') and stripped.endswith(']'):
+        inner = stripped[1:-1]
+        # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
+        if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
+            # Not a valid dictionary-style bracket like "(no pl)" — those
+            # use parentheses, not square brackets.  Square brackets with
+            # no IPA chars are garbled phonetics.
+            return True
+
+    for w in text.strip().split():
+        # Skip delimiters and very short tokens
+        if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
+            continue
+        # Starts with stress mark (OCR read IPA stress ' as apostrophe)
+        if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
+            return True
+        if w.startswith("\u02c8") or w.startswith("\u02cc"):  # ˈ ˌ
+            return True
+        # Contains IPA length mark ':' in a short non-word fragment
+        if ':' in w and len(w) < 12:
+            # But not things like "3:00" (time) or common words
+            stripped = re.sub(r'[^a-zA-Z:]', '', w)
+            if ':' in stripped and not stripped.replace(':', '').isalpha():
+                continue
+            return True
+        # Contains IPA special characters
+        if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
+            return True
+        # Embedded apostrophe suggesting merged garbled IPA with stress mark.
+        # E.g. "Scotland'skotland" — OCR reads ˈ as '.
+        # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
+        # chars to avoid contractions (don't, won't, o'clock).
+        if "'" in w and not w.startswith("'"):
+            apos_idx = w.index("'")
+            after = w[apos_idx + 1:]
+            if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
+                return True
+    return False
+
+
+def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
+    """Try to decompose a compound word and concatenate IPA for each part.
+
+    E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
+    Only returns IPA if ALL parts are found in the dictionary.
+
+    Tries splits at every position (min 3 chars per part) and picks the
+    split where the first part is longest.
+    """
+    if not IPA_AVAILABLE:
+        return None
+    lower = word.lower().strip()
+    if len(lower) < 6:
+        return None  # too short for a compound
+
+    best_ipa = None
+    best_first_len = 0
+
+    for split_pos in range(3, len(lower) - 2):  # min 3 chars each part
+        first = lower[:split_pos]
+        second = lower[split_pos:]
+        ipa_first = _lookup_ipa(first, pronunciation)
+        ipa_second = _lookup_ipa(second, pronunciation)
+        if ipa_first and ipa_second:
+            if split_pos > best_first_len:
+                best_first_len = split_pos
+                best_ipa = ipa_first + ipa_second
+
+    return best_ipa
+
+
+def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA pronunciation for English words that have no brackets at all.
+
+    OCR sometimes garbles the phonetic transcription into plain-text fragments
+    (e.g. "scare skea" where "skea" is garbled /skɛə/).  This scans the text
+    for the headword, inserts correct [IPA], and strips the garbled fragments.
+
+    Only inserts for words that:
+    - are standalone (not already followed by a bracket)
+    - have an IPA entry in the dictionary
+    - appear to be English headwords (at the start of text or after common
+      separators like ",", ";", "•")
+
+    This is intentionally conservative: it only inserts at the END of each
+    whitespace-separated token group to avoid breaking phrases.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    # Skip if already has brackets (IPA replacement handles those)
+    if any(ch in text for ch in '[{('):
+        return text
+
+    # Only process short text fragments (typical vocab cells).
+    # Long sentences / paragraphs should not get IPA insertions.
+    words = text.strip().split()
+    if len(words) > 6:
+        return text
+
+    # Try to insert IPA for the first alphanumeric word
+    # Typical patterns: "challenge", "profit", "film", "badge"
+    for i, w in enumerate(words):
+        # Clean punctuation for lookup
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        # Skip German/grammar words
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
+        if not ipa and '-' in clean:
+            ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
+        # Fallback 0b: compound word decomposition
+        # E.g. "schoolbag" → "school"+"bag" → concatenated IPA
+        if not ipa:
+            ipa = _decompose_compound(clean, pronunciation)
+        # Fallback 1: IPA-marker split for merged tokens where OCR
+        # joined headword with its IPA (e.g. "schoolbagsku:lbæg").
+        # Find the first IPA marker character (:, æ, ɪ, etc.), walk
+        # backwards ≤3 chars for the onset consonant cluster, and
+        # split into headword + OCR IPA.
+        _IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
+        if not ipa:
+            first_marker = next(
+                (p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
+            )
+            if first_marker >= 3:
+                split = first_marker
+                while (split > 0
+                       and split > first_marker - 3
+                       and w[split - 1].isalpha()
+                       and w[split - 1].islower()):
+                    split -= 1
+                if split >= 2:
+                    headword = w[:split]
+                    ocr_ipa = w[split:]
+                    hw_ipa = _lookup_ipa(headword, pronunciation)
+                    if not hw_ipa:
+                        # Try compound decomposition for the headword part
+                        hw_ipa = _decompose_compound(headword, pronunciation)
+                    if hw_ipa:
+                        words[i] = f"{headword} [{hw_ipa}]"
+                    else:
+                        # Word not in dictionary — use OCR IPA
+                        words[i] = f"{headword} [{ocr_ipa}]"
+                    words = words[:i + 1]
+                    ipa = True  # signal that we handled it
+                    break
+        # Fallback 2: prefix matching for merged tokens WITHOUT IPA
+        # markers (e.g. "Scotland'skotland").  Find longest dictionary
+        # prefix using only alpha chars to avoid punctuation matches.
+        if not ipa:
+            alpha = re.sub(r'[^a-zA-Z]', '', clean)
+            if len(alpha) > 5:  # need at least 6 chars for meaningful split
+                for end in range(len(alpha), 3, -1):  # min prefix 4 chars
+                    prefix = alpha[:end]
+                    test_ipa = _lookup_ipa(prefix, pronunciation)
+                    if test_ipa:
+                        ipa = test_ipa
+                        w = prefix
+                        words[i] = prefix
+                        break
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            # Strip garbled OCR phonetics after the IPA bracket.
+            # On scanned vocab pages, printed IPA is read as garbled
+            # text (e.g. "scare skea" where "skea" is garbled /skɛə/).
+            # After inserting correct IPA, remove remaining words that
+            # aren't real English words, delimiters, or German text.
+            kept = words[:i + 1]
+            for j in range(i + 1, len(words)):
+                wj = words[j]
+                # Delimiter — keep this and everything after
+                if wj in ('–', '—', '-', '/', '|', ',', ';'):
+                    kept.extend(words[j:])
+                    break
+                # Pure digits or numbering (e.g. "1", "2.", "3)") — keep
+                if re.match(r'^[\d.)\-]+$', wj):
+                    kept.extend(words[j:])
+                    break
+                # Starts with uppercase — likely German or proper noun
+                clean_j = re.sub(r'[^a-zA-Z]', '', wj)
+                if clean_j and clean_j[0].isupper():
+                    kept.extend(words[j:])
+                    break
+                # Known English word (≥2 chars) — keep it and rest
+                if clean_j and len(clean_j) >= 2:
+                    if _lookup_ipa(clean_j, pronunciation):
+                        kept.extend(words[j:])
+                        break
+                # Merged token: dictionary word + garbled IPA stuck together.
+                # E.g. "fictionsalans'fIkfn" starts with "fiction".
+                # Extract the dictionary prefix (≥4 chars) and add it with
+                # IPA, but only if enough chars remain after the prefix (≥3)
+                # to look like garbled IPA, not just a plural 's'.
+                if clean_j and len(clean_j) >= 7:
+                    for pend in range(min(len(clean_j) - 3, 15), 3, -1):
+                        prefix_j = clean_j[:pend]
+                        prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
+                        if prefix_ipa:
+                            kept.append(f"{prefix_j} [{prefix_ipa}]")
+                            break
+                    break  # rest of this token is garbled
+                # Otherwise — likely garbled phonetics, skip
+            words = kept
+            break
+
+    return ' '.join(words)
+
+
@@ -0,0 +1,287 @@
+"""
+Advanced IPA repair for OCR-extracted vocabulary.
+
+Functions that detect and fix garbled IPA fragments trailing after
+headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
+to stay within the 500 LOC budget.
+
+Contains:
+- _has_non_dict_trailing: detect non-dictionary trailing words
+- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
+- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
+- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import IPA_AVAILABLE
+from cv_ocr_ipa_lookup import (
+    _lookup_ipa,
+    _GRAMMAR_BRACKET_WORDS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
+    """Check if text has a headword followed by non-dictionary trailing words.
+
+    Used as an additional trigger for ``_insert_missing_ipa`` when
+    ``_text_has_garbled_ipa`` returns False because the garbled IPA
+    happens to look like plain ASCII (e.g. "skea" for /skɛə/).
+    """
+    if not IPA_AVAILABLE:
+        return False
+    words = text.strip().split()
+    if len(words) < 2 or len(words) > 6:
+        return False
+    # Find first dictionary word
+    hw_idx = -1
+    for i, w in enumerate(words):
+        clean = re.sub(r'[^a-zA-Z\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        if _lookup_ipa(clean, pronunciation):
+            hw_idx = i
+            break
+    if hw_idx < 0 or hw_idx >= len(words) - 1:
+        return False
+    # Check ALL remaining words — if none are dictionary/delimiter/German,
+    # they are likely garbled IPA.
+    for j in range(hw_idx + 1, len(words)):
+        wj = words[j]
+        if wj in ('–', '—', '-', '/', '|', ',', ';'):
+            return False
+        # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
+        if re.match(r'^[\d.)\-]+$', wj):
+            return False
+        clean_j = re.sub(r'[^a-zA-Z]', '', wj)
+        if clean_j and clean_j[0].isupper():
+            return False
+        if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
+            return False
+    return True
+
+
+def _strip_post_bracket_garbled(
+    text: str, pronunciation: str = 'british',
+) -> str:
+    """Strip garbled IPA fragments that trail after proper [IPA] brackets.
+
+    E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
+         ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
+         ``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
+
+    For multi-word headwords like "seat belt", a real English word ("belt")
+    may be followed by garbled IPA duplicates.  We detect this by checking
+    whether the sequence after a real word contains IPA markers (`:`, `ə`,
+    etc.) — if so, everything from the first garbled token onward is stripped.
+    """
+    if ']' not in text:
+        return text
+    last_bracket = text.rfind(']')
+    if last_bracket >= len(text) - 1:
+        return text
+    before = text[:last_bracket + 1].rstrip()
+    after = text[last_bracket + 1:].strip()
+    if not after:
+        return text
+
+    _IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
+    after_words = after.split()
+    kept: List[str] = []
+    for idx, w in enumerate(after_words):
+        # Delimiter — keep rest
+        if w in ('–', '—', '-', '/', '|', ',', ';'):
+            kept.extend(after_words[idx:])
+            break
+        # Contains IPA markers (length mark, IPA chars) — garbled, skip
+        if any(c in w for c in _IPA_MARKER_CHARS):
+            # Everything from here is garbled IPA — stop scanning
+            # but look ahead: if any remaining words are real English
+            # words WITHOUT IPA markers, they might be a different headword
+            # following. Only skip the contiguous garbled run.
+            continue
+        clean = re.sub(r'[^a-zA-Z]', '', w)
+        # Uppercase — likely German, keep rest
+        if clean and clean[0].isupper():
+            kept.extend(after_words[idx:])
+            break
+        # Known English word — keep it, but check if followed by garbled IPA
+        # (multi-word headword case like "seat [siːt] belt si:t belt")
+        if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
+            # Peek ahead: if next word has IPA markers, the rest is garbled
+            remaining = after_words[idx + 1:]
+            has_garbled_after = any(
+                any(c in rw for c in _IPA_MARKER_CHARS)
+                for rw in remaining
+            )
+            if has_garbled_after:
+                # Keep this real word but stop — rest is garbled duplication
+                kept.append(w)
+                # Still scan for delimiters/German in the remaining words
+                for ridx, rw in enumerate(remaining):
+                    if rw in ('–', '—', '-', '/', '|', ',', ';'):
+                        kept.extend(remaining[ridx:])
+                        break
+                    rclean = re.sub(r'[^a-zA-Z]', '', rw)
+                    if rclean and rclean[0].isupper():
+                        kept.extend(remaining[ridx:])
+                        break
+                break
+            else:
+                kept.extend(after_words[idx:])
+                break
+        # Unknown short word — likely garbled, skip
+    if kept:
+        return before + ' ' + ' '.join(kept)
+    return before
+
+
+def fix_ipa_continuation_cell(
+    garbled_text: str,
+    headword_text: str,
+    pronunciation: str = 'british',
+) -> str:
+    """Replace garbled IPA in a continuation row with proper IPA.
+
+    Continuation rows appear below the headword and contain only the
+    printed phonetic transcription, which OCR garbles into fragments
+    like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
+
+    Args:
+        garbled_text: The OCR-garbled IPA text from the continuation row.
+        headword_text: The headword text from the previous row
+            (e.g. ``scarf – scarves``).
+        pronunciation: ``'british'`` or ``'american'``.
+
+    Returns:
+        Corrected IPA text, or the original if no fix could be applied.
+    """
+    if not IPA_AVAILABLE or not garbled_text or not headword_text:
+        return garbled_text
+
+    # If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
+    # only generate continuation IPA for words NOT already covered.
+    covered_words: set = set()
+    has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
+    if has_inline_ipa:
+        # Words before the first bracket already have their IPA shown
+        first_bracket = headword_text.index('[')
+        pre_bracket = headword_text[:first_bracket].strip()
+        for w in pre_bracket.split():
+            clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
+            if clean and len(clean) >= 2:
+                covered_words.add(clean)
+
+        last_bracket_end = headword_text.rfind(']')
+        tail = headword_text[last_bracket_end + 1:].strip()
+
+        if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
+            # Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
+            # — return the inline IPA directly (continuation duplicates it)
+            last_bracket_start = headword_text.rfind('[')
+            inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
+            return inline_ipa
+
+        # Only the tail words need continuation IPA
+        headword_text = tail
+
+    # Strip existing IPA brackets and parenthetical grammar annotations
+    # like "(no pl)", "(sth)", "(sb)" from headword text
+    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
+    clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
+    if not clean_hw:
+        return garbled_text
+
+    # Split headword by delimiters (– — -)
+    # "scarf – scarves" → ["scarf", "scarves"]
+    # "see - saw - seen" → ["see", "saw", "seen"]
+    parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
+    parts = [p.strip() for p in parts if p.strip()]
+
+    if not parts:
+        return garbled_text
+
+    # Look up IPA for each headword part.
+    # Skip articles (the, a, an) — they never get IPA in vocab books.
+    # Other function words like "down", "up" are kept because they are
+    # integral parts of phrasal verbs (e.g. "close down").
+    # Skip words that already have inline IPA in the headword row.
+    _ARTICLES = {'the', 'a', 'an'}
+    ipa_parts: List[str] = []
+    for part in parts:
+        # A part may be multi-word like "secondary school"
+        words = part.split()
+        word_ipas: List[str] = []
+        for w in words:
+            clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
+            if not clean_w or len(clean_w) < 2:
+                continue
+            if covered_words and clean_w.lower() in covered_words:
+                continue  # Already has IPA inline in the headword
+            if clean_w.lower() in _ARTICLES:
+                continue  # Articles never get IPA in vocab books
+            ipa = _lookup_ipa(clean_w, pronunciation)
+            if ipa:
+                word_ipas.append(ipa)
+        if word_ipas:
+            ipa_parts.append('[' + ' '.join(word_ipas) + ']')
+
+    if not ipa_parts:
+        return garbled_text
+
+    # Join with delimiter
+    result = ' – '.join(ipa_parts)
+    logger.debug(
+        "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
+        garbled_text, result, headword_text,
+    )
+    return result
+
+
+def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA for the first English headword in a long mixed-language line.
+
+    Unlike _insert_missing_ipa (for short column_en cells), this handles
+    column_text lines of any length.  It only inserts IPA for the FIRST word
+    if that word:
+    - has no bracket following it already
+    - has an IPA entry in the dictionary
+    - is not a number/symbol prefix like "».55"
+
+    Returns the text with [ipa] inserted after the first word, or unchanged.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    words = text.strip().split()
+    if not words:
+        return text
+
+    # Check if text already starts with a bracket (IPA already present)
+    if len(words) > 1 and words[1].startswith(('[', '{', '(')):
+        return text
+
+    # Try the first few words (skip numeric prefixes like "».55", "0.56")
+    for i in range(min(3, len(words))):
+        w = words[i]
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            return ' '.join(words)
+        # Stop at first real word even if no IPA found
+        break
+
+    return text
@@ -0,0 +1,318 @@
+"""
+Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
+
+- Character confusion fix (I/1/l/|)
+- Comma-separated word form splitting
+- Example sentence attachment to matching vocab entries
+
+Split from cv_ocr_engines.py for maintainability.
+"""
+
+import re
+from typing import Any, Dict, List
+
+
+# =============================================================================
+# Post-Processing: Deterministic Quality Fixes
+# =============================================================================
+
+# --- A. Character Confusion Fix (I/1/l) ---
+
+# Common OCR confusion pairs in vocabulary context
+_CHAR_CONFUSION_RULES = [
+    # "1" at word start followed by lowercase → likely "I" or "l"
+    # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
+    (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
+    # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
+    (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
+    # "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
+    # and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
+    (re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'),  # |ch → Ich, | want → I want
+]
+
+# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
+_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
+
+
+def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Fix common OCR character confusions using context.
+
+    Deterministic rules:
+    - "1" at word start → "I" or "l" based on context
+    - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
+    - "y " artifact at word boundaries → remove (e.g. "y you" → "you")
+    """
+    for entry in entries:
+        en = entry.get('english', '') or ''
+        de = entry.get('german', '') or ''
+        ex = entry.get('example', '') or ''
+
+        # Apply general rules to all fields
+        for pattern, replacement in _CHAR_CONFUSION_RULES:
+            en = pattern.sub(replacement, en)
+            de = pattern.sub(replacement, de)
+            ex = pattern.sub(replacement, ex)
+
+        # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
+        de_lower_words = set(de.lower().replace(',', ' ').split())
+        if de_lower_words & _DE_INDICATORS_FOR_EN_I:
+            # Any remaining "1" in EN that looks like "I"
+            en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
+
+        # Fix "y " artifact before repeated word: "y you" → "you"
+        en = re.sub(r'\by\s+([a-z])', r'\1', en)
+        ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
+
+        entry['english'] = en.strip()
+        entry['german'] = de.strip()
+        entry['example'] = ex.strip()
+
+    return entries
+
+
+# --- B. Comma-Separated Word Form Splitting ---
+
+def _is_singular_plural_pair(parts: List[str]) -> bool:
+    """Detect if comma-separated parts are singular/plural forms of the same word.
+
+    E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
+    "break, broke, broken" → False (different verb forms, OK to split).
+
+    Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
+    OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
+    """
+    if len(parts) != 2:
+        return False
+
+    a, b = parts[0].lower().strip(), parts[1].lower().strip()
+    if not a or not b:
+        return False
+
+    # Common prefix heuristic: if words share >= 50% of the shorter word,
+    # they are likely forms of the same word (Maus/Mäuse, child/children).
+    min_len = min(len(a), len(b))
+    common = 0
+    for ca, cb in zip(a, b):
+        if ca == cb:
+            common += 1
+        else:
+            break
+    if common >= max(2, min_len * 0.5):
+        return True
+
+    # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
+    umlaut_map = str.maketrans('aou', 'äöü')
+    if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
+        return True
+
+    return False
+
+
+def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Split entries with comma-separated word forms into individual entries.
+
+    E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
+    → 3 entries: break/brechen, broke/brach, broken/gebrochen
+
+    Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
+    because those are forms of the same vocabulary entry.
+
+    Only splits when both EN and DE have the same number of comma-parts,
+    parts are short (word forms, not sentences), and at least 3 parts
+    (to avoid splitting pairs that likely belong together).
+    """
+    result: List[Dict[str, Any]] = []
+
+    for entry in entries:
+        en = (entry.get('english', '') or '').strip()
+        de = (entry.get('german', '') or '').strip()
+
+        # Split by comma (but not inside brackets or parentheses)
+        en_parts = _split_by_comma(en)
+        de_parts = _split_by_comma(de)
+
+        # Only split if we have multiple parts and counts match
+        should_split = False
+        if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
+            # All parts must be short (word forms, not sentences)
+            if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
+                # Do NOT split singular/plural pairs (2 parts that are
+                # forms of the same word)
+                if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
+                    should_split = False
+                else:
+                    should_split = True
+
+        if not should_split:
+            result.append(entry)
+            continue
+
+        # Split into individual entries
+        for k in range(len(en_parts)):
+            sub = dict(entry)  # shallow copy
+            sub['english'] = en_parts[k].strip()
+            sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
+            sub['example'] = ''  # examples get attached later
+            sub['split_from_comma'] = True
+            result.append(sub)
+
+    # Re-number
+    for i, e in enumerate(result):
+        e['row_index'] = i
+
+    return result
+
+
+def _split_by_comma(text: str) -> List[str]:
+    """Split text by commas, but not inside brackets [...] or parens (...)."""
+    if ',' not in text:
+        return [text]
+
+    parts = []
+    depth_bracket = 0
+    depth_paren = 0
+    current = []
+
+    for ch in text:
+        if ch == '[':
+            depth_bracket += 1
+        elif ch == ']':
+            depth_bracket = max(0, depth_bracket - 1)
+        elif ch == '(':
+            depth_paren += 1
+        elif ch == ')':
+            depth_paren = max(0, depth_paren - 1)
+        elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
+            parts.append(''.join(current).strip())
+            current = []
+            continue
+        current.append(ch)
+
+    if current:
+        parts.append(''.join(current).strip())
+
+    # Filter empty parts
+    return [p for p in parts if p]
+
+
+# --- C. Example Sentence Attachment ---
+
+def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
+    """Find the vocab entry whose English word(s) best match the example sentence.
+
+    Returns index into vocab_entries, or -1 if no match found.
+    Uses word stem overlap: "a broken arm" matches "broken" or "break".
+    """
+    if not vocab_entries or not example_text:
+        return -1
+
+    example_lower = example_text.lower()
+    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
+
+    best_idx = -1
+    best_score = 0
+
+    for i, entry in enumerate(vocab_entries):
+        en = (entry.get('english', '') or '').lower()
+        if not en:
+            continue
+
+        # Extract vocab words (split on space, comma, newline)
+        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
+
+        # Score: how many vocab words appear in the example?
+        # Also check if example words share a common stem (first 4 chars)
+        direct_matches = vocab_words & example_words
+        score = len(direct_matches) * 10
+
+        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
+        if score == 0:
+            for vw in vocab_words:
+                if len(vw) < 3:
+                    continue
+                stem = vw[:4] if len(vw) >= 4 else vw[:3]
+                for ew in example_words:
+                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
+                        score += 5
+                        break
+
+        if score > best_score:
+            best_score = score
+            best_idx = i
+
+    return best_idx if best_score > 0 else -1
+
+
+def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Attach rows with EN text but no DE translation as examples to matching vocab entries.
+
+    Vocabulary worksheets often have:
+      Row 1: break, broke, broken / brechen, brach, gebrochen
+      Row 2: a broken arm          (no DE → example for "broken")
+      Row 3: a broken plate         (no DE → example for "broken")
+      Row 4: egg / Ei               (has DE → new vocab entry)
+
+    Rules (deterministic, generic):
+    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
+    - Find the best matching vocab entry by checking which entry's English words
+      appear in the example sentence (semantic matching via word overlap)
+    - Fall back to the nearest preceding entry if no word match found
+    - Multiple examples get joined with " | "
+    """
+    if not entries:
+        return entries
+
+    # Separate into vocab entries (have DE) and example candidates (no DE)
+    vocab_entries: List[Dict[str, Any]] = []
+    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts
+
+    for entry in entries:
+        en = (entry.get('english', '') or '').strip()
+        de = (entry.get('german', '') or '').strip()
+        ex = (entry.get('example', '') or '').strip()
+
+        # Treat single-char DE as OCR noise, not real translation.
+        # "Ei" (2 chars) is a valid German word, so threshold is 1.
+        has_de = len(de) > 1
+        has_en = bool(en)
+
+        # Heuristic: a row without DE is an "example sentence" only if
+        # the EN text looks like a sentence (>= 4 words, or contains
+        # typical sentence punctuation).  Short EN text (1-3 words) is
+        # more likely a vocab entry whose DE was missed by OCR.
+        _looks_like_sentence = (
+            len(en.split()) >= 4
+            or en.rstrip().endswith(('.', '!', '?'))
+        )
+        is_example_candidate = (
+            has_en and not has_de and _looks_like_sentence and vocab_entries
+        )
+
+        if is_example_candidate:
+            # This is an example sentence — find best matching vocab entry
+            example_text = en
+
+            match_idx = _find_best_vocab_match(en, vocab_entries)
+            if match_idx < 0:
+                # No word match → fall back to last entry
+                match_idx = len(vocab_entries) - 1
+
+            if match_idx not in examples_for:
+                examples_for[match_idx] = []
+            examples_for[match_idx].append(example_text)
+        else:
+            vocab_entries.append(entry)
+
+    # Attach examples to their matched vocab entries
+    for idx, example_list in examples_for.items():
+        if 0 <= idx < len(vocab_entries):
+            entry = vocab_entries[idx]
+            existing_ex = (entry.get('example', '') or '').strip()
+            new_examples = ' | '.join(example_list)
+            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+
+    # Re-number
+    for i, e in enumerate(vocab_entries):
+        e['row_index'] = i
+
+    return vocab_entries
@@ -0,0 +1,134 @@
+"""
+Word assembly helpers for OCR output.
+
+Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
+into visual lines, rejoins hyphenated words, and produces reading-order
+text.  All functions are pure standard-library; no NumPy or project
+imports required.
+"""
+
+import logging
+from typing import Dict, List
+
+logger = logging.getLogger(__name__)
+
+
+def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
+    """Group words by Y position into lines, sorted by X within each line."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]['top']
+
+    for word in sorted_words[1:]:
+        if abs(word['top'] - current_y) <= y_tolerance_px:
+            current_line.append(word)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            lines.append(current_line)
+            current_line = [word]
+            current_y = word['top']
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        lines.append(current_line)
+
+    return lines
+
+
+def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
+    """Group OCR words into visual lines in reading order.
+
+    Returns a list of line strings (one per visual line in the cell).
+    """
+    if not words:
+        return []
+
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    return [' '.join(w['text'] for w in line) for line in lines]
+
+
+def _rejoin_hyphenated(lines: List[str]) -> List[str]:
+    """Rejoin words split by line-break hyphenation.
+
+    E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
+         ['some text-', 'thing here'] \u2192 ['something here']
+    """
+    if len(lines) <= 1:
+        return lines
+
+    result = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        # If line ends with '-' and there's a next line, rejoin
+        if i + 1 < len(lines) and line.rstrip().endswith('-'):
+            stripped = line.rstrip()
+            # Get the word fragment before hyphen (last word)
+            prefix = stripped[:-1]  # remove trailing hyphen
+            next_line = lines[i + 1]
+            # Join: last word of this line + first word of next line
+            prefix_words = prefix.rsplit(' ', 1)
+            next_words = next_line.split(' ', 1)
+            if len(prefix_words) > 1:
+                joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
+            else:
+                joined = prefix_words[0] + next_words[0]
+            remainder = next_words[1] if len(next_words) > 1 else ''
+            if remainder:
+                result.append(joined + ' ' + remainder)
+            else:
+                result.append(joined)
+            i += 2
+        else:
+            result.append(line)
+            i += 1
+    return result
+
+
+def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words into text in correct reading order, preserving line breaks.
+
+    Groups words into visual lines by Y-tolerance, sorts each line by X,
+    rejoins hyphenated words, then joins lines with newlines.
+    """
+    lines = _words_to_reading_order_lines(words, y_tolerance_px)
+    lines = _rejoin_hyphenated(lines)
+    return '\n'.join(lines)
+
+
+def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words preserving proportional horizontal spacing.
+
+    Instead of single spaces between words, inserts multiple spaces based on
+    the pixel gap between words relative to average character width.
+    Useful for box sub-sessions where spatial layout matters.
+    """
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    result_lines = []
+
+    for line_words in lines:
+        if not line_words:
+            continue
+        sorted_words = sorted(line_words, key=lambda w: w['left'])
+
+        # Calculate average character width from all words in line
+        total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
+        total_width = sum(w['width'] for w in sorted_words if w.get('text'))
+        avg_char_width = total_width / total_chars if total_chars > 0 else 10
+
+        parts = []
+        for i, word in enumerate(sorted_words):
+            parts.append(word.get('text', ''))
+            if i < len(sorted_words) - 1:
+                next_word = sorted_words[i + 1]
+                gap_px = next_word['left'] - (word['left'] + word['width'])
+                num_spaces = max(1, round(gap_px / avg_char_width))
+                parts.append(' ' * num_spaces)
+
+        result_lines.append(''.join(parts))
+
+    return '\n'.join(result_lines)
@@ -0,0 +1,275 @@
+"""
+Gutter Repair Core — spellchecker setup, data types, and single-word repair logic.
+
+Extracted from cv_gutter_repair.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import itertools
+import logging
+import re
+import uuid
+from dataclasses import dataclass, field, asdict
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Spellchecker setup (lazy, cached)
+# ---------------------------------------------------------------------------
+
+_spell_de = None
+_spell_en = None
+_SPELL_AVAILABLE = False
+
+def _init_spellcheckers():
+    """Lazy-load DE + EN spellcheckers (cached across calls)."""
+    global _spell_de, _spell_en, _SPELL_AVAILABLE
+    if _spell_de is not None:
+        return
+    try:
+        from spellchecker import SpellChecker
+        _spell_de = SpellChecker(language='de', distance=1)
+        _spell_en = SpellChecker(language='en', distance=1)
+        _SPELL_AVAILABLE = True
+        logger.info("Gutter repair: spellcheckers loaded (DE + EN)")
+    except ImportError:
+        logger.warning("pyspellchecker not installed — gutter repair unavailable")
+
+
+def _is_known(word: str) -> bool:
+    """Check if a word is known in DE or EN dictionary."""
+    _init_spellcheckers()
+    if not _SPELL_AVAILABLE:
+        return False
+    w = word.lower()
+    return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
+
+
+def _spell_candidates(word: str, lang: str = "both") -> List[str]:
+    """Get all plausible spellchecker candidates for a word (deduplicated)."""
+    _init_spellcheckers()
+    if not _SPELL_AVAILABLE:
+        return []
+    w = word.lower()
+    seen: set = set()
+    results: List[str] = []
+
+    for checker in ([_spell_de, _spell_en] if lang == "both"
+                    else [_spell_de] if lang == "de"
+                    else [_spell_en]):
+        if checker is None:
+            continue
+        cands = checker.candidates(w)
+        if cands:
+            for c in cands:
+                if c and c != w and c not in seen:
+                    seen.add(c)
+                    results.append(c)
+
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Gutter position detection
+# ---------------------------------------------------------------------------
+
+# Minimum word length for spell-fix (very short words are often legitimate)
+_MIN_WORD_LEN_SPELL = 3
+
+# Minimum word length for hyphen-join candidates (fragments at the gutter
+# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
+_MIN_WORD_LEN_HYPHEN = 2
+
+# How close to the right column edge a word must be to count as "gutter-adjacent".
+# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
+_GUTTER_EDGE_THRESHOLD = 0.70
+
+# Small common words / abbreviations that should NOT be repaired
+_STOPWORDS = frozenset([
+    # German
+    "ab", "an", "am", "da", "er", "es", "im", "in", "ja", "ob", "so", "um",
+    "zu", "wo", "du", "eh", "ei", "je", "na", "nu", "oh",
+    # English
+    "a", "am", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in",
+    "is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us",
+    "we",
+])
+
+# IPA / phonetic patterns — skip these cells
+_IPA_RE = re.compile(r'[\[\]/ˈˌːʃʒθðŋɑɒæɔəɛɪʊʌ]')
+
+
+def _is_ipa_text(text: str) -> bool:
+    """True if text looks like IPA transcription."""
+    return bool(_IPA_RE.search(text))
+
+
+def _word_is_at_gutter_edge(word_bbox: Dict, col_x: float, col_width: float) -> bool:
+    """Check if a word's right edge is near the right boundary of its column."""
+    if col_width <= 0:
+        return False
+    word_right = word_bbox.get("left", 0) + word_bbox.get("width", 0)
+    col_right = col_x + col_width
+    # Word's right edge within the rightmost portion of the column
+    relative_pos = (word_right - col_x) / col_width
+    return relative_pos >= _GUTTER_EDGE_THRESHOLD
+
+
+# ---------------------------------------------------------------------------
+# Suggestion types
+# ---------------------------------------------------------------------------
+
+@dataclass
+class GutterSuggestion:
+    """A single correction suggestion."""
+    id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
+    type: str = ""             # "hyphen_join" | "spell_fix"
+    zone_index: int = 0
+    row_index: int = 0
+    col_index: int = 0
+    col_type: str = ""
+    cell_id: str = ""
+    original_text: str = ""
+    suggested_text: str = ""
+    # For hyphen_join:
+    next_row_index: int = -1
+    next_row_cell_id: str = ""
+    next_row_text: str = ""
+    missing_chars: str = ""
+    display_parts: List[str] = field(default_factory=list)
+    # Alternatives (other plausible corrections the user can pick from)
+    alternatives: List[str] = field(default_factory=list)
+    # Meta:
+    confidence: float = 0.0
+    reason: str = ""           # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+# ---------------------------------------------------------------------------
+# Core repair logic
+# ---------------------------------------------------------------------------
+
+_TRAILING_PUNCT_RE = re.compile(r'[.,;:!?\)\]]+$')
+
+
+def _try_hyphen_join(
+    word_text: str,
+    next_word_text: str,
+    max_missing: int = 3,
+) -> Optional[Tuple[str, str, float]]:
+    """Try joining two fragments with 0..max_missing interpolated chars.
+
+    Strips trailing punctuation from the continuation word before testing
+    (e.g. "künden," → "künden") so dictionary lookup succeeds.
+
+    Returns (joined_word, missing_chars, confidence) or None.
+    """
+    base = word_text.rstrip("-").rstrip()
+    # Strip trailing punctuation from continuation (commas, periods, etc.)
+    raw_continuation = next_word_text.lstrip()
+    continuation = _TRAILING_PUNCT_RE.sub('', raw_continuation)
+
+    if not base or not continuation:
+        return None
+
+    # 1. Direct join (no missing chars)
+    direct = base + continuation
+    if _is_known(direct):
+        return (direct, "", 0.95)
+
+    # 2. Try with 1..max_missing missing characters
+    # Use common letters, weighted by frequency in German/English
+    _COMMON_CHARS = "enristaldhgcmobwfkzpvjyxqu"
+
+    for n_missing in range(1, max_missing + 1):
+        for chars in itertools.product(_COMMON_CHARS[:15], repeat=n_missing):
+            candidate = base + "".join(chars) + continuation
+            if _is_known(candidate):
+                missing = "".join(chars)
+                # Confidence decreases with more missing chars
+                conf = 0.90 - (n_missing - 1) * 0.10
+                return (candidate, missing, conf)
+
+    return None
+
+
+def _try_spell_fix(
+    word_text: str, col_type: str = "",
+) -> Optional[Tuple[str, float, List[str]]]:
+    """Try to fix a single garbled gutter word via spellchecker.
+
+    Returns (best_correction, confidence, alternatives_list) or None.
+    The alternatives list contains other plausible corrections the user
+    can choose from (e.g. "stammelt" vs "stammeln").
+    """
+    if len(word_text) < _MIN_WORD_LEN_SPELL:
+        return None
+
+    # Strip trailing/leading parentheses and check if the bare word is valid.
+    # Words like "probieren)" or "(Englisch" are valid words with punctuation,
+    # not OCR errors. Don't suggest corrections for them.
+    stripped = word_text.strip("()")
+    if stripped and _is_known(stripped):
+        return None
+
+    # Determine language priority from column type
+    if "en" in col_type:
+        lang = "en"
+    elif "de" in col_type:
+        lang = "de"
+    else:
+        lang = "both"
+
+    candidates = _spell_candidates(word_text, lang=lang)
+    if not candidates and lang != "both":
+        candidates = _spell_candidates(word_text, lang="both")
+
+    if not candidates:
+        return None
+
+    # Preserve original casing
+    is_upper = word_text[0].isupper()
+
+    def _preserve_case(w: str) -> str:
+        if is_upper and w:
+            return w[0].upper() + w[1:]
+        return w
+
+    # Sort candidates by edit distance (closest first)
+    scored = []
+    for c in candidates:
+        dist = _edit_distance(word_text.lower(), c.lower())
+        scored.append((dist, c))
+    scored.sort(key=lambda x: x[0])
+
+    best_dist, best = scored[0]
+    best = _preserve_case(best)
+    conf = max(0.5, 1.0 - best_dist * 0.15)
+
+    # Build alternatives (all other candidates, also case-preserved)
+    alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
+    # Limit to top 5 alternatives
+    alts = alts[:5]
+
+    return (best, conf, alts)
+
+
+def _edit_distance(a: str, b: str) -> int:
+    """Simple Levenshtein distance."""
+    if len(a) < len(b):
+        return _edit_distance(b, a)
+    if len(b) == 0:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a):
+        curr = [i + 1]
+        for j, cb in enumerate(b):
+            cost = 0 if ca == cb else 1
+            curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
+        prev = curr
+    return prev[len(b)]
@@ -0,0 +1,356 @@
+"""
+Gutter Repair Grid — grid analysis and suggestion application.
+
+Extracted from cv_gutter_repair.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import time
+from typing import Any, Dict, List, Tuple
+
+from cv_gutter_repair_core import (
+    _init_spellcheckers,
+    _is_ipa_text,
+    _is_known,
+    _MIN_WORD_LEN_HYPHEN,
+    _SPELL_AVAILABLE,
+    _STOPWORDS,
+    _TRAILING_PUNCT_RE,
+    _try_hyphen_join,
+    _try_spell_fix,
+    _word_is_at_gutter_edge,
+    GutterSuggestion,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Grid analysis
+# ---------------------------------------------------------------------------
+
+def analyse_grid_for_gutter_repair(
+    grid_data: Dict[str, Any],
+    image_width: int = 0,
+) -> Dict[str, Any]:
+    """Analyse a structured grid and return gutter repair suggestions.
+
+    Args:
+        grid_data: The grid_editor_result from the session (zones→cells structure).
+        image_width: Image width in pixels (for determining gutter side).
+
+    Returns:
+        Dict with "suggestions" list and "stats".
+    """
+    t0 = time.time()
+    _init_spellcheckers()
+
+    if not _SPELL_AVAILABLE:
+        return {
+            "suggestions": [],
+            "stats": {"error": "pyspellchecker not installed"},
+            "duration_seconds": 0,
+        }
+
+    zones = grid_data.get("zones", [])
+    suggestions: List[GutterSuggestion] = []
+    words_checked = 0
+    gutter_candidates = 0
+
+    for zi, zone in enumerate(zones):
+        columns = zone.get("columns", [])
+        cells = zone.get("cells", [])
+        if not columns or not cells:
+            continue
+
+        # Build column lookup: col_index → {x, width, type}
+        col_info: Dict[int, Dict] = {}
+        for col in columns:
+            ci = col.get("index", col.get("col_index", -1))
+            col_info[ci] = {
+                "x": col.get("x_min_px", col.get("x", 0)),
+                "width": col.get("x_max_px", col.get("width", 0)) - col.get("x_min_px", col.get("x", 0)),
+                "type": col.get("type", col.get("col_type", "")),
+            }
+
+        # Build row→col→cell lookup
+        cell_map: Dict[Tuple[int, int], Dict] = {}
+        max_row = 0
+        for cell in cells:
+            ri = cell.get("row_index", 0)
+            ci = cell.get("col_index", 0)
+            cell_map[(ri, ci)] = cell
+            if ri > max_row:
+                max_row = ri
+
+        # Determine which columns are at the gutter edge.
+        # For a left page: rightmost content columns.
+        # For now, check ALL columns — a word is a candidate if it's at the
+        # right edge of its column AND not a known word.
+        for (ri, ci), cell in cell_map.items():
+            text = (cell.get("text") or "").strip()
+            if not text:
+                continue
+            if _is_ipa_text(text):
+                continue
+
+            words_checked += 1
+            col = col_info.get(ci, {})
+            col_type = col.get("type", "")
+
+            # Get word boxes to check position
+            word_boxes = cell.get("word_boxes", [])
+
+            # Check the LAST word in the cell (rightmost, closest to gutter)
+            cell_words = text.split()
+            if not cell_words:
+                continue
+
+            last_word = cell_words[-1]
+
+            # Skip stopwords
+            if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
+                continue
+
+            last_word_clean = last_word.rstrip(".,;:!?)(")
+            if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
+                continue
+
+            # Check if the last word is at the gutter edge
+            is_at_edge = False
+            if word_boxes:
+                last_wb = word_boxes[-1]
+                is_at_edge = _word_is_at_gutter_edge(
+                    last_wb, col.get("x", 0), col.get("width", 1)
+                )
+            else:
+                # No word boxes — use cell bbox
+                bbox = cell.get("bbox_px", {})
+                is_at_edge = _word_is_at_gutter_edge(
+                    {"left": bbox.get("x", 0), "width": bbox.get("w", 0)},
+                    col.get("x", 0), col.get("width", 1)
+                )
+
+            if not is_at_edge:
+                continue
+
+            # Word is at gutter edge — check if it's a known word
+            if _is_known(last_word_clean):
+                continue
+
+            # Check if the word ends with "-" (explicit hyphen break)
+            ends_with_hyphen = last_word.endswith("-")
+
+            # If the word already ends with "-" and the stem (without
+            # the hyphen) is a known word, this is a VALID line-break
+            # hyphenation — not a gutter error.  Gutter problems cause
+            # the hyphen to be LOST ("ve" instead of "ver-"), so a
+            # visible hyphen + known stem = intentional word-wrap.
+            # Example: "wunder-" → "wunder" is known → skip.
+            if ends_with_hyphen:
+                stem = last_word_clean.rstrip("-")
+                if stem and _is_known(stem):
+                    continue
+
+            gutter_candidates += 1
+
+            # --- Strategy 1: Hyphen join with next row ---
+            next_cell = cell_map.get((ri + 1, ci))
+            if next_cell:
+                next_text = (next_cell.get("text") or "").strip()
+                next_words = next_text.split()
+                if next_words:
+                    first_next = next_words[0]
+                    first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next)
+                    first_alpha = next((c for c in first_next if c.isalpha()), "")
+
+                    # Also skip if the joined word is known (covers compound
+                    # words where the stem alone might not be in the dictionary)
+                    if ends_with_hyphen and first_next_clean:
+                        direct = last_word_clean.rstrip("-") + first_next_clean
+                        if _is_known(direct):
+                            continue
+
+                    # Continuation likely if:
+                    # - explicit hyphen, OR
+                    # - next row starts lowercase (= not a new entry)
+                    if ends_with_hyphen or (first_alpha and first_alpha.islower()):
+                        result = _try_hyphen_join(last_word_clean, first_next)
+                        if result:
+                            joined, missing, conf = result
+                            # Build display parts: show hyphenation for original layout
+                            if ends_with_hyphen:
+                                display_p1 = last_word_clean.rstrip("-")
+                                if missing:
+                                    display_p1 += missing
+                                display_p1 += "-"
+                            else:
+                                display_p1 = last_word_clean
+                                if missing:
+                                    display_p1 += missing + "-"
+                                else:
+                                    display_p1 += "-"
+
+                            suggestion = GutterSuggestion(
+                                type="hyphen_join",
+                                zone_index=zi,
+                                row_index=ri,
+                                col_index=ci,
+                                col_type=col_type,
+                                cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
+                                original_text=last_word,
+                                suggested_text=joined,
+                                next_row_index=ri + 1,
+                                next_row_cell_id=next_cell.get("cell_id", f"R{ri+1:02d}_C{ci}"),
+                                next_row_text=next_text,
+                                missing_chars=missing,
+                                display_parts=[display_p1, first_next],
+                                confidence=conf,
+                                reason="gutter_truncation" if missing else "hyphen_continuation",
+                            )
+                            suggestions.append(suggestion)
+                            continue  # skip spell_fix if hyphen_join found
+
+            # --- Strategy 2: Single-word spell fix (only for longer words) ---
+            fix_result = _try_spell_fix(last_word_clean, col_type)
+            if fix_result:
+                corrected, conf, alts = fix_result
+                suggestion = GutterSuggestion(
+                    type="spell_fix",
+                    zone_index=zi,
+                    row_index=ri,
+                    col_index=ci,
+                    col_type=col_type,
+                    cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
+                    original_text=last_word,
+                    suggested_text=corrected,
+                    alternatives=alts,
+                    confidence=conf,
+                    reason="gutter_blur",
+                )
+                suggestions.append(suggestion)
+
+    duration = round(time.time() - t0, 3)
+
+    logger.info(
+        "Gutter repair: checked %d words, %d gutter candidates, %d suggestions (%.2fs)",
+        words_checked, gutter_candidates, len(suggestions), duration,
+    )
+
+    return {
+        "suggestions": [s.to_dict() for s in suggestions],
+        "stats": {
+            "words_checked": words_checked,
+            "gutter_candidates": gutter_candidates,
+            "suggestions_found": len(suggestions),
+        },
+        "duration_seconds": duration,
+    }
+
+
+def apply_gutter_suggestions(
+    grid_data: Dict[str, Any],
+    accepted_ids: List[str],
+    suggestions: List[Dict[str, Any]],
+) -> Dict[str, Any]:
+    """Apply accepted gutter repair suggestions to the grid data.
+
+    Modifies cells in-place and returns summary of changes.
+
+    Args:
+        grid_data: The grid_editor_result (zones→cells).
+        accepted_ids: List of suggestion IDs the user accepted.
+        suggestions: The full suggestions list (from analyse_grid_for_gutter_repair).
+
+    Returns:
+        Dict with "applied_count" and "changes" list.
+    """
+    accepted_set = set(accepted_ids)
+    accepted_suggestions = [s for s in suggestions if s.get("id") in accepted_set]
+
+    zones = grid_data.get("zones", [])
+    changes: List[Dict[str, Any]] = []
+
+    for s in accepted_suggestions:
+        zi = s.get("zone_index", 0)
+        ri = s.get("row_index", 0)
+        ci = s.get("col_index", 0)
+        stype = s.get("type", "")
+
+        if zi >= len(zones):
+            continue
+        zone_cells = zones[zi].get("cells", [])
+
+        # Find the target cell
+        target_cell = None
+        for cell in zone_cells:
+            if cell.get("row_index") == ri and cell.get("col_index") == ci:
+                target_cell = cell
+                break
+
+        if not target_cell:
+            continue
+
+        old_text = target_cell.get("text", "")
+
+        if stype == "spell_fix":
+            # Replace the last word in the cell text
+            original_word = s.get("original_text", "")
+            corrected = s.get("suggested_text", "")
+            if original_word and corrected:
+                # Replace from the right (last occurrence)
+                idx = old_text.rfind(original_word)
+                if idx >= 0:
+                    new_text = old_text[:idx] + corrected + old_text[idx + len(original_word):]
+                    target_cell["text"] = new_text
+                    changes.append({
+                        "type": "spell_fix",
+                        "zone_index": zi,
+                        "row_index": ri,
+                        "col_index": ci,
+                        "cell_id": target_cell.get("cell_id", ""),
+                        "old_text": old_text,
+                        "new_text": new_text,
+                    })
+
+        elif stype == "hyphen_join":
+            # Current cell: replace last word with the hyphenated first part
+            original_word = s.get("original_text", "")
+            joined = s.get("suggested_text", "")
+            display_parts = s.get("display_parts", [])
+            next_ri = s.get("next_row_index", -1)
+
+            if not original_word or not joined or not display_parts:
+                continue
+
+            # The first display part is what goes in the current row
+            first_part = display_parts[0] if display_parts else ""
+
+            # Replace the last word in current cell with the restored form.
+            # The next row is NOT modified — "künden" stays in its row
+            # because the original book layout has it there. We only fix
+            # the truncated word in the current row (e.g. "ve" → "ver-").
+            idx = old_text.rfind(original_word)
+            if idx >= 0:
+                new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):]
+                target_cell["text"] = new_text
+                changes.append({
+                    "type": "hyphen_join",
+                    "zone_index": zi,
+                    "row_index": ri,
+                    "col_index": ci,
+                    "cell_id": target_cell.get("cell_id", ""),
+                    "old_text": old_text,
+                    "new_text": new_text,
+                    "joined_word": joined,
+                })
+
+    logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions))
+
+    return {
+        "applied_count": len(accepted_suggestions),
+        "changes": changes,
+    }
@@ -0,0 +1,35 @@
+"""
+Gutter Repair — barrel re-export.
+
+All implementation split into:
+  cv_gutter_repair_core  — spellchecker setup, data types, single-word repair
+  cv_gutter_repair_grid  — grid analysis, suggestion application
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+# Core: spellchecker, data types, repair helpers
+from cv_gutter_repair_core import (  # noqa: F401
+    _init_spellcheckers,
+    _is_known,
+    _spell_candidates,
+    _MIN_WORD_LEN_SPELL,
+    _MIN_WORD_LEN_HYPHEN,
+    _GUTTER_EDGE_THRESHOLD,
+    _STOPWORDS,
+    _IPA_RE,
+    _is_ipa_text,
+    _word_is_at_gutter_edge,
+    GutterSuggestion,
+    _TRAILING_PUNCT_RE,
+    _try_hyphen_join,
+    _try_spell_fix,
+    _edit_distance,
+)
+
+# Grid: analysis and application
+from cv_gutter_repair_grid import (  # noqa: F401
+    analyse_grid_for_gutter_repair,
+    apply_gutter_suggestions,
+)
@@ -0,0 +1,92 @@
+"""
+OCR Image Enhancement — Improve scan quality before OCR.
+
+Applies CLAHE contrast enhancement + bilateral filter denoising
+to degraded scans. Only runs when scan_quality.is_degraded is True.
+
+Pattern adapted from handwriting_htr_api.py (lines 50-68) and
+cv_layout.py (lines 229-241).
+
+All operations use OpenCV (Apache-2.0).
+"""
+
+import logging
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def enhance_for_ocr(
+    img_bgr: np.ndarray,
+    is_degraded: bool = False,
+    clip_limit: float = 3.0,
+    tile_size: int = 8,
+    denoise_d: int = 9,
+    denoise_sigma_color: float = 75,
+    denoise_sigma_space: float = 75,
+    sharpen: bool = True,
+) -> np.ndarray:
+    """
+    Enhance image quality for OCR processing.
+
+    Only applies aggressive enhancement when is_degraded is True.
+    For good scans, applies minimal enhancement (light CLAHE only).
+
+    Args:
+        img_bgr: Input BGR image
+        is_degraded: Whether the scan is degraded (from ScanQualityReport)
+        clip_limit: CLAHE clip limit (higher = more contrast)
+        tile_size: CLAHE tile grid size
+        denoise_d: Bilateral filter diameter
+        denoise_sigma_color: Bilateral filter sigma for color
+        denoise_sigma_space: Bilateral filter sigma for space
+        sharpen: Apply unsharp mask for blurry scans
+
+    Returns:
+        Enhanced BGR image
+    """
+    if not is_degraded:
+        # For good scans: light CLAHE only (preserves quality)
+        lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
+        l_channel, a_channel, b_channel = cv2.split(lab)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        l_enhanced = clahe.apply(l_channel)
+        lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
+        result = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
+        logger.info("enhance_for_ocr: light CLAHE applied (good scan)")
+        return result
+
+    # Degraded scan: full enhancement pipeline
+    logger.info(
+        f"enhance_for_ocr: full enhancement "
+        f"(CLAHE clip={clip_limit}, denoise d={denoise_d}, sharpen={sharpen})"
+    )
+
+    # 1. CLAHE on L-channel of LAB colorspace (preserves color for RapidOCR)
+    lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
+    l_channel, a_channel, b_channel = cv2.split(lab)
+    clahe = cv2.createCLAHE(
+        clipLimit=clip_limit,
+        tileGridSize=(tile_size, tile_size),
+    )
+    l_enhanced = clahe.apply(l_channel)
+    lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
+    enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
+
+    # 2. Bilateral filter: denoises while preserving edges
+    enhanced = cv2.bilateralFilter(
+        enhanced,
+        d=denoise_d,
+        sigmaColor=denoise_sigma_color,
+        sigmaSpace=denoise_sigma_space,
+    )
+
+    # 3. Unsharp mask for sharpening blurry text
+    if sharpen:
+        gaussian = cv2.GaussianBlur(enhanced, (0, 0), 3)
+        enhanced = cv2.addWeighted(enhanced, 1.5, gaussian, -0.5, 0)
+
+    logger.info("enhance_for_ocr: full enhancement pipeline complete")
+    return enhanced
@@ -0,0 +1,135 @@
+"""German IPA insertion for grid editor cells.
+
+Hybrid approach:
+  1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
+  2. Fallback: epitran rule-based G2P (MIT license)
+
+German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
+Attribution required — see grid editor UI.
+
+Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Dict, List, Optional, Set
+
+logger = logging.getLogger(__name__)
+
+# IPA/phonetic characters — skip cells that already contain IPA
+_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+
+
+def _lookup_ipa_de(word: str) -> Optional[str]:
+    """Look up German IPA for a single word.
+
+    Returns IPA string or None if not found.
+    """
+    from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
+
+    if not DE_IPA_AVAILABLE and _epitran_de is None:
+        return None
+
+    lower = word.lower().strip()
+    if not lower:
+        return None
+
+    # 1. Dictionary lookup (636k entries)
+    ipa = _de_ipa_dict.get(lower)
+    if ipa:
+        return ipa
+
+    # 2. epitran fallback (rule-based)
+    if _epitran_de is not None:
+        try:
+            result = _epitran_de.transliterate(word)
+            if result and result != word.lower():
+                return result
+        except Exception:
+            pass
+
+    return None
+
+
+def _insert_ipa_for_text(text: str) -> str:
+    """Insert German IPA after each recognized word in a text string.
+
+    Handles comma-separated lists:
+      "bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"
+
+    Skips cells already containing IPA brackets.
+    """
+    if not text or _IPA_RE.search(text):
+        return text
+
+    # Split on comma/semicolon sequences, keeping separators
+    tokens = re.split(r'([,;:]+\s*)', text)
+    result = []
+    changed = False
+
+    for tok in tokens:
+        # Keep separators as-is
+        if not tok or re.match(r'^[,;:\s]+$', tok):
+            result.append(tok)
+            continue
+
+        # Process words within this token
+        words = tok.split()
+        new_words = []
+        for w in words:
+            # Strip punctuation for lookup
+            clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
+            if len(clean) < 3:
+                new_words.append(w)
+                continue
+
+            ipa = _lookup_ipa_de(clean)
+            if ipa:
+                new_words.append(f"{w} [{ipa}]")
+                changed = True
+            else:
+                new_words.append(w)
+
+        result.append(' '.join(new_words))
+
+    return ''.join(result) if changed else text
+
+
+def insert_german_ipa(
+    cells: List[Dict],
+    target_cols: Set[str],
+) -> int:
+    """Insert German IPA transcriptions into cells of target columns.
+
+    Args:
+        cells: Flat list of all cells (modified in-place).
+        target_cols: Set of col_type values to process.
+
+    Returns:
+        Number of cells modified.
+    """
+    from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
+
+    if not DE_IPA_AVAILABLE and _epitran_de is None:
+        logger.warning("German IPA not available — skipping")
+        return 0
+
+    count = 0
+    for cell in cells:
+        ct = cell.get("col_type", "")
+        if ct not in target_cols:
+            continue
+        text = cell.get("text", "")
+        if not text.strip():
+            continue
+
+        new_text = _insert_ipa_for_text(text)
+        if new_text != text:
+            cell["text"] = new_text
+            cell["_ipa_corrected"] = True
+            count += 1
+
+    if count:
+        logger.info(f"German IPA inserted in {count} cells")
+    return count
@@ -0,0 +1,257 @@
+"""
+Legacy layout analysis using projection profiles.
+
+Extracted from cv_layout_columns.py — contains:
+- analyze_layout()   (projection-profile based column/header/footer detection)
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import PageRegion
+from cv_layout_detection import _find_content_bounds
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
+    """Detect columns, header, and footer using projection profiles.
+
+    Uses content-bounds detection to exclude page margins before searching
+    for column separators within the actual text area.
+
+    Args:
+        layout_img: CLAHE-enhanced grayscale image.
+        ocr_img: Binarized image for text density analysis.
+
+    Returns:
+        List of PageRegion objects describing detected regions.
+    """
+    h, w = ocr_img.shape[:2]
+
+    # Invert: black text on white → white text on black for projection
+    inv = cv2.bitwise_not(ocr_img)
+
+    # --- Find actual content bounds (exclude page margins) ---
+    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+                f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
+
+    if content_w < w * 0.3 or content_h < h * 0.3:
+        # Fallback if detection seems wrong
+        left_x, right_x = 0, w
+        top_y, bottom_y = 0, h
+        content_w, content_h = w, h
+
+    # --- Vertical projection within content area to find column separators ---
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+    v_proj = np.sum(content_strip, axis=0).astype(float)
+    v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
+
+    # Smooth the projection profile
+    kernel_size = max(5, content_w // 50)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Debug: log projection profile statistics
+    p_mean = float(np.mean(v_proj_smooth))
+    p_median = float(np.median(v_proj_smooth))
+    p_min = float(np.min(v_proj_smooth))
+    p_max = float(np.max(v_proj_smooth))
+    logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
+                f"mean={p_mean:.4f}, median={p_median:.4f}")
+
+    # Find valleys using multiple threshold strategies
+    # Strategy 1: relative to median (catches clear separators)
+    # Strategy 2: local minima approach (catches subtle gaps)
+    threshold = max(p_median * 0.3, p_mean * 0.2)
+    logger.info(f"Layout: valley threshold={threshold:.4f}")
+
+    in_valley = v_proj_smooth < threshold
+
+    # Find contiguous valley regions
+    all_valleys = []
+    start = None
+    for x in range(len(v_proj_smooth)):
+        if in_valley[x] and start is None:
+            start = x
+        elif not in_valley[x] and start is not None:
+            valley_width = x - start
+            valley_depth = float(np.min(v_proj_smooth[start:x]))
+            # Valley must be at least 3px wide
+            if valley_width >= 3:
+                all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
+            start = None
+
+    logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
+                f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
+
+    # Filter: valleys must be inside the content area (not at edges)
+    inner_margin = int(content_w * 0.08)
+    valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
+
+    # If no valleys found with strict threshold, try local minima approach
+    if len(valleys) < 2:
+        logger.info("Layout: trying local minima approach for column detection")
+        # Divide content into 20 segments, find the 2 lowest
+        seg_count = 20
+        seg_width = content_w // seg_count
+        seg_scores = []
+        for i in range(seg_count):
+            sx = i * seg_width
+            ex = min((i + 1) * seg_width, content_w)
+            seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
+            seg_scores.append((i, sx, ex, seg_mean))
+
+        seg_scores.sort(key=lambda s: s[3])
+        logger.info(f"Layout: segment scores (lowest 5): "
+                    f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
+
+        # Find two lowest non-adjacent segments that create reasonable columns
+        candidate_valleys = []
+        for seg_idx, sx, ex, seg_mean in seg_scores:
+            # Must not be at the edges
+            if seg_idx <= 1 or seg_idx >= seg_count - 2:
+                continue
+            # Must be significantly lower than overall mean
+            if seg_mean < p_mean * 0.6:
+                center = (sx + ex) // 2
+                candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
+
+        if len(candidate_valleys) >= 2:
+            # Pick the best pair: non-adjacent, creating reasonable column widths
+            candidate_valleys.sort(key=lambda v: v[2])
+            best_pair = None
+            best_score = float('inf')
+            for i in range(len(candidate_valleys)):
+                for j in range(i + 1, len(candidate_valleys)):
+                    c1 = candidate_valleys[i][2]
+                    c2 = candidate_valleys[j][2]
+                    # Must be at least 20% apart
+                    if (c2 - c1) < content_w * 0.2:
+                        continue
+                    col1 = c1
+                    col2 = c2 - c1
+                    col3 = content_w - c2
+                    # Each column at least 15%
+                    if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
+                        continue
+                    parts = sorted([col1, col2, col3])
+                    score = parts[2] - parts[0]
+                    if score < best_score:
+                        best_score = score
+                        best_pair = (candidate_valleys[i], candidate_valleys[j])
+
+            if best_pair:
+                valleys = list(best_pair)
+                logger.info(f"Layout: local minima found 2 valleys: "
+                            f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+    logger.info(f"Layout: final {len(valleys)} valleys: "
+                f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+    regions = []
+
+    if len(valleys) >= 2:
+        # 3-column layout detected
+        valleys.sort(key=lambda v: v[2])
+
+        if len(valleys) == 2:
+            sep1_center = valleys[0][2]
+            sep2_center = valleys[1][2]
+        else:
+            # Pick the two valleys that best divide into 3 parts
+            # Prefer wider valleys (more likely true separators)
+            best_pair = None
+            best_score = float('inf')
+            for i in range(len(valleys)):
+                for j in range(i + 1, len(valleys)):
+                    c1, c2 = valleys[i][2], valleys[j][2]
+                    # Each column should be at least 15% of content width
+                    col1 = c1
+                    col2 = c2 - c1
+                    col3 = content_w - c2
+                    if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
+                        continue
+                    # Score: lower is better (more even distribution)
+                    parts = sorted([col1, col2, col3])
+                    score = parts[2] - parts[0]
+                    # Bonus for wider valleys (subtract valley width)
+                    score -= (valleys[i][3] + valleys[j][3]) * 0.5
+                    if score < best_score:
+                        best_score = score
+                        best_pair = (c1, c2)
+            if best_pair:
+                sep1_center, sep2_center = best_pair
+            else:
+                sep1_center = valleys[0][2]
+                sep2_center = valleys[1][2]
+
+        # Convert from content-relative to absolute coordinates
+        abs_sep1 = sep1_center + left_x
+        abs_sep2 = sep2_center + left_x
+
+        logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
+                    f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
+
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=abs_sep1, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=abs_sep1, y=top_y,
+            width=abs_sep2 - abs_sep1, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_example', x=abs_sep2, y=top_y,
+            width=w - abs_sep2, height=content_h
+        ))
+
+    elif len(valleys) == 1:
+        # 2-column layout
+        abs_sep = valleys[0][2] + left_x
+
+        logger.info(f"Layout: 2 columns at separator x={abs_sep}")
+
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=abs_sep, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=abs_sep, y=top_y,
+            width=w - abs_sep, height=content_h
+        ))
+
+    else:
+        # No columns detected — run full-page OCR as single column
+        logger.warning("Layout: no column separators found, using full page")
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=w, height=content_h
+        ))
+
+    # Add header/footer info (gap-based detection with fallback)
+    # Lazy import to avoid circular dependency with cv_layout.py
+    from cv_layout_detection import _add_header_footer
+    _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
+
+    top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
+    bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
+    col_count = len([r for r in regions if r.type.startswith('column')])
+    logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
+
+    return regions
@@ -0,0 +1,494 @@
+"""
+Column type classification for OCR layout analysis.
+
+Entry point: classify_column_types() with 4-level fallback chain.
+Also provides positional_column_regions() and _build_margin_regions().
+Position-based classifiers (Level 2+3) in cv_layout_classify_position.py.
+"""
+
+import logging
+from typing import Dict, List, Optional
+
+import numpy as np
+
+from cv_vocab_types import ColumnGeometry, PageRegion
+
+from cv_layout_scoring import (
+    _score_language,
+    _score_role,
+    _score_dictionary_signals,
+    _classify_dictionary_columns,
+)
+
+from cv_layout_classify_position import (
+    _classify_by_position_enhanced,
+    _classify_by_position_fallback,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Margin Region Building
+# ---------------------------------------------------------------------------
+
+def _build_margin_regions(
+    all_regions: List[PageRegion],
+    left_x: int,
+    right_x: int,
+    img_w: int,
+    top_y: int,
+    content_h: int,
+) -> List[PageRegion]:
+    """Create margin_left / margin_right PageRegions from content bounds.
+
+    Margins represent the space between the image edge and the first/last
+    content column.  They are used downstream for faithful page
+    reconstruction but are skipped during OCR.
+    """
+    margins: List[PageRegion] = []
+    # Minimum gap (px) to create a margin region
+    _min_gap = 5
+
+    if left_x > _min_gap:
+        margins.append(PageRegion(
+            type='margin_left', x=0, y=top_y,
+            width=left_x, height=content_h,
+            classification_confidence=1.0,
+            classification_method='content_bounds',
+        ))
+
+    # Right margin: from end of last content column to image edge
+    non_margin = [r for r in all_regions
+                  if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
+                                    'margin_top', 'margin_bottom')]
+    if non_margin:
+        last_col_end = max(r.x + r.width for r in non_margin)
+    else:
+        last_col_end = right_x
+    if img_w - last_col_end > _min_gap:
+        margins.append(PageRegion(
+            type='margin_right', x=last_col_end, y=top_y,
+            width=img_w - last_col_end, height=content_h,
+            classification_confidence=1.0,
+            classification_method='content_bounds',
+        ))
+
+    if margins:
+        logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
+                     f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
+
+    return margins
+
+
+# ---------------------------------------------------------------------------
+# Positional Column Regions
+# ---------------------------------------------------------------------------
+
+def positional_column_regions(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    content_h: int,
+    left_x: int,
+) -> List[PageRegion]:
+    """Classify columns by position only (no language scoring).
+
+    Structural columns (page_ref, column_marker) are identified by geometry.
+    Remaining content columns are labelled left->right as column_en, column_de,
+    column_example.  The names are purely positional -- no language analysis.
+    """
+    structural: List[PageRegion] = []
+    content_cols: List[ColumnGeometry] = []
+
+    for g in geometries:
+        rel_x = g.x - left_x
+        # page_ref: narrow column in the leftmost 20% region
+        if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
+            structural.append(PageRegion(
+                type='page_ref', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        # column_marker: very narrow, few words
+        elif g.width_ratio < 0.06 and g.word_count <= 15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        # empty or near-empty narrow column -> treat as margin/structural
+        elif g.word_count <= 2 and g.width_ratio < 0.15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.85,
+                classification_method='positional',
+            ))
+        else:
+            content_cols.append(g)
+
+    # Single content column -> plain text page
+    if len(content_cols) == 1:
+        g = content_cols[0]
+        return structural + [PageRegion(
+            type='column_text', x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.9,
+            classification_method='positional',
+        )]
+
+    # No content columns
+    if not content_cols:
+        return structural
+
+    # Sort content columns left->right and assign positional labels
+    content_cols.sort(key=lambda g: g.x)
+
+    # With exactly 2 content columns: if the left one is very wide (>35%),
+    # it likely contains EN+DE combined, so the right one is examples.
+    if (len(content_cols) == 2
+            and content_cols[0].width_ratio > 0.35
+            and content_cols[1].width_ratio > 0.20):
+        labels = ['column_en', 'column_example']
+    else:
+        labels = ['column_en', 'column_de', 'column_example']
+
+    regions = list(structural)
+    for i, g in enumerate(content_cols):
+        label = labels[i] if i < len(labels) else 'column_example'
+        regions.append(PageRegion(
+            type=label, x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.95,
+            classification_method='positional',
+        ))
+
+    logger.info(f"PositionalColumns: {len(structural)} structural, "
+                f"{len(content_cols)} content -> "
+                f"{[r.type for r in regions]}")
+    return regions
+
+
+# ---------------------------------------------------------------------------
+# Main Classification Entry Point
+# ---------------------------------------------------------------------------
+
+def classify_column_types(geometries: List[ColumnGeometry],
+                          content_w: int,
+                          top_y: int,
+                          img_w: int,
+                          img_h: int,
+                          bottom_y: int,
+                          left_x: int = 0,
+                          right_x: int = 0,
+                          inv: Optional[np.ndarray] = None,
+                          document_category: Optional[str] = None,
+                          margin_strip_detected: bool = False) -> List[PageRegion]:
+    """Classify column types using a 3-level fallback chain.
+
+    Level 0: Dictionary detection (if signals are strong enough)
+    Level 1: Content-based (language + role scoring)
+    Level 2: Position + language (old rules enhanced with language detection)
+    Level 3: Pure position (exact old code, no regression)
+
+    Args:
+        geometries: List of ColumnGeometry from Phase A.
+        content_w: Total content width.
+        top_y: Top Y of content area.
+        img_w: Full image width.
+        img_h: Full image height.
+        bottom_y: Bottom Y of content area.
+        left_x: Left content bound (from _find_content_bounds).
+        right_x: Right content bound (from _find_content_bounds).
+        document_category: User-selected category (e.g. 'woerterbuch').
+        margin_strip_detected: Whether a decorative A-Z margin strip was found.
+
+    Returns:
+        List of PageRegion with types, confidence, and method.
+    """
+    # _add_header_footer lives in cv_layout (avoids circular import at module
+    # level).  Lazy-import here so the module can be tested independently when
+    # cv_layout hasn't been modified yet.
+    from cv_layout_detection import _add_header_footer  # noqa: E402
+
+    content_h = bottom_y - top_y
+
+    def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
+        """Append margin_left / margin_right regions to *result*."""
+        margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
+        return result + margins
+
+    # Special case: single column -> plain text page
+    if len(geometries) == 1:
+        geom = geometries[0]
+        return _with_margins([PageRegion(
+            type='column_text', x=geom.x, y=geom.y,
+            width=geom.width, height=geom.height,
+            classification_confidence=0.9,
+            classification_method='content',
+        )])
+
+    # --- Pre-filter: first/last columns with very few words -> column_ignore ---
+    # Sub-columns from _detect_sub_columns() are exempt: they intentionally
+    # have few words (page refs, markers) and should not be discarded.
+    ignore_regions = []
+    active_geometries = []
+    for idx, g in enumerate(geometries):
+        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
+            ignore_regions.append(PageRegion(
+                type='column_ignore', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='content',
+            ))
+            logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) -> column_ignore (edge, few words)")
+        else:
+            active_geometries.append(g)
+
+    # Re-index active geometries for classification
+    for new_idx, g in enumerate(active_geometries):
+        g.index = new_idx
+    geometries = active_geometries
+
+    # Handle edge case: all columns ignored or only 1 left
+    if len(geometries) == 0:
+        return _with_margins(ignore_regions)
+    if len(geometries) == 1:
+        geom = geometries[0]
+        ignore_regions.append(PageRegion(
+            type='column_text', x=geom.x, y=geom.y,
+            width=geom.width, height=geom.height,
+            classification_confidence=0.9,
+            classification_method='content',
+        ))
+        return _with_margins(ignore_regions)
+
+    # --- Score all columns ---
+    lang_scores = [_score_language(g.words) for g in geometries]
+    role_scores = [_score_role(g) for g in geometries]
+
+    logger.info(f"ClassifyColumns: language scores: "
+                f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
+    logger.info(f"ClassifyColumns: role scores: "
+                f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
+
+    # --- Level 0: Dictionary detection ---
+    dict_signals = _score_dictionary_signals(
+        geometries,
+        document_category=document_category,
+        margin_strip_detected=margin_strip_detected,
+    )
+    if dict_signals["is_dictionary"]:
+        regions = _classify_dictionary_columns(
+            geometries, dict_signals, lang_scores, content_h,
+        )
+        if regions is not None:
+            logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
+                        dict_signals["confidence"])
+            _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+            return _with_margins(ignore_regions + regions)
+
+    # --- Level 1: Content-based classification ---
+    regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
+    if regions is not None:
+        logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
+        _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+        return _with_margins(ignore_regions + regions)
+
+    # --- Level 2: Position + language enhanced ---
+    regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
+    if regions is not None:
+        logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
+        _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+        return _with_margins(ignore_regions + regions)
+
+    # --- Level 3: Pure position fallback (old code, no regression) ---
+    logger.info("ClassifyColumns: Level 3 (position fallback)")
+    regions = _classify_by_position_fallback(geometries, content_w, content_h)
+    _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+    return _with_margins(ignore_regions + regions)
+
+
+# ---------------------------------------------------------------------------
+# Level 1: Content-Based Classification
+# ---------------------------------------------------------------------------
+
+def _classify_by_content(geometries: List[ColumnGeometry],
+                         lang_scores: List[Dict[str, float]],
+                         role_scores: List[Dict[str, float]],
+                         content_w: int,
+                         content_h: int) -> Optional[List[PageRegion]]:
+    """Level 1: Classify columns purely by content analysis.
+
+    Requires clear language signals to distinguish EN/DE columns.
+    Returns None if language signals are too weak.
+    """
+    regions = []
+    assigned = set()
+
+    # Step 1: Assign structural roles first (reference, marker)
+    # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
+    left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
+
+    for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
+        is_left_side = geom.x < left_20_threshold
+        has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
+        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
+            regions.append(PageRegion(
+                type='page_ref', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=rs['reference'],
+                classification_method='content',
+            ))
+            assigned.add(i)
+        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=rs['marker'],
+                classification_method='content',
+            ))
+            assigned.add(i)
+        elif geom.width_ratio < 0.05 and not is_left_side:
+            # Narrow column on the right side -> marker, not page_ref
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.8,
+                classification_method='content',
+            ))
+            assigned.add(i)
+
+    # Step 2: Among remaining columns, find EN and DE by language scores
+    remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
+                 for i in range(len(geometries)) if i not in assigned]
+
+    if len(remaining) < 2:
+        # Not enough columns for EN/DE pair
+        if len(remaining) == 1:
+            i, geom, ls, rs = remaining[0]
+            regions.append(PageRegion(
+                type='column_text', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.6,
+                classification_method='content',
+            ))
+        regions.sort(key=lambda r: r.x)
+        return regions
+
+    # Check if we have enough language signal
+    en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
+    de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
+
+    # Position tiebreaker: when language signals are weak, use left=EN, right=DE
+    if (not en_candidates or not de_candidates) and len(remaining) >= 2:
+        max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
+        max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
+        if max_eng < 0.15 and max_deu < 0.15:
+            # Both signals weak -- fall back to positional: left=EN, right=DE
+            sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
+            best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
+            best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
+            logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
+            en_conf = 0.4
+            de_conf = 0.4
+
+            regions.append(PageRegion(
+                type='column_en', x=best_en[1].x, y=best_en[1].y,
+                width=best_en[1].width, height=content_h,
+                classification_confidence=en_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_en[0])
+
+            regions.append(PageRegion(
+                type='column_de', x=best_de[1].x, y=best_de[1].y,
+                width=best_de[1].width, height=content_h,
+                classification_confidence=de_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_de[0])
+
+            # Assign remaining as example
+            for i, geom, ls, rs in remaining:
+                if i not in assigned:
+                    regions.append(PageRegion(
+                        type='column_example', x=geom.x, y=geom.y,
+                        width=geom.width, height=content_h,
+                        classification_confidence=0.4,
+                        classification_method='content',
+                    ))
+            regions.sort(key=lambda r: r.x)
+            return regions
+
+    if not en_candidates or not de_candidates:
+        # Language signals too weak for content-based classification
+        logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
+        return None
+
+    # Pick the best EN and DE candidates
+    best_en = max(en_candidates, key=lambda x: x[2]['eng'])
+    best_de = max(de_candidates, key=lambda x: x[2]['deu'])
+
+    # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
+    # Example sentences contain English function words ("the", "a", "is") which inflate
+    # the eng score of the Example column.  When the best EN candidate sits to the RIGHT
+    # of the DE column and there is another EN candidate to the LEFT, prefer the left one
+    # -- it is almost certainly the real vocabulary column.
+    if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
+        left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
+        if left_of_de:
+            alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
+            logger.info(
+                f"ClassifyColumns: Level 1 position fix -- best EN col {best_en[0]} "
+                f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
+                f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
+            best_en = alt_en
+
+    if best_en[0] == best_de[0]:
+        # Same column scored highest for both -- ambiguous
+        logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
+        return None
+
+    en_conf = best_en[2]['eng']
+    de_conf = best_de[2]['deu']
+
+    regions.append(PageRegion(
+        type='column_en', x=best_en[1].x, y=best_en[1].y,
+        width=best_en[1].width, height=content_h,
+        classification_confidence=round(en_conf, 2),
+        classification_method='content',
+    ))
+    assigned.add(best_en[0])
+
+    regions.append(PageRegion(
+        type='column_de', x=best_de[1].x, y=best_de[1].y,
+        width=best_de[1].width, height=content_h,
+        classification_confidence=round(de_conf, 2),
+        classification_method='content',
+    ))
+    assigned.add(best_de[0])
+
+    # Step 3: Remaining columns -> example or text based on role scores
+    for i, geom, ls, rs in remaining:
+        if i in assigned:
+            continue
+        if rs['sentence'] > 0.4:
+            regions.append(PageRegion(
+                type='column_example', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=round(rs['sentence'], 2),
+                classification_method='content',
+            ))
+        else:
+            regions.append(PageRegion(
+                type='column_example', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.5,
+                classification_method='content',
+            ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
@@ -0,0 +1,218 @@
+"""
+Position-based column type classification for OCR layout analysis.
+
+Contains Level 2 and Level 3 classification functions:
+  Level 2 – _classify_by_position_enhanced: Position + language confirmation
+  Level 3 – _classify_by_position_fallback: Pure positional (no regression)
+
+Extracted from cv_layout_classify.py during file-size split.
+"""
+
+import logging
+from typing import Dict, List, Optional
+
+from cv_vocab_types import ColumnGeometry, PageRegion
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Level 2: Position-Enhanced Classification
+# ---------------------------------------------------------------------------
+
+def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
+                                    lang_scores: List[Dict[str, float]],
+                                    content_w: int,
+                                    content_h: int) -> Optional[List[PageRegion]]:
+    """Level 2: Position-based rules enhanced with language confirmation.
+
+    Uses the old positional heuristics but confirms EN/DE assignment
+    with language scores (swapping if needed).
+    """
+    regions = []
+    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_20_threshold = first_x + content_w * 0.20
+
+    # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
+    g0 = geometries[0]
+    ls0 = lang_scores[0]
+    has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
+    if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
+        regions.append(PageRegion(
+            type='page_ref', x=g0.x, y=g0.y,
+            width=g0.width, height=content_h,
+            classification_confidence=0.8,
+            classification_method='position_enhanced',
+        ))
+        untyped.remove(0)
+
+    # Rule 2: Narrow columns with few words -> marker
+    for i in list(untyped):
+        geom = geometries[i]
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.7,
+                classification_method='position_enhanced',
+            ))
+            untyped.remove(i)
+
+    # Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
+    if len(untyped) >= 3:
+        last_idx = untyped[-1]
+        geom = geometries[last_idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.7,
+            classification_method='position_enhanced',
+        ))
+        untyped.remove(last_idx)
+
+    # Rule 4: First two remaining -> EN/DE, but check language to possibly swap
+    if len(untyped) >= 2:
+        idx_a = untyped[0]
+        idx_b = untyped[1]
+        ls_a = lang_scores[idx_a]
+        ls_b = lang_scores[idx_b]
+
+        # Default: first=EN, second=DE (old behavior)
+        en_idx, de_idx = idx_a, idx_b
+        conf = 0.7
+
+        # Swap if language signals clearly indicate the opposite
+        if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
+            en_idx, de_idx = idx_b, idx_a
+            conf = 0.85
+            logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
+
+        regions.append(PageRegion(
+            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+            width=geometries[en_idx].width, height=content_h,
+            classification_confidence=conf,
+            classification_method='position_enhanced',
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+            width=geometries[de_idx].width, height=content_h,
+            classification_confidence=conf,
+            classification_method='position_enhanced',
+        ))
+        untyped = untyped[2:]
+    elif len(untyped) == 1:
+        idx = untyped[0]
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_en', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.5,
+            classification_method='position_enhanced',
+        ))
+        untyped = []
+
+    # Remaining -> example
+    for idx in untyped:
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.5,
+            classification_method='position_enhanced',
+        ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
+
+
+# ---------------------------------------------------------------------------
+# Level 3: Position Fallback Classification
+# ---------------------------------------------------------------------------
+
+def _classify_by_position_fallback(geometries: List[ColumnGeometry],
+                                   content_w: int,
+                                   content_h: int) -> List[PageRegion]:
+    """Level 3: Pure position-based fallback (identical to old code).
+
+    Guarantees no regression from the previous behavior.
+    """
+    regions = []
+    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_20_threshold = first_x + content_w * 0.20
+
+    # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
+    g0 = geometries[0]
+    if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
+        regions.append(PageRegion(
+            type='page_ref', x=g0.x, y=g0.y,
+            width=g0.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped.remove(0)
+
+    # Rule 2: Narrow + few words -> marker
+    for i in list(untyped):
+        geom = geometries[i]
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=1.0,
+                classification_method='position_fallback',
+            ))
+            untyped.remove(i)
+
+    # Rule 3: Rightmost remaining -> example (if 3+)
+    if len(untyped) >= 3:
+        last_idx = untyped[-1]
+        geom = geometries[last_idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped.remove(last_idx)
+
+    # Rule 4: First remaining -> EN, second -> DE
+    if len(untyped) >= 2:
+        en_idx = untyped[0]
+        de_idx = untyped[1]
+        regions.append(PageRegion(
+            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+            width=geometries[en_idx].width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+            width=geometries[de_idx].width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped = untyped[2:]
+    elif len(untyped) == 1:
+        idx = untyped[0]
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_en', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped = []
+
+    for idx in untyped:
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
@@ -0,0 +1,458 @@
+"""
+Post-processing refinements for column geometry.
+
+Extracted from cv_layout_columns.py — contains:
+- _detect_sub_columns()      (sub-column detection via left-edge alignment)
+- _split_broad_columns()     (broad column splitting via word-coverage gaps)
+- expand_narrow_columns()    (narrow column expansion into whitespace)
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import statistics
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import ColumnGeometry
+
+logger = logging.getLogger(__name__)
+
+
+def _detect_sub_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int = 0,
+    top_y: int = 0,
+    header_y: Optional[int] = None,
+    footer_y: Optional[int] = None,
+    _edge_tolerance: int = 8,
+    _min_col_start_ratio: float = 0.10,
+) -> List[ColumnGeometry]:
+    """Split columns that contain internal sub-columns based on left-edge alignment.
+
+    For each column, clusters word left-edges into alignment bins (within
+    ``_edge_tolerance`` px).  The leftmost bin whose word count reaches
+    ``_min_col_start_ratio`` of the column total is treated as the true column
+    start.  Any words to the left of that bin form a sub-column, provided they
+    number >= 2 and < 35 % of total.
+
+    Word ``left`` values are relative to the content ROI (offset by *left_x*),
+    while ``ColumnGeometry.x`` is in absolute image coordinates.  *left_x*
+    bridges the two coordinate systems.
+
+    If *header_y* / *footer_y* are provided (absolute y-coordinates), words
+    in header/footer regions are excluded from alignment clustering to avoid
+    polluting the bins with page numbers or chapter titles.  Word ``top``
+    values are relative to *top_y*.
+
+    Returns a new list of ColumnGeometry — potentially longer than the input.
+    """
+    if content_w <= 0:
+        return geometries
+
+    result: List[ColumnGeometry] = []
+    for geo in geometries:
+        # Only consider wide-enough columns with enough words
+        if geo.width_ratio < 0.15 or geo.word_count < 5:
+            result.append(geo)
+            continue
+
+        # Collect left-edges of confident words, excluding header/footer
+        # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
+        min_top_rel = (header_y - top_y) if header_y is not None else None
+        max_top_rel = (footer_y - top_y) if footer_y is not None else None
+
+        confident = [w for w in geo.words
+                     if w.get('conf', 0) >= 30
+                     and (min_top_rel is None or w['top'] >= min_top_rel)
+                     and (max_top_rel is None or w['top'] <= max_top_rel)]
+        if len(confident) < 3:
+            result.append(geo)
+            continue
+
+        # --- Cluster left-edges into alignment bins ---
+        sorted_edges = sorted(w['left'] for w in confident)
+        bins: List[Tuple[int, int, int, int]] = []  # (center, count, min_edge, max_edge)
+        cur = [sorted_edges[0]]
+        for i in range(1, len(sorted_edges)):
+            if sorted_edges[i] - cur[-1] <= _edge_tolerance:
+                cur.append(sorted_edges[i])
+            else:
+                bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+                cur = [sorted_edges[i]]
+        bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+
+        # --- Find the leftmost bin qualifying as a real column start ---
+        total = len(confident)
+        min_count = max(3, int(total * _min_col_start_ratio))
+        col_start_bin = None
+        for b in bins:
+            if b[1] >= min_count:
+                col_start_bin = b
+                break
+
+        if col_start_bin is None:
+            result.append(geo)
+            continue
+
+        # Words to the left of the column-start bin are sub-column candidates
+        split_threshold = col_start_bin[2] - _edge_tolerance
+        sub_words = [w for w in geo.words if w['left'] < split_threshold]
+        main_words = [w for w in geo.words if w['left'] >= split_threshold]
+
+        # Count only body words (excluding header/footer) for the threshold check
+        # so that header/footer words don't artificially trigger a split.
+        sub_body = [w for w in sub_words
+                    if (min_top_rel is None or w['top'] >= min_top_rel)
+                    and (max_top_rel is None or w['top'] <= max_top_rel)]
+        if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
+            result.append(geo)
+            continue
+
+        # --- Guard against inline markers (bullet points, numbering) ---
+        # Bullet points like "1.", "2.", "•", "-" sit close to the main
+        # column text and are part of the cell, not a separate column.
+        # Only split if the horizontal gap between the rightmost sub-word
+        # and the main column start is large enough.
+        max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
+        gap_to_main = col_start_bin[2] - max_sub_right  # px gap
+        median_heights = [w.get('height', 20) for w in confident]
+        med_h = statistics.median(median_heights) if median_heights else 20
+        min_gap = max(med_h * 1.2, 20)  # at least 1.2× word height or 20px
+        if gap_to_main < min_gap:
+            logger.debug(
+                "SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
+                "(likely inline markers, not a sub-column)",
+                geo.index, gap_to_main, min_gap)
+            result.append(geo)
+            continue
+
+        # --- Build two sub-column geometries ---
+        # Word 'left' values are relative to left_x; geo.x is absolute.
+        # Convert the split position from relative to absolute coordinates.
+        max_sub_left = max(w['left'] for w in sub_words)
+        split_rel = (max_sub_left + col_start_bin[2]) // 2
+        split_abs = split_rel + left_x
+
+        sub_x = geo.x
+        sub_width = split_abs - geo.x
+        main_x = split_abs
+        main_width = (geo.x + geo.width) - split_abs
+
+        if sub_width <= 0 or main_width <= 0:
+            result.append(geo)
+            continue
+
+        sub_geo = ColumnGeometry(
+            index=0,
+            x=sub_x,
+            y=geo.y,
+            width=sub_width,
+            height=geo.height,
+            word_count=len(sub_words),
+            words=sub_words,
+            width_ratio=sub_width / content_w if content_w > 0 else 0.0,
+            is_sub_column=True,
+        )
+        main_geo = ColumnGeometry(
+            index=0,
+            x=main_x,
+            y=geo.y,
+            width=main_width,
+            height=geo.height,
+            word_count=len(main_words),
+            words=main_words,
+            width_ratio=main_width / content_w if content_w > 0 else 0.0,
+            is_sub_column=True,
+        )
+
+        result.append(sub_geo)
+        result.append(main_geo)
+
+        logger.info(
+            f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
+            f"(rel={split_rel}), sub={len(sub_words)} words, "
+            f"main={len(main_words)} words, "
+            f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
+        )
+
+    # Re-index by left-to-right order
+    result.sort(key=lambda g: g.x)
+    for i, g in enumerate(result):
+        g.index = i
+
+    return result
+
+
+def _split_broad_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int = 0,
+    _broad_threshold: float = 0.35,
+    _min_gap_px: int = 15,
+    _min_words_per_split: int = 5,
+) -> List[ColumnGeometry]:
+    """Split overly broad columns that contain two language blocks (EN+DE).
+
+    Uses word-coverage gap analysis: builds a per-pixel coverage array from the
+    words inside each broad column, finds the largest horizontal gap, and splits
+    the column at that gap.
+
+    Args:
+        geometries: Column geometries from _detect_sub_columns.
+        content_w: Width of the content area in pixels.
+        left_x: Left edge of content ROI in absolute image coordinates.
+        _broad_threshold: Minimum width_ratio to consider a column "broad".
+        _min_gap_px: Minimum gap width (pixels) to trigger a split.
+        _min_words_per_split: Both halves must have at least this many words.
+
+    Returns:
+        Updated list of ColumnGeometry (possibly with more columns).
+    """
+    result: List[ColumnGeometry] = []
+
+    logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
+                f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
+
+    for geo in geometries:
+        if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
+            result.append(geo)
+            continue
+
+        # Build word-coverage array (per pixel within column)
+        col_left_rel = geo.x - left_x  # column left in content-relative coords
+        coverage = np.zeros(geo.width, dtype=np.float32)
+
+        for wd in geo.words:
+            # wd['left'] is relative to left_x (content ROI)
+            wl = wd['left'] - col_left_rel
+            wr = wl + wd.get('width', 0)
+            wl = max(0, int(wl))
+            wr = min(geo.width, int(wr))
+            if wr > wl:
+                coverage[wl:wr] += 1.0
+
+        # Light smoothing (kernel=3px) to avoid noise
+        if len(coverage) > 3:
+            kernel = np.ones(3, dtype=np.float32) / 3.0
+            coverage = np.convolve(coverage, kernel, mode='same')
+
+        # Normalise to [0, 1]
+        cmax = coverage.max()
+        if cmax > 0:
+            coverage /= cmax
+
+        # Find INTERNAL gaps where coverage < 0.5
+        # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
+        low_mask = coverage < 0.5
+        all_gaps = []
+        _gs = None
+        for px in range(len(low_mask)):
+            if low_mask[px]:
+                if _gs is None:
+                    _gs = px
+            else:
+                if _gs is not None:
+                    all_gaps.append((_gs, px, px - _gs))
+                    _gs = None
+        if _gs is not None:
+            all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
+
+        # Filter: only internal gaps (not touching column edges)
+        _edge_margin = 10  # pixels from edge to ignore
+        internal_gaps = [g for g in all_gaps
+                         if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
+        best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
+
+        logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
+                    f"{[g for g in all_gaps if g[2] >= 5]}, "
+                    f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
+                    f"best={best_gap}")
+
+        if best_gap is None or best_gap[2] < _min_gap_px:
+            result.append(geo)
+            continue
+
+        gap_center = (best_gap[0] + best_gap[1]) // 2
+
+        # Split words by midpoint relative to gap
+        left_words = []
+        right_words = []
+        for wd in geo.words:
+            wl = wd['left'] - col_left_rel
+            mid = wl + wd.get('width', 0) / 2.0
+            if mid < gap_center:
+                left_words.append(wd)
+            else:
+                right_words.append(wd)
+
+        if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
+            result.append(geo)
+            continue
+
+        # Build two new ColumnGeometry objects
+        split_x_abs = geo.x + gap_center
+        left_w = gap_center
+        right_w = geo.width - gap_center
+
+        left_geo = ColumnGeometry(
+            index=0,
+            x=geo.x,
+            y=geo.y,
+            width=left_w,
+            height=geo.height,
+            word_count=len(left_words),
+            words=left_words,
+            width_ratio=left_w / content_w if content_w else 0,
+            is_sub_column=True,
+        )
+        right_geo = ColumnGeometry(
+            index=0,
+            x=split_x_abs,
+            y=geo.y,
+            width=right_w,
+            height=geo.height,
+            word_count=len(right_words),
+            words=right_words,
+            width_ratio=right_w / content_w if content_w else 0,
+            is_sub_column=True,
+        )
+
+        logger.info(
+            f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
+            f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
+            f"left={len(left_words)} words (w={left_w}), "
+            f"right={len(right_words)} words (w={right_w})"
+        )
+
+        result.append(left_geo)
+        result.append(right_geo)
+
+    # Re-index left-to-right
+    result.sort(key=lambda g: g.x)
+    for i, g in enumerate(result):
+        g.index = i
+
+    return result
+
+
+def expand_narrow_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int,
+    word_dicts: List[Dict],
+) -> List[ColumnGeometry]:
+    """Expand narrow columns into adjacent whitespace gaps.
+
+    Narrow columns (marker, page_ref, < 10% content width) often lose
+    content at image edges due to residual shear.  This expands them toward
+    the neighbouring column, but never past 40% of the gap or past the
+    nearest word in the neighbour.
+
+    Must be called AFTER _detect_sub_columns() so that sub-column splits
+    (which create the narrowest columns) have already happened.
+    """
+    _NARROW_THRESHOLD_PCT = 10.0
+    _MIN_WORD_MARGIN = 4
+
+    if len(geometries) < 2:
+        return geometries
+
+    logger.info("ExpandNarrowCols: input %d cols: %s",
+                len(geometries),
+                [(i, g.x, g.width, round(g.width / content_w * 100, 1))
+                 for i, g in enumerate(geometries)])
+
+    for i, g in enumerate(geometries):
+        col_pct = g.width / content_w * 100 if content_w > 0 else 100
+        if col_pct >= _NARROW_THRESHOLD_PCT:
+            continue
+
+        expanded = False
+        orig_pct = col_pct
+
+        # --- try expanding to the LEFT ---
+        if i > 0:
+            left_nb = geometries[i - 1]
+            # Gap can be 0 if sub-column split created adjacent columns.
+            # In that case, look at where the neighbor's rightmost words
+            # actually are — there may be unused space we can claim.
+            nb_words_right = [wd['left'] + wd.get('width', 0)
+                              for wd in left_nb.words]
+            if nb_words_right:
+                rightmost_word_abs = left_x + max(nb_words_right)
+                safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
+            else:
+                # No words in neighbor → we can take up to neighbor's start
+                safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
+
+            if safe_left_abs < g.x:
+                g.width += (g.x - safe_left_abs)
+                g.x = safe_left_abs
+                expanded = True
+
+        # --- try expanding to the RIGHT ---
+        if i + 1 < len(geometries):
+            right_nb = geometries[i + 1]
+            nb_words_left = [wd['left'] for wd in right_nb.words]
+            if nb_words_left:
+                leftmost_word_abs = left_x + min(nb_words_left)
+                safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
+            else:
+                safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
+
+            cur_right = g.x + g.width
+            if safe_right_abs > cur_right:
+                g.width = safe_right_abs - g.x
+                expanded = True
+
+        if expanded:
+            col_left_rel = g.x - left_x
+            col_right_rel = col_left_rel + g.width
+            g.words = [wd for wd in word_dicts
+                       if col_left_rel <= wd['left'] < col_right_rel]
+            g.word_count = len(g.words)
+            g.width_ratio = g.width / content_w if content_w > 0 else 0.0
+            logger.info(
+                "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
+                i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
+
+            # --- Shrink overlapping neighbors to match new boundaries ---
+            # Left neighbor: its right edge must not exceed our new left edge
+            if i > 0:
+                left_nb = geometries[i - 1]
+                nb_right = left_nb.x + left_nb.width
+                if nb_right > g.x:
+                    left_nb.width = g.x - left_nb.x
+                    if left_nb.width < 0:
+                        left_nb.width = 0
+                    left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
+                    # Re-assign words
+                    nb_left_rel = left_nb.x - left_x
+                    nb_right_rel = nb_left_rel + left_nb.width
+                    left_nb.words = [wd for wd in word_dicts
+                                     if nb_left_rel <= wd['left'] < nb_right_rel]
+                    left_nb.word_count = len(left_nb.words)
+
+            # Right neighbor: its left edge must not be before our new right edge
+            if i + 1 < len(geometries):
+                right_nb = geometries[i + 1]
+                my_right = g.x + g.width
+                if right_nb.x < my_right:
+                    old_right_edge = right_nb.x + right_nb.width
+                    right_nb.x = my_right
+                    right_nb.width = old_right_edge - right_nb.x
+                    if right_nb.width < 0:
+                        right_nb.width = 0
+                    right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
+                    # Re-assign words
+                    nb_left_rel = right_nb.x - left_x
+                    nb_right_rel = nb_left_rel + right_nb.width
+                    right_nb.words = [wd for wd in word_dicts
+                                      if nb_left_rel <= wd['left'] < nb_right_rel]
+                    right_nb.word_count = len(right_nb.words)
+
+    return geometries
@@ -0,0 +1,589 @@
+"""
+Core column detection: gap-based geometry and clustering fallback.
+
+Extracted from the original cv_layout_columns.py — contains:
+- _detect_columns_by_clustering()   (fallback clustering)
+- _build_geometries_from_starts()   (geometry construction)
+- detect_column_geometry()          (main column detection)
+
+Post-processing (sub-columns, broad-column split, narrow expansion)
+lives in cv_layout_column_refine.py.
+Legacy projection-profile layout lives in cv_layout_analyze.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import ColumnGeometry
+from cv_layout_detection import _find_content_bounds
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
+# =============================================================================
+
+# --- Phase A: Geometry Detection ---
+
+def _detect_columns_by_clustering(
+    word_dicts: List[Dict],
+    left_edges: List[int],
+    edge_word_indices: List[int],
+    content_w: int,
+    content_h: int,
+    left_x: int,
+    right_x: int,
+    top_y: int,
+    bottom_y: int,
+    inv: Optional[np.ndarray] = None,
+) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
+    """Fallback: detect columns by clustering left-aligned word positions.
+
+    Used when the primary gap-based algorithm finds fewer than 2 gaps.
+    """
+    tolerance = max(10, int(content_w * 0.01))
+    sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
+
+    clusters = []
+    cluster_widxs = []
+    cur_edges = [sorted_pairs[0][0]]
+    cur_widxs = [sorted_pairs[0][1]]
+    for edge, widx in sorted_pairs[1:]:
+        if edge - cur_edges[-1] <= tolerance:
+            cur_edges.append(edge)
+            cur_widxs.append(widx)
+        else:
+            clusters.append(cur_edges)
+            cluster_widxs.append(cur_widxs)
+            cur_edges = [edge]
+            cur_widxs = [widx]
+    clusters.append(cur_edges)
+    cluster_widxs.append(cur_widxs)
+
+    MIN_Y_COVERAGE_PRIMARY = 0.30
+    MIN_Y_COVERAGE_SECONDARY = 0.15
+    MIN_WORDS_SECONDARY = 5
+
+    cluster_infos = []
+    for c_edges, c_widxs in zip(clusters, cluster_widxs):
+        if len(c_edges) < 2:
+            continue
+        y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
+        y_span = max(y_positions) - min(y_positions)
+        y_coverage = y_span / content_h if content_h > 0 else 0.0
+        cluster_infos.append({
+            'mean_x': int(np.mean(c_edges)),
+            'count': len(c_edges),
+            'min_edge': min(c_edges),
+            'max_edge': max(c_edges),
+            'y_min': min(y_positions),
+            'y_max': max(y_positions),
+            'y_coverage': y_coverage,
+        })
+
+    primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
+    primary_set = set(id(c) for c in primary)
+    secondary = [c for c in cluster_infos
+                 if id(c) not in primary_set
+                 and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
+                 and c['count'] >= MIN_WORDS_SECONDARY]
+    significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
+
+    if len(significant) < 3:
+        logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
+        return None
+
+    merge_distance = max(30, int(content_w * 0.06))
+    merged = [significant[0].copy()]
+    for s in significant[1:]:
+        if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
+            prev = merged[-1]
+            total = prev['count'] + s['count']
+            avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
+            prev['mean_x'] = avg_x
+            prev['count'] = total
+            prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
+            prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
+        else:
+            merged.append(s.copy())
+
+    if len(merged) < 3:
+        logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
+        return None
+
+    logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
+
+    margin_px = max(6, int(content_w * 0.003))
+    return _build_geometries_from_starts(
+        [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
+        word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
+    )
+
+
+def _build_geometries_from_starts(
+    col_starts: List[Tuple[int, int]],
+    word_dicts: List[Dict],
+    left_x: int,
+    right_x: int,
+    top_y: int,
+    bottom_y: int,
+    content_w: int,
+    content_h: int,
+    inv: Optional[np.ndarray] = None,
+) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
+    """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
+    geometries = []
+    for i, (start_x, count) in enumerate(col_starts):
+        if i + 1 < len(col_starts):
+            col_width = col_starts[i + 1][0] - start_x
+        else:
+            col_width = right_x - start_x
+
+        col_left_rel = start_x - left_x
+        col_right_rel = col_left_rel + col_width
+        col_words = [w for w in word_dicts
+                     if col_left_rel <= w['left'] < col_right_rel]
+
+        geometries.append(ColumnGeometry(
+            index=i,
+            x=start_x,
+            y=top_y,
+            width=col_width,
+            height=content_h,
+            word_count=len(col_words),
+            words=col_words,
+            width_ratio=col_width / content_w if content_w > 0 else 0.0,
+        ))
+
+    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
+                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+
+
+def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
+    """Detect column geometry using whitespace-gap analysis with word validation.
+
+    Phase A of the two-phase column detection. Uses vertical projection
+    profiles to find whitespace gaps between columns, then validates that
+    no gap cuts through a word bounding box.
+
+    Falls back to clustering-based detection if fewer than 2 gaps are found.
+
+    Args:
+        ocr_img: Binarized grayscale image for layout analysis.
+        dewarped_bgr: Original BGR image (for Tesseract word detection).
+
+    Returns:
+        Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+        or None if detection fails entirely.
+    """
+    h, w = ocr_img.shape[:2]
+
+    # --- Step 1: Find content bounds ---
+    inv = cv2.bitwise_not(ocr_img)
+    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    if content_w < w * 0.3 or content_h < h * 0.3:
+        left_x, right_x = 0, w
+        top_y, bottom_y = 0, h
+        content_w, content_h = w, h
+
+    logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+                f"y=[{top_y}..{bottom_y}] ({content_h}px)")
+
+    # --- Step 2: Get word bounding boxes from Tesseract ---
+    # Crop from left_x to full image width (not right_x) so words at the right
+    # edge of the last column are included even if they extend past the detected
+    # content boundary (right_x).
+    content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
+    pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
+
+    try:
+        data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
+    except Exception as e:
+        logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
+        return None
+
+    word_dicts = []
+    left_edges = []
+    edge_word_indices = []
+    n_words = len(data['text'])
+    for i in range(n_words):
+        conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
+        text = str(data['text'][i]).strip()
+        if conf < 30 or not text:
+            continue
+        lx = int(data['left'][i])
+        ty = int(data['top'][i])
+        bw = int(data['width'][i])
+        bh = int(data['height'][i])
+        left_edges.append(lx)
+        edge_word_indices.append(len(word_dicts))
+        word_dicts.append({
+            'text': text, 'conf': conf,
+            'left': lx, 'top': ty, 'width': bw, 'height': bh,
+        })
+
+    if len(left_edges) < 5:
+        logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
+        return None
+
+    logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
+
+    # --- Step 2b: Segment by sub-headers ---
+    # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
+    # text bands that pollute the vertical projection.  We detect large
+    # horizontal gaps (= whitespace rows separating sections) and use only
+    # the tallest content segment for the projection.  This makes column
+    # detection immune to sub-headers, illustrations, and section dividers.
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+    h_proj_row = np.sum(content_strip, axis=1).astype(float)
+    h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
+
+    # Find horizontal gaps (near-empty rows)
+    H_GAP_THRESH = 0.02  # rows with <2% ink density are "empty"
+    h_in_gap = h_proj_row_norm < H_GAP_THRESH
+    H_MIN_GAP = max(5, content_h // 200)  # min gap height ~5-7px
+
+    h_gaps: List[Tuple[int, int]] = []
+    h_gap_start = None
+    for y_idx in range(len(h_in_gap)):
+        if h_in_gap[y_idx]:
+            if h_gap_start is None:
+                h_gap_start = y_idx
+        else:
+            if h_gap_start is not None:
+                if y_idx - h_gap_start >= H_MIN_GAP:
+                    h_gaps.append((h_gap_start, y_idx))
+                h_gap_start = None
+    if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
+        h_gaps.append((h_gap_start, len(h_in_gap)))
+
+    # Identify "large" gaps (significantly bigger than median) that indicate
+    # section boundaries (sub-headers, chapter titles).
+    if len(h_gaps) >= 3:
+        gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
+        median_gap_h = gap_sizes[len(gap_sizes) // 2]
+        large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
+        large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
+    else:
+        large_gaps = h_gaps
+
+    # Build content segments between large gaps and pick the tallest
+    seg_boundaries = [0]
+    for gs, ge in large_gaps:
+        seg_boundaries.append(gs)
+        seg_boundaries.append(ge)
+    seg_boundaries.append(content_h)
+
+    segments = []
+    for i in range(0, len(seg_boundaries) - 1, 2):
+        seg_top = seg_boundaries[i]
+        seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
+        seg_height = seg_bot - seg_top
+        if seg_height > 20:  # ignore tiny fragments
+            segments.append((seg_top, seg_bot, seg_height))
+
+    if segments:
+        segments.sort(key=lambda s: s[2], reverse=True)
+        best_seg = segments[0]
+        proj_strip = content_strip[best_seg[0]:best_seg[1], :]
+        effective_h = best_seg[2]
+        if len(segments) > 1:
+            logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
+                        f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
+                        f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
+    else:
+        proj_strip = content_strip
+        effective_h = content_h
+
+    # --- Step 3: Vertical projection profile ---
+    v_proj = np.sum(proj_strip, axis=0).astype(float)
+    v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
+
+    # Smooth the projection to avoid noise-induced micro-gaps
+    kernel_size = max(5, content_w // 80)
+    if kernel_size % 2 == 0:
+        kernel_size += 1  # keep odd for symmetry
+    v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # --- Step 4: Find whitespace gaps ---
+    # Threshold: areas with very little ink density are gaps
+    median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
+    gap_threshold = max(median_density * 0.15, 0.005)
+
+    in_gap = v_smooth < gap_threshold
+    MIN_GAP_WIDTH = max(8, content_w // 200)  # min ~8px or 0.5% of content width
+
+    # Collect contiguous gap regions
+    raw_gaps = []  # (start_x_rel, end_x_rel) relative to content ROI
+    gap_start = None
+    for x in range(len(in_gap)):
+        if in_gap[x]:
+            if gap_start is None:
+                gap_start = x
+        else:
+            if gap_start is not None:
+                gap_width = x - gap_start
+                if gap_width >= MIN_GAP_WIDTH:
+                    raw_gaps.append((gap_start, x))
+                gap_start = None
+    # Handle gap at the right edge
+    if gap_start is not None:
+        gap_width = len(in_gap) - gap_start
+        if gap_width >= MIN_GAP_WIDTH:
+            raw_gaps.append((gap_start, len(in_gap)))
+
+    logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
+                f"min_width={MIN_GAP_WIDTH}px): "
+                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
+
+    # --- Step 5: Validate gaps against word bounding boxes ---
+    # When using a segment for projection, only validate against words
+    # inside that segment — words from sub-headers or other sections
+    # would incorrectly overlap with real column gaps.
+    if segments and len(segments) > 1:
+        seg_top_abs = best_seg[0]  # relative to content strip
+        seg_bot_abs = best_seg[1]
+        segment_words = [wd for wd in word_dicts
+                         if wd['top'] >= seg_top_abs
+                         and wd['top'] + wd['height'] <= seg_bot_abs]
+        logger.info(f"ColumnGeometry: filtering words to segment: "
+                    f"{len(segment_words)}/{len(word_dicts)} words")
+    else:
+        segment_words = word_dicts
+
+    validated_gaps = []
+    for gap_start_rel, gap_end_rel in raw_gaps:
+        # Check if any word overlaps with this gap region
+        overlapping = False
+        for wd in segment_words:
+            word_left = wd['left']
+            word_right = wd['left'] + wd['width']
+            if word_left < gap_end_rel and word_right > gap_start_rel:
+                overlapping = True
+                break
+
+        if not overlapping:
+            validated_gaps.append((gap_start_rel, gap_end_rel))
+        else:
+            # Try to shift the gap to avoid the overlapping word(s)
+            # Find the tightest word boundaries within the gap region
+            min_word_left = content_w
+            max_word_right = 0
+            for wd in segment_words:
+                word_left = wd['left']
+                word_right = wd['left'] + wd['width']
+                if word_left < gap_end_rel and word_right > gap_start_rel:
+                    min_word_left = min(min_word_left, word_left)
+                    max_word_right = max(max_word_right, word_right)
+
+            # Try gap before the overlapping words
+            if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
+                validated_gaps.append((gap_start_rel, min_word_left))
+                logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
+            # Try gap after the overlapping words
+            elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
+                validated_gaps.append((max_word_right, gap_end_rel))
+                logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
+            else:
+                logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                             f"discarded (word overlap, no room to shift)")
+
+    logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
+                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
+
+    # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
+    # When pixel-based projection fails (e.g. due to illustrations or colored
+    # bands), use word bounding boxes to find clear vertical gaps.  This is
+    # immune to decorative graphics that Tesseract doesn't recognise as words.
+    if len(validated_gaps) < 2:
+        logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
+        word_coverage = np.zeros(content_w, dtype=np.int32)
+        for wd in segment_words:
+            wl = max(0, wd['left'])
+            wr = min(wd['left'] + wd['width'], content_w)
+            if wr > wl:
+                word_coverage[wl:wr] += 1
+
+        # Smooth slightly to bridge tiny 1-2px noise gaps between words
+        wc_kernel = max(3, content_w // 300)
+        if wc_kernel % 2 == 0:
+            wc_kernel += 1
+        wc_smooth = np.convolve(word_coverage.astype(float),
+                                np.ones(wc_kernel) / wc_kernel, mode='same')
+
+        wc_in_gap = wc_smooth < 0.5  # effectively zero word coverage
+        WC_MIN_GAP = max(4, content_w // 300)
+
+        wc_gaps: List[Tuple[int, int]] = []
+        wc_gap_start = None
+        for x in range(len(wc_in_gap)):
+            if wc_in_gap[x]:
+                if wc_gap_start is None:
+                    wc_gap_start = x
+            else:
+                if wc_gap_start is not None:
+                    if x - wc_gap_start >= WC_MIN_GAP:
+                        wc_gaps.append((wc_gap_start, x))
+                    wc_gap_start = None
+        if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
+            wc_gaps.append((wc_gap_start, len(wc_in_gap)))
+
+        logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
+                    f"(min_width={WC_MIN_GAP}px): "
+                    f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
+
+        if len(wc_gaps) >= 2:
+            validated_gaps = wc_gaps
+
+    # --- Step 6: Fallback to clustering if too few gaps ---
+    if len(validated_gaps) < 2:
+        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
+        return _detect_columns_by_clustering(
+            word_dicts, left_edges, edge_word_indices,
+            content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
+        )
+
+    # --- Step 7: Derive column boundaries from gaps ---
+    # Sort gaps by position
+    validated_gaps.sort(key=lambda g: g[0])
+
+    # Identify margin gaps (first and last) vs interior gaps
+    # A margin gap touches the edge of the content area (within 2% tolerance)
+    edge_tolerance = max(10, int(content_w * 0.02))
+
+    is_left_margin = validated_gaps[0][0] <= edge_tolerance
+    is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
+
+    # Interior gaps define column boundaries
+    # Column starts at the end of a gap, ends at the start of the next gap
+    col_starts = []
+
+    if is_left_margin:
+        # First column starts after the left margin gap
+        first_gap_end = validated_gaps[0][1]
+        interior_gaps = validated_gaps[1:]
+    else:
+        # No left margin gap — first column starts at content left edge
+        first_gap_end = 0
+        interior_gaps = validated_gaps[:]
+
+    if is_right_margin:
+        # Last gap is right margin — don't use it as column start
+        interior_gaps_for_boundaries = interior_gaps[:-1]
+        right_boundary = validated_gaps[-1][0]  # last column ends at right margin gap start
+    else:
+        interior_gaps_for_boundaries = interior_gaps
+        right_boundary = content_w
+
+    # First column
+    col_starts.append(left_x + first_gap_end)
+
+    # Columns between interior gaps
+    for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
+        col_starts.append(left_x + gap_end_rel)
+
+    # Count words per column region (for logging)
+    col_start_counts = []
+    for i, start_x in enumerate(col_starts):
+        if i + 1 < len(col_starts):
+            next_start = col_starts[i + 1]
+        else:
+            # Rightmost column always extends to full image width (w).
+            # The page margin contains only white space — extending the OCR
+            # crop to the image edge is safe and prevents text near the right
+            # border from being cut off.
+            next_start = w
+
+        col_left_rel = start_x - left_x
+        col_right_rel = next_start - left_x
+        n_words_in_col = sum(1 for w in word_dicts
+                             if col_left_rel <= w['left'] < col_right_rel)
+        col_start_counts.append((start_x, n_words_in_col))
+
+    logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
+                f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
+                f"{col_start_counts}")
+
+    # --- Step 8: Build ColumnGeometry objects ---
+    # Determine right edge for each column
+    all_boundaries = []
+    for i, start_x in enumerate(col_starts):
+        if i + 1 < len(col_starts):
+            end_x = col_starts[i + 1]
+        else:
+            # Rightmost column always extends to full image width (w).
+            end_x = w
+        all_boundaries.append((start_x, end_x))
+
+    geometries = []
+    for i, (start_x, end_x) in enumerate(all_boundaries):
+        col_width = end_x - start_x
+        col_left_rel = start_x - left_x
+        col_right_rel = col_left_rel + col_width
+        col_words = [w for w in word_dicts
+                     if col_left_rel <= w['left'] < col_right_rel]
+
+        geometries.append(ColumnGeometry(
+            index=i,
+            x=start_x,
+            y=top_y,
+            width=col_width,
+            height=content_h,
+            word_count=len(col_words),
+            words=col_words,
+            width_ratio=col_width / content_w if content_w > 0 else 0.0,
+        ))
+
+    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
+                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
+    # --- Step 9: Filter phantom narrow columns ---
+    # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
+    # columns (< 3% of content width) with zero or no words. These are not
+    # real columns — remove them and close the gap between neighbors.
+    min_real_col_w = max(20, int(content_w * 0.03))
+    filtered_geoms = [g for g in geometries
+                      if not (g.word_count < 3 and g.width < min_real_col_w)]
+    if len(filtered_geoms) < len(geometries):
+        n_removed = len(geometries) - len(filtered_geoms)
+        logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
+                    f"(width < {min_real_col_w}px and words < 3)")
+        # Extend each remaining column to close gaps with its right neighbor
+        for i, g in enumerate(filtered_geoms):
+            if i + 1 < len(filtered_geoms):
+                g.width = filtered_geoms[i + 1].x - g.x
+            else:
+                g.width = w - g.x
+            g.index = i
+            col_left_rel = g.x - left_x
+            col_right_rel = col_left_rel + g.width
+            g.words = [w for w in word_dicts
+                       if col_left_rel <= w['left'] < col_right_rel]
+            g.word_count = len(g.words)
+        geometries = filtered_geoms
+        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
+                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
+    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
@@ -0,0 +1,479 @@
+"""
+Document type detection, image preparation, content bounds, and header/footer detection.
+
+Extracted from cv_layout.py — these are the "input-side" helpers that run before
+column/row geometry analysis.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    DocumentTypeResult,
+    PageRegion,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+# =============================================================================
+# Document Type Detection
+# =============================================================================
+
+def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
+    """Detect whether the page is a vocab table, generic table, or full text.
+
+    Uses projection profiles and text density analysis — no OCR required.
+    Runs in < 2 seconds.
+
+    Args:
+        ocr_img: Binarized grayscale image (for projection profiles).
+        img_bgr: BGR color image.
+
+    Returns:
+        DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
+    """
+    if ocr_img is None or ocr_img.size == 0:
+        return DocumentTypeResult(
+            doc_type='full_text', confidence=0.5, pipeline='full_page',
+            skip_steps=['columns', 'rows'],
+            features={'error': 'empty image'},
+        )
+
+    h, w = ocr_img.shape[:2]
+
+    # --- 1. Vertical projection profile → detect column gaps ---
+    # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
+    # Invert: dark pixels on white background → high values = text.
+    vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
+
+    # Smooth the profile to avoid noise spikes
+    kernel_size = max(3, w // 100)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Find significant vertical gaps (columns of near-zero text density)
+    # A gap must be at least 1% of image width and have < 5% of max density
+    max_density = max(vert_smooth.max(), 1)
+    gap_threshold = max_density * 0.05
+    min_gap_width = max(5, w // 100)
+
+    in_gap = False
+    gap_count = 0
+    gap_start = 0
+    vert_gaps = []
+
+    for x in range(w):
+        if vert_smooth[x] < gap_threshold:
+            if not in_gap:
+                in_gap = True
+                gap_start = x
+        else:
+            if in_gap:
+                gap_width = x - gap_start
+                if gap_width >= min_gap_width:
+                    gap_count += 1
+                    vert_gaps.append((gap_start, x, gap_width))
+                in_gap = False
+
+    # Filter out margin gaps (within 10% of image edges)
+    margin_threshold = w * 0.10
+    internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
+    internal_gap_count = len(internal_gaps)
+
+    # --- 2. Horizontal projection profile → detect row gaps ---
+    horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
+    h_kernel = max(3, h // 200)
+    if h_kernel % 2 == 0:
+        h_kernel += 1
+    horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
+
+    h_max = max(horiz_smooth.max(), 1)
+    h_gap_threshold = h_max * 0.05
+    min_row_gap = max(3, h // 200)
+
+    row_gap_count = 0
+    in_gap = False
+    for y in range(h):
+        if horiz_smooth[y] < h_gap_threshold:
+            if not in_gap:
+                in_gap = True
+                gap_start = y
+        else:
+            if in_gap:
+                if y - gap_start >= min_row_gap:
+                    row_gap_count += 1
+                in_gap = False
+
+    # --- 3. Text density distribution (4×4 grid) ---
+    grid_rows, grid_cols = 4, 4
+    cell_h, cell_w = h // grid_rows, w // grid_cols
+    densities = []
+    for gr in range(grid_rows):
+        for gc in range(grid_cols):
+            cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
+                           gc * cell_w:(gc + 1) * cell_w]
+            if cell.size > 0:
+                d = float(np.count_nonzero(cell < 128)) / cell.size
+                densities.append(d)
+
+    density_std = float(np.std(densities)) if densities else 0
+    density_mean = float(np.mean(densities)) if densities else 0
+
+    features = {
+        'vertical_gaps': gap_count,
+        'internal_vertical_gaps': internal_gap_count,
+        'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
+        'row_gaps': row_gap_count,
+        'density_mean': round(density_mean, 4),
+        'density_std': round(density_std, 4),
+        'image_size': (w, h),
+    }
+
+    # --- 4. Decision tree ---
+    # Use internal_gap_count (excludes margin gaps) for column detection.
+    if internal_gap_count >= 2 and row_gap_count >= 5:
+        # Multiple internal vertical gaps + many row gaps → table
+        confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
+        return DocumentTypeResult(
+            doc_type='vocab_table',
+            confidence=round(confidence, 2),
+            pipeline='cell_first',
+            skip_steps=[],
+            features=features,
+        )
+    elif internal_gap_count >= 1 and row_gap_count >= 3:
+        # Some internal structure, likely a table
+        confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
+        return DocumentTypeResult(
+            doc_type='generic_table',
+            confidence=round(confidence, 2),
+            pipeline='cell_first',
+            skip_steps=[],
+            features=features,
+        )
+    elif internal_gap_count == 0:
+        # No internal column gaps → full text (regardless of density)
+        confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
+        return DocumentTypeResult(
+            doc_type='full_text',
+            confidence=round(confidence, 2),
+            pipeline='full_page',
+            skip_steps=['columns', 'rows'],
+            features=features,
+        )
+    else:
+        # Ambiguous — default to vocab_table (most common use case)
+        return DocumentTypeResult(
+            doc_type='vocab_table',
+            confidence=0.5,
+            pipeline='cell_first',
+            skip_steps=[],
+            features=features,
+        )
+
+
+# =============================================================================
+# Image Creation (Dual Image Preparation)
+# =============================================================================
+
+def create_ocr_image(img: np.ndarray) -> np.ndarray:
+    """Create a binarized image optimized for Tesseract OCR.
+
+    Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Binary image (white text on black background inverted to black on white).
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Background normalization: divide by blurred version
+    bg = cv2.GaussianBlur(gray, (51, 51), 0)
+    normalized = cv2.divide(gray, bg, scale=255)
+
+    # Adaptive binarization
+    binary = cv2.adaptiveThreshold(
+        normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY, 31, 10
+    )
+
+    # Light denoise
+    denoised = cv2.medianBlur(binary, 3)
+
+    return denoised
+
+
+def create_layout_image(img: np.ndarray) -> np.ndarray:
+    """Create a CLAHE-enhanced grayscale image for layout analysis.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Enhanced grayscale image.
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+    return enhanced
+
+
+# =============================================================================
+# Content Bounds Detection
+# =============================================================================
+
+def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
+    """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
+    out = mask.copy()
+    n = len(out)
+    i = 0
+    while i < n:
+        if out[i]:
+            start = i
+            while i < n and out[i]:
+                i += 1
+            if (i - start) < min_width:
+                out[start:i] = False
+        else:
+            i += 1
+    return out
+
+
+def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
+    """Find the bounding box of actual text content (excluding page margins).
+
+    Scan artefacts (thin black lines at page edges) are filtered out by
+    discarding contiguous projection runs narrower than 1 % of the image
+    dimension (min 5 px).
+
+    Returns:
+        Tuple of (left_x, right_x, top_y, bottom_y).
+    """
+    h, w = inv.shape[:2]
+    threshold = 0.005
+
+    # --- Horizontal projection for top/bottom ---
+    h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
+    h_mask = h_proj > threshold
+    min_h_run = max(5, h // 100)
+    h_mask = _filter_narrow_runs(h_mask, min_h_run)
+
+    top_y = 0
+    for y in range(h):
+        if h_mask[y]:
+            top_y = max(0, y - 5)
+            break
+
+    bottom_y = h
+    for y in range(h - 1, 0, -1):
+        if h_mask[y]:
+            bottom_y = min(h, y + 5)
+            break
+
+    # --- Vertical projection for left/right margins ---
+    v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
+    v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
+    v_mask = v_proj_norm > threshold
+    min_v_run = max(5, w // 100)
+    v_mask = _filter_narrow_runs(v_mask, min_v_run)
+
+    left_x = 0
+    for x in range(w):
+        if v_mask[x]:
+            left_x = max(0, x - 2)
+            break
+
+    right_x = w
+    for x in range(w - 1, 0, -1):
+        if v_mask[x]:
+            right_x = min(w, x + 2)
+            break
+
+    return left_x, right_x, top_y, bottom_y
+
+
+# =============================================================================
+# Header / Footer Detection
+# =============================================================================
+
+def _detect_header_footer_gaps(
+    inv: np.ndarray,
+    img_w: int,
+    img_h: int,
+) -> Tuple[Optional[int], Optional[int]]:
+    """Detect header/footer boundaries via horizontal projection gap analysis.
+
+    Scans the full-page inverted image for large horizontal gaps in the top/bottom
+    20% that separate header/footer content from the main body.
+
+    Returns:
+        (header_y, footer_y) — absolute y-coordinates.
+        header_y = bottom edge of header region (None if no header detected).
+        footer_y = top edge of footer region (None if no footer detected).
+    """
+    HEADER_FOOTER_ZONE = 0.20
+    GAP_MULTIPLIER = 2.0
+
+    # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
+    actual_h = min(inv.shape[0], img_h)
+    roi = inv[:actual_h, :]
+    h_proj = np.sum(roi, axis=1).astype(float)
+    proj_w = roi.shape[1]
+    h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
+
+    # Step 2: Smoothing
+    kernel_size = max(3, actual_h // 200)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Step 3: Gap threshold
+    positive = h_smooth[h_smooth > 0]
+    median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
+    gap_threshold = max(median_density * 0.15, 0.003)
+
+    in_gap = h_smooth < gap_threshold
+    MIN_GAP_HEIGHT = max(3, actual_h // 500)
+
+    # Step 4: Collect contiguous gaps
+    raw_gaps: List[Tuple[int, int]] = []
+    gap_start: Optional[int] = None
+    for y in range(len(in_gap)):
+        if in_gap[y]:
+            if gap_start is None:
+                gap_start = y
+        else:
+            if gap_start is not None:
+                gap_height = y - gap_start
+                if gap_height >= MIN_GAP_HEIGHT:
+                    raw_gaps.append((gap_start, y))
+                gap_start = None
+    if gap_start is not None:
+        gap_height = len(in_gap) - gap_start
+        if gap_height >= MIN_GAP_HEIGHT:
+            raw_gaps.append((gap_start, len(in_gap)))
+
+    if not raw_gaps:
+        return None, None
+
+    # Step 5: Compute median gap size and large-gap threshold
+    gap_sizes = [g[1] - g[0] for g in raw_gaps]
+    median_gap = float(np.median(gap_sizes))
+    large_gap_threshold = median_gap * GAP_MULTIPLIER
+
+    # Step 6: Find largest qualifying gap in header / footer zones
+    # A separator gap must have content on BOTH sides — edge-touching gaps
+    # (e.g. dewarp padding at bottom) are not valid separators.
+    EDGE_MARGIN = max(5, actual_h // 400)
+    header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
+    footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
+
+    header_y: Optional[int] = None
+    footer_y: Optional[int] = None
+
+    best_header_size = 0
+    for gs, ge in raw_gaps:
+        if gs <= EDGE_MARGIN:
+            continue  # skip gaps touching the top edge
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
+            if gap_size > best_header_size:
+                best_header_size = gap_size
+                header_y = ge  # bottom edge of gap
+
+    best_footer_size = 0
+    for gs, ge in raw_gaps:
+        if ge >= actual_h - EDGE_MARGIN:
+            continue  # skip gaps touching the bottom edge
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
+            if gap_size > best_footer_size:
+                best_footer_size = gap_size
+                footer_y = gs  # top edge of gap
+
+    if header_y is not None:
+        logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
+                    f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
+    if footer_y is not None:
+        logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
+                    f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
+
+    return header_y, footer_y
+
+
+def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
+                        min_density: float = 0.005) -> bool:
+    """Check whether a horizontal strip contains meaningful ink.
+
+    Args:
+        inv: Inverted binarized image (white-on-black).
+        y_start: Top of the region (inclusive).
+        y_end: Bottom of the region (exclusive).
+        min_density: Fraction of white pixels required to count as content.
+
+    Returns:
+        True if the region contains text/graphics, False if empty margin.
+    """
+    if y_start >= y_end:
+        return False
+    strip = inv[y_start:y_end, :]
+    density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
+    return density > min_density
+
+
+def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
+                       img_w: int, img_h: int,
+                       inv: Optional[np.ndarray] = None) -> None:
+    """Add header/footer/margin regions in-place.
+
+    Uses gap-based detection when *inv* is provided, otherwise falls back
+    to simple top_y/bottom_y bounds.
+
+    Region types depend on whether there is actual content (text/graphics):
+      - 'header' / 'footer'       — region contains text (e.g. title, page number)
+      - 'margin_top' / 'margin_bottom' — region is empty page margin
+    """
+    header_y: Optional[int] = None
+    footer_y: Optional[int] = None
+
+    if inv is not None:
+        header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
+
+    # --- Top region ---
+    top_boundary = header_y if header_y is not None and header_y > 10 else (
+        top_y if top_y > 10 else None
+    )
+    if top_boundary is not None:
+        has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
+        rtype = 'header' if has_content else 'margin_top'
+        regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
+        logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
+                    f"(has_content={has_content})")
+
+    # --- Bottom region ---
+    bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
+        bottom_y if bottom_y < img_h - 10 else None
+    )
+    if bottom_boundary is not None:
+        has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
+        rtype = 'footer' if has_content else 'margin_bottom'
+        regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
+                                  height=img_h - bottom_boundary))
+        logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
+                    f"height={img_h - bottom_boundary}px (has_content={has_content})")
@@ -0,0 +1,274 @@
+"""
+Layout analysis for OCR vocabulary pages — orchestration and re-exports.
+
+This module provides the high-level entry points for layout analysis and
+re-exports all functions from sub-modules for backward compatibility.
+
+Sub-modules:
+- cv_layout_detection: Document type detection, image creation, content bounds, header/footer
+- cv_layout_analyze: Legacy projection-based layout analysis
+- cv_layout_columns: Core column geometry detection
+- cv_layout_column_refine: Sub-column, broad-column, expand operations
+- cv_layout_rows: Row geometry detection
+- cv_layout_row_regularize: Row grid regularization
+- cv_layout_scoring: Language/role scoring, dictionary signals
+- cv_layout_classify: Column type classification (Phase B)
+- cv_layout_classify_position: Position-based classification fallbacks
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion
+
+logger = logging.getLogger(__name__)
+
+
+# ── Re-exports (backward compatibility) ───────────────────────────────────
+
+from cv_layout_detection import (  # noqa: F401
+    detect_document_type,
+    create_ocr_image,
+    create_layout_image,
+    _filter_narrow_runs,
+    _find_content_bounds,
+    _detect_header_footer_gaps,
+    _region_has_content,
+    _add_header_footer,
+)
+
+from cv_layout_analyze import (  # noqa: F401
+    analyze_layout,
+)
+
+from cv_layout_columns import (  # noqa: F401
+    detect_column_geometry,
+    _detect_columns_by_clustering,
+    _build_geometries_from_starts,
+)
+
+from cv_layout_column_refine import (  # noqa: F401
+    _detect_sub_columns,
+    _split_broad_columns,
+    expand_narrow_columns,
+)
+
+from cv_layout_rows import (  # noqa: F401
+    detect_row_geometry,
+    _build_rows_from_word_grouping,
+)
+
+from cv_layout_row_regularize import (  # noqa: F401
+    _regularize_row_grid,
+)
+
+from cv_layout_scoring import (  # noqa: F401
+    _score_language,
+    _score_role,
+    _score_dictionary_signals,
+    _classify_dictionary_columns,
+)
+
+from cv_layout_classify import (  # noqa: F401
+    _build_margin_regions,
+    positional_column_regions,
+    classify_column_types,
+    _classify_by_content,
+)
+
+from cv_layout_classify_position import (  # noqa: F401
+    _classify_by_position_enhanced,
+    _classify_by_position_fallback,
+)
+
+
+# ── Orchestration Functions ───────────────────────────────────────────────
+
+def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
+    """Detect columns using two-phase approach: geometry then content classification.
+
+    Phase A: detect_column_geometry() — clustering word positions into columns.
+    Phase B: classify_column_types() — content-based type assignment with fallback.
+
+    Falls back to projection-based analyze_layout() if geometry detection fails.
+    """
+    h, w = ocr_img.shape[:2]
+
+    result = detect_column_geometry(ocr_img, dewarped_bgr)
+
+    if result is None:
+        logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
+        layout_img = create_layout_image(dewarped_bgr)
+        return analyze_layout(layout_img, ocr_img)
+
+    geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
+    content_w = right_x - left_x
+
+    header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
+
+    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
+                                      top_y=top_y, header_y=header_y, footer_y=footer_y)
+
+    geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
+
+    content_h = bottom_y - top_y
+    regions = positional_column_regions(geometries, content_w, content_h, left_x)
+
+    col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
+    methods = set(r.classification_method for r in regions if r.classification_method)
+    logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
+                f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
+
+    return regions
+
+
+def detect_column_geometry_zoned(
+    ocr_img: np.ndarray,
+    dewarped_bgr: np.ndarray,
+) -> Optional[Tuple[
+    List[ColumnGeometry],
+    int, int, int, int,
+    List[Dict],
+    np.ndarray,
+    List[Dict],
+    List[DetectedBox],
+]]:
+    """Zone-aware column geometry detection.
+
+    1. Finds content bounds.
+    2. Runs box detection.
+    3. If boxes found: splits page into zones, runs detect_column_geometry()
+       per content zone on the corresponding sub-image.
+    4. If no boxes: delegates entirely to detect_column_geometry().
+    """
+    from cv_box_detect import detect_boxes, split_page_into_zones
+
+    geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
+    if geo_result is None:
+        return None
+
+    geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h)
+
+    if not boxes:
+        zone_data = [{
+            "index": 0, "zone_type": "content",
+            "y": top_y, "height": content_h,
+            "x": left_x, "width": content_w, "columns": [],
+        }]
+        return (geometries, left_x, right_x, top_y, bottom_y,
+                word_dicts, inv, zone_data, boxes)
+
+    zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
+
+    content_strips: List[Tuple[int, int]] = []
+    for zone in zones:
+        if zone.zone_type == 'content' and zone.height >= 40:
+            content_strips.append((zone.y, zone.y + zone.height))
+
+    if not content_strips:
+        logger.info("ZonedColumns: no content zones with height >= 40, using original result")
+        zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
+                       "height": content_h, "x": left_x, "width": content_w, "columns": []}]
+        return (geometries, left_x, right_x, top_y, bottom_y,
+                word_dicts, inv, zone_data, boxes)
+
+    ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
+    bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
+    combined_ocr = np.vstack(ocr_strips)
+    combined_bgr = np.vstack(bgr_strips)
+
+    logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
+                f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
+
+    combined_result = detect_column_geometry(combined_ocr, combined_bgr)
+    if combined_result is not None:
+        combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
+    else:
+        logger.info("ZonedColumns: combined image column detection failed, using original")
+        combined_geoms = geometries
+
+    strip_offsets: List[Tuple[int, int, int]] = []
+    cum_y = 0
+    for ys, ye in content_strips:
+        h = ye - ys
+        strip_offsets.append((cum_y, h, ys))
+        cum_y += h
+
+    def _combined_y_to_abs(cy: int) -> int:
+        for c_start, s_h, abs_start in strip_offsets:
+            if cy < c_start + s_h:
+                return abs_start + (cy - c_start)
+        last_c, last_h, last_abs = strip_offsets[-1]
+        return last_abs + last_h
+
+    if combined_result is not None:
+        for g in combined_geoms:
+            abs_y = _combined_y_to_abs(g.y)
+            abs_y_end = _combined_y_to_abs(g.y + g.height)
+            g.y = abs_y
+            g.height = abs_y_end - abs_y
+
+    if word_dicts:
+        content_words = []
+        for w in word_dicts:
+            w_abs_cx = w['left'] + left_x + w['width'] / 2
+            w_abs_cy = w['top'] + top_y + w['height'] / 2
+            inside_box = any(
+                box.x <= w_abs_cx <= box.x + box.width
+                and box.y <= w_abs_cy <= box.y + box.height
+                for box in boxes
+            )
+            if not inside_box:
+                content_words.append(w)
+
+        target_geoms = combined_geoms if combined_result is not None else geometries
+        for g in target_geoms:
+            g_left_rel = g.x - left_x
+            g_right_rel = g_left_rel + g.width
+            g.words = [
+                w for w in content_words
+                if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
+            ]
+            g.word_count = len(g.words)
+
+        excluded_count = len(word_dicts) - len(content_words)
+        if excluded_count:
+            logger.info(
+                "ZonedColumns: enriched geometries with %d content words "
+                "(excluded %d box-interior words)",
+                len(content_words), excluded_count,
+            )
+
+    zones_data: List[Dict] = []
+    for zone in zones:
+        zone_dict: Dict = {
+            "index": zone.index,
+            "zone_type": zone.zone_type,
+            "y": zone.y,
+            "height": zone.height,
+            "x": zone.x,
+            "width": zone.width,
+            "columns": [],
+        }
+        if zone.box is not None:
+            zone_dict["box"] = {
+                "x": zone.box.x, "y": zone.box.y,
+                "width": zone.box.width, "height": zone.box.height,
+                "confidence": zone.box.confidence,
+                "border_thickness": zone.box.border_thickness,
+            }
+        zones_data.append(zone_dict)
+
+    all_geometries = combined_geoms if combined_geoms else geometries
+
+    logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
+                f"{len(all_geometries)} total columns (combined-image approach)")
+
+    return (all_geometries, left_x, right_x, top_y, bottom_y,
+            word_dicts, inv, zones_data, boxes)
@@ -0,0 +1,329 @@
+"""
+Row grid regularization for document layout analysis.
+
+Provides word-center-based row boundary refinement to improve
+gap-based row detection. Extracted from cv_layout_rows.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Dict, List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+
+def _regularize_row_grid(
+    rows: List['RowGeometry'],
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int,
+    content_w: int, content_h: int,
+    inv: np.ndarray,
+) -> List['RowGeometry']:
+    """Rebuild row boundaries from word center-lines with section-break awareness.
+
+    Instead of overlaying a rigid grid, this derives row positions bottom-up
+    from the words themselves:
+
+    Step A: Group all content words into line clusters by Y-proximity.
+        Tolerance = 40% of median gap-based row height.
+    Step B: For each cluster compute:
+        - center_y = median of (word_top + word_height/2) for all words
+        - letter_h = median of word heights (excluding outliers > 2× median)
+    Step B2: Merge clusters whose centers are closer than 30% of row height
+        (spurious splits from OCR jitter).
+    Step C: Compute pitches (distances between consecutive centers).
+        Detect section breaks where gap > 1.8× median pitch.
+    Step D: Split clusters into sections at the section breaks.
+    Step E: Within each section, place row boundaries at midpoints between
+        consecutive line centers:
+        - First row top = center - local_pitch/2
+        - Last row bottom = center + local_pitch/2
+        - Interior boundaries = (center_i + center_{i+1}) / 2
+        This ensures rows tile seamlessly without gaps or overlaps.
+    Step F: Re-assign words to the nearest grid row by vertical center distance.
+    Step G: Validate that >= 85% of words land in a grid row; otherwise
+        fall back to the original gap-based rows.
+    Step H: Merge with preserved header/footer rows and re-index.
+
+    Guard: Requires >= 5 content rows from gap-based detection to activate.
+    This prevents the regularizer from running on very small images (e.g.
+    box sub-sessions with only 3-6 rows) where the gap-based detection
+    is already accurate enough.
+
+    Header/footer rows from the gap-based detection are preserved.
+    """
+    content_rows = [r for r in rows if r.row_type == 'content']
+    non_content = [r for r in rows if r.row_type != 'content']
+
+    if len(content_rows) < 5:
+        return rows
+
+    # --- Step A: Group ALL words into line clusters ---
+    # Collect words that belong to content rows (deduplicated)
+    content_words: List[Dict] = []
+    seen_keys: set = set()
+    for r in content_rows:
+        for w in r.words:
+            key = (w['left'], w['top'], w['width'], w['height'])
+            if key not in seen_keys:
+                seen_keys.add(key)
+                content_words.append(w)
+
+    if len(content_words) < 5:
+        return rows
+
+    # Compute median word height (excluding outliers like tall brackets/IPA)
+    word_heights = sorted(w['height'] for w in content_words)
+    median_wh = word_heights[len(word_heights) // 2]
+
+    # Compute median gap-based row height — this is the actual line height
+    # as detected by the horizontal projection.  We use 40% of this as
+    # grouping tolerance.  This is much more reliable than using word height
+    # alone, because words on the same line can have very different heights
+    # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
+    gap_row_heights = sorted(r.height for r in content_rows)
+    median_row_h = gap_row_heights[len(gap_row_heights) // 2]
+
+    # Tolerance: 40% of row height.  Words on the same line should have
+    # centers within this range.  Even if a word's bbox is taller/shorter,
+    # its center should stay within half a row height of the line center.
+    y_tol = max(10, int(median_row_h * 0.4))
+
+    # Sort by center_y, then group by proximity
+    words_by_center = sorted(content_words,
+                             key=lambda w: (w['top'] + w['height'] / 2, w['left']))
+    line_clusters: List[List[Dict]] = []
+    current_line: List[Dict] = [words_by_center[0]]
+    current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
+
+    for w in words_by_center[1:]:
+        w_center = w['top'] + w['height'] / 2
+        if abs(w_center - current_center) <= y_tol:
+            current_line.append(w)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            line_clusters.append(current_line)
+            current_line = [w]
+            current_center = w_center
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        line_clusters.append(current_line)
+
+    if len(line_clusters) < 3:
+        return rows
+
+    # --- Step B: Compute center_y per cluster ---
+    # center_y = median of (word_top + word_height/2) across all words in cluster
+    # letter_h = median of word heights, but excluding outlier-height words
+    #            (>2× median) so that tall brackets/IPA don't skew the height
+    cluster_info: List[Dict] = []
+    for cl_words in line_clusters:
+        centers = [w['top'] + w['height'] / 2 for w in cl_words]
+        # Filter outlier heights for letter_h computation
+        normal_heights = [w['height'] for w in cl_words
+                          if w['height'] <= median_wh * 2.0]
+        if not normal_heights:
+            normal_heights = [w['height'] for w in cl_words]
+        center_y = float(np.median(centers))
+        letter_h = float(np.median(normal_heights))
+        cluster_info.append({
+            'center_y_rel': center_y,  # relative to content ROI
+            'center_y_abs': center_y + top_y,  # absolute
+            'letter_h': letter_h,
+            'words': cl_words,
+        })
+
+    cluster_info.sort(key=lambda c: c['center_y_rel'])
+
+    # --- Step B2: Merge clusters that are too close together ---
+    # Even with center-based grouping, some edge cases can produce
+    # spurious clusters.  Merge any pair whose centers are closer
+    # than 30% of the row height (they're definitely the same text line).
+    merge_threshold = max(8, median_row_h * 0.3)
+    merged: List[Dict] = [cluster_info[0]]
+    for cl in cluster_info[1:]:
+        prev = merged[-1]
+        if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
+            # Merge: combine words, recompute center
+            combined_words = prev['words'] + cl['words']
+            centers = [w['top'] + w['height'] / 2 for w in combined_words]
+            normal_heights = [w['height'] for w in combined_words
+                              if w['height'] <= median_wh * 2.0]
+            if not normal_heights:
+                normal_heights = [w['height'] for w in combined_words]
+            prev['center_y_rel'] = float(np.median(centers))
+            prev['center_y_abs'] = prev['center_y_rel'] + top_y
+            prev['letter_h'] = float(np.median(normal_heights))
+            prev['words'] = combined_words
+        else:
+            merged.append(cl)
+
+    cluster_info = merged
+
+    if len(cluster_info) < 3:
+        return rows
+
+    # --- Step C: Compute pitches and detect section breaks ---
+    pitches: List[float] = []
+    for i in range(1, len(cluster_info)):
+        pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+        pitches.append(pitch)
+
+    if not pitches:
+        return rows
+
+    median_pitch = float(np.median(pitches))
+    if median_pitch <= 5:
+        return rows
+
+    # A section break is where the gap between line centers is much larger
+    # than the normal pitch (sub-headings, section titles, etc.)
+    BREAK_FACTOR = 1.8
+
+    # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
+    sections: List[List[Dict]] = []
+    current_section: List[Dict] = [cluster_info[0]]
+
+    for i in range(1, len(cluster_info)):
+        gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+        if gap > median_pitch * BREAK_FACTOR:
+            sections.append(current_section)
+            current_section = [cluster_info[i]]
+        else:
+            current_section.append(cluster_info[i])
+
+    if current_section:
+        sections.append(current_section)
+
+    # --- Step E: Build row boundaries per section ---
+    grid_rows: List[RowGeometry] = []
+
+    for section in sections:
+        if not section:
+            continue
+
+        if len(section) == 1:
+            # Single-line section (likely a heading)
+            cl = section[0]
+            half_h = max(cl['letter_h'], median_pitch * 0.4)
+            row_top = cl['center_y_abs'] - half_h
+            row_bot = cl['center_y_abs'] + half_h
+            grid_rows.append(RowGeometry(
+                index=0,
+                x=left_x,
+                y=round(row_top),
+                width=content_w,
+                height=round(row_bot - row_top),
+                word_count=len(cl['words']),
+                words=cl['words'],
+                row_type='content',
+                gap_before=0,
+            ))
+            continue
+
+        # Compute local pitch for this section
+        local_pitches = []
+        for i in range(1, len(section)):
+            local_pitches.append(
+                section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
+            )
+        local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
+
+        # Row boundaries are placed at midpoints between consecutive centers.
+        # First row: top = center - local_pitch/2
+        # Last row: bottom = center + local_pitch/2
+        for i, cl in enumerate(section):
+            if i == 0:
+                row_top = cl['center_y_abs'] - local_pitch / 2
+            else:
+                # Midpoint between this center and previous center
+                prev_center = section[i - 1]['center_y_abs']
+                row_top = (prev_center + cl['center_y_abs']) / 2
+
+            if i == len(section) - 1:
+                row_bot = cl['center_y_abs'] + local_pitch / 2
+            else:
+                next_center = section[i + 1]['center_y_abs']
+                row_bot = (cl['center_y_abs'] + next_center) / 2
+
+            # Clamp to reasonable bounds
+            row_top = max(top_y, row_top)
+            row_bot = min(top_y + content_h, row_bot)
+
+            if row_bot - row_top < 5:
+                continue
+
+            grid_rows.append(RowGeometry(
+                index=0,
+                x=left_x,
+                y=round(row_top),
+                width=content_w,
+                height=round(row_bot - row_top),
+                word_count=len(cl['words']),
+                words=cl['words'],
+                row_type='content',
+                gap_before=0,
+            ))
+
+    if not grid_rows:
+        return rows
+
+    # --- Step F: Re-assign words to grid rows ---
+    # Words may have shifted slightly; assign each word to the row whose
+    # center is closest to the word's vertical center.
+    for gr in grid_rows:
+        gr.words = []
+
+    for w in content_words:
+        w_center = w['top'] + top_y + w['height'] / 2
+        best_row = None
+        best_dist = float('inf')
+        for gr in grid_rows:
+            row_center = gr.y + gr.height / 2
+            dist = abs(w_center - row_center)
+            if dist < best_dist:
+                best_dist = dist
+                best_row = gr
+        if best_row is not None and best_dist < median_pitch:
+            best_row.words.append(w)
+
+    for gr in grid_rows:
+        gr.word_count = len(gr.words)
+
+    # --- Step G: Validate ---
+    words_placed = sum(gr.word_count for gr in grid_rows)
+    if len(content_words) > 0:
+        match_ratio = words_placed / len(content_words)
+        if match_ratio < 0.85:
+            logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
+                        f"of words, keeping gap-based rows")
+            return rows
+
+    # Remove empty grid rows (no words assigned)
+    grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
+
+    # --- Step H: Merge header/footer + re-index ---
+    result = list(non_content) + grid_rows
+    result.sort(key=lambda r: r.y)
+    for i, r in enumerate(result):
+        r.index = i
+
+    row_heights = [gr.height for gr in grid_rows]
+    min_h = min(row_heights) if row_heights else 0
+    max_h = max(row_heights) if row_heights else 0
+    logger.info(f"RowGrid: word-center grid applied "
+                f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
+                f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
+                f"{len(sections)} sections, "
+                f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
+                f"was {len(content_rows)} gap-based rows)")
+
+    return result
@@ -0,0 +1,352 @@
+"""
+Row geometry detection for document layout analysis.
+
+Provides horizontal whitespace-gap analysis to detect text rows,
+word-center grid regularization, and fallback word-grouping.
+
+Extracted from cv_layout.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Dict, List
+
+import numpy as np
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+from cv_vocab_types import RowGeometry
+from cv_ocr_word_assembly import _group_words_into_lines
+from cv_layout_row_regularize import _regularize_row_grid
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Row Geometry Detection (horizontal whitespace-gap analysis)
+# =============================================================================
+
+def detect_row_geometry(
+    inv: np.ndarray,
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int, bottom_y: int,
+) -> List['RowGeometry']:
+    """Detect row geometry using horizontal whitespace-gap analysis.
+
+    Algorithm overview (two phases):
+
+    Phase 1 — Gap-based detection (Steps 1–6):
+      1. Build a horizontal projection profile: for each y-pixel, sum the
+         ink density across the content width. Only pixels within/near
+         Tesseract word bounding boxes contribute (word_mask), so that
+         images/illustrations don't merge adjacent text rows.
+      2. Smooth the projection and find contiguous regions below a
+         threshold (= gaps / horizontal whitespace between text lines).
+         The threshold is 15% of the median non-zero density.
+      3. Validate gaps against word bounding boxes — discard any gap
+         that overlaps a word, or shift the gap boundary to avoid the word.
+      4. Build rows from the spans between validated gaps.
+      5. Detect header/footer rows: gaps in the top/bottom 15% of the
+         page that are >= 2× the median gap size mark section boundaries.
+
+    Phase 2 — Word-center regularization (_regularize_row_grid, Step 7):
+      For each word, compute its vertical center (top + height/2).
+      Group words into line clusters by Y-proximity (tolerance = 40% of
+      the median gap-based row height).
+      For each cluster, the line center = median of all word centers.
+      The "pitch" = distance between consecutive line centers.
+      Section breaks are detected where the pitch exceeds 1.8× the median.
+      Within each section, row boundaries are placed at the midpoints
+      between consecutive line centers:
+        - Row top = midpoint to previous line center (or center - pitch/2 for first)
+        - Row bottom = midpoint to next line center (or center + pitch/2 for last)
+      This ensures rows tile without gaps or overlaps.
+
+    Fallback:
+      If < 2 gaps are found (very dense or uniform text), falls back to
+      _build_rows_from_word_grouping() which groups words by Y proximity.
+
+    Args:
+        inv: Inverted binarized image (white text on black bg, full page).
+        word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
+        left_x, right_x: Absolute X bounds of the content area.
+        top_y, bottom_y: Absolute Y bounds of the content area.
+
+    Returns:
+        List of RowGeometry objects sorted top to bottom.
+    """
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    if content_h < 10 or content_w < 10:
+        logger.warning("detect_row_geometry: content area too small")
+        return []
+
+    # --- Step 1: Horizontal projection profile ---
+    # For each y-pixel row, sum ink density across the content width.
+    # A word-coverage mask ensures only pixels near Tesseract words contribute,
+    # so that illustrations/images don't inflate the density and merge rows.
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
+    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
+    for wd in word_dicts:
+        y1 = max(0, wd['top'] - WORD_PAD_Y)
+        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
+        x1 = max(0, wd['left'])
+        x2 = min(content_w, wd['left'] + wd['width'])
+        word_mask[y1:y2, x1:x2] = 255
+
+    masked_strip = cv2.bitwise_and(content_strip, word_mask)
+    h_proj = np.sum(masked_strip, axis=1).astype(float)
+    h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
+
+    # --- Step 2: Smoothing + gap threshold ---
+    # Smooth the projection to reduce noise, then threshold at 15% of the
+    # median non-zero density. Pixels below this threshold are considered
+    # "gap" (horizontal whitespace between text lines).
+    # MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
+    kernel_size = max(3, content_h // 200)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
+    gap_threshold = max(median_density * 0.15, 0.003)
+
+    in_gap = h_smooth < gap_threshold
+    MIN_GAP_HEIGHT = max(3, content_h // 500)
+
+    # --- Step 3: Collect contiguous gap regions ---
+    raw_gaps = []  # (start_y_rel, end_y_rel) relative to content ROI
+    gap_start = None
+    for y in range(len(in_gap)):
+        if in_gap[y]:
+            if gap_start is None:
+                gap_start = y
+        else:
+            if gap_start is not None:
+                gap_height = y - gap_start
+                if gap_height >= MIN_GAP_HEIGHT:
+                    raw_gaps.append((gap_start, y))
+                gap_start = None
+    if gap_start is not None:
+        gap_height = len(in_gap) - gap_start
+        if gap_height >= MIN_GAP_HEIGHT:
+            raw_gaps.append((gap_start, len(in_gap)))
+
+    logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
+                f"min_height={MIN_GAP_HEIGHT}px)")
+
+    # --- Step 4: Validate gaps against word bounding boxes ---
+    # A gap is valid only if no word's bounding box overlaps it vertically.
+    # If a word overlaps, try to shift the gap boundary above or below the
+    # word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
+    validated_gaps = []
+    for gap_start_rel, gap_end_rel in raw_gaps:
+        overlapping = False
+        for wd in word_dicts:
+            word_top = wd['top']
+            word_bottom = wd['top'] + wd['height']
+            if word_top < gap_end_rel and word_bottom > gap_start_rel:
+                overlapping = True
+                break
+
+        if not overlapping:
+            validated_gaps.append((gap_start_rel, gap_end_rel))
+        else:
+            # Try to shift the gap to avoid overlapping words
+            min_word_top = content_h
+            max_word_bottom = 0
+            for wd in word_dicts:
+                word_top = wd['top']
+                word_bottom = wd['top'] + wd['height']
+                if word_top < gap_end_rel and word_bottom > gap_start_rel:
+                    min_word_top = min(min_word_top, word_top)
+                    max_word_bottom = max(max_word_bottom, word_bottom)
+
+            if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
+                validated_gaps.append((gap_start_rel, min_word_top))
+            elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
+                validated_gaps.append((max_word_bottom, gap_end_rel))
+            else:
+                logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                             f"discarded (word overlap, no room to shift)")
+
+    logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
+
+    # --- Fallback if too few gaps ---
+    if len(validated_gaps) < 2:
+        logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
+        return _build_rows_from_word_grouping(
+            word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
+        )
+
+    validated_gaps.sort(key=lambda g: g[0])
+
+    # --- Step 5: Header/footer detection via gap size ---
+    HEADER_FOOTER_ZONE = 0.15
+    GAP_MULTIPLIER = 2.0
+
+    gap_sizes = [g[1] - g[0] for g in validated_gaps]
+    median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
+    large_gap_threshold = median_gap * GAP_MULTIPLIER
+
+    header_boundary_rel = None  # y below which is header
+    footer_boundary_rel = None  # y above which is footer
+
+    header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
+    footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
+
+    # Find largest gap in header zone
+    best_header_gap = None
+    for gs, ge in validated_gaps:
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
+            if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
+                best_header_gap = (gs, ge)
+
+    if best_header_gap is not None:
+        header_boundary_rel = best_header_gap[1]
+        logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
+                    f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
+                    f"median_gap={median_gap:.0f}px)")
+
+    # Find largest gap in footer zone
+    best_footer_gap = None
+    for gs, ge in validated_gaps:
+        gap_mid = (gs + ge) / 2
+        gap_size = ge - gs
+        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
+            if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
+                best_footer_gap = (gs, ge)
+
+    if best_footer_gap is not None:
+        footer_boundary_rel = best_footer_gap[0]
+        logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
+                    f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
+
+    # --- Step 6: Build RowGeometry objects from gaps ---
+    # Rows are the spans between consecutive gaps. The gap midpoints define
+    # where one row ends and the next begins. Each row's height extends
+    # from the end of the previous gap to the start of the next gap.
+    row_boundaries = []  # (start_y_rel, end_y_rel)
+
+    # Top of content to first gap
+    if validated_gaps[0][0] > MIN_GAP_HEIGHT:
+        row_boundaries.append((0, validated_gaps[0][0]))
+
+    # Between gaps
+    for i in range(len(validated_gaps) - 1):
+        row_start = validated_gaps[i][1]
+        row_end = validated_gaps[i + 1][0]
+        if row_end - row_start > 0:
+            row_boundaries.append((row_start, row_end))
+
+    # Last gap to bottom of content
+    if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
+        row_boundaries.append((validated_gaps[-1][1], content_h))
+
+    rows = []
+    for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
+        # Determine row type
+        row_mid = (row_start_rel + row_end_rel) / 2
+        if header_boundary_rel is not None and row_mid < header_boundary_rel:
+            row_type = 'header'
+        elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
+            row_type = 'footer'
+        else:
+            row_type = 'content'
+
+        # Collect words in this row
+        row_words = [w for w in word_dicts
+                     if w['top'] + w['height'] / 2 >= row_start_rel
+                     and w['top'] + w['height'] / 2 < row_end_rel]
+
+        # Gap before this row
+        gap_before = 0
+        if idx == 0 and validated_gaps[0][0] > 0:
+            gap_before = validated_gaps[0][0]
+        elif idx > 0:
+            # Find the gap just before this row boundary
+            for gs, ge in validated_gaps:
+                if ge == row_start_rel:
+                    gap_before = ge - gs
+                    break
+
+        rows.append(RowGeometry(
+            index=idx,
+            x=left_x,
+            y=top_y + row_start_rel,
+            width=content_w,
+            height=row_end_rel - row_start_rel,
+            word_count=len(row_words),
+            words=row_words,
+            row_type=row_type,
+            gap_before=gap_before,
+        ))
+
+    # --- Step 7: Word-center grid regularization ---
+    # Refine the gap-based rows using word vertical centers. For each word,
+    # compute center_y = top + height/2. Group into line clusters, compute
+    # the pitch (distance between consecutive line centers), and place row
+    # boundaries at the midpoints between centers. This gives more precise
+    # and evenly-spaced rows than the gap-based approach alone.
+    # Also detects section breaks (headings, paragraphs) where the pitch
+    # exceeds 1.8× the median, and handles each section independently.
+    rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
+                                content_w, content_h, inv)
+
+    type_counts = {}
+    for r in rows:
+        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
+    logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
+
+    return rows
+
+
+def _build_rows_from_word_grouping(
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int, bottom_y: int,
+    content_w: int, content_h: int,
+) -> List['RowGeometry']:
+    """Fallback: build rows by grouping words by Y position.
+
+    Uses _group_words_into_lines() with a generous tolerance.
+    No header/footer detection in fallback mode.
+    """
+    if not word_dicts:
+        return []
+
+    y_tolerance = max(20, content_h // 100)
+    lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
+
+    rows = []
+    for idx, line_words in enumerate(lines):
+        if not line_words:
+            continue
+        min_top = min(w['top'] for w in line_words)
+        max_bottom = max(w['top'] + w['height'] for w in line_words)
+        row_height = max_bottom - min_top
+
+        rows.append(RowGeometry(
+            index=idx,
+            x=left_x,
+            y=top_y + min_top,
+            width=content_w,
+            height=row_height,
+            word_count=len(line_words),
+            words=line_words,
+            row_type='content',
+            gap_before=0,
+        ))
+
+    logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
+    return rows
@@ -0,0 +1,441 @@
+"""
+Language scoring, role scoring, and dictionary detection/classification.
+
+Extracted from cv_layout.py to keep modules under 500 LOC.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from collections import Counter
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import (
+    ColumnGeometry,
+    ENGLISH_FUNCTION_WORDS,
+    GERMAN_FUNCTION_WORDS,
+    PageRegion,
+)
+
+logger = logging.getLogger(__name__)
+
+# --- Dictionary / Wörterbuch Detection ---
+
+# Article words that appear as a dedicated column in dictionaries
+_DICT_ARTICLE_WORDS = {
+    # German articles
+    "die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
+    # English articles / infinitive marker
+    "the", "a", "an", "to",
+}
+
+
+# --- Phase B: Content-Based Classification ---
+
+def _score_language(words: List[Dict]) -> Dict[str, float]:
+    """Score the language of a column's words.
+
+    Analyzes function words, umlauts, and capitalization patterns
+    to determine whether text is English or German.
+
+    Args:
+        words: List of word dicts with 'text' and 'conf' keys.
+
+    Returns:
+        Dict with 'eng' and 'deu' scores (0.0-1.0).
+    """
+    if not words:
+        return {'eng': 0.0, 'deu': 0.0}
+
+    # Only consider words with decent confidence
+    good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
+    if not good_words:
+        return {'eng': 0.0, 'deu': 0.0}
+
+    total = len(good_words)
+    en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
+    de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
+
+    # Check for umlauts (strong German signal)
+    raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
+    umlaut_count = sum(1 for t in raw_texts
+                       for c in t if c in 'äöüÄÖÜß')
+
+    # German capitalization: nouns are capitalized mid-sentence
+    # Count words that start with uppercase but aren't at position 0
+    cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
+
+    en_score = en_hits / total if total > 0 else 0.0
+    de_score = de_hits / total if total > 0 else 0.0
+
+    # Boost German score for umlauts
+    if umlaut_count > 0:
+        de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
+
+    # Boost German score for high capitalization ratio (typical for German nouns)
+    if total > 5:
+        cap_ratio = cap_words / total
+        if cap_ratio > 0.3:
+            de_score = min(1.0, de_score + 0.1)
+
+    return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
+
+
+def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
+    """Score the role of a column based on its geometry and content patterns.
+
+    Args:
+        geom: ColumnGeometry with words and dimensions.
+
+    Returns:
+        Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
+    """
+    scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
+
+    if not geom.words:
+        return scores
+
+    texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
+    if not texts:
+        return scores
+
+    avg_word_len = sum(len(t) for t in texts) / len(texts)
+    has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
+    digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
+    digit_ratio = digit_words / len(texts) if texts else 0.0
+
+    # Reference: narrow + mostly numbers/page references
+    if geom.width_ratio < 0.12:
+        scores['reference'] = 0.5
+        if digit_ratio > 0.4:
+            scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
+
+    # Marker: narrow + few short entries
+    if geom.width_ratio < 0.06 and geom.word_count <= 15:
+        scores['marker'] = 0.7
+        if avg_word_len < 4:
+            scores['marker'] = 0.9
+    # Very narrow non-edge column → strong marker regardless of word count
+    if geom.width_ratio < 0.04 and geom.index > 0:
+        scores['marker'] = max(scores['marker'], 0.9)
+
+    # Sentence: longer words + punctuation present
+    if geom.width_ratio > 0.15 and has_punctuation > 2:
+        scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
+        if avg_word_len > 4:
+            scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
+
+    # Vocabulary: medium width + medium word length
+    if 0.10 < geom.width_ratio < 0.45:
+        scores['vocabulary'] = 0.4
+        if 3 < avg_word_len < 8:
+            scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
+
+    return {k: round(v, 3) for k, v in scores.items()}
+
+
+def _score_dictionary_signals(
+    geometries: List[ColumnGeometry],
+    document_category: Optional[str] = None,
+    margin_strip_detected: bool = False,
+) -> Dict[str, Any]:
+    """Score dictionary-specific patterns across all columns.
+
+    Combines 4 independent signals to determine if the page is a dictionary:
+      1. Alphabetical ordering of words in each column
+      2. Article column detection (der/die/das, to)
+      3. First-letter uniformity (most headwords share a letter)
+      4. Decorative A-Z margin strip (detected upstream)
+
+    Args:
+        geometries: List of ColumnGeometry with words.
+        document_category: User-selected category (e.g. 'woerterbuch').
+        margin_strip_detected: Whether a decorative A-Z margin strip was found.
+
+    Returns:
+        Dict with 'is_dictionary', 'confidence', 'article_col_index',
+        'headword_col_index', and 'signals' sub-dict.
+    """
+    result: Dict[str, Any] = {
+        "is_dictionary": False,
+        "confidence": 0.0,
+        "article_col_index": None,
+        "headword_col_index": None,
+        "signals": {},
+    }
+
+    if not geometries or len(geometries) < 2:
+        return result
+
+    # --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
+    best_alpha_score = 0.0
+    best_alpha_col = -1
+    for geom in geometries:
+        texts = [
+            w["text"].strip().lower()
+            for w in sorted(geom.words, key=lambda w: w.get("top", 0))
+            if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
+        ]
+        if len(texts) < 5:
+            continue
+        # Deduplicate consecutive identical words (OCR double-reads)
+        deduped = [texts[0]]
+        for t in texts[1:]:
+            if t != deduped[-1]:
+                deduped.append(t)
+        if len(deduped) < 5:
+            continue
+        # Count consecutive pairs in alphabetical order
+        ordered_pairs = sum(
+            1 for i in range(len(deduped) - 1)
+            if deduped[i] <= deduped[i + 1]
+        )
+        alpha_score = ordered_pairs / (len(deduped) - 1)
+        if alpha_score > best_alpha_score:
+            best_alpha_score = alpha_score
+            best_alpha_col = geom.index
+
+    result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
+    result["signals"]["alphabetical_col"] = best_alpha_col
+
+    # --- Signal 2: Article detection (weight 0.25) ---
+    # Check three patterns:
+    # (a) Dedicated narrow article column (der/die/das only)
+    # (b) Inline articles: multi-word texts starting with "der X", "die X"
+    # (c) High article word frequency: many individual words ARE articles
+    #     (common when OCR splits "der Zustand" into separate word_boxes)
+    best_article_density = 0.0
+    best_article_col = -1
+    best_inline_article_ratio = 0.0
+    best_article_word_ratio = 0.0
+
+    for geom in geometries:
+        texts = [
+            w["text"].strip().lower()
+            for w in geom.words
+            if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
+        ]
+        if len(texts) < 3:
+            continue
+
+        # (a) Dedicated article column: narrow, mostly article words
+        article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
+        if geom.width_ratio <= 0.20:
+            density = article_count / len(texts)
+            if density > best_article_density:
+                best_article_density = density
+                best_article_col = geom.index
+
+        # (b) Inline articles: "der Zustand", "die Zutat", etc.
+        inline_count = sum(
+            1 for t in texts
+            if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
+        )
+        inline_ratio = inline_count / len(texts)
+        if inline_ratio > best_inline_article_ratio:
+            best_inline_article_ratio = inline_ratio
+
+        # (c) Article word frequency in any column (for OCR-split word_boxes)
+        # In dictionaries, articles appear frequently among headwords
+        # Require at least 10% articles and >= 3 article words
+        if article_count >= 3:
+            art_ratio = article_count / len(texts)
+            # Only count if column has enough non-article words too
+            # (pure article column is handled by (a))
+            non_art = len(texts) - article_count
+            if non_art >= 3 and art_ratio > best_article_word_ratio:
+                best_article_word_ratio = art_ratio
+
+    # Use the strongest signal
+    effective_article_score = max(
+        best_article_density,
+        best_inline_article_ratio,
+        best_article_word_ratio * 0.8,  # slight discount for raw word ratio
+    )
+
+    result["signals"]["article_density"] = round(best_article_density, 3)
+    result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
+    result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
+    result["signals"]["article_col"] = best_article_col
+
+    # --- Signal 3: First-letter uniformity (weight 0.25) ---
+    best_uniformity = 0.0
+    best_uniform_col = -1
+    has_letter_transition = False
+    for geom in geometries:
+        texts = [
+            w["text"].strip().lower()
+            for w in sorted(geom.words, key=lambda w: w.get("top", 0))
+            if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
+        ]
+        if len(texts) < 5:
+            continue
+        # Count first letters
+        first_letters = [t[0] for t in texts if t[0].isalpha()]
+        if not first_letters:
+            continue
+        letter_counts = Counter(first_letters)
+        most_common_letter, most_common_count = letter_counts.most_common(1)[0]
+        uniformity = most_common_count / len(first_letters)
+
+        # Check for orderly letter transitions (A→B or Y→Z)
+        # Group consecutive words by first letter, check if groups are in order
+        groups = []
+        current_letter = first_letters[0]
+        for fl in first_letters:
+            if fl != current_letter:
+                groups.append(current_letter)
+                current_letter = fl
+        groups.append(current_letter)
+        if len(groups) >= 2 and len(groups) <= 5:
+            # Check if groups are alphabetically ordered
+            if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
+                has_letter_transition = True
+                # Boost uniformity for orderly transitions
+                uniformity = max(uniformity, 0.70)
+
+        if uniformity > best_uniformity:
+            best_uniformity = uniformity
+            best_uniform_col = geom.index
+
+    result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
+    result["signals"]["uniform_col"] = best_uniform_col
+    result["signals"]["has_letter_transition"] = has_letter_transition
+
+    # --- Signal 4: Decorative margin strip (weight 0.15) ---
+    result["signals"]["margin_strip_detected"] = margin_strip_detected
+
+    # --- Combine signals ---
+    s1 = min(best_alpha_score, 1.0) * 0.35
+    s2 = min(effective_article_score, 1.0) * 0.25
+    s3 = min(best_uniformity, 1.0) * 0.25
+    s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
+
+    combined = s1 + s2 + s3 + s4
+
+    # Boost if user set document_category to 'woerterbuch'
+    if document_category == "woerterbuch":
+        combined = min(1.0, combined + 0.20)
+        result["signals"]["category_boost"] = True
+
+    result["confidence"] = round(combined, 3)
+
+    # Threshold: combined >= 0.40 to classify as dictionary
+    # (at least 2 strong signals or 3 moderate ones)
+    if combined >= 0.40:
+        result["is_dictionary"] = True
+        # Identify headword column: best alphabetical OR best uniform
+        if best_alpha_col >= 0 and best_alpha_score >= 0.60:
+            result["headword_col_index"] = best_alpha_col
+        elif best_uniform_col >= 0 and best_uniformity >= 0.50:
+            result["headword_col_index"] = best_uniform_col
+        if best_article_col >= 0 and best_article_density >= 0.30:
+            result["article_col_index"] = best_article_col
+        # If inline articles are strong but no dedicated column, note it
+        if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
+            result["signals"]["inline_articles_detected"] = True
+
+    logger.info(
+        "DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
+        combined, result["is_dictionary"], result["signals"],
+    )
+
+    return result
+
+
+def _classify_dictionary_columns(
+    geometries: List[ColumnGeometry],
+    dict_signals: Dict[str, Any],
+    lang_scores: List[Dict[str, float]],
+    content_h: int,
+) -> Optional[List[PageRegion]]:
+    """Classify columns for a detected dictionary page.
+
+    Assigns column_headword, column_article, column_ipa, and
+    column_de/column_en based on dictionary signals and language scores.
+
+    Returns None if classification fails.
+    """
+    if not dict_signals.get("is_dictionary"):
+        return None
+
+    regions: List[PageRegion] = []
+    assigned = set()
+    article_idx = dict_signals.get("article_col_index")
+    headword_idx = dict_signals.get("headword_col_index")
+
+    # 1. Assign article column if detected
+    if article_idx is not None:
+        for geom in geometries:
+            if geom.index == article_idx:
+                regions.append(PageRegion(
+                    type="column_article",
+                    x=geom.x, y=geom.y,
+                    width=geom.width, height=content_h,
+                    classification_confidence=round(
+                        dict_signals["signals"].get("article_density", 0.5), 2),
+                    classification_method="dictionary",
+                ))
+                assigned.add(geom.index)
+                break
+
+    # 2. Assign headword column
+    if headword_idx is not None and headword_idx not in assigned:
+        for geom in geometries:
+            if geom.index == headword_idx:
+                regions.append(PageRegion(
+                    type="column_headword",
+                    x=geom.x, y=geom.y,
+                    width=geom.width, height=content_h,
+                    classification_confidence=round(
+                        dict_signals["confidence"], 2),
+                    classification_method="dictionary",
+                ))
+                assigned.add(geom.index)
+                break
+
+    # 3. Assign remaining columns by language + content
+    remaining = [g for g in geometries if g.index not in assigned]
+    for geom in remaining:
+        ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
+
+        # Check if column contains IPA (brackets like [, /, ˈ)
+        ipa_chars = sum(
+            1 for w in geom.words
+            if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
+        )
+        ipa_ratio = ipa_chars / max(len(geom.words), 1)
+
+        if ipa_ratio > 0.25:
+            col_type = "column_ipa"
+            conf = round(min(1.0, ipa_ratio), 2)
+        elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
+            col_type = "column_de"
+            conf = round(ls["deu"], 2)
+        elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
+            col_type = "column_en"
+            conf = round(ls["eng"], 2)
+        else:
+            # Positional fallback: leftmost unassigned = EN, next = DE
+            left_unassigned = sorted(
+                [g for g in remaining if g.index not in assigned],
+                key=lambda g: g.x,
+            )
+            if geom == left_unassigned[0] if left_unassigned else None:
+                col_type = "column_en"
+            else:
+                col_type = "column_de"
+            conf = 0.4
+
+        regions.append(PageRegion(
+            type=col_type,
+            x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=conf,
+            classification_method="dictionary",
+        ))
+        assigned.add(geom.index)
+
+    regions.sort(key=lambda r: r.x)
+    return regions
@@ -0,0 +1,37 @@
+"""
+CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
+
+Re-export facade — all logic lives in the sub-modules:
+
+  cv_vocab_types      Dataklassen, Konstanten, IPA, Feature-Flags
+  cv_preprocessing    Bild-I/O, Orientierung, Deskew, Dewarp
+  cv_layout           Dokumenttyp, Spalten, Zeilen, Klassifikation
+  cv_ocr_engines      OCR-Engines, Vocab-Postprocessing, Text-Cleaning
+  cv_cell_grid        Cell-Grid (v2 + Legacy), Vocab-Konvertierung
+  cv_review           LLM/Spell Review, Pipeline-Orchestrierung
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+from cv_vocab_types import *       # noqa: F401,F403
+from cv_preprocessing import *     # noqa: F401,F403
+from cv_layout import *            # noqa: F401,F403
+from cv_ocr_engines import *       # noqa: F401,F403
+from cv_cell_grid import *         # noqa: F401,F403
+from cv_box_detect import *         # noqa: F401,F403
+from cv_review import *            # noqa: F401,F403
+
+# Private names used by consumers — not covered by wildcard re-exports.
+from cv_preprocessing import _apply_shear  # noqa: F401
+from cv_layout import (  # noqa: F401
+    _detect_header_footer_gaps,
+    _detect_sub_columns,
+    _split_broad_columns,
+)
+from cv_ocr_engines import (  # noqa: F401
+    _fix_character_confusion,
+    _fix_phonetic_brackets,
+)
+from cv_cell_grid import _cells_to_vocab_entries  # noqa: F401
+from cv_words_first import build_grid_from_words  # noqa: F401
@@ -0,0 +1,437 @@
+"""
+CV Preprocessing Deskew — Rotation correction via Hough lines, word alignment, and iterative projection.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from collections import defaultdict
+from typing import Any, Dict, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Deskew via Hough Lines
+# =============================================================================
+
+def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
+    """Correct rotation using Hough Line detection.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Tuple of (corrected image, detected angle in degrees).
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
+                            minLineLength=img.shape[1] // 4, maxLineGap=20)
+
+    if lines is None or len(lines) < 3:
+        return img, 0.0
+
+    angles = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+        if abs(angle) < 15:
+            angles.append(angle)
+
+    if not angles:
+        return img, 0.0
+
+    median_angle = float(np.median(angles))
+
+    if abs(median_angle) > 5.0:
+        median_angle = 5.0 * np.sign(median_angle)
+
+    if abs(median_angle) < 0.1:
+        return img, 0.0
+
+    h, w = img.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+    corrected = cv2.warpAffine(img, M, (w, h),
+                               flags=cv2.INTER_LINEAR,
+                               borderMode=cv2.BORDER_REPLICATE)
+
+    logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation")
+    return corrected, median_angle
+
+
+# =============================================================================
+# Deskew via Word Alignment
+# =============================================================================
+
+def deskew_image_by_word_alignment(
+    image_data: bytes,
+    lang: str = "eng+deu",
+    downscale_factor: float = 0.5,
+) -> Tuple[bytes, float]:
+    """Correct rotation by fitting a line through left-most word starts per text line.
+
+    More robust than Hough-based deskew for vocabulary worksheets where text lines
+    have consistent left-alignment.
+
+    Args:
+        image_data: Raw image bytes (PNG/JPEG).
+        lang: Tesseract language string for the quick pass.
+        downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
+
+    Returns:
+        Tuple of (rotated image as PNG bytes, detected angle in degrees).
+    """
+    if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
+        return image_data, 0.0
+
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img is None:
+        logger.warning("deskew_by_word_alignment: could not decode image")
+        return image_data, 0.0
+
+    orig_h, orig_w = img.shape[:2]
+
+    small_w = int(orig_w * downscale_factor)
+    small_h = int(orig_h * downscale_factor)
+    small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
+
+    pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+    try:
+        data = pytesseract.image_to_data(
+            pil_small, lang=lang, config="--psm 6 --oem 3",
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception as e:
+        logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
+        return image_data, 0.0
+
+    line_groups: Dict[tuple, list] = defaultdict(list)
+    for i in range(len(data["text"])):
+        text = (data["text"][i] or "").strip()
+        conf = int(data["conf"][i])
+        if not text or conf < 20:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        line_groups[key].append(i)
+
+    if len(line_groups) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
+        return image_data, 0.0
+
+    scale = 1.0 / downscale_factor
+    points = []
+    for key, indices in line_groups.items():
+        best_idx = min(indices, key=lambda i: data["left"][i])
+        lx = data["left"][best_idx] * scale
+        top = data["top"][best_idx] * scale
+        h = data["height"][best_idx] * scale
+        cy = top + h / 2.0
+        points.append((lx, cy))
+
+    xs = np.array([p[0] for p in points])
+    ys = np.array([p[1] for p in points])
+    median_x = float(np.median(xs))
+    tolerance = orig_w * 0.03
+
+    mask = np.abs(xs - median_x) <= tolerance
+    filtered_xs = xs[mask]
+    filtered_ys = ys[mask]
+
+    if len(filtered_xs) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
+        return image_data, 0.0
+
+    coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
+    slope = coeffs[0]
+    angle_rad = np.arctan(slope)
+    angle_deg = float(np.degrees(angle_rad))
+
+    angle_deg = max(-5.0, min(5.0, angle_deg))
+
+    logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points "
+                f"(total lines: {len(line_groups)})")
+
+    if abs(angle_deg) < 0.05:
+        return image_data, 0.0
+
+    center = (orig_w // 2, orig_h // 2)
+    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+    rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    success, png_buf = cv2.imencode(".png", rotated)
+    if not success:
+        logger.warning("deskew_by_word_alignment: PNG encoding failed")
+        return image_data, 0.0
+
+    return png_buf.tobytes(), angle_deg
+
+
+# =============================================================================
+# Projection Gradient Scoring
+# =============================================================================
+
+def _projection_gradient_score(profile: np.ndarray) -> float:
+    """Score a projection profile by the L2-norm of its first derivative."""
+    diff = np.diff(profile)
+    return float(np.sum(diff * diff))
+
+
+# =============================================================================
+# Iterative Deskew (Vertical-Edge Projection)
+# =============================================================================
+
+def deskew_image_iterative(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+    coarse_step: float = 0.1,
+    fine_range: float = 0.15,
+    fine_step: float = 0.02,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Iterative deskew using vertical-edge projection optimisation.
+
+    Args:
+        img: BGR image (full resolution).
+        coarse_range: half-range in degrees for the coarse sweep.
+        coarse_step: step size in degrees for the coarse sweep.
+        fine_range: half-range around the coarse winner for the fine sweep.
+        fine_step: step size in degrees for the fine sweep.
+
+    Returns:
+        (rotated_bgr, angle_degrees, debug_dict)
+    """
+    h, w = img.shape[:2]
+    debug: Dict[str, Any] = {}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    y_lo, y_hi = int(h * 0.15), int(h * 0.85)
+    x_lo, x_hi = int(w * 0.10), int(w * 0.90)
+    gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
+
+    sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
+    edges = np.abs(sobel_x)
+    edge_max = edges.max()
+    if edge_max > 0:
+        edges = (edges / edge_max * 255).astype(np.uint8)
+    else:
+        return img, 0.0, {"error": "no edges detected"}
+
+    crop_h, crop_w = edges.shape[:2]
+    crop_center = (crop_w // 2, crop_h // 2)
+
+    trim_y = max(4, int(crop_h * 0.03))
+    trim_x = max(4, int(crop_w * 0.03))
+
+    def _sweep_edges(angles: np.ndarray) -> list:
+        results = []
+        for angle in angles:
+            if abs(angle) < 1e-6:
+                rotated = edges
+            else:
+                M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
+                rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_REPLICATE)
+            trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
+            v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
+            score = _projection_gradient_score(v_profile)
+            results.append((float(angle), score))
+        return results
+
+    coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
+    coarse_results = _sweep_edges(coarse_angles)
+    best_coarse = max(coarse_results, key=lambda x: x[1])
+    best_coarse_angle, best_coarse_score = best_coarse
+
+    debug["coarse_best_angle"] = round(best_coarse_angle, 2)
+    debug["coarse_best_score"] = round(best_coarse_score, 1)
+    debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
+
+    fine_lo = best_coarse_angle - fine_range
+    fine_hi = best_coarse_angle + fine_range
+    fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
+    fine_results = _sweep_edges(fine_angles)
+    best_fine = max(fine_results, key=lambda x: x[1])
+    best_fine_angle, best_fine_score = best_fine
+
+    debug["fine_best_angle"] = round(best_fine_angle, 2)
+    debug["fine_best_score"] = round(best_fine_score, 1)
+    debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
+
+    final_angle = best_fine_angle
+    final_angle = max(-5.0, min(5.0, final_angle))
+
+    logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0")
+
+    if abs(final_angle) < 0.05:
+        return img, 0.0, debug
+
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    return rotated, final_angle, debug
+
+
+# =============================================================================
+# Text-Line Slope Measurement
+# =============================================================================
+
+def _measure_textline_slope(img: np.ndarray) -> float:
+    """Measure residual text-line slope via Tesseract word-position regression."""
+    import math as _math
+
+    if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
+        return 0.0
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    data = pytesseract.image_to_data(
+        Image.fromarray(gray),
+        output_type=pytesseract.Output.DICT,
+        config="--psm 6",
+    )
+
+    lines: Dict[tuple, list] = {}
+    for i in range(len(data["text"])):
+        txt = (data["text"][i] or "").strip()
+        if len(txt) < 2 or int(data["conf"][i]) < 30:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        cx = data["left"][i] + data["width"][i] / 2.0
+        cy = data["top"][i] + data["height"][i] / 2.0
+        lines.setdefault(key, []).append((cx, cy))
+
+    slopes: list = []
+    for pts in lines.values():
+        if len(pts) < 3:
+            continue
+        pts.sort(key=lambda p: p[0])
+        xs = np.array([p[0] for p in pts], dtype=np.float64)
+        ys = np.array([p[1] for p in pts], dtype=np.float64)
+        if xs[-1] - xs[0] < w * 0.15:
+            continue
+        A = np.vstack([xs, np.ones_like(xs)]).T
+        result = np.linalg.lstsq(A, ys, rcond=None)
+        slope = result[0][0]
+        slopes.append(_math.degrees(_math.atan(slope)))
+
+    if len(slopes) < 3:
+        return 0.0
+
+    slopes.sort()
+    trim = max(1, len(slopes) // 10)
+    trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
+    if not trimmed:
+        return 0.0
+
+    return sum(trimmed) / len(trimmed)
+
+
+# =============================================================================
+# Two-Pass Deskew
+# =============================================================================
+
+def deskew_two_pass(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Two-pass deskew: iterative projection + word-alignment residual check.
+
+    Returns:
+        (corrected_bgr, total_angle_degrees, debug_dict)
+    """
+    debug: Dict[str, Any] = {}
+
+    # --- Pass 1: iterative projection ---
+    corrected, angle1, dbg1 = deskew_image_iterative(
+        img.copy(), coarse_range=coarse_range,
+    )
+    debug["pass1_angle"] = round(angle1, 3)
+    debug["pass1_method"] = "iterative"
+    debug["pass1_debug"] = dbg1
+
+    # --- Pass 2: word-alignment residual check ---
+    angle2 = 0.0
+    try:
+        ok, buf = cv2.imencode(".png", corrected)
+        if ok:
+            corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
+            if abs(angle2) >= 0.3:
+                arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
+                corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
+                if corrected2 is not None:
+                    corrected = corrected2
+                    logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied "
+                                f"(total={angle1 + angle2:.2f}\u00b0)")
+                else:
+                    angle2 = 0.0
+            else:
+                logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped")
+                angle2 = 0.0
+    except Exception as e:
+        logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
+        angle2 = 0.0
+
+    # --- Pass 3: Tesseract text-line regression residual check ---
+    angle3 = 0.0
+    try:
+        residual = _measure_textline_slope(corrected)
+        debug["pass3_raw"] = round(residual, 3)
+        if abs(residual) >= 0.3:
+            h3, w3 = corrected.shape[:2]
+            center3 = (w3 // 2, h3 // 2)
+            M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
+            corrected = cv2.warpAffine(
+                corrected, M3, (w3, h3),
+                flags=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+            angle3 = residual
+            logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual)
+        else:
+            logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual)
+    except Exception as e:
+        logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
+
+    total_angle = angle1 + angle2 + angle3
+    debug["pass2_angle"] = round(angle2, 3)
+    debug["pass2_method"] = "word_alignment"
+    debug["pass3_angle"] = round(angle3, 3)
+    debug["pass3_method"] = "textline_regression"
+    debug["total_angle"] = round(total_angle, 3)
+
+    logger.info(
+        "deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0",
+        angle1, angle2, angle3, total_angle,
+    )
+
+    return corrected, total_angle, debug
@@ -0,0 +1,474 @@
+"""
+CV Preprocessing Dewarp — Vertical shear detection and correction.
+
+Provides four shear detection methods (vertical edge, projection variance,
+Hough lines, text-line drift), ensemble combination, quality gating,
+and the main dewarp_image() function.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import math
+import time
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Shear Detection Methods
+# =============================================================================
+
+def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
+    """Detect vertical shear angle via strongest vertical edge tracking (Method A)."""
+    h, w = img.shape[:2]
+    result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+    abs_sobel = np.abs(sobel_x).astype(np.uint8)
+
+    _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+    num_strips = 20
+    strip_h = h // num_strips
+    edge_positions = []
+
+    for i in range(num_strips):
+        y_start = i * strip_h
+        y_end = min((i + 1) * strip_h, h)
+        strip = binary[y_start:y_end, :]
+
+        projection = np.sum(strip, axis=0).astype(np.float64)
+        if projection.max() == 0:
+            continue
+
+        search_w = int(w * 0.4)
+        left_proj = projection[:search_w]
+        if left_proj.max() == 0:
+            continue
+
+        kernel_size = max(3, w // 100)
+        if kernel_size % 2 == 0:
+            kernel_size += 1
+        smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
+        x_pos = float(np.argmax(smoothed))
+        y_center = (y_start + y_end) / 2.0
+        edge_positions.append((y_center, x_pos))
+
+    if len(edge_positions) < 8:
+        return result
+
+    ys = np.array([p[0] for p in edge_positions])
+    xs = np.array([p[1] for p in edge_positions])
+
+    median_x = np.median(xs)
+    std_x = max(np.std(xs), 1.0)
+    mask = np.abs(xs - median_x) < 2 * std_x
+    ys = ys[mask]
+    xs = xs[mask]
+
+    if len(ys) < 6:
+        return result
+
+    straight_coeffs = np.polyfit(ys, xs, 1)
+    slope = straight_coeffs[0]
+    fitted = np.polyval(straight_coeffs, ys)
+    residuals = xs - fitted
+    rmse = float(np.sqrt(np.mean(residuals ** 2)))
+
+    shear_degrees = math.degrees(math.atan(slope))
+
+    confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(float(confidence), 2)
+
+    return result
+
+
+def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear angle by maximising variance of horizontal text-line projections (Method B)."""
+    result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
+    sh, sw = small.shape
+
+    def _sweep_variance(angles_list):
+        results = []
+        for angle_deg in angles_list:
+            if abs(angle_deg) < 0.001:
+                rotated = small
+            else:
+                shear_tan = math.tan(math.radians(angle_deg))
+                M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
+                rotated = cv2.warpAffine(small, M, (sw, sh),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_CONSTANT)
+            profile = np.sum(rotated, axis=1).astype(float)
+            results.append((angle_deg, float(np.var(profile))))
+        return results
+
+    coarse_angles = [a * 0.5 for a in range(-6, 7)]
+    coarse_results = _sweep_variance(coarse_angles)
+    coarse_best = max(coarse_results, key=lambda x: x[1])
+
+    fine_center = coarse_best[0]
+    fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]
+    fine_results = _sweep_variance(fine_angles)
+    fine_best = max(fine_results, key=lambda x: x[1])
+
+    best_angle = fine_best[0]
+    best_variance = fine_best[1]
+    variances = coarse_results + fine_results
+
+    all_mean = sum(v for _, v in variances) / len(variances)
+    if all_mean > 0 and best_variance > all_mean:
+        confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
+    else:
+        confidence = 0.0
+
+    result["shear_degrees"] = round(best_angle, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear using Hough transform on printed table / ruled lines (Method C)."""
+    result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+
+    min_len = int(w * 0.15)
+    lines = cv2.HoughLinesP(
+        edges, rho=1, theta=np.pi / 360,
+        threshold=int(w * 0.08),
+        minLineLength=min_len,
+        maxLineGap=20,
+    )
+
+    if lines is None or len(lines) < 3:
+        return result
+
+    horizontal_angles: List[Tuple[float, float]] = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        if x1 == x2:
+            continue
+        angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
+        if abs(angle) <= 5.0:
+            length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
+            horizontal_angles.append((angle, length))
+
+    if len(horizontal_angles) < 3:
+        return result
+
+    angles_arr = np.array([a for a, _ in horizontal_angles])
+    weights_arr = np.array([l for _, l in horizontal_angles])
+    sorted_idx = np.argsort(angles_arr)
+    s_angles = angles_arr[sorted_idx]
+    s_weights = weights_arr[sorted_idx]
+    cum = np.cumsum(s_weights)
+    mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
+    median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
+
+    agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
+    confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
+
+    shear_degrees = -median_angle
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear by measuring text-line straightness (Method D)."""
+    result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    scale = 0.5
+    small = cv2.resize(img, (int(w * scale), int(h * scale)),
+                       interpolation=cv2.INTER_AREA)
+    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+    pil_img = Image.fromarray(gray)
+
+    try:
+        data = pytesseract.image_to_data(
+            pil_img, lang='eng+deu', config='--psm 11 --oem 3',
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception:
+        return result
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 20 or len(text) < 2:
+            continue
+        left_x = float(data['left'][i])
+        cy = data['top'][i] + data['height'][i] / 2.0
+        word_w = float(data['width'][i])
+        words.append((left_x, cy, word_w))
+
+    if len(words) < 15:
+        return result
+
+    avg_w = sum(ww for _, _, ww in words) / len(words)
+    x_tol = max(avg_w * 0.4, 8)
+
+    words_by_x = sorted(words, key=lambda w: w[0])
+    columns: List[List[Tuple[float, float]]] = []
+    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+    cur_x = words_by_x[0][0]
+
+    for lx, cy, _ in words_by_x[1:]:
+        if abs(lx - cur_x) <= x_tol:
+            cur_col.append((lx, cy))
+            cur_x = cur_x * 0.8 + lx * 0.2
+        else:
+            if len(cur_col) >= 5:
+                columns.append(cur_col)
+            cur_col = [(lx, cy)]
+            cur_x = lx
+    if len(cur_col) >= 5:
+        columns.append(cur_col)
+
+    if len(columns) < 2:
+        return result
+
+    drifts = []
+    for col in columns:
+        ys = np.array([p[1] for p in col])
+        xs = np.array([p[0] for p in col])
+        y_range = ys.max() - ys.min()
+        if y_range < h * scale * 0.3:
+            continue
+        coeffs = np.polyfit(ys, xs, 1)
+        drifts.append(coeffs[0])
+
+    if len(drifts) < 2:
+        return result
+
+    median_drift = float(np.median(drifts))
+    shear_degrees = math.degrees(math.atan(median_drift))
+
+    drift_std = float(np.std(drifts))
+    consistency = max(0.0, 1.0 - drift_std * 50)
+    count_factor = min(1.0, len(drifts) / 4.0)
+    confidence = count_factor * 0.5 + consistency * 0.5
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+                "shear=%.3f\u00b0, conf=%.2f",
+                len(columns), len(drifts), median_drift,
+                shear_degrees, confidence)
+    return result
+
+
+# =============================================================================
+# Quality Check and Shear Application
+# =============================================================================
+
+def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
+    """Check whether the dewarp correction actually improved alignment."""
+    def _h_proj_variance(img: np.ndarray) -> float:
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        _, binary = cv2.threshold(gray, 0, 255,
+                                  cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
+                           interpolation=cv2.INTER_AREA)
+        profile = np.sum(small, axis=1).astype(float)
+        return float(np.var(profile))
+
+    var_before = _h_proj_variance(original)
+    var_after = _h_proj_variance(corrected)
+
+    return var_after > var_before
+
+
+def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply a vertical shear correction to an image."""
+    h, w = img.shape[:2]
+    shear_tan = math.tan(math.radians(shear_degrees))
+
+    M = np.float32([
+        [1, shear_tan, -h / 2.0 * shear_tan],
+        [0, 1, 0],
+    ])
+
+    corrected = cv2.warpAffine(img, M, (w, h),
+                                flags=cv2.INTER_LINEAR,
+                                borderMode=cv2.BORDER_REPLICATE)
+    return corrected
+
+
+# =============================================================================
+# Ensemble Shear Combination
+# =============================================================================
+
+def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
+    """Combine multiple shear detections into a single weighted estimate (v2)."""
+    _MIN_CONF = 0.35
+    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
+
+    accepted = []
+    for d in detections:
+        if d["confidence"] < _MIN_CONF:
+            continue
+        boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
+        effective_conf = d["confidence"] * boost
+        accepted.append((d["shear_degrees"], effective_conf, d["method"]))
+
+    if not accepted:
+        return 0.0, 0.0, "none"
+
+    if len(accepted) == 1:
+        deg, conf, method = accepted[0]
+        return deg, min(conf, 1.0), method
+
+    total_w = sum(c for _, c, _ in accepted)
+    w_mean = sum(d * c for d, c, _ in accepted) / total_w
+
+    filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
+    if not filtered:
+        filtered = accepted
+
+    total_w2 = sum(c for _, c, _ in filtered)
+    final_deg = sum(d * c for d, c, _ in filtered) / total_w2
+
+    avg_conf = total_w2 / len(filtered)
+    spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
+    agreement_bonus = 0.15 if spread < 0.5 else 0.0
+    ensemble_conf = min(1.0, avg_conf + agreement_bonus)
+
+    methods_str = "+".join(m for _, _, m in filtered)
+    return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
+
+
+# =============================================================================
+# Main Dewarp Function
+# =============================================================================
+
+def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
+    """Correct vertical shear after deskew (v2 with quality gate).
+
+    Methods (all run in ~150ms total):
+        A. _detect_shear_angle()           -- vertical edge profile (~50ms)
+        B. _detect_shear_by_projection()   -- horizontal text-line variance (~30ms)
+        C. _detect_shear_by_hough()        -- Hough lines on table borders (~20ms)
+        D. _detect_shear_by_text_lines()   -- text-line straightness (~50ms)
+
+    Args:
+        img: BGR image (already deskewed).
+        use_ensemble: If False, fall back to single-method behaviour (method A only).
+
+    Returns:
+        Tuple of (corrected_image, dewarp_info).
+    """
+    no_correction = {
+        "method": "none",
+        "shear_degrees": 0.0,
+        "confidence": 0.0,
+        "detections": [],
+    }
+
+    if not CV2_AVAILABLE:
+        return img, no_correction
+
+    t0 = time.time()
+
+    if use_ensemble:
+        det_a = _detect_shear_angle(img)
+        det_b = _detect_shear_by_projection(img)
+        det_c = _detect_shear_by_hough(img)
+        det_d = _detect_shear_by_text_lines(img)
+        detections = [det_a, det_b, det_c, det_d]
+        shear_deg, confidence, method = _ensemble_shear(detections)
+    else:
+        det_a = _detect_shear_angle(img)
+        detections = [det_a]
+        shear_deg = det_a["shear_degrees"]
+        confidence = det_a["confidence"]
+        method = det_a["method"]
+
+    duration = time.time() - t0
+
+    logger.info(
+        "dewarp: ensemble shear=%.3f\u00b0 conf=%.2f method=%s (%.2fs) | "
+        "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
+        shear_deg, confidence, method, duration,
+        detections[0]["shear_degrees"], detections[0]["confidence"],
+        detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
+        detections[1]["confidence"] if len(detections) > 1 else 0.0,
+        detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
+        detections[2]["confidence"] if len(detections) > 2 else 0.0,
+        detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
+        detections[3]["confidence"] if len(detections) > 3 else 0.0,
+    )
+
+    _all_detections = [
+        {"method": d["method"], "shear_degrees": d["shear_degrees"],
+         "confidence": d["confidence"]}
+        for d in detections
+    ]
+
+    if abs(shear_deg) < 0.08 or confidence < 0.4:
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    corrected = _apply_shear(img, -shear_deg)
+
+    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
+        logger.info("dewarp: quality gate REJECTED correction (%.3f\u00b0) -- "
+                     "projection variance did not improve", shear_deg)
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    info = {
+        "method": method,
+        "shear_degrees": shear_deg,
+        "confidence": confidence,
+        "detections": _all_detections,
+    }
+
+    return corrected, info
+
+
+def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply shear correction with a manual angle."""
+    if abs(shear_degrees) < 0.001:
+        return img
+    return _apply_shear(img, -shear_degrees)
@@ -0,0 +1,157 @@
+"""
+Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
+
+Re-export facade -- all logic lives in the sub-modules:
+
+  cv_preprocessing_deskew   Rotation correction (Hough, word-alignment, iterative, two-pass)
+  cv_preprocessing_dewarp   Vertical shear detection and correction (4 methods + ensemble)
+
+This file contains the image I/O and orientation detection functions.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+# Guarded imports
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+# Re-export all deskew functions
+from cv_preprocessing_deskew import (  # noqa: F401
+    deskew_image,
+    deskew_image_by_word_alignment,
+    deskew_image_iterative,
+    deskew_two_pass,
+    _projection_gradient_score,
+    _measure_textline_slope,
+)
+
+# Re-export all dewarp functions
+from cv_preprocessing_dewarp import (  # noqa: F401
+    _apply_shear,
+    _detect_shear_angle,
+    _detect_shear_by_hough,
+    _detect_shear_by_projection,
+    _detect_shear_by_text_lines,
+    _dewarp_quality_check,
+    _ensemble_shear,
+    dewarp_image,
+    dewarp_image_manual,
+)
+
+
+# =============================================================================
+# Image I/O
+# =============================================================================
+
+def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
+    """Render a PDF page to a high-resolution numpy array (BGR).
+
+    Args:
+        pdf_data: Raw PDF bytes.
+        page_number: 0-indexed page number.
+        zoom: Zoom factor (3.0 = 432 DPI).
+
+    Returns:
+        numpy array in BGR format.
+    """
+    import fitz  # PyMuPDF
+
+    pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
+    if page_number >= pdf_doc.page_count:
+        raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
+
+    page = pdf_doc[page_number]
+    mat = fitz.Matrix(zoom, zoom)
+    pix = page.get_pixmap(matrix=mat)
+
+    img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
+    if pix.n == 4:  # RGBA
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
+    elif pix.n == 3:  # RGB
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
+    else:  # Grayscale
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
+
+    pdf_doc.close()
+    return img_bgr
+
+
+def render_image_high_res(image_data: bytes) -> np.ndarray:
+    """Load an image (PNG/JPEG) into a numpy array (BGR).
+
+    Args:
+        image_data: Raw image bytes.
+
+    Returns:
+        numpy array in BGR format.
+    """
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise ValueError("Could not decode image data")
+    return img_bgr
+
+
+# =============================================================================
+# Orientation Detection (0/90/180/270)
+# =============================================================================
+
+def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
+    """Detect page orientation via Tesseract OSD and rotate if needed.
+
+    Returns:
+        (corrected_image, rotation_degrees) -- rotation is 0, 90, 180, or 270.
+    """
+    if pytesseract is None:
+        return img_bgr, 0
+
+    try:
+        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+        pil_img = Image.fromarray(gray)
+
+        osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
+        rotate = osd.get("rotate", 0)
+        confidence = osd.get("orientation_conf", 0.0)
+
+        logger.info(f"OSD: orientation={rotate}\u00b0 confidence={confidence:.1f}")
+
+        if rotate == 0 or confidence < 1.0:
+            return img_bgr, 0
+
+        if rotate == 180:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
+        elif rotate == 90:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
+        elif rotate == 270:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        else:
+            return img_bgr, 0
+
+        logger.info(f"OSD: rotated {rotate}\u00b0 to fix orientation")
+        return corrected, rotate
+
+    except Exception as e:
+        logger.warning(f"OSD orientation detection failed: {e}")
+        return img_bgr, 0
@@ -0,0 +1,388 @@
+"""
+CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.
+
+Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
+_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re
+import time
+from typing import Dict, List, Tuple
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
+_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
+logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
+
+REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
+
+# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
+_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
+
+# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
+_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
+
+
+def _entry_needs_review(entry: Dict) -> bool:
+    """Check if an entry should be sent for review.
+
+    Sends all non-empty entries that don't have IPA phonetic transcriptions.
+    """
+    en = entry.get("english", "") or ""
+    de = entry.get("german", "") or ""
+
+    if not en.strip() and not de.strip():
+        return False
+    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
+        return False
+    return True
+
+
+def _build_llm_prompt(table_lines: List[Dict]) -> str:
+    """Build the LLM correction prompt for a batch of entries."""
+    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
+
+DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
+
+NUR diese Korrekturen sind erlaubt:
+- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
+- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
+- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
+- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
+- Ziffer 6 statt G oder g: "6eld" -> "Geld"
+- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
+
+ABSOLUT VERBOTEN -- aendere NIEMALS:
+- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
+- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
+- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
+- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
+- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
+- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
+- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
+- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
+
+Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
+
+Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
+
+/no_think
+
+Eingabe:
+{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
+
+
+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+    """Detect LLM changes that are likely wrong and should be discarded.
+
+    Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
+    legitimate OCR corrections. Everything else is rejected.
+    """
+    if not old_val or not new_val:
+        return False
+
+    if old_val.lower() == new_val.lower():
+        return True
+
+    old_words = old_val.split()
+    new_words = new_val.split()
+    if abs(len(old_words) - len(new_words)) > 1:
+        return True
+
+    _OCR_CHAR_MAP = {
+        '0': set('oOgG'),
+        '1': set('lLiI'),
+        '5': set('sS'),
+        '6': set('gG'),
+        '8': set('bB'),
+        '|': set('lLiI1'),
+        'l': set('iI|1'),
+    }
+    has_valid_fix = False
+    if len(old_val) == len(new_val):
+        for oc, nc in zip(old_val, new_val):
+            if oc != nc:
+                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
+                    has_valid_fix = True
+                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
+                    has_valid_fix = True
+    else:
+        _OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
+        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
+            has_valid_fix = True
+
+    if not has_valid_fix:
+        return True
+
+    return False
+
+
+def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
+    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
+    changes = []
+    entries_out = []
+    for i, orig in enumerate(originals):
+        if i < len(corrected):
+            c = corrected[i]
+            entry = dict(orig)
+            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
+                new_val = c.get(key, "").strip()
+                old_val = (orig.get(field_name, "") or "").strip()
+                if new_val and new_val != old_val:
+                    if _is_spurious_change(old_val, new_val):
+                        continue
+                    changes.append({
+                        "row_index": orig.get("row_index", i),
+                        "field": field_name,
+                        "old": old_val,
+                        "new": new_val,
+                    })
+                    entry[field_name] = new_val
+                    entry["llm_corrected"] = True
+            entries_out.append(entry)
+        else:
+            entries_out.append(dict(orig))
+    return changes, entries_out
+
+
+def _sanitize_for_json(text: str) -> str:
+    """Remove or escape control characters that break JSON parsing."""
+    return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
+
+
+def _parse_llm_json_array(text: str) -> List[Dict]:
+    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = re.sub(r'```json\s*', '', text)
+    text = re.sub(r'```\s*', '', text)
+    text = _sanitize_for_json(text)
+    match = re.search(r'\[.*\]', text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group())
+        except (ValueError, json.JSONDecodeError) as e:
+            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
+    else:
+        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
+    return []
+
+
+async def llm_review_entries(
+    entries: List[Dict],
+    model: str = None,
+) -> Dict:
+    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
+    from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        return spell_review_entries_sync(entries)
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
+
+    if not reviewable:
+        return {
+            "entries_original": entries,
+            "entries_corrected": [dict(e) for e in entries],
+            "changes": [],
+            "skipped_count": len(entries),
+            "model_used": model,
+            "duration_ms": 0,
+        }
+
+    review_entries = [e for _, e in reviewable]
+    table_lines = [
+        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+        for e in review_entries
+    ]
+
+    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
+                len(review_entries), len(entries), model, len(entries) - len(reviewable))
+
+    prompt = _build_llm_prompt(table_lines)
+
+    t0 = time.time()
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.post(
+            f"{_OLLAMA_URL}/api/chat",
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "stream": False,
+                "think": False,
+                "options": {"temperature": 0.1, "num_predict": 8192},
+            },
+        )
+        resp.raise_for_status()
+        content = resp.json().get("message", {}).get("content", "")
+    duration_ms = int((time.time() - t0) * 1000)
+
+    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
+
+    corrected = _parse_llm_json_array(content)
+    changes, corrected_entries = _diff_batch(review_entries, corrected)
+
+    all_corrected = [dict(e) for e in entries]
+    for batch_idx, (orig_idx, _) in enumerate(reviewable):
+        if batch_idx < len(corrected_entries):
+            all_corrected[orig_idx] = corrected_entries[batch_idx]
+
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": len(entries) - len(reviewable),
+        "model_used": model,
+        "duration_ms": duration_ms,
+    }
+
+
+async def llm_review_entries_streaming(
+    entries: List[Dict],
+    model: str = None,
+    batch_size: int = _REVIEW_BATCH_SIZE,
+):
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
+
+    Phase 0 (always): Run _fix_character_confusion and emit any changes.
+    """
+    from cv_ocr_engines import _fix_character_confusion
+    from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
+
+    _CONF_FIELDS = ('english', 'german', 'example')
+    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
+    _fix_character_confusion(entries)
+    char_changes = [
+        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
+        for i in range(len(entries))
+        for f in _CONF_FIELDS
+        if originals[i][f] != entries[i].get(f, '')
+    ]
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        _meta_sent = False
+        async for event in spell_review_entries_streaming(entries, batch_size):
+            yield event
+            if not _meta_sent and event.get('type') == 'meta' and char_changes:
+                _meta_sent = True
+                yield {
+                    'type': 'batch',
+                    'changes': char_changes,
+                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+                    'progress': {'current': 0, 'total': len(entries)},
+                }
+        return
+
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    # LLM path
+    if char_changes:
+        yield {
+            'type': 'batch',
+            'changes': char_changes,
+            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+            'progress': {'current': 0, 'total': len(entries)},
+        }
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    reviewable = []
+    skipped_indices = []
+    for i, e in enumerate(entries):
+        if _entry_needs_review(e):
+            reviewable.append((i, e))
+        else:
+            skipped_indices.append(i)
+
+    total_to_review = len(reviewable)
+
+    yield {
+        "type": "meta",
+        "total_entries": len(entries),
+        "to_review": total_to_review,
+        "skipped": len(skipped_indices),
+        "model": model,
+        "batch_size": batch_size,
+    }
+
+    all_changes = []
+    all_corrected = [dict(e) for e in entries]
+    total_duration_ms = 0
+    reviewed_count = 0
+
+    for batch_start in range(0, total_to_review, batch_size):
+        batch_items = reviewable[batch_start:batch_start + batch_size]
+        batch_entries = [e for _, e in batch_items]
+
+        table_lines = [
+            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+            for e in batch_entries
+        ]
+
+        prompt = _build_llm_prompt(table_lines)
+
+        logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
+                    batch_start // batch_size, len(batch_entries), model)
+
+        t0 = time.time()
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(
+                f"{_OLLAMA_URL}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False,
+                    "think": False,
+                    "options": {"temperature": 0.1, "num_predict": 8192},
+                },
+            )
+            resp.raise_for_status()
+            content = resp.json().get("message", {}).get("content", "")
+        batch_ms = int((time.time() - t0) * 1000)
+        total_duration_ms += batch_ms
+
+        corrected = _parse_llm_json_array(content)
+        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
+
+        for batch_idx, (orig_idx, _) in enumerate(batch_items):
+            if batch_idx < len(batch_corrected):
+                all_corrected[orig_idx] = batch_corrected[batch_idx]
+
+        all_changes.extend(batch_changes)
+        reviewed_count += len(batch_items)
+
+        yield {
+            "type": "batch",
+            "batch_index": batch_start // batch_size,
+            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
+            "changes": batch_changes,
+            "duration_ms": batch_ms,
+            "progress": {"current": reviewed_count, "total": total_to_review},
+        }
+
+    yield {
+        "type": "complete",
+        "changes": all_changes,
+        "model_used": model,
+        "duration_ms": total_duration_ms,
+        "total_entries": len(entries),
+        "reviewed": total_to_review,
+        "skipped": len(skipped_indices),
+        "corrections_found": len(all_changes),
+        "entries_corrected": all_corrected,
+    }
@@ -0,0 +1,430 @@
+"""
+CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration.
+
+Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import time
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV_PIPELINE_AVAILABLE,
+    PageRegion,
+    PipelineResult,
+    VocabRow,
+)
+from cv_preprocessing import (
+    deskew_image,
+    dewarp_image,
+    render_image_high_res,
+    render_pdf_high_res,
+)
+from cv_layout import (
+    analyze_layout,
+    create_layout_image,
+    create_ocr_image,
+)
+from cv_ocr_engines import (
+    _group_words_into_lines,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Stage 6: Multi-Pass OCR
+# =============================================================================
+
+def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
+               psm: int, fallback_psm: Optional[int] = None,
+               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
+    """Run Tesseract OCR on a specific region with given PSM.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        region: Region to crop and OCR.
+        lang: Tesseract language string.
+        psm: Page Segmentation Mode.
+        fallback_psm: If confidence too low, retry with this PSM per line.
+        min_confidence: Minimum average confidence before fallback.
+
+    Returns:
+        List of word dicts with text, position, confidence.
+    """
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    pil_img = Image.fromarray(crop)
+
+    config = f'--psm {psm} --oem 3'
+    try:
+        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                         output_type=pytesseract.Output.DICT)
+    except Exception as e:
+        logger.warning(f"Tesseract failed for region {region.type}: {e}")
+        return []
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 10:
+            continue
+        words.append({
+            'text': text,
+            'left': data['left'][i] + region.x,
+            'top': data['top'][i] + region.y,
+            'width': data['width'][i],
+            'height': data['height'][i],
+            'conf': conf,
+            'region_type': region.type,
+        })
+
+    if words and fallback_psm is not None:
+        avg_conf = sum(w['conf'] for w in words) / len(words)
+        if avg_conf < min_confidence:
+            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
+                        f"trying fallback PSM {fallback_psm}")
+            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
+
+    return words
+
+
+def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
+                              lang: str, psm: int) -> List[Dict[str, Any]]:
+    """OCR a region line by line (fallback for low-confidence regions)."""
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    inv = cv2.bitwise_not(crop)
+    h_proj = np.sum(inv, axis=1)
+    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
+
+    lines = []
+    in_text = False
+    line_start = 0
+    for y in range(len(h_proj)):
+        if h_proj[y] > threshold and not in_text:
+            line_start = y
+            in_text = True
+        elif h_proj[y] <= threshold and in_text:
+            if y - line_start > 5:
+                lines.append((line_start, y))
+            in_text = False
+    if in_text and len(h_proj) - line_start > 5:
+        lines.append((line_start, len(h_proj)))
+
+    all_words = []
+    config = f'--psm {psm} --oem 3'
+
+    for line_y_start, line_y_end in lines:
+        pad = 3
+        y1 = max(0, line_y_start - pad)
+        y2 = min(crop.shape[0], line_y_end + pad)
+        line_crop = crop[y1:y2, :]
+
+        if line_crop.size == 0:
+            continue
+
+        pil_img = Image.fromarray(line_crop)
+        try:
+            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                             output_type=pytesseract.Output.DICT)
+        except Exception:
+            continue
+
+        for i in range(len(data['text'])):
+            text = data['text'][i].strip()
+            conf = int(data['conf'][i])
+            if not text or conf < 10:
+                continue
+            all_words.append({
+                'text': text,
+                'left': data['left'][i] + region.x,
+                'top': data['top'][i] + region.y + y1,
+                'width': data['width'][i],
+                'height': data['height'][i],
+                'conf': conf,
+                'region_type': region.type,
+            })
+
+    return all_words
+
+
+def run_multi_pass_ocr(ocr_img: np.ndarray,
+                       regions: List[PageRegion],
+                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
+    """Run OCR on each detected region with optimized settings."""
+    results: Dict[str, List[Dict]] = {}
+
+    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    for region in regions:
+        if region.type in _ocr_skip:
+            continue
+
+        if region.type == 'column_en':
+            words = ocr_region(ocr_img, region, lang='eng', psm=4)
+        elif region.type == 'column_de':
+            words = ocr_region(ocr_img, region, lang='deu', psm=4)
+        elif region.type == 'column_example':
+            words = ocr_region(ocr_img, region, lang=lang, psm=6,
+                              fallback_psm=7, min_confidence=40.0)
+        else:
+            words = ocr_region(ocr_img, region, lang=lang, psm=6)
+
+        results[region.type] = words
+        logger.info(f"OCR {region.type}: {len(words)} words")
+
+    return results
+
+
+# =============================================================================
+# Stage 7: Line Alignment -> Vocabulary Entries
+# =============================================================================
+
+def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
+                          regions: List[PageRegion],
+                          y_tolerance_px: int = 25) -> List[VocabRow]:
+    """Align OCR results from different columns into vocabulary rows."""
+    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
+        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
+        return []
+
+    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
+    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
+    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
+
+    def line_y_center(line: List[Dict]) -> float:
+        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
+
+    def line_text(line: List[Dict]) -> str:
+        return ' '.join(w['text'] for w in line)
+
+    def line_confidence(line: List[Dict]) -> float:
+        return sum(w['conf'] for w in line) / len(line) if line else 0
+
+    vocab_rows: List[VocabRow] = []
+
+    for en_line in en_lines:
+        en_y = line_y_center(en_line)
+        en_text = line_text(en_line)
+        en_conf = line_confidence(en_line)
+
+        if len(en_text.strip()) < 2:
+            continue
+
+        de_text = ""
+        de_conf = 0.0
+        best_de_dist = float('inf')
+        best_de_idx = -1
+        for idx, de_line in enumerate(de_lines):
+            dist = abs(line_y_center(de_line) - en_y)
+            if dist < y_tolerance_px and dist < best_de_dist:
+                best_de_dist = dist
+                best_de_idx = idx
+
+        if best_de_idx >= 0:
+            de_text = line_text(de_lines[best_de_idx])
+            de_conf = line_confidence(de_lines[best_de_idx])
+
+        ex_text = ""
+        ex_conf = 0.0
+        best_ex_dist = float('inf')
+        best_ex_idx = -1
+        for idx, ex_line in enumerate(ex_lines):
+            dist = abs(line_y_center(ex_line) - en_y)
+            if dist < y_tolerance_px and dist < best_ex_dist:
+                best_ex_dist = dist
+                best_ex_idx = idx
+
+        if best_ex_idx >= 0:
+            ex_text = line_text(ex_lines[best_ex_idx])
+            ex_conf = line_confidence(ex_lines[best_ex_idx])
+
+        avg_conf = en_conf
+        conf_count = 1
+        if de_conf > 0:
+            avg_conf += de_conf
+            conf_count += 1
+        if ex_conf > 0:
+            avg_conf += ex_conf
+            conf_count += 1
+
+        vocab_rows.append(VocabRow(
+            english=en_text.strip(),
+            german=de_text.strip(),
+            example=ex_text.strip(),
+            confidence=avg_conf / conf_count,
+            y_position=int(en_y),
+        ))
+
+    # Handle multi-line wrapping in example column
+    matched_ex_ys = set()
+    for row in vocab_rows:
+        if row.example:
+            matched_ex_ys.add(row.y_position)
+
+    for ex_line in ex_lines:
+        ex_y = line_y_center(ex_line)
+        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
+        if already_matched:
+            continue
+
+        best_row = None
+        best_dist = float('inf')
+        for row in vocab_rows:
+            dist = ex_y - row.y_position
+            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
+                best_dist = dist
+                best_row = row
+
+        if best_row:
+            continuation = line_text(ex_line).strip()
+            if continuation:
+                best_row.example = (best_row.example + " " + continuation).strip()
+
+    vocab_rows.sort(key=lambda r: r.y_position)
+
+    return vocab_rows
+
+
+# =============================================================================
+# Stage 8: Optional LLM Post-Correction
+# =============================================================================
+
+async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
+                           confidence_threshold: float = 50.0,
+                           enabled: bool = False) -> List[VocabRow]:
+    """Optionally send low-confidence regions to Qwen-VL for correction."""
+    if not enabled:
+        return vocab_rows
+
+    logger.info(f"LLM post-correction skipped (not yet implemented)")
+    return vocab_rows
+
+
+# =============================================================================
+# Orchestrator
+# =============================================================================
+
+async def run_cv_pipeline(
+    pdf_data: Optional[bytes] = None,
+    image_data: Optional[bytes] = None,
+    page_number: int = 0,
+    zoom: float = 3.0,
+    enable_dewarp: bool = True,
+    enable_llm_correction: bool = False,
+    lang: str = "eng+deu",
+) -> PipelineResult:
+    """Run the complete CV document reconstruction pipeline."""
+    if not CV_PIPELINE_AVAILABLE:
+        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
+
+    result = PipelineResult()
+    total_start = time.time()
+
+    try:
+        # Stage 1: Render
+        t = time.time()
+        if pdf_data:
+            img = render_pdf_high_res(pdf_data, page_number, zoom)
+        elif image_data:
+            img = render_image_high_res(image_data)
+        else:
+            return PipelineResult(error="No input data (pdf_data or image_data required)")
+        result.stages['render'] = round(time.time() - t, 2)
+        result.image_width = img.shape[1]
+        result.image_height = img.shape[0]
+        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
+
+        # Stage 2: Deskew
+        t = time.time()
+        img, angle = deskew_image(img)
+        result.stages['deskew'] = round(time.time() - t, 2)
+        logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")
+
+        # Stage 3: Dewarp
+        if enable_dewarp:
+            t = time.time()
+            img, _dewarp_info = dewarp_image(img)
+            result.stages['dewarp'] = round(time.time() - t, 2)
+
+        # Stage 4: Dual image preparation
+        t = time.time()
+        ocr_img = create_ocr_image(img)
+        layout_img = create_layout_image(img)
+        result.stages['image_prep'] = round(time.time() - t, 2)
+
+        # Stage 5: Layout analysis
+        t = time.time()
+        regions = analyze_layout(layout_img, ocr_img)
+        result.stages['layout'] = round(time.time() - t, 2)
+        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
+        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
+
+        # Stage 6: Multi-pass OCR
+        t = time.time()
+        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
+        result.stages['ocr'] = round(time.time() - t, 2)
+        total_words = sum(len(w) for w in ocr_results.values())
+        result.word_count = total_words
+        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
+
+        # Stage 7: Line alignment
+        t = time.time()
+        vocab_rows = match_lines_to_vocab(ocr_results, regions)
+        result.stages['alignment'] = round(time.time() - t, 2)
+
+        # Stage 8: Optional LLM correction
+        if enable_llm_correction:
+            t = time.time()
+            vocab_rows = await llm_post_correct(img, vocab_rows)
+            result.stages['llm_correction'] = round(time.time() - t, 2)
+
+        # Convert to output format
+        result.vocabulary = [
+            {
+                "english": row.english,
+                "german": row.german,
+                "example": row.example,
+                "confidence": round(row.confidence, 1),
+            }
+            for row in vocab_rows
+            if row.english or row.german
+        ]
+
+        result.duration_seconds = round(time.time() - total_start, 2)
+        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
+
+    except Exception as e:
+        logger.error(f"CV Pipeline error: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        result.error = str(e)
+        result.duration_seconds = round(time.time() - total_start, 2)
+
+    return result
@@ -0,0 +1,46 @@
+"""
+Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
+
+Re-export facade -- all logic lives in the sub-modules:
+
+  cv_review_pipeline   Stages 6-8: OCR, line alignment, orchestrator
+  cv_review_spell      Rule-based spell-checker OCR correction
+  cv_review_llm        LLM-based OCR correction, prompt building, streaming
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+# Re-export everything for backward compatibility
+from cv_review_pipeline import (  # noqa: F401
+    ocr_region,
+    run_multi_pass_ocr,
+    match_lines_to_vocab,
+    llm_post_correct,
+    run_cv_pipeline,
+)
+
+from cv_review_spell import (  # noqa: F401
+    _SPELL_AVAILABLE,
+    _spell_dict_knows,
+    _spell_fix_field,
+    _spell_fix_token,
+    _try_split_merged_word,
+    _normalize_page_ref,
+    spell_review_entries_sync,
+    spell_review_entries_streaming,
+)
+
+from cv_review_llm import (  # noqa: F401
+    OLLAMA_REVIEW_MODEL,
+    REVIEW_ENGINE,
+    _REVIEW_BATCH_SIZE,
+    _build_llm_prompt,
+    _diff_batch,
+    _entry_needs_review,
+    _is_spurious_change,
+    _parse_llm_json_array,
+    _sanitize_for_json,
+    llm_review_entries,
+    llm_review_entries_streaming,
+)
@@ -0,0 +1,315 @@
+"""
+CV Review Spell — Rule-based OCR spell correction (no LLM).
+
+Provides dictionary-backed digit-to-letter substitution, umlaut correction,
+general spell correction, merged-word splitting, and page-ref normalization.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+import time
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _SPELL_AVAILABLE = True
+    logger.info("pyspellchecker loaded (EN+DE)")
+except ImportError:
+    _SPELL_AVAILABLE = False
+    _en_spell = None  # type: ignore[assignment]
+    _de_spell = None  # type: ignore[assignment]
+    logger.warning("pyspellchecker not installed")
+
+
+# ---- Page-Ref Normalization ----
+# Normalizes OCR variants like "p-60", "p 61", "p60" -> "p.60"
+_PAGE_REF_RE = re.compile(r'\bp[\s\-]?(\d+)', re.IGNORECASE)
+
+
+def _normalize_page_ref(text: str) -> str:
+    """Normalize page references: 'p-60' / 'p 61' / 'p60' -> 'p.60'."""
+    if not text:
+        return text
+    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
+
+
+# Suspicious OCR chars -> ordered list of most-likely correct replacements
+_SPELL_SUBS: Dict[str, List[str]] = {
+    '0': ['O', 'o'],
+    '1': ['l', 'I'],
+    '5': ['S', 's'],
+    '6': ['G', 'g'],
+    '8': ['B', 'b'],
+    '|': ['I', 'l', '1'],
+}
+_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
+
+# Tokenizer: word tokens (letters + pipe) alternating with separators
+_SPELL_TOKEN_RE = re.compile(r'([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]*)')
+
+
+def _spell_dict_knows(word: str) -> bool:
+    """True if word is known in EN or DE dictionary."""
+    if not _SPELL_AVAILABLE:
+        return False
+    w = word.lower()
+    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
+
+
+def _try_split_merged_word(token: str) -> Optional[str]:
+    """Try to split a merged word like 'atmyschool' into 'at my school'.
+
+    Uses dynamic programming to find the shortest sequence of dictionary
+    words that covers the entire token. Only returns a result when the
+    split produces at least 2 words and ALL parts are known dictionary words.
+
+    Preserves original capitalisation by mapping back to the input string.
+    """
+    if not _SPELL_AVAILABLE or len(token) < 4:
+        return None
+
+    lower = token.lower()
+    n = len(lower)
+
+    # dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
+    dp: list = [None] * (n + 1)
+    dp[0] = ([], 0)
+
+    for i in range(1, n + 1):
+        for j in range(max(0, i - 20), i):
+            if dp[j] is None:
+                continue
+            candidate = lower[j:i]
+            word_len = i - j
+            if word_len == 1 and candidate not in ('a', 'i'):
+                continue
+            if _spell_dict_knows(candidate):
+                prev_words, prev_sq = dp[j]
+                new_words = prev_words + [word_len]
+                new_sq = prev_sq + word_len * word_len
+                new_key = (-len(new_words), new_sq)
+                if dp[i] is None:
+                    dp[i] = (new_words, new_sq)
+                else:
+                    old_key = (-len(dp[i][0]), dp[i][1])
+                    if new_key >= old_key:
+                        dp[i] = (new_words, new_sq)
+
+    if dp[n] is None or len(dp[n][0]) < 2:
+        return None
+
+    result = []
+    pos = 0
+    for wlen in dp[n][0]:
+        result.append(token[pos:pos + wlen])
+        pos += wlen
+
+    logger.debug("Split merged word: %r -> %r", token, " ".join(result))
+    return " ".join(result)
+
+
+def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
+    """Return corrected form of token, or None if no fix needed/possible.
+
+    *field* is 'english' or 'german' -- used to pick the right dictionary.
+    """
+    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
+
+    # 1. Already known word -> no fix needed
+    if _spell_dict_knows(token):
+        return None
+
+    # 2. Digit/pipe substitution
+    if has_suspicious:
+        if token == '|':
+            return 'I'
+        for i, ch in enumerate(token):
+            if ch not in _SPELL_SUBS:
+                continue
+            for replacement in _SPELL_SUBS[ch]:
+                candidate = token[:i] + replacement + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+        first = token[0]
+        if first in _SPELL_SUBS and len(token) >= 2:
+            rest = token[1:]
+            if rest.isalpha() and rest.islower():
+                candidate = _SPELL_SUBS[first][0] + rest
+                if not candidate[0].isdigit():
+                    return candidate
+
+    # 3. OCR umlaut confusion
+    if len(token) >= 3 and token.isalpha() and field == "german":
+        _UMLAUT_SUBS = {'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
+                         'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc'}
+        for i, ch in enumerate(token):
+            if ch in _UMLAUT_SUBS:
+                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+
+    # 4. General spell correction for unknown words (no digits/pipes)
+    if not has_suspicious and len(token) >= 3 and token.isalpha():
+        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
+        if spell is not None:
+            correction = spell.correction(token.lower())
+            if correction and correction != token.lower():
+                if token[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if _spell_dict_knows(correction):
+                    return correction
+
+    # 5. Merged-word split
+    if len(token) >= 4 and token.isalpha():
+        split = _try_split_merged_word(token)
+        if split:
+            return split
+
+    return None
+
+
+def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
+    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
+    if not text:
+        return text, False
+    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
+    if not has_suspicious and not any(c.isalpha() for c in text):
+        return text, False
+    # Pattern: | immediately before . or , -> numbered list prefix
+    fixed = re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
+    changed = fixed != text
+    # Tokenize and fix word by word
+    parts: List[str] = []
+    pos = 0
+    for m in _SPELL_TOKEN_RE.finditer(fixed):
+        token, sep = m.group(1), m.group(2)
+        correction = _spell_fix_token(token, field=field)
+        if correction:
+            parts.append(correction)
+            changed = True
+        else:
+            parts.append(token)
+        parts.append(sep)
+        pos = m.end()
+    if pos < len(fixed):
+        parts.append(fixed[pos:])
+    return ''.join(parts), changed
+
+
+def spell_review_entries_sync(entries: List[Dict]) -> Dict:
+    """Rule-based OCR correction: spell-checker + structural heuristics.
+
+    Deterministic -- never translates, never touches IPA, never hallucinates.
+    Uses SmartSpellChecker for language-aware corrections with context-based
+    disambiguation (a/I), multi-digit substitution, and cross-language guard.
+    """
+    from cv_review_llm import _entry_needs_review
+
+    t0 = time.time()
+    changes: List[Dict] = []
+    all_corrected: List[Dict] = []
+
+    # Use SmartSpellChecker if available
+    _smart = None
+    try:
+        from smart_spell import SmartSpellChecker
+        _smart = SmartSpellChecker()
+        logger.debug("spell_review: using SmartSpellChecker")
+    except Exception:
+        logger.debug("spell_review: SmartSpellChecker not available, using legacy")
+
+    _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
+
+    for i, entry in enumerate(entries):
+        e = dict(entry)
+        # Page-ref normalization
+        old_ref = (e.get("source_page") or "").strip()
+        if old_ref:
+            new_ref = _normalize_page_ref(old_ref)
+            if new_ref != old_ref:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": "source_page",
+                    "old": old_ref,
+                    "new": new_ref,
+                })
+                e["source_page"] = new_ref
+                e["llm_corrected"] = True
+        if not _entry_needs_review(e):
+            all_corrected.append(e)
+            continue
+        for field_name in ("english", "german", "example"):
+            old_val = (e.get(field_name) or "").strip()
+            if not old_val:
+                continue
+
+            if _smart:
+                lang_code = _LANG_MAP.get(field_name, "en")
+                result = _smart.correct_text(old_val, lang=lang_code)
+                new_val = result.corrected
+                was_changed = result.changed
+            else:
+                lang = "german" if field_name in ("german", "example") else "english"
+                new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
+            if was_changed and new_val != old_val:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": field_name,
+                    "old": old_val,
+                    "new": new_val,
+                })
+                e[field_name] = new_val
+                e["llm_corrected"] = True
+        all_corrected.append(e)
+    duration_ms = int((time.time() - t0) * 1000)
+    model_name = "smart-spell-checker" if _smart else "spell-checker"
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": 0,
+        "model_used": model_name,
+        "duration_ms": duration_ms,
+    }
+
+
+async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
+    """Async generator yielding SSE-compatible events for spell-checker review."""
+    total = len(entries)
+    yield {
+        "type": "meta",
+        "total_entries": total,
+        "to_review": total,
+        "skipped": 0,
+        "model": "spell-checker",
+        "batch_size": batch_size,
+    }
+    result = spell_review_entries_sync(entries)
+    changes = result["changes"]
+    yield {
+        "type": "batch",
+        "batch_index": 0,
+        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
+        "changes": changes,
+        "duration_ms": result["duration_ms"],
+        "progress": {"current": total, "total": total},
+    }
+    yield {
+        "type": "complete",
+        "changes": changes,
+        "model_used": "spell-checker",
+        "duration_ms": result["duration_ms"],
+        "total_entries": total,
+        "reviewed": total,
+        "skipped": 0,
+        "corrections_found": len(changes),
+        "entries_corrected": result["entries_corrected"],
+    }
@@ -0,0 +1,215 @@
+"""
+Shared types, constants, and availability guards for the CV vocabulary pipeline.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re  # noqa: F401 — re-exported for downstream modules
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import numpy as np  # noqa: F401
+
+logger = logging.getLogger(__name__)
+
+# --- Availability Guards ---
+
+try:
+    import cv2  # noqa: F401
+    CV2_AVAILABLE = True
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+    CV2_AVAILABLE = False
+    logger.warning("OpenCV not available — CV pipeline disabled")
+
+try:
+    import pytesseract  # noqa: F401
+    from PIL import Image  # noqa: F401
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+    TESSERACT_AVAILABLE = False
+    logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
+
+CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
+
+# --- IPA Dictionary ---
+
+IPA_AVAILABLE = False
+_ipa_convert_american = None
+_britfone_dict: Dict[str, str] = {}
+
+try:
+    import eng_to_ipa as _eng_to_ipa
+    _ipa_convert_american = _eng_to_ipa.convert
+    IPA_AVAILABLE = True
+    logger.info("eng_to_ipa available — American IPA lookup enabled")
+except ImportError:
+    logger.info("eng_to_ipa not installed — American IPA disabled")
+
+# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
+_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
+if os.path.exists(_britfone_path):
+    try:
+        with open(_britfone_path, 'r', encoding='utf-8') as f:
+            _britfone_dict = json.load(f)
+        IPA_AVAILABLE = True
+        logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
+    except Exception as e:
+        logger.warning(f"Failed to load Britfone: {e}")
+else:
+    logger.info("Britfone not found — British IPA disabled")
+
+# --- German IPA Dictionary (CC-BY-SA, Wiktionary) ---
+
+DE_IPA_AVAILABLE = False
+_de_ipa_dict: Dict[str, str] = {}
+
+_de_ipa_path = os.path.join(os.path.dirname(__file__), 'data', 'de_ipa.tsv')
+if os.path.exists(_de_ipa_path):
+    try:
+        with open(_de_ipa_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.rstrip('\n').split('\t', 1)
+                if len(parts) == 2:
+                    _de_ipa_dict[parts[0]] = parts[1]
+        DE_IPA_AVAILABLE = True
+        logger.info(f"German IPA loaded — {len(_de_ipa_dict)} entries (CC-BY-SA, Wiktionary)")
+    except Exception as e:
+        logger.warning(f"Failed to load German IPA: {e}")
+else:
+    logger.info("German IPA not found — German IPA disabled")
+
+# --- epitran German fallback (MIT license) ---
+
+_epitran_de = None
+try:
+    import epitran as _epitran_module
+    _epitran_de = _epitran_module.Epitran('deu-Latn')
+    logger.info("epitran loaded — German rule-based IPA fallback enabled")
+except ImportError:
+    logger.info("epitran not installed — German IPA fallback disabled")
+except Exception as e:
+    logger.warning(f"Failed to init epitran: {e}")
+
+# --- Language Detection Constants ---
+
+GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
+    'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
+    'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
+    'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
+    'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
+
+ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
+    'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
+    'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+    'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
+    'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
+
+
+# --- Data Classes ---
+
+@dataclass
+class PageRegion:
+    """A detected region on the page."""
+    type: str           # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom', 'column_headword', 'column_article', 'column_ipa'
+    x: int
+    y: int
+    width: int
+    height: int
+    classification_confidence: float = 1.0   # 0.0-1.0
+    classification_method: str = ""          # 'content', 'position_enhanced', 'position_fallback'
+
+
+@dataclass
+class ColumnGeometry:
+    """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
+    index: int              # 0-basiert, links->rechts
+    x: int
+    y: int
+    width: int
+    height: int
+    word_count: int
+    words: List[Dict]       # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
+    width_ratio: float      # width / content_width (0.0-1.0)
+    is_sub_column: bool = False  # True if created by _detect_sub_columns() split
+
+
+@dataclass
+class RowGeometry:
+    """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
+    index: int              # 0-basiert, oben→unten
+    x: int                  # absolute left (= content left_x)
+    y: int                  # absolute y start
+    width: int              # content width
+    height: int             # Zeilenhoehe in px
+    word_count: int
+    words: List[Dict]
+    row_type: str = 'content'  # 'content' | 'header' | 'footer'
+    gap_before: int = 0     # Gap in px ueber dieser Zeile
+
+
+@dataclass
+class VocabRow:
+    """A single vocabulary entry assembled from multi-column OCR."""
+    english: str = ""
+    german: str = ""
+    example: str = ""
+    source_page: str = ""
+    confidence: float = 0.0
+    y_position: int = 0
+
+
+@dataclass
+class PipelineResult:
+    """Complete result of the CV pipeline."""
+    vocabulary: List[Dict[str, Any]] = field(default_factory=list)
+    word_count: int = 0
+    columns_detected: int = 0
+    duration_seconds: float = 0.0
+    stages: Dict[str, float] = field(default_factory=dict)
+    error: Optional[str] = None
+    image_width: int = 0
+    image_height: int = 0
+
+
+@dataclass
+class DocumentTypeResult:
+    """Result of automatic document type detection."""
+    doc_type: str           # 'vocab_table' | 'full_text' | 'generic_table'
+    confidence: float       # 0.0-1.0
+    pipeline: str           # 'cell_first' | 'full_page'
+    skip_steps: List[str] = field(default_factory=list)  # e.g. ['columns', 'rows']
+    features: Dict[str, Any] = field(default_factory=dict)  # debug info
+
+
+@dataclass
+class DetectedBox:
+    """An embedded box (e.g. grammar tip, exercise) detected on the page."""
+    x: int              # absolute pixel position
+    y: int
+    width: int
+    height: int
+    confidence: float   # 0.0-1.0
+    border_thickness: int = 0
+
+
+@dataclass
+class PageZone:
+    """A horizontal zone of the page — either normal content or a detected box."""
+    index: int          # 0-based, top to bottom
+    zone_type: str      # 'content' | 'box'
+    y: int              # absolute pixel y
+    height: int
+    x: int
+    width: int
+    box: Optional[DetectedBox] = None
+    columns: List[ColumnGeometry] = field(default_factory=list)
+    image_overlays: List[Dict] = field(default_factory=list)
+    layout_hint: Optional[str] = None   # 'left_of_vsplit', 'right_of_vsplit'
+    vsplit_group: Optional[int] = None  # group ID for side-by-side rendering
@@ -0,0 +1,404 @@
+"""
+Words-First Grid Builder (Bottom-Up).
+
+Builds a cell grid from Tesseract word_boxes directly, without requiring
+pre-detected columns or rows.  Algorithm:
+
+  1. Cluster words into columns by X-gap analysis
+  2. Cluster words into rows by Y-proximity
+  3. Build cells at (column, row) intersections
+
+Returns the same (cells, columns_meta) format as build_cell_grid_v2().
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+import statistics
+from typing import Any, Dict, List, Optional, Tuple
+
+from cv_ocr_engines import (
+    _group_words_into_lines,
+    _words_to_reading_order_text,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# 1. Column clustering
+# ---------------------------------------------------------------------------
+
+def _cluster_columns(
+    words: List[Dict],
+    img_w: int,
+    min_gap_pct: float = 3.0,
+    max_columns: Optional[int] = None,
+) -> List[Dict[str, Any]]:
+    """Cluster words into columns by finding large horizontal gaps.
+
+    Args:
+        max_columns: If set, limits the number of columns by merging
+            the closest adjacent pairs until the count matches.
+            Prevents phantom columns from degraded OCR.
+
+    Returns a list of column dicts:
+        [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
+    sorted left-to-right.
+    """
+    if not words:
+        return []
+
+    # Sort by X center
+    sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
+
+    # Collect word heights to compute adaptive threshold
+    heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
+    median_h = statistics.median(heights) if heights else 30
+
+    # Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
+    min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
+
+    # Find X-gap boundaries between consecutive words (sorted by X-center)
+    # For each word, compute right edge; for next word, compute left edge
+    # Collect gaps with their sizes for max_columns enforcement
+    gaps: List[Tuple[float, float]] = []  # (gap_size, split_x)
+    for i in range(len(sorted_w) - 1):
+        right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
+        left_edge = sorted_w[i + 1]['left']
+        gap = left_edge - right_edge
+        if gap > min_gap_px:
+            split_x = (right_edge + left_edge) / 2
+            gaps.append((gap, split_x))
+
+    # If max_columns is set, keep only the (max_columns - 1) largest gaps
+    if max_columns and len(gaps) >= max_columns:
+        gaps.sort(key=lambda g: g[0], reverse=True)
+        gaps = gaps[:max_columns - 1]
+        logger.info(
+            f"_cluster_columns: limited to {max_columns} columns "
+            f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
+        )
+
+    boundaries = sorted(g[1] for g in gaps)
+
+    # Build column ranges from boundaries
+    col_edges = [0.0] + boundaries + [float(img_w)]
+    columns = []
+    for ci in range(len(col_edges) - 1):
+        columns.append({
+            'index': ci,
+            'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
+            'x_min': col_edges[ci],
+            'x_max': col_edges[ci + 1],
+        })
+
+    return columns
+
+
+# ---------------------------------------------------------------------------
+# 2. Row clustering
+# ---------------------------------------------------------------------------
+
+def _cluster_rows(
+    words: List[Dict],
+) -> List[Dict[str, Any]]:
+    """Cluster words into visual rows by Y-proximity.
+
+    Uses half the median word height as Y-tolerance.
+
+    Returns a list of row dicts:
+        [{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
+    sorted top-to-bottom.
+    """
+    if not words:
+        return []
+
+    heights = [w['height'] for w in words if w.get('height', 0) > 0]
+    median_h = statistics.median(heights) if heights else 20
+    y_tol = max(median_h * 0.5, 5)
+
+    lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
+
+    rows = []
+    for ri, line_words in enumerate(lines):
+        y_min = min(w['top'] for w in line_words)
+        y_max = max(w['top'] + w['height'] for w in line_words)
+        rows.append({
+            'index': ri,
+            'y_min': y_min,
+            'y_max': y_max,
+            'y_center': (y_min + y_max) / 2,
+        })
+
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# 3. Build cells
+# ---------------------------------------------------------------------------
+
+def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
+    """Return column index for a word based on overlap, then center, then nearest.
+
+    Three-pass strategy (consistent with _assign_row_words_to_columns):
+    1. Overlap-based: assign to column with maximum horizontal overlap.
+    2. Midpoint-range: if no overlap, use midpoints between adjacent columns.
+    3. Nearest center: last resort fallback.
+    """
+    w_left = word['left']
+    w_right = w_left + word['width']
+    w_center = w_left + word['width'] / 2
+
+    # Pass 1: overlap-based
+    best_col = -1
+    best_overlap = 0
+    for col in columns:
+        overlap = max(0, min(w_right, col['x_max']) - max(w_left, col['x_min']))
+        if overlap > best_overlap:
+            best_overlap = overlap
+            best_col = col['index']
+    if best_col >= 0 and best_overlap > 0:
+        return best_col
+
+    # Pass 2: midpoint-range (non-overlapping assignment zones)
+    for ci, col in enumerate(columns):
+        if ci == 0:
+            assign_left = 0
+        else:
+            assign_left = (columns[ci - 1]['x_max'] + col['x_min']) / 2
+        if ci == len(columns) - 1:
+            assign_right = float('inf')
+        else:
+            assign_right = (col['x_max'] + columns[ci + 1]['x_min']) / 2
+        if assign_left <= w_center < assign_right:
+            return col['index']
+
+    # Pass 3: nearest column center
+    return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - w_center))['index']
+
+
+def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
+    """Return row index for a word based on its Y-center.
+
+    When rows overlap (e.g. due to tall border-ghost characters inflating
+    a row's y_max), prefer the row whose y_center is closest.
+    """
+    y_center = word['top'] + word['height'] / 2
+    # Find all rows whose y_range contains this word's center
+    matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
+    if matching:
+        return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
+    # Fallback: nearest row by Y-center
+    return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
+
+
+def _build_cells(
+    words: List[Dict],
+    columns: List[Dict],
+    rows: List[Dict],
+    img_w: int,
+    img_h: int,
+) -> List[Dict[str, Any]]:
+    """Build cell dicts from word assignments to (column, row) pairs."""
+    if not columns or not rows:
+        return []
+
+    # Bucket words into (col_idx, row_idx)
+    buckets: Dict[Tuple[int, int], List[Dict]] = {}
+    for w in words:
+        ci = _assign_word_to_column(w, columns)
+        ri = _assign_word_to_row(w, rows)
+        buckets.setdefault((ci, ri), []).append(w)
+
+    cells = []
+    for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
+        col = columns[ci]
+        row = rows[ri]
+
+        # Compute tight bbox from actual word positions
+        x_min = min(w['left'] for w in cell_words)
+        y_min = min(w['top'] for w in cell_words)
+        x_max = max(w['left'] + w['width'] for w in cell_words)
+        y_max = max(w['top'] + w['height'] for w in cell_words)
+        bw = x_max - x_min
+        bh = y_max - y_min
+
+        # Text from words in reading order
+        text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
+
+        # Average confidence
+        confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
+        avg_conf = sum(confs) / len(confs) if confs else 0.0
+
+        # Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py).
+        # PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
+        # but the overlay slide mechanism expects one box per word. Split multi-word
+        # boxes into individual word positions proportional to character length.
+        # Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
+        #
+        # Sort in reading order: group by Y (same visual line), then sort by X.
+        # Simple (top, left) sort fails when words on the same line have slightly
+        # different top values (1-6px), causing wrong word order.
+        y_tol_wb = max(10, int(bh * 0.4))
+        reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb)
+        ordered_cell_words = [w for line in reading_lines for w in line]
+
+        word_boxes = []
+        for w in ordered_cell_words:
+            raw_text = w.get('text', '').strip()
+            # Split by whitespace, at "[" boundaries (IPA), and after leading "!"
+            # e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
+            # e.g. "profit['proft]" → ["profit", "['proft]"]
+            # e.g. "!Betonung" → ["!", "Betonung"]
+            tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text)
+            tokens = [t for t in tokens if t]  # remove empty strings
+            if len(tokens) <= 1:
+                # Single word — keep as-is
+                word_boxes.append({
+                    'text': raw_text,
+                    'left': w['left'],
+                    'top': w['top'],
+                    'width': w['width'],
+                    'height': w['height'],
+                    'conf': w.get('conf', 0),
+                })
+            else:
+                # Multi-word phrase — split proportionally by character count
+                total_chars = sum(len(t) for t in tokens)
+                if total_chars == 0:
+                    continue
+                # Small gap between words (2% of box width per gap)
+                n_gaps = len(tokens) - 1
+                gap_px = w['width'] * 0.02
+                usable_w = w['width'] - gap_px * n_gaps
+                cursor = w['left']
+                for t in tokens:
+                    token_w = max(1, usable_w * len(t) / total_chars)
+                    word_boxes.append({
+                        'text': t,
+                        'left': round(cursor),
+                        'top': w['top'],
+                        'width': round(token_w),
+                        'height': w['height'],
+                        'conf': w.get('conf', 0),
+                    })
+                    cursor += token_w + gap_px
+
+        cells.append({
+            'cell_id': f"R{ri:02d}_C{ci}",
+            'row_index': ri,
+            'col_index': ci,
+            'col_type': col['type'],
+            'text': text,
+            'confidence': round(avg_conf, 1),
+            'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
+            'bbox_pct': {
+                'x': round(x_min / img_w * 100, 2) if img_w else 0,
+                'y': round(y_min / img_h * 100, 2) if img_h else 0,
+                'w': round(bw / img_w * 100, 2) if img_w else 0,
+                'h': round(bh / img_h * 100, 2) if img_h else 0,
+            },
+            'word_boxes': word_boxes,
+            'ocr_engine': 'words_first',
+            'is_bold': False,
+        })
+
+    return cells
+
+
+# ---------------------------------------------------------------------------
+# 4. Public API
+# ---------------------------------------------------------------------------
+
+def build_grid_from_words(
+    word_dicts: List[Dict],
+    img_w: int,
+    img_h: int,
+    min_confidence: int = 30,
+    box_rects: Optional[List[Dict]] = None,
+    max_columns: Optional[int] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Build a cell grid bottom-up from Tesseract word boxes.
+
+    Args:
+        word_dicts: Flat list of word dicts with keys:
+            text, left, top, width, height, conf
+            (absolute pixel coordinates).
+        img_w: Image width in pixels.
+        img_h: Image height in pixels.
+        min_confidence: Minimum OCR confidence to keep a word.
+        box_rects: Optional list of box dicts with keys x, y, width, height.
+            Words inside these boxes are excluded from column clustering
+            (box-internal columns are detected separately in sub-sessions).
+
+    Returns:
+        (cells, columns_meta) — same format as build_cell_grid_v2().
+        cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
+        columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
+    """
+    if not word_dicts:
+        logger.info("build_grid_from_words: no words — returning empty grid")
+        return [], []
+
+    # Filter by confidence
+    words = [
+        w for w in word_dicts
+        if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
+    ]
+    if not words:
+        logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
+        return [], []
+
+    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
+
+    # Exclude words inside detected boxes — box columns are detected separately
+    if box_rects:
+        content_words = []
+        for w in words:
+            w_cx = w['left'] + w['width'] / 2
+            w_cy = w['top'] + w['height'] / 2
+            inside = any(
+                b['x'] <= w_cx <= b['x'] + b['width']
+                and b['y'] <= w_cy <= b['y'] + b['height']
+                for b in box_rects
+            )
+            if not inside:
+                content_words.append(w)
+        excluded = len(words) - len(content_words)
+        if excluded:
+            logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
+                        excluded, len(box_rects))
+        words = content_words
+        if not words:
+            logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
+            return [], []
+
+    # Step 1: cluster columns
+    columns = _cluster_columns(words, img_w, max_columns=max_columns)
+    logger.info("build_grid_from_words: %d column(s) detected%s",
+                len(columns), f" (max={max_columns})" if max_columns else "")
+
+    # Step 2: cluster rows
+    rows = _cluster_rows(words)
+    logger.info("build_grid_from_words: %d row(s) detected", len(rows))
+
+    # Step 3: build cells
+    cells = _build_cells(words, columns, rows, img_w, img_h)
+    logger.info("build_grid_from_words: %d cells built", len(cells))
+
+    # Build columns_meta in same format as build_cell_grid_v2
+    columns_meta = []
+    for col in columns:
+        x = int(col['x_min'])
+        w = int(col['x_max'] - col['x_min'])
+        columns_meta.append({
+            'index': col['index'],
+            'type': col['type'],
+            'x': x,
+            'width': w,
+        })
+
+    return cells, columns_meta