breakpilot-lehrer/klausur-service/backend/cv_cell_grid.py

"""
Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Dict, Generator, List, Optional, Tuple

import numpy as np

from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
    RAPIDOCR_AVAILABLE,
    _RE_ALPHA,
    _assign_row_words_to_columns,
    _attach_example_sentences,
    _clean_cell_text,
    _clean_cell_text_lite,
    _fix_phonetic_brackets,
    _split_comma_entries,
    _words_to_reading_order_text,
    _words_to_spaced_text,
    ocr_region_lighton,
    ocr_region_rapid,
    ocr_region_trocr,
)

logger = logging.getLogger(__name__)

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]

try:
    from PIL import Image
except ImportError:
    Image = None  # type: ignore[assignment,misc]

# Minimum OCR word confidence to keep (used across multiple functions)
_MIN_WORD_CONF = 30

# ---------------------------------------------------------------------------

def _ocr_cell_crop(
    row_idx: int,
    col_idx: int,
    row: RowGeometry,
    col: PageRegion,
    ocr_img: np.ndarray,
    img_bgr: Optional[np.ndarray],
    img_w: int,
    img_h: int,
    engine_name: str,
    lang: str,
    lang_map: Dict[str, str],
) -> Dict[str, Any]:
    """OCR a single cell by cropping the exact column×row intersection.

    No padding beyond cell boundaries → no neighbour bleeding.
    """
    # Display bbox: exact column × row intersection
    disp_x = col.x
    disp_y = row.y
    disp_w = col.width
    disp_h = row.height

    # Crop boundaries: add small internal padding (3px each side) to avoid
    # clipping characters near column/row edges (e.g. parentheses, descenders).
    # Stays within image bounds but may extend slightly beyond strict cell.
    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
    _PAD = 3
    cx = max(0, disp_x - _PAD)
    cy = max(0, disp_y - _PAD)
    cx2 = min(img_w, disp_x + disp_w + _PAD)
    cy2 = min(img_h, disp_y + disp_h + _PAD)
    cw = cx2 - cx
    ch = cy2 - cy

    empty_cell = {
        'cell_id': f"R{row_idx:02d}_C{col_idx}",
        'row_index': row_idx,
        'col_index': col_idx,
        'col_type': col.type,
        'text': '',
        'confidence': 0.0,
        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
        'bbox_pct': {
            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
        },
        'ocr_engine': 'cell_crop_v2',
        'is_bold': False,
    }

    if cw <= 0 or ch <= 0:
        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
        return empty_cell

    # --- Pixel-density check: skip truly empty cells ---
    if ocr_img is not None:
        crop = ocr_img[cy:cy + ch, cx:cx + cw]
        if crop.size > 0:
            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
            if dark_ratio < 0.005:
                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
                            row_idx, col_idx, dark_ratio, cw, ch)
                return empty_cell

    # --- Prepare crop for OCR ---
    cell_lang = lang_map.get(col.type, lang)
    psm = _select_psm_for_column(col.type, col.width, row.height)
    text = ''
    avg_conf = 0.0
    used_engine = 'cell_crop_v2'

    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
        words = ocr_region_trocr(img_bgr, cell_region,
                                 handwritten=(engine_name == "trocr-handwritten"))
    elif engine_name == "lighton" and img_bgr is not None:
        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
        words = ocr_region_lighton(img_bgr, cell_region)
    elif engine_name == "rapid" and img_bgr is not None:
        # Upscale small BGR crops for RapidOCR.
        # Cell crops typically have height 35-55px but width >300px.
        # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
        # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
        # For very short heights (< 80px), force 3× upscale for better OCR
        # of small characters like periods, ellipsis, and phonetic symbols.
        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
        if bgr_crop.size == 0:
            words = []
        else:
            crop_h, crop_w = bgr_crop.shape[:2]
            if crop_h < 80:
                # Force 3× upscale for short rows — small chars need more pixels
                scale = 3.0
                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
                                    interpolation=cv2.INTER_CUBIC)
            else:
                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
            up_h, up_w = bgr_up.shape[:2]
            scale_x = up_w / max(crop_w, 1)
            scale_y = up_h / max(crop_h, 1)
            was_scaled = (up_w != crop_w or up_h != crop_h)
            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
            words = ocr_region_rapid(bgr_up, tmp_region)
            # Remap positions back to original image coords
            if words and was_scaled:
                for w in words:
                    w['left'] = int(w['left'] / scale_x) + cx
                    w['top'] = int(w['top'] / scale_y) + cy
                    w['width'] = int(w['width'] / scale_x)
                    w['height'] = int(w['height'] / scale_y)
            elif words:
                for w in words:
                    w['left'] += cx
                    w['top'] += cy
    else:
        # Tesseract: upscale tiny crops for better recognition
        if ocr_img is not None:
            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
            upscaled = _ensure_minimum_crop_size(crop_slice)
            up_h, up_w = upscaled.shape[:2]
            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
            # Remap word positions back to original image coordinates
            if words and (up_w != cw or up_h != ch):
                sx = cw / max(up_w, 1)
                sy = ch / max(up_h, 1)
                for w in words:
                    w['left'] = int(w['left'] * sx) + cx
                    w['top'] = int(w['top'] * sy) + cy
                    w['width'] = int(w['width'] * sx)
                    w['height'] = int(w['height'] * sy)
            elif words:
                for w in words:
                    w['left'] += cx
                    w['top'] += cy
        else:
            words = []

    # Filter low-confidence words
    if words:
        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]

    if words:
        y_tol = max(15, ch)
        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
    else:
        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
                    row_idx, col_idx, cw, ch, psm, engine_name)

    # --- PSM 7 fallback for still-empty Tesseract cells ---
    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
        upscaled = _ensure_minimum_crop_size(crop_slice)
        up_h, up_w = upscaled.shape[:2]
        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
        if psm7_words:
            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
        if psm7_words:
            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
            if p7_text.strip():
                text = p7_text
                avg_conf = round(
                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
                )
                used_engine = 'cell_crop_v2_psm7'
                # Remap PSM7 word positions back to original image coords
                if up_w != cw or up_h != ch:
                    sx = cw / max(up_w, 1)
                    sy = ch / max(up_h, 1)
                    for w in psm7_words:
                        w['left'] = int(w['left'] * sx) + cx
                        w['top'] = int(w['top'] * sy) + cy
                        w['width'] = int(w['width'] * sx)
                        w['height'] = int(w['height'] * sy)
                else:
                    for w in psm7_words:
                        w['left'] += cx
                        w['top'] += cy
                words = psm7_words

    # --- Noise filter ---
    if text.strip():
        pre_filter = text
        text = _clean_cell_text_lite(text)
        if not text:
            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
                        row_idx, col_idx, pre_filter)
            avg_conf = 0.0

    result = dict(empty_cell)
    result['text'] = text
    result['confidence'] = avg_conf
    result['ocr_engine'] = used_engine

    # Store individual word bounding boxes (absolute image coordinates)
    # for pixel-accurate overlay positioning in the frontend.
    if words and text.strip():
        result['word_boxes'] = [
            {
                'text': w.get('text', ''),
                'left': w['left'],
                'top': w['top'],
                'width': w['width'],
                'height': w['height'],
                'conf': w.get('conf', 0),
            }
            for w in words
            if w.get('text', '').strip()
        ]

    return result


# Threshold: columns narrower than this (% of image width) use single-cell
# crop OCR instead of full-page word assignment.
#
# Broad columns (>= threshold): Full-page Tesseract word assignment.
#   Better for multi-word content (sentences, IPA brackets, punctuation).
#   Examples: EN vocabulary, DE translation, example sentences.
#
# Narrow columns (< threshold): Isolated cell-crop OCR.
#   Prevents neighbour bleeding from adjacent broad columns.
#   Examples: page_ref, marker, numbering columns.
#
# 15% was empirically validated across vocab table scans with 3-5 columns.
# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
# The 15% boundary cleanly separates the two groups.
_NARROW_COL_THRESHOLD_PCT = 15.0


def build_cell_grid_v2(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
    skip_heal_gaps: bool = False,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.

    Drop-in replacement for build_cell_grid() — same signature & return type.

    Strategy:
    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
      words (from row.words). Handles IPA brackets, punctuation, sentence
      continuity correctly.
    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
      neighbour bleeding from adjacent broad columns.
    """
    engine_name = "tesseract"
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
        engine_name = "rapid"

    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")

    # Filter to content rows only
    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        logger.warning("build_cell_grid_v2: no content rows found")
        return [], []

    # Filter phantom rows (word_count=0) and artifact rows
    before = len(content_rows)
    content_rows = [r for r in content_rows if r.word_count > 0]
    skipped = before - len(content_rows)
    if skipped > 0:
        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
    if not content_rows:
        logger.warning("build_cell_grid_v2: no content rows with words found")
        return [], []

    before_art = len(content_rows)
    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    artifact_skipped = before_art - len(content_rows)
    if artifact_skipped > 0:
        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
    if not content_rows:
        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
        return [], []

    # Filter columns
    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
                   'margin_bottom', 'margin_left', 'margin_right'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        logger.warning("build_cell_grid_v2: no usable columns found")
        return [], []

    # Heal row gaps — use header/footer boundaries
    content_rows.sort(key=lambda r: r.y)
    header_rows = [r for r in row_geometries if r.row_type == 'header']
    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
    if header_rows:
        top_bound = max(r.y + r.height for r in header_rows)
    else:
        top_bound = content_rows[0].y
    if footer_rows:
        bottom_bound = min(r.y for r in footer_rows)
    else:
        bottom_bound = content_rows[-1].y + content_rows[-1].height

    # skip_heal_gaps: When True, keep cell positions at their exact row geometry
    # positions without expanding to fill gaps from removed rows.  Useful for
    # overlay rendering where pixel-precise positioning matters more than
    # full-coverage OCR crops.
    if not skip_heal_gaps:
        _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)

    relevant_cols.sort(key=lambda c: c.x)

    columns_meta = [
        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
        for ci, c in enumerate(relevant_cols)
    ]

    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }

    # --- Classify columns as broad vs narrow ---
    narrow_col_indices = set()
    for ci, col in enumerate(relevant_cols):
        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
        if col_pct < _NARROW_COL_THRESHOLD_PCT:
            narrow_col_indices.add(ci)

    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
                f"{len(narrow_col_indices)} narrow columns (cell-crop)")

    # --- Phase 1: Broad columns via full-page word assignment ---
    cells: List[Dict[str, Any]] = []

    for row_idx, row in enumerate(content_rows):
        # Assign full-page words to columns for this row
        col_words = _assign_row_words_to_columns(row, relevant_cols)

        for col_idx, col in enumerate(relevant_cols):
            if col_idx not in narrow_col_indices:
                # BROAD column: use pre-assigned full-page words
                words = col_words.get(col_idx, [])
                # Filter low-confidence words
                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]

                # Single full-width column (box sub-session): preserve spacing
                is_single_full_column = (
                    len(relevant_cols) == 1
                    and img_w > 0
                    and relevant_cols[0].width / img_w > 0.9
                )

                if words:
                    y_tol = max(15, row.height)
                    if is_single_full_column:
                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
                        logger.info(f"R{row_idx:02d}: {len(words)} words, "
                                    f"text={text!r:.100}")
                    else:
                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
                else:
                    text = ''
                    avg_conf = 0.0
                    if is_single_full_column:
                        logger.info(f"R{row_idx:02d}: 0 words (row has "
                                    f"{row.word_count} total, y={row.y}..{row.y+row.height})")

                # Apply noise filter — but NOT for single-column sub-sessions:
                # 1. _clean_cell_text strips trailing non-alpha tokens (e.g. €0.50,
                #    £1, €2.50) which are valid content in box layouts.
                # 2. _clean_cell_text joins tokens with single space, destroying
                #    the proportional spacing from _words_to_spaced_text.
                if not is_single_full_column:
                    text = _clean_cell_text(text)

                cell = {
                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
                    'row_index': row_idx,
                    'col_index': col_idx,
                    'col_type': col.type,
                    'text': text,
                    'confidence': avg_conf,
                    'bbox_px': {
                        'x': col.x, 'y': row.y,
                        'w': col.width, 'h': row.height,
                    },
                    'bbox_pct': {
                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
                    },
                    'ocr_engine': 'word_lookup',
                    'is_bold': False,
                }
                cells.append(cell)

    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
    narrow_tasks = []
    for row_idx, row in enumerate(content_rows):
        for col_idx, col in enumerate(relevant_cols):
            if col_idx in narrow_col_indices:
                narrow_tasks.append((row_idx, col_idx, row, col))

    if narrow_tasks:
        max_workers = 4 if engine_name == "tesseract" else 2
        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            futures = {
                pool.submit(
                    _ocr_cell_crop,
                    ri, ci, row, col,
                    ocr_img, img_bgr, img_w, img_h,
                    engine_name, lang, lang_map,
                ): (ri, ci)
                for ri, ci, row, col in narrow_tasks
            }
            for future in as_completed(futures):
                try:
                    cell = future.result()
                    cells.append(cell)
                except Exception as e:
                    ri, ci = futures[future]
                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")

    # Sort cells by (row_index, col_index)
    cells.sort(key=lambda c: (c['row_index'], c['col_index']))

    # Remove all-empty rows
    rows_with_text: set = set()
    for cell in cells:
        if cell['text'].strip():
            rows_with_text.add(cell['row_index'])
    before_filter = len(cells)
    cells = [c for c in cells if c['row_index'] in rows_with_text]
    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
    if empty_rows_removed > 0:
        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")

    # Bold detection disabled: cell-level stroke-width analysis cannot
    # distinguish bold from non-bold when cells contain mixed formatting
    # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
    # TODO: word-level bold detection would require per-word bounding boxes.

    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
                f"engine={engine_name} (hybrid)")

    return cells, columns_meta


def build_cell_grid_v2_streaming(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
    """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.

    Yields:
        (cell_dict, columns_meta, total_cells)
    """
    # Resolve engine — default to Tesseract for cell-first OCR.
    # Tesseract excels at isolated text crops (binarized, upscaled).
    # RapidOCR is optimized for full-page scene-text and produces artifacts
    # on small cell crops (extra chars, missing punctuation, garbled IPA).
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
        engine_name = "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
        else:
            use_rapid = True
        engine_name = "rapid" if use_rapid else "tesseract"
    else:
        engine_name = "tesseract"

    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        return

    content_rows = [r for r in content_rows if r.word_count > 0]
    if not content_rows:
        return

    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
                   'margin_bottom', 'margin_left', 'margin_right'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        return

    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    if not content_rows:
        return

    # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
    content_rows.sort(key=lambda r: r.y)
    header_rows = [r for r in row_geometries if r.row_type == 'header']
    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
    if header_rows:
        top_bound = max(r.y + r.height for r in header_rows)
    else:
        top_bound = content_rows[0].y
    if footer_rows:
        bottom_bound = min(r.y for r in footer_rows)
    else:
        bottom_bound = content_rows[-1].y + content_rows[-1].height

    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)

    relevant_cols.sort(key=lambda c: c.x)

    columns_meta = [
        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
        for ci, c in enumerate(relevant_cols)
    ]

    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }

    total_cells = len(content_rows) * len(relevant_cols)

    for row_idx, row in enumerate(content_rows):
        for col_idx, col in enumerate(relevant_cols):
            cell = _ocr_cell_crop(
                row_idx, col_idx, row, col,
                ocr_img, img_bgr, img_w, img_h,
                engine_name, lang, lang_map,
            )
            yield cell, columns_meta, total_cells


# ---------------------------------------------------------------------------
# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
# ---------------------------------------------------------------------------

def _compute_cell_padding(col_width: int, img_w: int) -> int:
    """Adaptive padding for OCR crops based on column width.

    Narrow columns (page_ref, marker) need more surrounding context so
    Tesseract can segment characters correctly.  Wide columns keep the
    minimal 4 px padding to avoid pulling in neighbours.
    """
    col_pct = col_width / img_w * 100 if img_w > 0 else 100
    if col_pct < 5:
        return max(20, col_width // 2)
    if col_pct < 10:
        return max(12, col_width // 4)
    if col_pct < 15:
        return 8
    return 4


def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
                               max_scale: int = 3) -> np.ndarray:
    """Upscale tiny crops so Tesseract gets enough pixel data.

    If either dimension is below *min_dim*, the crop is bicubic-upscaled
    so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
    """
    h, w = crop.shape[:2]
    if h >= min_dim and w >= min_dim:
        return crop
    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
    if scale <= 1.0:
        return crop
    new_w = int(w * scale)
    new_h = int(h * scale)
    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)


def _select_psm_for_column(col_type: str, col_width: int,
                            row_height: int) -> int:
    """Choose the best Tesseract PSM for a given column geometry.

    - page_ref columns are almost always single short tokens → PSM 8
    - Very narrow or short cells → PSM 7 (single text line)
    - Everything else → PSM 6 (uniform block)
    """
    if col_type in ('page_ref', 'marker'):
        return 8  # single word
    if col_width < 100 or row_height < 30:
        return 7  # single line
    return 6  # uniform block


def _ocr_single_cell(
    row_idx: int,
    col_idx: int,
    row: RowGeometry,
    col: PageRegion,
    ocr_img: np.ndarray,
    img_bgr: Optional[np.ndarray],
    img_w: int,
    img_h: int,
    use_rapid: bool,
    engine_name: str,
    lang: str,
    lang_map: Dict[str, str],
    preassigned_words: Optional[List[Dict]] = None,
) -> Dict[str, Any]:
    """Populate a single cell (column x row intersection) via word lookup."""
    # Display bbox: exact column × row intersection (no padding)
    disp_x = col.x
    disp_y = row.y
    disp_w = col.width
    disp_h = row.height

    # OCR crop: adaptive padding — narrow columns get more context
    pad = _compute_cell_padding(col.width, img_w)
    cell_x = max(0, col.x - pad)
    cell_y = max(0, row.y - pad)
    cell_w = min(col.width + 2 * pad, img_w - cell_x)
    cell_h = min(row.height + 2 * pad, img_h - cell_y)
    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False

    if disp_w <= 0 or disp_h <= 0:
        return {
            'cell_id': f"R{row_idx:02d}_C{col_idx}",
            'row_index': row_idx,
            'col_index': col_idx,
            'col_type': col.type,
            'text': '',
            'confidence': 0.0,
            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
            'bbox_pct': {
                'x': round(col.x / img_w * 100, 2),
                'y': round(row.y / img_h * 100, 2),
                'w': round(col.width / img_w * 100, 2),
                'h': round(row.height / img_h * 100, 2),
            },
            'ocr_engine': 'word_lookup',
        }

    # --- PRIMARY: Word-lookup from full-page Tesseract ---
    words = preassigned_words if preassigned_words is not None else []
    used_engine = 'word_lookup'

    # Filter low-confidence words (OCR noise from images/artifacts).
    # Tesseract gives low confidence to misread image edges, borders,
    # and other non-text elements.
    if words:
        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]

    if words:
        # Use row height as Y-tolerance so all words within a single row
        # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
        # across two lines due to slight vertical offset).
        y_tol = max(15, row.height)
        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
    else:
        text = ''
        avg_conf = 0.0

    # --- FALLBACK: Cell-OCR for empty cells ---
    # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
    # Re-run OCR on the cell crop to catch what word-lookup missed.
    # To avoid wasting time on truly empty cells, check pixel density first:
    # only run Tesseract if the cell crop contains enough dark pixels to
    # plausibly contain text.
    _run_fallback = False
    if not text.strip() and cell_w > 0 and cell_h > 0:
        if ocr_img is not None:
            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
            if crop.size > 0:
                # Threshold: pixels darker than 180 (on 0-255 grayscale).
                # Use 0.5% to catch even small text like "Ei" (2 chars)
                # in an otherwise empty cell.
                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
                _run_fallback = dark_ratio > 0.005
    if _run_fallback:
        # For narrow columns, upscale the crop before OCR
        if is_narrow and ocr_img is not None:
            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
            _upscaled = _ensure_minimum_crop_size(_crop_slice)
            if _upscaled is not _crop_slice:
                # Build a temporary full-size image with the upscaled crop
                # placed at origin so ocr_region can crop it cleanly.
                _up_h, _up_w = _upscaled.shape[:2]
                _tmp_region = PageRegion(
                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
                )
                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
                cell_lang = lang_map.get(col.type, lang)
                fallback_words = ocr_region(_upscaled, _tmp_region,
                                            lang=cell_lang, psm=_cell_psm)
                # Remap word positions back to original image coordinates
                _sx = cell_w / max(_up_w, 1)
                _sy = cell_h / max(_up_h, 1)
                for _fw in (fallback_words or []):
                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
                    _fw['width'] = int(_fw['width'] * _sx)
                    _fw['height'] = int(_fw['height'] * _sy)
            else:
                # No upscaling needed, use adaptive PSM
                cell_region = PageRegion(
                    type=col.type, x=cell_x, y=cell_y,
                    width=cell_w, height=cell_h,
                )
                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
                cell_lang = lang_map.get(col.type, lang)
                fallback_words = ocr_region(ocr_img, cell_region,
                                            lang=cell_lang, psm=_cell_psm)
        else:
            cell_region = PageRegion(
                type=col.type,
                x=cell_x, y=cell_y,
                width=cell_w, height=cell_h,
            )
            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
            elif engine_name == "lighton" and img_bgr is not None:
                fallback_words = ocr_region_lighton(img_bgr, cell_region)
            elif use_rapid and img_bgr is not None:
                fallback_words = ocr_region_rapid(img_bgr, cell_region)
            else:
                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
                cell_lang = lang_map.get(col.type, lang)
                fallback_words = ocr_region(ocr_img, cell_region,
                                            lang=cell_lang, psm=_cell_psm)

        if fallback_words:
            # Apply same confidence filter to fallback words
            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
        if fallback_words:
            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
            fb_y_tol = max(10, int(fb_avg_h * 0.5))
            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
            if fb_text.strip():
                text = fb_text
                avg_conf = round(
                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
                )
                used_engine = 'cell_ocr_fallback'

        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
        if not text.strip() and _run_fallback and not use_rapid:
            _fb_region = PageRegion(
                type=col.type, x=cell_x, y=cell_y,
                width=cell_w, height=cell_h,
            )
            cell_lang = lang_map.get(col.type, lang)
            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
            if psm7_words:
                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
            if psm7_words:
                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
                if p7_text.strip():
                    text = p7_text
                    avg_conf = round(
                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
                    )
                    used_engine = 'cell_ocr_psm7'

        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
        # If a narrow cell is still empty, OCR the entire row strip with
        # RapidOCR (which handles small text better) and assign words by
        # X-position overlap with this column.
        if not text.strip() and is_narrow and img_bgr is not None:
            row_region = PageRegion(
                type='_row_strip', x=0, y=row.y,
                width=img_w, height=row.height,
            )
            strip_words = ocr_region_rapid(img_bgr, row_region)
            if strip_words:
                # Filter to words overlapping this column's X-range
                col_left = col.x
                col_right = col.x + col.width
                col_words = []
                for sw in strip_words:
                    sw_left = sw.get('left', 0)
                    sw_right = sw_left + sw.get('width', 0)
                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
                    if overlap > sw.get('width', 1) * 0.3:
                        col_words.append(sw)
                if col_words:
                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
                if col_words:
                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
                    if rs_text.strip():
                        text = rs_text
                        avg_conf = round(
                            sum(w['conf'] for w in col_words) / len(col_words), 1
                        )
                        used_engine = 'row_strip_rapid'

    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
    if text.strip():
        text = _clean_cell_text(text)
        if not text:
            avg_conf = 0.0

    return {
        'cell_id': f"R{row_idx:02d}_C{col_idx}",
        'row_index': row_idx,
        'col_index': col_idx,
        'col_type': col.type,
        'text': text,
        'confidence': avg_conf,
        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
        'bbox_pct': {
            'x': round(disp_x / img_w * 100, 2),
            'y': round(disp_y / img_h * 100, 2),
            'w': round(disp_w / img_w * 100, 2),
            'h': round(disp_h / img_h * 100, 2),
        },
        'ocr_engine': used_engine,
    }


def _is_artifact_row(row: RowGeometry) -> bool:
    """Return True if this row contains only scan artifacts, not real text.

    Artifact rows (scanner shadows, noise) typically produce only single-character
    detections. A real content row always has at least one token with 2+ characters.
    """
    if row.word_count == 0:
        return True
    texts = [w.get('text', '').strip() for w in row.words]
    return all(len(t) <= 1 for t in texts)


def _heal_row_gaps(
    rows: List[RowGeometry],
    top_bound: int,
    bottom_bound: int,
) -> None:
    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.

    After filtering out empty or artifact rows, remaining content rows may have
    gaps between them where the removed rows used to be. This function mutates
    each row to extend upward/downward to the midpoint of such gaps so that
    OCR crops cover the full available content area.

    The first row always extends to top_bound; the last row to bottom_bound.
    """
    if not rows:
        return
    rows.sort(key=lambda r: r.y)
    n = len(rows)
    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation

    for i, row in enumerate(rows):
        # New top: midpoint between previous row's bottom and this row's top
        if i == 0:
            new_top = top_bound
        else:
            prev_bot = orig[i - 1][1]
            my_top = orig[i][0]
            gap = my_top - prev_bot
            new_top = prev_bot + gap // 2 if gap > 1 else my_top

        # New bottom: midpoint between this row's bottom and next row's top
        if i == n - 1:
            new_bottom = bottom_bound
        else:
            my_bot = orig[i][1]
            next_top = orig[i + 1][0]
            gap = next_top - my_bot
            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot

        row.y = new_top
        row.height = max(5, new_bottom - new_top)

    logger.debug(
        f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
        f"(bounds: top={top_bound}, bottom={bottom_bound})"
    )


def build_cell_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Generic Cell-Grid: Columns × Rows → cells with OCR text.

    This is the layout-agnostic foundation. Every column (except column_ignore)
    is intersected with every content row to produce numbered cells.

    Args:
        ocr_img: Binarized full-page image (for Tesseract).
        column_regions: Classified columns from Step 3 (PageRegion list).
        row_geometries: Rows from Step 4 (RowGeometry list).
        img_w: Image width in pixels.
        img_h: Image height in pixels.
        lang: Default Tesseract language.
        ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
        img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).

    Returns:
        (cells, columns_meta) where cells is a list of cell dicts and
        columns_meta describes the columns used.
    """
    # Resolve engine choice
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
        else:
            use_rapid = True
        engine_name = "rapid" if use_rapid else "tesseract"
    else:
        engine_name = "tesseract"

    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")

    # Filter to content rows only (skip header/footer)
    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        logger.warning("build_cell_grid: no content rows found")
        return [], []

    # Filter phantom rows: rows with no Tesseract words assigned are
    # inter-line whitespace gaps that would produce garbage OCR.
    before = len(content_rows)
    content_rows = [r for r in content_rows if r.word_count > 0]
    skipped = before - len(content_rows)
    if skipped > 0:
        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
    if not content_rows:
        logger.warning("build_cell_grid: no content rows with words found")
        return [], []

    # Use columns only — skip ignore, header, footer, page_ref
    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        logger.warning("build_cell_grid: no usable columns found")
        return [], []

    # Filter artifact rows: rows whose detected words are all single characters
    # are caused by scanner shadows or noise, not real text.
    before_art = len(content_rows)
    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    artifact_skipped = before_art - len(content_rows)
    if artifact_skipped > 0:
        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
    if not content_rows:
        logger.warning("build_cell_grid: no content rows after artifact filtering")
        return [], []

    # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
    # to fill the space so OCR crops are not artificially narrow.
    _heal_row_gaps(
        content_rows,
        top_bound=min(c.y for c in relevant_cols),
        bottom_bound=max(c.y + c.height for c in relevant_cols),
    )

    # Sort columns left-to-right
    relevant_cols.sort(key=lambda c: c.x)

    # Build columns_meta
    columns_meta = [
        {
            'index': col_idx,
            'type': col.type,
            'x': col.x,
            'width': col.width,
        }
        for col_idx, col in enumerate(relevant_cols)
    ]

    # Choose OCR language per column type (Tesseract only)
    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }

    cells: List[Dict[str, Any]] = []

    for row_idx, row in enumerate(content_rows):
        # Pre-assign each word to exactly one column (nearest center)
        col_words = _assign_row_words_to_columns(row, relevant_cols)
        for col_idx, col in enumerate(relevant_cols):
            cell = _ocr_single_cell(
                row_idx, col_idx, row, col,
                ocr_img, img_bgr, img_w, img_h,
                use_rapid, engine_name, lang, lang_map,
                preassigned_words=col_words[col_idx],
            )
            cells.append(cell)

    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
    # Collect cells that are still empty but have visible pixels.
    # Instead of calling Tesseract once per cell (expensive), crop an entire
    # column strip and run OCR once, then assign words to cells by Y position.
    empty_by_col: Dict[int, List[int]] = {}  # col_idx → [cell list indices]
    for ci, cell in enumerate(cells):
        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
            bpx = cell['bbox_px']
            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
            if w > 0 and h > 0 and ocr_img is not None:
                crop = ocr_img[y:y + h, x:x + w]
                if crop.size > 0:
                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
                    if dark_ratio > 0.005:
                        empty_by_col.setdefault(cell['col_index'], []).append(ci)

    for col_idx, cell_indices in empty_by_col.items():
        if len(cell_indices) < 3:
            continue  # Not worth batching for < 3 cells

        # Find the column strip bounding box (union of all empty cell bboxes)
        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
        col_x = cells[cell_indices[0]]['bbox_px']['x']
        col_w = cells[cell_indices[0]]['bbox_px']['w']

        strip_region = PageRegion(
            type=relevant_cols[col_idx].type,
            x=col_x, y=min_y,
            width=col_w, height=max_y_h - min_y,
        )
        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)

        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
        elif engine_name == "lighton" and img_bgr is not None:
            strip_words = ocr_region_lighton(img_bgr, strip_region)
        elif use_rapid and img_bgr is not None:
            strip_words = ocr_region_rapid(img_bgr, strip_region)
        else:
            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)

        if not strip_words:
            continue

        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
        if not strip_words:
            continue

        # Assign words to cells by Y overlap
        for ci in cell_indices:
            cell_y = cells[ci]['bbox_px']['y']
            cell_h = cells[ci]['bbox_px']['h']
            cell_mid_y = cell_y + cell_h / 2

            matched_words = [
                w for w in strip_words
                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
            ]
            if matched_words:
                matched_words.sort(key=lambda w: w['left'])
                batch_text = ' '.join(w['text'] for w in matched_words)
                batch_text = _clean_cell_text(batch_text)
                if batch_text.strip():
                    cells[ci]['text'] = batch_text
                    cells[ci]['confidence'] = round(
                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
                    )
                    cells[ci]['ocr_engine'] = 'batch_column_ocr'

        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
        if batch_filled > 0:
            logger.info(
                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
                f"empty cells in column {col_idx}"
            )

    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
    # that had stray Tesseract artifacts giving word_count > 0).
    rows_with_text: set = set()
    for cell in cells:
        if cell['text'].strip():
            rows_with_text.add(cell['row_index'])
    before_filter = len(cells)
    cells = [c for c in cells if c['row_index'] in rows_with_text]
    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
    if empty_rows_removed > 0:
        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")

    logger.info(f"build_cell_grid: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
                f"engine={engine_name}")

    return cells, columns_meta


def build_cell_grid_streaming(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
    """Like build_cell_grid(), but yields each cell as it is OCR'd.

    Yields:
        (cell_dict, columns_meta, total_cells) for each cell.
    """
    # Resolve engine choice (same as build_cell_grid)
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
        else:
            use_rapid = True
        engine_name = "rapid" if use_rapid else "tesseract"
    else:
        engine_name = "tesseract"

    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        return

    # Filter phantom rows: rows with no Tesseract words assigned are
    # inter-line whitespace gaps that would produce garbage OCR.
    before = len(content_rows)
    content_rows = [r for r in content_rows if r.word_count > 0]
    skipped = before - len(content_rows)
    if skipped > 0:
        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
    if not content_rows:
        return

    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        return

    # Filter artifact rows + heal gaps (same logic as build_cell_grid)
    before_art = len(content_rows)
    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    artifact_skipped = before_art - len(content_rows)
    if artifact_skipped > 0:
        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
    if not content_rows:
        return
    _heal_row_gaps(
        content_rows,
        top_bound=min(c.y for c in relevant_cols),
        bottom_bound=max(c.y + c.height for c in relevant_cols),
    )

    relevant_cols.sort(key=lambda c: c.x)

    columns_meta = [
        {
            'index': col_idx,
            'type': col.type,
            'x': col.x,
            'width': col.width,
        }
        for col_idx, col in enumerate(relevant_cols)
    ]

    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }

    total_cells = len(content_rows) * len(relevant_cols)

    for row_idx, row in enumerate(content_rows):
        # Pre-assign each word to exactly one column (nearest center)
        col_words = _assign_row_words_to_columns(row, relevant_cols)
        for col_idx, col in enumerate(relevant_cols):
            cell = _ocr_single_cell(
                row_idx, col_idx, row, col,
                ocr_img, img_bgr, img_w, img_h,
                use_rapid, engine_name, lang, lang_map,
                preassigned_words=col_words[col_idx],
            )
            yield cell, columns_meta, total_cells


def _cells_to_vocab_entries(
    cells: List[Dict[str, Any]],
    columns_meta: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Map generic cells to vocab entries with english/german/example fields.

    Groups cells by row_index, maps col_type → field name, and produces
    one entry per row (only rows with at least one non-empty field).
    """
    # Determine image dimensions from first cell (for row-level bbox)
    col_type_to_field = {
        'column_en': 'english',
        'column_de': 'german',
        'column_example': 'example',
        'page_ref': 'source_page',
        'column_marker': 'marker',
        'column_text': 'text',  # generic single-column (box sub-sessions)
    }
    bbox_key_map = {
        'column_en': 'bbox_en',
        'column_de': 'bbox_de',
        'column_example': 'bbox_ex',
        'page_ref': 'bbox_ref',
        'column_marker': 'bbox_marker',
        'column_text': 'bbox_text',
    }

    # Group cells by row_index
    rows: Dict[int, List[Dict]] = {}
    for cell in cells:
        ri = cell['row_index']
        rows.setdefault(ri, []).append(cell)

    entries: List[Dict[str, Any]] = []
    for row_idx in sorted(rows.keys()):
        row_cells = rows[row_idx]
        entry: Dict[str, Any] = {
            'row_index': row_idx,
            'english': '',
            'german': '',
            'example': '',
            'text': '',  # generic single-column (box sub-sessions)
            'source_page': '',
            'marker': '',
            'confidence': 0.0,
            'bbox': None,
            'bbox_en': None,
            'bbox_de': None,
            'bbox_ex': None,
            'bbox_ref': None,
            'bbox_marker': None,
            'bbox_text': None,
            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
        }

        confidences = []
        for cell in row_cells:
            col_type = cell['col_type']
            field = col_type_to_field.get(col_type)
            if field:
                entry[field] = cell['text']
            bbox_field = bbox_key_map.get(col_type)
            if bbox_field:
                entry[bbox_field] = cell['bbox_pct']
            if cell['confidence'] > 0:
                confidences.append(cell['confidence'])

        # Compute row-level bbox as union of all cell bboxes
        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
        if all_bboxes:
            min_x = min(b['x'] for b in all_bboxes)
            min_y = min(b['y'] for b in all_bboxes)
            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
            entry['bbox'] = {
                'x': round(min_x, 2),
                'y': round(min_y, 2),
                'w': round(max_x2 - min_x, 2),
                'h': round(max_y2 - min_y, 2),
            }

        entry['confidence'] = round(
            sum(confidences) / len(confidences), 1
        ) if confidences else 0.0

        # Only include if at least one mapped field has text
        has_content = any(
            entry.get(f)
            for f in col_type_to_field.values()
        )
        if has_content:
            entries.append(entry)

    return entries


# Regex: line starts with phonetic bracket content only (no real word before it)
_PHONETIC_ONLY_RE = re.compile(
    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
)


def _is_phonetic_only_text(text: str) -> bool:
    """Check if text consists only of phonetic transcription.

    Phonetic-only patterns:
      ['mani serva]   →  True
      [dɑːns]         →  True
      ["a:mand]       →  True
      almond ['a:mand] → False (has real word before bracket)
      Mandel           → False
    """
    t = text.strip()
    if not t:
        return False
    # Must contain at least one bracket
    if '[' not in t and ']' not in t:
        return False
    # Remove all bracket content and surrounding punctuation/whitespace
    without_brackets = re.sub(r"\[.*?\]", '', t)
    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
    # If nothing meaningful remains, it's phonetic-only
    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
    return len(alpha_remaining) < 2


def _merge_phonetic_continuation_rows(
    entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Merge rows that contain only phonetic transcription into previous entry.

    In dictionary pages, phonetic transcription sometimes wraps to the next
    row.  E.g.:
      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
      Row 29: EN="['mani serva]"       DE=""

    Row 29 is phonetic-only → merge into row 28's EN field.
    """
    if len(entries) < 2:
        return entries

    merged: List[Dict[str, Any]] = []
    for entry in entries:
        en = (entry.get('english') or '').strip()
        de = (entry.get('german') or '').strip()
        ex = (entry.get('example') or '').strip()

        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
        if merged and _is_phonetic_only_text(en) and not de:
            prev = merged[-1]
            prev_en = (prev.get('english') or '').strip()
            # Append phonetic to previous entry's EN
            if prev_en:
                prev['english'] = prev_en + ' ' + en
            else:
                prev['english'] = en
            # If there was an example, append to previous too
            if ex:
                prev_ex = (prev.get('example') or '').strip()
                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
            logger.debug(
                f"Merged phonetic row {entry.get('row_index')} "
                f"into previous entry: {prev['english']!r}"
            )
            continue

        merged.append(entry)

    return merged


def _merge_continuation_rows(
    entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Merge multi-line vocabulary entries where text wraps to the next row.

    A row is a continuation of the previous entry when:
    - EN has text, but DE is empty
    - EN starts with a lowercase letter (not a new vocab entry)
    - Previous entry's EN does NOT end with a sentence terminator (.!?)
    - The continuation text has fewer than 4 words (not an example sentence)
    - The row was not already merged as phonetic

    Example:
      Row 5: EN="to put up"       DE="aufstellen"
      Row 6: EN="with sth."       DE=""
      → Merged: EN="to put up with sth."  DE="aufstellen"
    """
    if len(entries) < 2:
        return entries

    merged: List[Dict[str, Any]] = []
    for entry in entries:
        en = (entry.get('english') or '').strip()
        de = (entry.get('german') or '').strip()

        if merged and en and not de:
            # Check: not phonetic (already handled)
            if _is_phonetic_only_text(en):
                merged.append(entry)
                continue

            # Check: starts with lowercase
            first_alpha = next((c for c in en if c.isalpha()), '')
            starts_lower = first_alpha and first_alpha.islower()

            # Check: fewer than 4 words (not an example sentence)
            word_count = len(en.split())
            is_short = word_count < 4

            # Check: previous entry doesn't end with sentence terminator
            prev = merged[-1]
            prev_en = (prev.get('english') or '').strip()
            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'

            if starts_lower and is_short and not prev_ends_sentence:
                # Merge into previous entry
                prev['english'] = (prev_en + ' ' + en).strip()
                # Merge example if present
                ex = (entry.get('example') or '').strip()
                if ex:
                    prev_ex = (prev.get('example') or '').strip()
                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
                logger.debug(
                    f"Merged continuation row {entry.get('row_index')} "
                    f"into previous entry: {prev['english']!r}"
                )
                continue

        merged.append(entry)

    return merged


def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
    pronunciation: str = "british",
) -> List[Dict[str, Any]]:
    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.

    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
    - Maps cells to english/german/example entries
    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
    - Falls back to returning raw cells if no vocab columns detected.

    Args:
        ocr_img: Binarized full-page image (for Tesseract).
        column_regions: Classified columns from Step 3.
        row_geometries: Rows from Step 4.
        img_w, img_h: Image dimensions.
        lang: Default Tesseract language.
        ocr_engine: 'tesseract', 'rapid', or 'auto'.
        img_bgr: BGR color image (required for RapidOCR).
        pronunciation: 'british' or 'american' for IPA lookup.

    Returns:
        List of entry dicts with english/german/example text and bbox info (percent).
    """
    cells, columns_meta = build_cell_grid(
        ocr_img, column_regions, row_geometries, img_w, img_h,
        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
    )

    if not cells:
        return []

    # Check if vocab layout is present
    col_types = {c['type'] for c in columns_meta}
    if not (col_types & {'column_en', 'column_de'}):
        logger.info("build_word_grid: no vocab columns — returning raw cells")
        return cells

    # Vocab mapping: cells → entries
    entries = _cells_to_vocab_entries(cells, columns_meta)

    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)

    # 0a. Merge phonetic-only continuation rows into previous entry
    entries = _merge_phonetic_continuation_rows(entries)

    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
    entries = _merge_continuation_rows(entries)

    # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
    #    llm_review_entries_streaming so changes are visible to the user in Step 6.

    # 2. Replace OCR'd phonetics with dictionary IPA
    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)

    # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
    entries = _split_comma_entries(entries)

    # 4. Attach example sentences (rows without DE → examples for preceding entry)
    entries = _attach_example_sentences(entries)

    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
    logger.info(f"build_word_grid: {len(entries)} entries from "
                f"{n_raw} raw → {len(entries)} after post-processing "
                f"(engine={engine_name})")

    return entries