Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
--- a/klausur-service/backend/ocr/cell_grid/build.py
+++ b/klausur-service/backend/ocr/cell_grid/build.py
@@ -0,0 +1,498 @@
+"""
+Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
+Extracted from cv_cell_grid.py.
+Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+    _clean_cell_text,
+    _clean_cell_text_lite,
+    _words_to_reading_order_text,
+    _words_to_spaced_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+    _MIN_WORD_CONF,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
+# ---------------------------------------------------------------------------
+
+def _ocr_cell_crop(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+    """OCR a single cell by cropping the exact column x row intersection.
+
+    No padding beyond cell boundaries -> no neighbour bleeding.
+    """
+    # Display bbox: exact column x row intersection
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # Crop boundaries: add small internal padding (3px each side) to avoid
+    # clipping characters near column/row edges (e.g. parentheses, descenders).
+    # Stays within image bounds but may extend slightly beyond strict cell.
+    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
+    _PAD = 3
+    cx = max(0, disp_x - _PAD)
+    cy = max(0, disp_y - _PAD)
+    cx2 = min(img_w, disp_x + disp_w + _PAD)
+    cy2 = min(img_h, disp_y + disp_h + _PAD)
+    cw = cx2 - cx
+    ch = cy2 - cy
+
+    empty_cell = {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': '',
+        'confidence': 0.0,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
+            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
+            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
+            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
+        },
+        'ocr_engine': 'cell_crop_v2',
+        'is_bold': False,
+    }
+
+    if cw <= 0 or ch <= 0:
+        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+        return empty_cell
+
+    # --- Pixel-density check: skip truly empty cells ---
+    if ocr_img is not None:
+        crop = ocr_img[cy:cy + ch, cx:cx + cw]
+        if crop.size > 0:
+            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+            if dark_ratio < 0.005:
+                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                            row_idx, col_idx, dark_ratio, cw, ch)
+                return empty_cell
+
+    # --- Prepare crop for OCR ---
+    cell_lang = lang_map.get(col.type, lang)
+    psm = _select_psm_for_column(col.type, col.width, row.height)
+    text = ''
+    avg_conf = 0.0
+    used_engine = 'cell_crop_v2'
+
+    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_trocr(img_bgr, cell_region,
+                                 handwritten=(engine_name == "trocr-handwritten"))
+    elif engine_name == "lighton" and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_lighton(img_bgr, cell_region)
+    elif engine_name == "rapid" and img_bgr is not None:
+        # Upscale small BGR crops for RapidOCR.
+        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+        if bgr_crop.size == 0:
+            words = []
+        else:
+            crop_h, crop_w = bgr_crop.shape[:2]
+            if crop_h < 80:
+                # Force 3x upscale for short rows — small chars need more pixels
+                scale = 3.0
+                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                    interpolation=cv2.INTER_CUBIC)
+            else:
+                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+            up_h, up_w = bgr_up.shape[:2]
+            scale_x = up_w / max(crop_w, 1)
+            scale_y = up_h / max(crop_h, 1)
+            was_scaled = (up_w != crop_w or up_h != crop_h)
+            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region_rapid(bgr_up, tmp_region)
+            # Remap positions back to original image coords
+            if words and was_scaled:
+                for w in words:
+                    w['left'] = int(w['left'] / scale_x) + cx
+                    w['top'] = int(w['top'] / scale_y) + cy
+                    w['width'] = int(w['width'] / scale_x)
+                    w['height'] = int(w['height'] / scale_y)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+    else:
+        # Tesseract: upscale tiny crops for better recognition
+        if ocr_img is not None:
+            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+            upscaled = _ensure_minimum_crop_size(crop_slice)
+            up_h, up_w = upscaled.shape[:2]
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
+            # Remap word positions back to original image coordinates
+            if words and (up_w != cw or up_h != ch):
+                sx = cw / max(up_w, 1)
+                sy = ch / max(up_h, 1)
+                for w in words:
+                    w['left'] = int(w['left'] * sx) + cx
+                    w['top'] = int(w['top'] * sy) + cy
+                    w['width'] = int(w['width'] * sx)
+                    w['height'] = int(w['height'] * sy)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+        else:
+            words = []
+
+    # Filter low-confidence words
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, ch)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+    else:
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+                    row_idx, col_idx, cw, ch, psm, engine_name)
+
+    # --- PSM 7 fallback for still-empty Tesseract cells ---
+    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
+        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+        upscaled = _ensure_minimum_crop_size(crop_slice)
+        up_h, up_w = upscaled.shape[:2]
+        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
+        if psm7_words:
+            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if psm7_words:
+            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+            if p7_text.strip():
+                text = p7_text
+                avg_conf = round(
+                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                )
+                used_engine = 'cell_crop_v2_psm7'
+                # Remap PSM7 word positions back to original image coords
+                if up_w != cw or up_h != ch:
+                    sx = cw / max(up_w, 1)
+                    sy = ch / max(up_h, 1)
+                    for w in psm7_words:
+                        w['left'] = int(w['left'] * sx) + cx
+                        w['top'] = int(w['top'] * sy) + cy
+                        w['width'] = int(w['width'] * sx)
+                        w['height'] = int(w['height'] * sy)
+                else:
+                    for w in psm7_words:
+                        w['left'] += cx
+                        w['top'] += cy
+                words = psm7_words
+
+    # --- Noise filter ---
+    if text.strip():
+        pre_filter = text
+        text = _clean_cell_text_lite(text)
+        if not text:
+            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+                        row_idx, col_idx, pre_filter)
+            avg_conf = 0.0
+
+    result = dict(empty_cell)
+    result['text'] = text
+    result['confidence'] = avg_conf
+    result['ocr_engine'] = used_engine
+
+    # Store individual word bounding boxes (absolute image coordinates)
+    # for pixel-accurate overlay positioning in the frontend.
+    if words and text.strip():
+        result['word_boxes'] = [
+            {
+                'text': w.get('text', ''),
+                'left': w['left'],
+                'top': w['top'],
+                'width': w['width'],
+                'height': w['height'],
+                'conf': w.get('conf', 0),
+            }
+            for w in words
+            if w.get('text', '').strip()
+        ]
+
+    return result
+
+
+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2 — hybrid grid builder (current default)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+    skip_heal_gaps: bool = False,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
+
+    Drop-in replacement for build_cell_grid() -- same signature & return type.
+
+    Strategy:
+    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+      words (from row.words). Handles IPA brackets, punctuation, sentence
+      continuity correctly.
+    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+      neighbour bleeding from adjacent broad columns.
+    """
+    engine_name = "tesseract"
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+        engine_name = "rapid"
+
+    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
+
+    # Filter to content rows only
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows found")
+        return [], []
+
+    # Filter phantom rows (word_count=0) and artifact rows
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows with words found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
+        return [], []
+
+    # Filter columns
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid_v2: no usable columns found")
+        return [], []
+
+    # Heal row gaps -- use header/footer boundaries
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    # skip_heal_gaps: When True, keep cell positions at their exact row geometry
+    # positions without expanding to fill gaps from removed rows.
+    if not skip_heal_gaps:
+        _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    # --- Classify columns as broad vs narrow ---
+    narrow_col_indices = set()
+    for ci, col in enumerate(relevant_cols):
+        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+        if col_pct < _NARROW_COL_THRESHOLD_PCT:
+            narrow_col_indices.add(ci)
+
+    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+    # --- Phase 1: Broad columns via full-page word assignment ---
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        # Assign full-page words to columns for this row
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx not in narrow_col_indices:
+                # BROAD column: use pre-assigned full-page words
+                words = col_words.get(col_idx, [])
+                # Filter low-confidence words
+                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+                # Single full-width column (box sub-session): preserve spacing
+                is_single_full_column = (
+                    len(relevant_cols) == 1
+                    and img_w > 0
+                    and relevant_cols[0].width / img_w > 0.9
+                )
+
+                if words:
+                    y_tol = max(15, row.height)
+                    if is_single_full_column:
+                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+                        logger.info(f"R{row_idx:02d}: {len(words)} words, "
+                                    f"text={text!r:.100}")
+                    else:
+                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+                else:
+                    text = ''
+                    avg_conf = 0.0
+                    if is_single_full_column:
+                        logger.info(f"R{row_idx:02d}: 0 words (row has "
+                                    f"{row.word_count} total, y={row.y}..{row.y+row.height})")
+
+                # Apply noise filter -- but NOT for single-column sub-sessions
+                if not is_single_full_column:
+                    text = _clean_cell_text(text)
+
+                cell = {
+                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
+                    'row_index': row_idx,
+                    'col_index': col_idx,
+                    'col_type': col.type,
+                    'text': text,
+                    'confidence': avg_conf,
+                    'bbox_px': {
+                        'x': col.x, 'y': row.y,
+                        'w': col.width, 'h': row.height,
+                    },
+                    'bbox_pct': {
+                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
+                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
+                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
+                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
+                    },
+                    'ocr_engine': 'word_lookup',
+                    'is_bold': False,
+                }
+                # Store word bounding boxes for pixel-accurate overlay
+                if words and text.strip():
+                    cell['word_boxes'] = [
+                        {
+                            'text': w.get('text', ''),
+                            'left': w['left'],
+                            'top': w['top'],
+                            'width': w['width'],
+                            'height': w['height'],
+                            'conf': w.get('conf', 0),
+                        }
+                        for w in words
+                        if w.get('text', '').strip()
+                    ]
+                cells.append(cell)
+
+    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+    narrow_tasks = []
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx in narrow_col_indices:
+                narrow_tasks.append((row_idx, col_idx, row, col))
+
+    if narrow_tasks:
+        max_workers = 4 if engine_name == "tesseract" else 2
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            futures = {
+                pool.submit(
+                    _ocr_cell_crop,
+                    ri, ci, row, col,
+                    ocr_img, img_bgr, img_w, img_h,
+                    engine_name, lang, lang_map,
+                ): (ri, ci)
+                for ri, ci, row, col in narrow_tasks
+            }
+            for future in as_completed(futures):
+                try:
+                    cell = future.result()
+                    cells.append(cell)
+                except Exception as e:
+                    ri, ci = futures[future]
+                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+    # Sort cells by (row_index, col_index)
+    cells.sort(key=lambda c: (c['row_index'], c['col_index']))
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
+
+    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
+                f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+                f"engine={engine_name} (hybrid)")
+
+    return cells, columns_meta
--- a/klausur-service/backend/ocr/cell_grid/cell_grid.py
+++ b/klausur-service/backend/ocr/cell_grid/cell_grid.py
@@ -0,0 +1,60 @@
+"""
+Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
+
+Re-export hub — all public and private names remain importable from here
+for backward compatibility. The actual implementations live in:
+
+  cv_cell_grid_helpers.py    — shared helpers (_heal_row_gaps, _is_artifact_row, ...)
+  cv_cell_grid_build.py      — v2 hybrid grid (build_cell_grid_v2, _ocr_cell_crop)
+  cv_cell_grid_legacy.py     — deprecated v1 grid (build_cell_grid, _ocr_single_cell)
+  cv_cell_grid_streaming.py  — streaming variants (build_cell_grid_v2_streaming, ...)
+  cv_cell_grid_merge.py      — row-merging logic (_merge_wrapped_rows, ...)
+  cv_cell_grid_vocab.py      — vocab extraction (_cells_to_vocab_entries, build_word_grid)
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+# --- Helpers ---
+from cv_cell_grid_helpers import (  # noqa: F401
+    _MIN_WORD_CONF,
+    _compute_cell_padding,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+# --- v2 build (current default) ---
+from cv_cell_grid_build import (  # noqa: F401
+    _NARROW_COL_THRESHOLD_PCT,
+    _ocr_cell_crop,
+    build_cell_grid_v2,
+)
+
+# --- Legacy build (DEPRECATED) ---
+from cv_cell_grid_legacy import (  # noqa: F401
+    _ocr_single_cell,
+    build_cell_grid,
+)
+
+# --- Streaming variants ---
+from cv_cell_grid_streaming import (  # noqa: F401
+    build_cell_grid_streaming,
+    build_cell_grid_v2_streaming,
+)
+
+# --- Row merging ---
+from cv_cell_grid_merge import (  # noqa: F401
+    _PHONETIC_ONLY_RE,
+    _is_phonetic_only_text,
+    _merge_continuation_rows,
+    _merge_phonetic_continuation_rows,
+    _merge_wrapped_rows,
+)
+
+# --- Vocab extraction ---
+from cv_cell_grid_vocab import (  # noqa: F401
+    _cells_to_vocab_entries,
+    build_word_grid,
+)
--- a/klausur-service/backend/ocr/cell_grid/helpers.py
+++ b/klausur-service/backend/ocr/cell_grid/helpers.py
@@ -0,0 +1,136 @@
+"""
+Shared helpers for cell-grid construction (v2 + legacy).
+
+Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
+cv_cell_grid_legacy.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+# Minimum OCR word confidence to keep (used across multiple functions)
+_MIN_WORD_CONF = 30
+
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+    """Adaptive padding for OCR crops based on column width.
+
+    Narrow columns (page_ref, marker) need more surrounding context so
+    Tesseract can segment characters correctly.  Wide columns keep the
+    minimal 4 px padding to avoid pulling in neighbours.
+    """
+    col_pct = col_width / img_w * 100 if img_w > 0 else 100
+    if col_pct < 5:
+        return max(20, col_width // 2)
+    if col_pct < 10:
+        return max(12, col_width // 4)
+    if col_pct < 15:
+        return 8
+    return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+                               max_scale: int = 3) -> np.ndarray:
+    """Upscale tiny crops so Tesseract gets enough pixel data.
+
+    If either dimension is below *min_dim*, the crop is bicubic-upscaled
+    so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
+    """
+    h, w = crop.shape[:2]
+    if h >= min_dim and w >= min_dim:
+        return crop
+    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+    if scale <= 1.0:
+        return crop
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+                            row_height: int) -> int:
+    """Choose the best Tesseract PSM for a given column geometry.
+
+    - page_ref columns are almost always single short tokens -> PSM 8
+    - Very narrow or short cells -> PSM 7 (single text line)
+    - Everything else -> PSM 6 (uniform block)
+    """
+    if col_type in ('page_ref', 'marker'):
+        return 8  # single word
+    if col_width < 100 or row_height < 30:
+        return 7  # single line
+    return 6  # uniform block
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+    """Return True if this row contains only scan artifacts, not real text.
+
+    Artifact rows (scanner shadows, noise) typically produce only single-character
+    detections. A real content row always has at least one token with 2+ characters.
+    """
+    if row.word_count == 0:
+        return True
+    texts = [w.get('text', '').strip() for w in row.words]
+    return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+    rows: List[RowGeometry],
+    top_bound: int,
+    bottom_bound: int,
+) -> None:
+    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+    After filtering out empty or artifact rows, remaining content rows may have
+    gaps between them where the removed rows used to be. This function mutates
+    each row to extend upward/downward to the midpoint of such gaps so that
+    OCR crops cover the full available content area.
+
+    The first row always extends to top_bound; the last row to bottom_bound.
+    """
+    if not rows:
+        return
+    rows.sort(key=lambda r: r.y)
+    n = len(rows)
+    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
+
+    for i, row in enumerate(rows):
+        # New top: midpoint between previous row's bottom and this row's top
+        if i == 0:
+            new_top = top_bound
+        else:
+            prev_bot = orig[i - 1][1]
+            my_top = orig[i][0]
+            gap = my_top - prev_bot
+            new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+        # New bottom: midpoint between this row's bottom and next row's top
+        if i == n - 1:
+            new_bottom = bottom_bound
+        else:
+            my_bot = orig[i][1]
+            next_top = orig[i + 1][0]
+            gap = next_top - my_bot
+            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+        row.y = new_top
+        row.height = max(5, new_bottom - new_top)
+
+    logger.debug(
+        f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+        f"(bounds: top={top_bound}, bottom={bottom_bound})"
+    )
--- a/klausur-service/backend/ocr/cell_grid/legacy.py
+++ b/klausur-service/backend/ocr/cell_grid/legacy.py
@@ -0,0 +1,436 @@
+"""
+Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
+
+Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+    _clean_cell_text,
+    _words_to_reading_order_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+    _MIN_WORD_CONF,
+    _compute_cell_padding,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
+# ---------------------------------------------------------------------------
+
+def _ocr_single_cell(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    use_rapid: bool,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+    preassigned_words: Optional[List[Dict]] = None,
+) -> Dict[str, Any]:
+    """Populate a single cell (column x row intersection) via word lookup."""
+    # Display bbox: exact column x row intersection (no padding)
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # OCR crop: adaptive padding -- narrow columns get more context
+    pad = _compute_cell_padding(col.width, img_w)
+    cell_x = max(0, col.x - pad)
+    cell_y = max(0, row.y - pad)
+    cell_w = min(col.width + 2 * pad, img_w - cell_x)
+    cell_h = min(row.height + 2 * pad, img_h - cell_y)
+    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
+
+    if disp_w <= 0 or disp_h <= 0:
+        return {
+            'cell_id': f"R{row_idx:02d}_C{col_idx}",
+            'row_index': row_idx,
+            'col_index': col_idx,
+            'col_type': col.type,
+            'text': '',
+            'confidence': 0.0,
+            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+            'bbox_pct': {
+                'x': round(col.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(col.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'ocr_engine': 'word_lookup',
+        }
+
+    # --- PRIMARY: Word-lookup from full-page Tesseract ---
+    words = preassigned_words if preassigned_words is not None else []
+    used_engine = 'word_lookup'
+
+    # Filter low-confidence words
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, row.height)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+    else:
+        text = ''
+        avg_conf = 0.0
+
+    # --- FALLBACK: Cell-OCR for empty cells ---
+    _run_fallback = False
+    if not text.strip() and cell_w > 0 and cell_h > 0:
+        if ocr_img is not None:
+            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            if crop.size > 0:
+                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                _run_fallback = dark_ratio > 0.005
+    if _run_fallback:
+        # For narrow columns, upscale the crop before OCR
+        if is_narrow and ocr_img is not None:
+            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            _upscaled = _ensure_minimum_crop_size(_crop_slice)
+            if _upscaled is not _crop_slice:
+                _up_h, _up_w = _upscaled.shape[:2]
+                _tmp_region = PageRegion(
+                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(_upscaled, _tmp_region,
+                                            lang=cell_lang, psm=_cell_psm)
+                # Remap word positions back to original image coordinates
+                _sx = cell_w / max(_up_w, 1)
+                _sy = cell_h / max(_up_h, 1)
+                for _fw in (fallback_words or []):
+                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
+                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
+                    _fw['width'] = int(_fw['width'] * _sx)
+                    _fw['height'] = int(_fw['height'] * _sy)
+            else:
+                cell_region = PageRegion(
+                    type=col.type, x=cell_x, y=cell_y,
+                    width=cell_w, height=cell_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+        else:
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
+            elif engine_name == "lighton" and img_bgr is not None:
+                fallback_words = ocr_region_lighton(img_bgr, cell_region)
+            elif use_rapid and img_bgr is not None:
+                fallback_words = ocr_region_rapid(img_bgr, cell_region)
+            else:
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+
+        if fallback_words:
+            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if fallback_words:
+            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
+            fb_y_tol = max(10, int(fb_avg_h * 0.5))
+            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
+            if fb_text.strip():
+                text = fb_text
+                avg_conf = round(
+                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
+                )
+                used_engine = 'cell_ocr_fallback'
+
+        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+        if not text.strip() and _run_fallback and not use_rapid:
+            _fb_region = PageRegion(
+                type=col.type, x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            cell_lang = lang_map.get(col.type, lang)
+            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
+            if psm7_words:
+                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+            if psm7_words:
+                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+                if p7_text.strip():
+                    text = p7_text
+                    avg_conf = round(
+                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                    )
+                    used_engine = 'cell_ocr_psm7'
+
+        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
+        if not text.strip() and is_narrow and img_bgr is not None:
+            row_region = PageRegion(
+                type='_row_strip', x=0, y=row.y,
+                width=img_w, height=row.height,
+            )
+            strip_words = ocr_region_rapid(img_bgr, row_region)
+            if strip_words:
+                col_left = col.x
+                col_right = col.x + col.width
+                col_words = []
+                for sw in strip_words:
+                    sw_left = sw.get('left', 0)
+                    sw_right = sw_left + sw.get('width', 0)
+                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
+                    if overlap > sw.get('width', 1) * 0.3:
+                        col_words.append(sw)
+                if col_words:
+                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+                if col_words:
+                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
+                    if rs_text.strip():
+                        text = rs_text
+                        avg_conf = round(
+                            sum(w['conf'] for w in col_words) / len(col_words), 1
+                        )
+                        used_engine = 'row_strip_rapid'
+
+    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+    if text.strip():
+        text = _clean_cell_text(text)
+        if not text:
+            avg_conf = 0.0
+
+    return {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': text,
+        'confidence': avg_conf,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2),
+            'y': round(disp_y / img_h * 100, 2),
+            'w': round(disp_w / img_w * 100, 2),
+            'h': round(disp_h / img_h * 100, 2),
+        },
+        'ocr_engine': used_engine,
+    }
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid — legacy grid builder (DEPRECATED)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Generic Cell-Grid: Columns x Rows -> cells with OCR text.
+
+    DEPRECATED: Use build_cell_grid_v2 instead.
+    """
+    # Resolve engine choice
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
+
+    # Filter to content rows only (skip header/footer)
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows found")
+        return [], []
+
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows with words found")
+        return [], []
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid: no usable columns found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows after artifact filtering")
+        return [], []
+
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            cells.append(cell)
+
+    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+    empty_by_col: Dict[int, List[int]] = {}
+    for ci, cell in enumerate(cells):
+        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+            bpx = cell['bbox_px']
+            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+            if w > 0 and h > 0 and ocr_img is not None:
+                crop = ocr_img[y:y + h, x:x + w]
+                if crop.size > 0:
+                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                    if dark_ratio > 0.005:
+                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+    for col_idx, cell_indices in empty_by_col.items():
+        if len(cell_indices) < 3:
+            continue
+
+        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+        col_x = cells[cell_indices[0]]['bbox_px']['x']
+        col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+        strip_region = PageRegion(
+            type=relevant_cols[col_idx].type,
+            x=col_x, y=min_y,
+            width=col_w, height=max_y_h - min_y,
+        )
+        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
+        elif engine_name == "lighton" and img_bgr is not None:
+            strip_words = ocr_region_lighton(img_bgr, strip_region)
+        elif use_rapid and img_bgr is not None:
+            strip_words = ocr_region_rapid(img_bgr, strip_region)
+        else:
+            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+        if not strip_words:
+            continue
+
+        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+        if not strip_words:
+            continue
+
+        for ci in cell_indices:
+            cell_y = cells[ci]['bbox_px']['y']
+            cell_h = cells[ci]['bbox_px']['h']
+            cell_mid_y = cell_y + cell_h / 2
+
+            matched_words = [
+                w for w in strip_words
+                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+            ]
+            if matched_words:
+                matched_words.sort(key=lambda w: w['left'])
+                batch_text = ' '.join(w['text'] for w in matched_words)
+                batch_text = _clean_cell_text(batch_text)
+                if batch_text.strip():
+                    cells[ci]['text'] = batch_text
+                    cells[ci]['confidence'] = round(
+                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
+                    )
+                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+        if batch_filled > 0:
+            logger.info(
+                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+                f"empty cells in column {col_idx}"
+            )
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
+    logger.info(f"build_cell_grid: {len(cells)} cells from "
+                f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+                f"engine={engine_name}")
+
+    return cells, columns_meta
--- a/klausur-service/backend/ocr/cell_grid/merge.py
+++ b/klausur-service/backend/ocr/cell_grid/merge.py
@@ -0,0 +1,235 @@
+"""
+Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_ocr_engines import _RE_ALPHA
+
+logger = logging.getLogger(__name__)
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   ->  True
+      [dance]         ->  True
+      ["a:mand]       ->  True
+      almond ['a:mand] -> False (has real word before bracket)
+      Mandel           -> False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only -> merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
+def _merge_wrapped_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows where the primary column (EN) is empty -- cell wrap continuation.
+
+    In textbook vocabulary tables, columns are often narrow, so the author
+    wraps text within a cell. OCR treats each physical line as a separate row.
+    The key indicator: if the EN column is empty but DE/example have text,
+    this row is a continuation of the previous row's cells.
+
+    Example (original textbook has ONE row):
+      Row 2: EN="take part (in)"  DE="teilnehmen (an), mitmachen"  EX="More than 200 singers took"
+      Row 3: EN=""                DE="(bei)"                        EX="part in the concert."
+      -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
+
+    Also handles the reverse case: DE empty but EN has text (wrap in EN column).
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        if not merged:
+            merged.append(entry)
+            continue
+
+        prev = merged[-1]
+        prev_en = (prev.get('english') or '').strip()
+        prev_de = (prev.get('german') or '').strip()
+        prev_ex = (prev.get('example') or '').strip()
+
+        # Case 1: EN is empty -> continuation of previous row
+        if not en and (de or ex) and prev_en:
+            if de:
+                if prev_de.endswith(','):
+                    sep = ' '
+                elif prev_de.endswith(('-', '(')):
+                    sep = ''
+                else:
+                    sep = ' '
+                prev['german'] = (prev_de + sep + de).strip()
+            if ex:
+                sep = ' ' if prev_ex else ''
+                prev['example'] = (prev_ex + sep + ex).strip()
+            logger.debug(
+                f"Merged wrapped row {entry.get('row_index')} into previous "
+                f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
+            )
+            continue
+
+        # Case 2: DE is empty, EN has text that looks like continuation
+        if en and not de and prev_de:
+            is_paren = en.startswith('(')
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            if (is_paren or starts_lower) and len(en.split()) < 5:
+                sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
+                prev['english'] = (prev_en + sep + en).strip()
+                if ex:
+                    sep2 = ' ' if prev_ex else ''
+                    prev['example'] = (prev_ex + sep2 + ex).strip()
+                logger.debug(
+                    f"Merged wrapped row {entry.get('row_index')} into previous "
+                    f"(empty DE): EN={prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    if len(merged) < len(entries):
+        logger.info(
+            f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
+            f"continuation rows ({len(entries)} -> {len(merged)})"
+        )
+    return merged
+
+
+def _merge_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge multi-line vocabulary entries where text wraps to the next row.
+
+    A row is a continuation of the previous entry when:
+    - EN has text, but DE is empty
+    - EN starts with a lowercase letter (not a new vocab entry)
+    - Previous entry's EN does NOT end with a sentence terminator (.!?)
+    - The continuation text has fewer than 4 words (not an example sentence)
+    - The row was not already merged as phonetic
+
+    Example:
+      Row 5: EN="to put up"       DE="aufstellen"
+      Row 6: EN="with sth."       DE=""
+      -> Merged: EN="to put up with sth."  DE="aufstellen"
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+
+        if merged and en and not de:
+            # Check: not phonetic (already handled)
+            if _is_phonetic_only_text(en):
+                merged.append(entry)
+                continue
+
+            # Check: starts with lowercase
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            # Check: fewer than 4 words (not an example sentence)
+            word_count = len(en.split())
+            is_short = word_count < 4
+
+            # Check: previous entry doesn't end with sentence terminator
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+            if starts_lower and is_short and not prev_ends_sentence:
+                # Merge into previous entry
+                prev['english'] = (prev_en + ' ' + en).strip()
+                # Merge example if present
+                ex = (entry.get('example') or '').strip()
+                if ex:
+                    prev_ex = (prev.get('example') or '').strip()
+                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+                logger.debug(
+                    f"Merged continuation row {entry.get('row_index')} "
+                    f"into previous entry: {prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    return merged
--- a/klausur-service/backend/ocr/cell_grid/streaming.py
+++ b/klausur-service/backend/ocr/cell_grid/streaming.py
@@ -0,0 +1,217 @@
+"""
+Streaming variants of cell-grid builders (v2 + legacy).
+
+Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
+useful for progress reporting.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+)
+from cv_cell_grid_helpers import (
+    _heal_row_gaps,
+    _is_artifact_row,
+)
+from cv_cell_grid_build import _ocr_cell_crop
+from cv_cell_grid_legacy import _ocr_single_cell
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2_streaming
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells)
+    """
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        engine_name = "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    if not content_rows:
+        return
+
+    # Use header/footer boundaries for heal_row_gaps
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_cell_crop(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                engine_name, lang, lang_map,
+            )
+            yield cell, columns_meta, total_cells
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_streaming — legacy streaming variant
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+    DEPRECATED: Use build_cell_grid_v2_streaming instead.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells) for each cell.
+    """
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        return
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            yield cell, columns_meta, total_cells
--- a/klausur-service/backend/ocr/cell_grid/vocab.py
+++ b/klausur-service/backend/ocr/cell_grid/vocab.py
@@ -0,0 +1,200 @@
+"""
+Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List
+
+from cv_ocr_engines import (
+    _attach_example_sentences,
+    _fix_phonetic_brackets,
+    _split_comma_entries,
+)
+from cv_cell_grid_legacy import build_cell_grid
+from cv_cell_grid_merge import (
+    _merge_continuation_rows,
+    _merge_phonetic_continuation_rows,
+    _merge_wrapped_rows,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _cells_to_vocab_entries(
+    cells: List[Dict[str, Any]],
+    columns_meta: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Map generic cells to vocab entries with english/german/example fields.
+
+    Groups cells by row_index, maps col_type -> field name, and produces
+    one entry per row (only rows with at least one non-empty field).
+    """
+    col_type_to_field = {
+        'column_en': 'english',
+        'column_de': 'german',
+        'column_example': 'example',
+        'page_ref': 'source_page',
+        'column_marker': 'marker',
+        'column_text': 'text',  # generic single-column (box sub-sessions)
+    }
+    bbox_key_map = {
+        'column_en': 'bbox_en',
+        'column_de': 'bbox_de',
+        'column_example': 'bbox_ex',
+        'page_ref': 'bbox_ref',
+        'column_marker': 'bbox_marker',
+        'column_text': 'bbox_text',
+    }
+
+    # Group cells by row_index
+    rows: Dict[int, List[Dict]] = {}
+    for cell in cells:
+        ri = cell['row_index']
+        rows.setdefault(ri, []).append(cell)
+
+    entries: List[Dict[str, Any]] = []
+    for row_idx in sorted(rows.keys()):
+        row_cells = rows[row_idx]
+        entry: Dict[str, Any] = {
+            'row_index': row_idx,
+            'english': '',
+            'german': '',
+            'example': '',
+            'text': '',  # generic single-column (box sub-sessions)
+            'source_page': '',
+            'marker': '',
+            'confidence': 0.0,
+            'bbox': None,
+            'bbox_en': None,
+            'bbox_de': None,
+            'bbox_ex': None,
+            'bbox_ref': None,
+            'bbox_marker': None,
+            'bbox_text': None,
+            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
+        }
+
+        confidences = []
+        for cell in row_cells:
+            col_type = cell['col_type']
+            field = col_type_to_field.get(col_type)
+            if field:
+                entry[field] = cell['text']
+            bbox_field = bbox_key_map.get(col_type)
+            if bbox_field:
+                entry[bbox_field] = cell['bbox_pct']
+            if cell['confidence'] > 0:
+                confidences.append(cell['confidence'])
+
+        # Compute row-level bbox as union of all cell bboxes
+        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
+        if all_bboxes:
+            min_x = min(b['x'] for b in all_bboxes)
+            min_y = min(b['y'] for b in all_bboxes)
+            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
+            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
+            entry['bbox'] = {
+                'x': round(min_x, 2),
+                'y': round(min_y, 2),
+                'w': round(max_x2 - min_x, 2),
+                'h': round(max_y2 - min_y, 2),
+            }
+
+        entry['confidence'] = round(
+            sum(confidences) / len(confidences), 1
+        ) if confidences else 0.0
+
+        # Only include if at least one mapped field has text
+        has_content = any(
+            entry.get(f)
+            for f in col_type_to_field.values()
+        )
+        if has_content:
+            entries.append(entry)
+
+    return entries
+
+
+def build_word_grid(
+    ocr_img,
+    column_regions,
+    row_geometries,
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr=None,
+    pronunciation: str = "british",
+) -> List[Dict[str, Any]]:
+    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
+
+    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
+    - Maps cells to english/german/example entries
+    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
+    - Falls back to returning raw cells if no vocab columns detected.
+
+    Args:
+        ocr_img: Binarized full-page image (for Tesseract).
+        column_regions: Classified columns from Step 3.
+        row_geometries: Rows from Step 4.
+        img_w, img_h: Image dimensions.
+        lang: Default Tesseract language.
+        ocr_engine: 'tesseract', 'rapid', or 'auto'.
+        img_bgr: BGR color image (required for RapidOCR).
+        pronunciation: 'british' or 'american' for IPA lookup.
+
+    Returns:
+        List of entry dicts with english/german/example text and bbox info (percent).
+    """
+    cells, columns_meta = build_cell_grid(
+        ocr_img, column_regions, row_geometries, img_w, img_h,
+        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
+    )
+
+    if not cells:
+        return []
+
+    # Check if vocab layout is present
+    col_types = {c['type'] for c in columns_meta}
+    if not (col_types & {'column_en', 'column_de'}):
+        logger.info("build_word_grid: no vocab columns -- returning raw cells")
+        return cells
+
+    # Vocab mapping: cells -> entries
+    entries = _cells_to_vocab_entries(cells, columns_meta)
+
+    # --- Post-processing pipeline (deterministic, no LLM) ---
+    n_raw = len(entries)
+
+    # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
+    entries = _merge_wrapped_rows(entries)
+
+    # 0a. Merge phonetic-only continuation rows into previous entry
+    entries = _merge_phonetic_continuation_rows(entries)
+
+    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+    entries = _merge_continuation_rows(entries)
+
+    # 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
+    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
+
+    # 2. Replace OCR'd phonetics with dictionary IPA
+    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+
+    # 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
+    entries = _split_comma_entries(entries)
+
+    # 4. Attach example sentences (rows without DE -> examples for preceding entry)
+    entries = _attach_example_sentences(entries)
+
+    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
+    logger.info(f"build_word_grid: {len(entries)} entries from "
+                f"{n_raw} raw -> {len(entries)} after post-processing "
+                f"(engine={engine_name})")
+
+    return entries