Restructure: Move grid_* + vocab_* into packages (klausur-service)

grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions
@@ -31,6 +31,7 @@
 # Two indivisible route handlers (~230 LOC each) that cannot be split further
 **/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01
 **/vocab/worksheet/compare_api.py | owner=klausur | reason=Same file moved to vocab/ package | review=2026-10-01
 # TypeScript Data Catalogs (admin-lehrer/lib/sdk/)
 # Pure exported const arrays/objects with type definitions, no business logic.
@@ -0,0 +1,10 @@
 """
 Grid package — restructured from grid_* flat modules.
 Backward-compatible re-exports: consumers can still use
 ``from grid_build_core import ...`` etc. via the shim files in backend/.
 Sub-packages:
  - grid.build   — grid construction pipeline (_build_grid_core and phases)
  - grid.editor  — FastAPI endpoints, helper functions, column/zone logic
 """
@@ -0,0 +1,11 @@
 """
 Grid Build sub-package — grid construction pipeline.
 Modules:
  - core      — _build_grid_core() main entry point
  - zones     — image loading, graphic/box detection, zone-aware grid building
  - cleanup   — junk row removal, artifact cleanup, pipe dividers
  - text_ops  — color annotation, heading detection, IPA correction
  - cell_ops  — bullet removal, garbled cells, word-box reordering
  - finalize  — dictionary detection, spell checking, result assembly
 """
@@ -0,0 +1,305 @@
 """
 Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
 garbled cell cleanup, word-box reordering, and max_columns enforcement.
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List, Tuple
 from cv_ocr_engines import (
    _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
 )
 logger = logging.getLogger(__name__)
 def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
    """Remove blue bullet/artifact word_boxes (Step 5i).
    Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
    and syllable-split word merging.
    """
    _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
    _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
    bullet_removed = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            to_remove: set = set()
            # Rule (a): tiny coloured symbols
            for i, wb in enumerate(wbs):
                cn = wb.get("color_name", "black")
                if (cn != "black"
                        and wb.get("width", 0) * wb.get("height", 0) < 200
                        and wb.get("conf", 100) < 85):
                    to_remove.add(i)
            # Rule (a2): isolated non-alphanumeric symbols
            for i, wb in enumerate(wbs):
                t = (wb.get("text") or "").strip()
                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
                    if t in _REMOVE_SYMBOLS:
                        to_remove.add(i)
            # Rule (b) + (c): overlap and duplicate detection
            to_merge: List[Tuple[int, int]] = []
            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
            for p in range(len(indexed) - 1):
                i1, w1 = indexed[p]
                i2, w2 = indexed[p + 1]
                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
                min_w = min(w1.get("width", 1), w2.get("width", 1))
                gap = x2s - x1e
                overlap_pct = overlap / min_w if min_w > 0 else 0
                if overlap_pct > 0.20:
                    t1 = (w1.get("text") or "").strip()
                    t2 = (w2.get("text") or "").strip()
                    # Syllable-split words
                    if (overlap_pct <= 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)):
                        to_merge.append((i1, i2))
                        continue
                    # High overlap with short prefix
                    if (overlap_pct > 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)
                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
                        to_merge.append((i1, i2))
                        continue
                    if overlap_pct <= 0.40:
                        continue
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
                    # Very high overlap: prefer IPA-dictionary word
                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
                        if in_dict_1 and not in_dict_2:
                            to_remove.add(i2)
                            continue
                        elif in_dict_2 and not in_dict_1:
                            to_remove.add(i1)
                            continue
                    if c1 < c2:
                        to_remove.add(i1)
                    elif c2 < c1:
                        to_remove.add(i2)
                    else:
                        if w1.get("height", 0) > w2.get("height", 0):
                            to_remove.add(i1)
                        else:
                            to_remove.add(i2)
                elif (gap < 6
                      and w1.get("color_name") == "blue"
                      and w2.get("color_name") == "blue"
                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)
            # Execute merges first (syllable-split words)
            if to_merge:
                merge_parent: Dict[int, int] = {}
                for mi1, mi2 in to_merge:
                    actual_mi1 = mi1
                    while actual_mi1 in merge_parent:
                        actual_mi1 = merge_parent[actual_mi1]
                    if actual_mi1 in to_remove or mi2 in to_remove:
                        continue
                    if mi2 in merge_parent:
                        continue
                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
                    mt2 = (mw2.get("text") or "").strip()
                    merged_text = mt1 + mt2
                    mx = min(mw1["left"], mw2["left"])
                    my = min(mw1["top"], mw2["top"])
                    mr = max(mw1["left"] + mw1["width"],
                             mw2["left"] + mw2["width"])
                    mb = max(mw1["top"] + mw1["height"],
                             mw2["top"] + mw2["height"])
                    mw1["text"] = merged_text
                    mw1["left"] = mx
                    mw1["top"] = my
                    mw1["width"] = mr - mx
                    mw1["height"] = mb - my
                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
                    to_remove.add(mi2)
                    merge_parent[mi2] = actual_mi1
                    bullet_removed -= 1
            if to_remove:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
                cell["word_boxes"] = filtered
                if not cell.get("_ipa_corrected"):
                    cell["text"] = _words_to_reading_order_text(filtered)
    if bullet_removed:
        for z in zones_data:
            z["cells"] = [c for c in z.get("cells", [])
                          if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
 def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
    """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
    _COMMON_SHORT_WORDS = {
        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
        "die", "der", "das", "dem", "den", "des", "ein", "und",
        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
        "on", "or", "so", "to", "up", "us", "we",
        "the", "and", "but", "for", "not",
    }
    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
    artifact_cells_removed = 0
    for z in zones_data:
        before = len(z.get("cells", []))
        kept = []
        for cell in z.get("cells", []):
            text = (cell.get("text") or "").strip()
            core = text.rstrip(".,;:!?'\"")
            is_artifact = False
            if not core:
                is_artifact = True
            elif _PURE_JUNK_RE.match(core):
                if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
                    is_artifact = True
            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
                is_artifact = True
            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
                is_artifact = True
            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
                  and not re.match(r'^[pPsS]\.?\d+$', core)):
                is_artifact = True
            if is_artifact:
                kept.append(None)
            else:
                kept.append(cell)
        z["cells"] = [c for c in kept if c is not None]
        artifact_cells_removed += before - len(z["cells"])
    if artifact_cells_removed:
        for z in zones_data:
            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
 def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
    """Normalise word_box order to reading order (Step 5j)."""
    wb_reordered = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
            sorted_wbs = [w for line in lines for w in line]
            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
                cell["word_boxes"] = sorted_wbs
                wb_reordered += 1
    if wb_reordered:
        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
 def _enforce_max_columns(
    zones_data: List[Dict[str, Any]],
    max_columns: int,
 ) -> None:
    """Enforce max_columns by merging narrowest columns (Step 5k)."""
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
        cols = z.get("columns", [])
        cells = z.get("cells", [])
        if len(cols) <= max_columns:
            continue
        logger.info(
            "max_columns=%d: zone %s has %d columns -> merging",
            max_columns, z.get("zone_index"), len(cols),
        )
        cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
        while len(cols) > max_columns:
            narrowest = cols_by_width.pop(0)
            ni = narrowest["index"]
            sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
            pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
            if pos + 1 < len(sorted_by_x):
                merge_target = sorted_by_x[pos + 1]
            elif pos > 0:
                merge_target = sorted_by_x[pos - 1]
            else:
                break
            ti = merge_target["index"]
            merge_target["x_min_px"] = min(
                merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
                narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
            )
            merge_target["x_max_px"] = max(
                merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
                narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
            )
            if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
                merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
                merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
            for cell in cells:
                if cell.get("col_index") == ni:
                    cell["col_index"] = ti
                    existing = next(
                        (c for c in cells if c["col_index"] == ti
                         and c["row_index"] == cell["row_index"]
                         and c is not cell),
                        None,
                    )
                    if existing:
                        existing["text"] = (
                            (existing.get("text", "") + " " + cell.get("text", "")).strip()
                        )
                        existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
                        cell["_merged"] = True
            z["cells"] = [c for c in cells if not c.get("_merged")]
            cells = z["cells"]
            cols.remove(narrowest)
            cols_by_width = [c for c in cols_by_width if c["index"] != ni]
        # Re-index columns 0..N-1
        for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
            old_idx = col["index"]
            col["index"] = new_idx
            for cell in cells:
                if cell.get("col_index") == old_idx:
                    cell["col_index"] = new_idx
        logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
@@ -0,0 +1,390 @@
 """
 Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
 divider removal, connector normalization, border strip detection, and
 alphabet sidebar removal.
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List
 from cv_ocr_engines import _words_to_reading_order_text
 logger = logging.getLogger(__name__)
 _PIPE_RE = re.compile(r"^\|+$")
 def _cleanup_zones(
    zones_data: List[Dict[str, Any]],
    border_prefiltered: bool,
    session_id: str,
 ) -> bool:
    """Clean up zone data: remove junk rows, artifacts, pipes, border strips.
    Args:
        zones_data: List of zone dicts (modified in place).
        border_prefiltered: Whether border words were already pre-filtered.
        session_id: For logging.
    Returns:
        Updated border_prefiltered flag.
    """
    _remove_junk_rows(zones_data)
    _remove_artifact_cells(zones_data)
    _remove_oversized_word_boxes(zones_data)
    _remove_pipe_dividers(zones_data)
    _normalize_connector_columns(zones_data)
    border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
    _remove_alphabet_sidebars(zones_data)
    return border_prefiltered
 def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
    """Remove rows where ALL cells contain only short, low-confidence text.
    Also removes 'oversized stub' rows and 'scattered debris' rows.
    """
    _JUNK_CONF_THRESHOLD = 50
    _JUNK_MAX_TEXT_LEN = 3
    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        if not cells or not rows:
            continue
        # Compute median word height across the zone for oversized detection
        all_wb_heights = [
            wb["height"]
            for cell in cells
            for wb in cell.get("word_boxes") or []
            if wb.get("height", 0) > 0
        ]
        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
        junk_row_indices = set()
        for row in rows:
            ri = row["index"]
            row_cells = [c for c in cells if c.get("row_index") == ri]
            if not row_cells:
                continue
            row_wbs = [
                wb for cell in row_cells
                for wb in cell.get("word_boxes") or []
            ]
            # Rule 1: ALL word_boxes are low-conf AND short text
            all_junk = True
            for wb in row_wbs:
                text = (wb.get("text") or "").strip()
                conf = wb.get("conf", 0)
                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
                    all_junk = False
                    break
            if all_junk and row_wbs:
                junk_row_indices.add(ri)
                continue
            # Rule 2: oversized stub -- <=3 words, short total text,
            # and word height > 1.8x median
            if len(row_wbs) <= 3:
                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
                has_page_ref = any(
                    re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
                    for wb in row_wbs
                )
                if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
                    junk_row_indices.add(ri)
                    continue
            # Rule 3: scattered debris -- rows with only tiny fragments
            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
            if longest <= 2:
                junk_row_indices.add(ri)
                continue
        if junk_row_indices:
            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
            logger.info(
                "build-grid: removed %d junk rows from zone %d: %s",
                len(junk_row_indices), z["zone_index"],
                sorted(junk_row_indices),
            )
 def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
    """Remove individual cells with a single very-short, low-conf word."""
    _ARTIFACT_MAX_LEN = 2
    _ARTIFACT_CONF_THRESHOLD = 65
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        artifact_ids = set()
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            if len(wbs) != 1:
                continue
            wb = wbs[0]
            text = (wb.get("text") or "").strip()
            conf = wb.get("conf", 100)
            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
                artifact_ids.add(cell.get("cell_id"))
        if artifact_ids:
            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
            logger.info(
                "build-grid: removed %d artifact cells from zone %d: %s",
                len(artifact_ids), z.get("zone_index", 0),
                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
            )
 def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
    """Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        all_wh = [
            wb["height"]
            for cell in cells
            for wb in cell.get("word_boxes") or []
            if wb.get("height", 0) > 0
        ]
        if not all_wh:
            continue
        med_h = sorted(all_wh)[len(all_wh) // 2]
        oversized_threshold = med_h * 3
        removed_oversized = 0
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
            if len(filtered) < len(wbs):
                removed_oversized += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_oversized:
            z["cells"] = [c for c in cells if c.get("word_boxes")]
            logger.info(
                "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
                removed_oversized, oversized_threshold, z.get("zone_index", 0),
            )
 def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
    """Remove pipe-character word_boxes (column divider artifacts)."""
    for z in zones_data:
        if z.get("vsplit_group") is not None:
            continue  # pipes already removed before split
        removed_pipes = 0
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                removed_pipes += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_pipes:
            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
            logger.info(
                "build-grid: removed %d pipe-divider word_boxes from zone %d",
                removed_pipes, z.get("zone_index", 0),
            )
    # Strip pipe chars ONLY from cell edges (OCR artifacts).
    # Preserve pipes embedded in words as syllable separators.
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if "|" in text:
                cleaned = text.strip("|").strip()
                if cleaned != text.strip():
                    cell["text"] = cleaned
 def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
    """Normalize narrow connector columns where OCR appends noise chars.
    In synonym dictionaries a narrow column repeats the same word
    (e.g. "oder") in every row. OCR sometimes appends noise chars.
    """
    for z in zones_data:
        cols = z.get("columns", [])
        cells = z.get("cells", [])
        if not cols or not cells:
            continue
        for col in cols:
            ci = col.get("index")
            col_cells = [c for c in cells if c.get("col_index") == ci]
            if len(col_cells) < 3:
                continue
            text_counts: Dict[str, int] = {}
            for c in col_cells:
                t = (c.get("text") or "").strip()
                if t:
                    text_counts[t] = text_counts.get(t, 0) + 1
            if not text_counts:
                continue
            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
            dominant_count = text_counts[dominant_text]
            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
                continue
            fixed = 0
            for c in col_cells:
                t = (c.get("text") or "").strip()
                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
                    c["text"] = dominant_text
                    wbs = c.get("word_boxes") or []
                    if len(wbs) == 1:
                        wbs[0]["text"] = dominant_text
                    fixed += 1
            if fixed:
                logger.info(
                    "build-grid: normalized %d outlier cells in connector column %d "
                    "(dominant='%s') zone %d",
                    fixed, ci, dominant_text, z.get("zone_index", 0),
                )
 def _remove_border_strips(
    zones_data: List[Dict[str, Any]],
    border_prefiltered: bool,
 ) -> bool:
    """Detect and remove page-border decoration strips.
    Returns updated border_prefiltered flag.
    """
    border_strip_removed = 0
    if border_prefiltered:
        logger.info("Step 4e: skipped (border pre-filter already applied)")
        return border_prefiltered
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        all_wbs_with_cell: list = []
        for cell in cells:
            for wb in cell.get("word_boxes") or []:
                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
        if len(all_wbs_with_cell) < 10:
            continue
        all_wbs_with_cell.sort(key=lambda t: t[0])
        total = len(all_wbs_with_cell)
        # -- Left-edge scan --
        left_strip_count = 0
        left_gap = 0
        running_right = 0
        for gi in range(total - 1):
            running_right = max(
                running_right,
                all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
            )
            gap = all_wbs_with_cell[gi + 1][0] - running_right
            if gap > 30:
                left_strip_count = gi + 1
                left_gap = gap
                break
        # -- Right-edge scan --
        right_strip_count = 0
        right_gap = 0
        running_left = all_wbs_with_cell[-1][0]
        for gi in range(total - 1, 0, -1):
            running_left = min(running_left, all_wbs_with_cell[gi][0])
            prev_right = (
                all_wbs_with_cell[gi - 1][0]
                + all_wbs_with_cell[gi - 1][1].get("width", 0)
            )
            gap = running_left - prev_right
            if gap > 30:
                right_strip_count = total - gi
                right_gap = gap
                break
        strip_wbs: set = set()
        strip_side = ""
        strip_gap = 0
        strip_count = 0
        if left_strip_count > 0 and left_strip_count / total < 0.20:
            strip_side = "left"
            strip_count = left_strip_count
            strip_gap = left_gap
            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
        elif right_strip_count > 0 and right_strip_count / total < 0.20:
            strip_side = "right"
            strip_count = right_strip_count
            strip_gap = right_gap
            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
        if not strip_wbs:
            continue
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
            if len(filtered) < len(wbs):
                border_strip_removed += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        z["cells"] = [c for c in cells
                      if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info(
            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
            "(gap=%dpx, strip=%d/%d wbs)",
            border_strip_removed, strip_side, z.get("zone_index", 0),
            strip_gap, strip_count, total,
        )
    return border_prefiltered
 def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
    """Remove decorative edge columns (alphabet sidebar safety net).
    Dictionary pages have A-Z letter sidebars that OCR reads as single-
    character word_boxes.
    """
    for z in zones_data:
        columns = z.get("columns", [])
        cells = z.get("cells", [])
        if len(columns) < 3 or not cells:
            continue
        col_cells: Dict[str, List[Dict]] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_"):
                col_cells.setdefault(ct, []).append(cell)
        col_types_ordered = sorted(col_cells.keys())
        if len(col_types_ordered) < 3:
            continue
        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
            edge_cells_list = col_cells.get(edge_ct, [])
            if len(edge_cells_list) < 3:
                continue
            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
            avg_len = sum(len(t) for t in texts) / len(texts)
            single_char = sum(1 for t in texts if len(t) <= 1)
            single_ratio = single_char / len(texts)
            if avg_len > 1.5:
                continue
            if single_ratio < 0.7:
                continue
            removed_count = len(edge_cells_list)
            edge_ids = {id(c) for c in edge_cells_list}
            z["cells"] = [c for c in cells if id(c) not in edge_ids]
            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
            logger.info(
                "Step 4f: removed decorative edge column '%s' from zone %d "
                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
                edge_ct, z.get("zone_index", 0), removed_count,
                avg_len, single_ratio * 100,
            )
            break  # only remove one edge per zone
@@ -0,0 +1,213 @@
 """
 Grid Build Core — the main _build_grid_core() function.
 Extracted from grid_editor_api.py for maintainability.
 Takes merged OCR word positions and builds a structured, zone-aware grid.
 The function delegates to phase-specific modules:
 - grid_build_zones.py   — image loading, graphic/box detection, zone grids
 - grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
 - grid_build_text_ops.py — color, headings, IPA, page refs
 - grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
 """
 import logging
 import time
 from typing import Any, Dict, List, Optional
 from grid.editor.filters import (
    _flatten_word_boxes,
    _get_content_bounds,
    _filter_decorative_margin,
    _filter_footer_words,
    _filter_header_junk,
 )
 from .zones import _build_zones
 from .cleanup import _cleanup_zones
 from .text_ops import _process_text
 from .finalize import _finalize_grid
 logger = logging.getLogger(__name__)
 async def _build_grid_core(
    session_id: str,
    session: dict,
    *,
    ipa_mode: str = "auto",
    syllable_mode: str = "auto",
    enhance: bool = True,
    max_columns: Optional[int] = None,
    min_conf: Optional[int] = None,
 ) -> dict:
    """Core grid building logic — pure computation, no HTTP or DB side effects.
    Args:
        session_id: Session identifier (for logging and image loading).
        session: Full session dict from get_session_db().
        ipa_mode: "auto" (only when English headwords detected), "all"
            (force IPA on all content columns), "en" (English column only),
            "de" (German/definition columns only), or "none" (skip entirely).
        syllable_mode: "auto" (only when original has pipe dividers),
            "all" (force syllabification on all words), "en" (English only),
            "de" (German only), or "none" (skip).
    Returns:
        StructuredGrid result dict.
    Raises:
        ValueError: If session data is incomplete.
    """
    t0 = time.time()
    # ── Phase 1: Input Validation & Word Filtering ──────────────────
    # 1. Validate and load word results
    word_result = session.get("word_result")
    if not word_result or not word_result.get("cells"):
        raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
    img_w = word_result.get("image_width", 0)
    img_h = word_result.get("image_height", 0)
    if not img_w or not img_h:
        raise ValueError("Missing image dimensions in word_result")
    # 2. Flatten all word boxes from cells
    all_words = _flatten_word_boxes(word_result["cells"])
    if not all_words:
        raise ValueError("No word boxes found in cells")
    # 2a-pre. Apply min_conf filter if specified
    if min_conf and min_conf > 0:
        before = len(all_words)
        all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
        removed = before - len(all_words)
        if removed:
            logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
                        session_id, min_conf, removed, before)
    logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
                session_id, len(all_words), len(word_result["cells"]),
                enhance, max_columns, min_conf)
    # 2b. Filter decorative margin columns (alphabet graphics)
    margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
    margin_strip_detected = margin_strip_info.get("found", False)
    # Read document_category from session
    document_category = session.get("document_category")
    # 2c. Filter footer rows (page numbers at the very bottom)
    page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
    # 2c2. Filter OCR junk from header illustrations
    _filter_header_junk(all_words, img_h, logger, session_id)
    # 2d. Filter words inside user-defined exclude regions
    structure_result = session.get("structure_result")
    exclude_rects = []
    if structure_result:
        for er in structure_result.get("exclude_regions", []):
            exclude_rects.append({
                "x": er["x"], "y": er["y"],
                "w": er["w"], "h": er["h"],
            })
    if exclude_rects:
        before = len(all_words)
        filtered = []
        for w in all_words:
            w_cx = w["left"] + w.get("width", 0) / 2
            w_cy = w["top"] + w.get("height", 0) / 2
            inside = any(
                er["x"] <= w_cx <= er["x"] + er["w"]
                and er["y"] <= w_cy <= er["y"] + er["h"]
                for er in exclude_rects
            )
            if not inside:
                filtered.append(w)
        removed = before - len(filtered)
        if removed:
            all_words = filtered
            logger.info(
                "build-grid session %s: removed %d words inside %d user exclude region(s)",
                session_id, removed, len(exclude_rects),
            )
    # 2e. Hard-filter words inside graphic/image regions from structure step
    graphic_rects: List[Dict[str, int]] = []
    if structure_result:
        for g in structure_result.get("graphics", []):
            graphic_rects.append({
                "x": g["x"], "y": g["y"],
                "w": g["w"], "h": g["h"],
            })
    if graphic_rects:
        before = len(all_words)
        all_words = [
            w for w in all_words
            if not any(
                gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
                and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
                for gr in graphic_rects
            )
        ]
        removed = before - len(all_words)
        if removed:
            logger.info(
                "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
                session_id, removed, len(graphic_rects),
            )
    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
    # ── Phase 2: Image Processing & Zone Detection ──────────────────
    zone_result = await _build_zones(
        session_id, session, all_words, graphic_rects,
        content_x, content_y, content_w, content_h,
        img_w, img_h,
    )
    zones_data = zone_result["zones_data"]
    boxes_detected = zone_result["boxes_detected"]
    recovered_count = zone_result["recovered_count"]
    border_prefiltered = zone_result["border_prefiltered"]
    img_bgr = zone_result["img_bgr"]
    # ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
    border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
    # ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
    text_result = _process_text(
        zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
    )
    # ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
    duration = time.time() - t0
    result = _finalize_grid(
        zones_data=zones_data,
        all_words=all_words,
        img_bgr=img_bgr,
        img_w=img_w,
        img_h=img_h,
        session_id=session_id,
        max_columns=max_columns,
        ipa_mode=ipa_mode,
        syllable_mode=syllable_mode,
        en_col_type=text_result["en_col_type"],
        ipa_target_cols=text_result["ipa_target_cols"],
        all_content_cols=text_result["all_content_cols"],
        skip_ipa=text_result["skip_ipa"],
        document_category=document_category,
        margin_strip_detected=margin_strip_detected,
        page_number_info=text_result["page_number_info"],
        boxes_detected=boxes_detected,
        recovered_count=recovered_count,
        duration=duration,
    )
    return result
@@ -0,0 +1,452 @@
 """
 Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
 dictionary detection, syllable dividers, spell checking, empty column
 removal, and result assembly.
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 from .cell_ops import (
    _remove_bullets_and_artifacts,
    _remove_garbled_cells,
    _normalize_word_order,
    _enforce_max_columns,
 )
 logger = logging.getLogger(__name__)
 def _finalize_grid(
    zones_data: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    img_bgr: Any,
    img_w: int,
    img_h: int,
    session_id: str,
    max_columns: Optional[int],
    ipa_mode: str,
    syllable_mode: str,
    en_col_type: Optional[str],
    ipa_target_cols: set,
    all_content_cols: set,
    skip_ipa: bool,
    document_category: Optional[str],
    margin_strip_detected: bool,
    page_number_info: Optional[Dict],
    boxes_detected: int,
    recovered_count: int,
    duration: float,
 ) -> dict:
    """Run final processing steps and assemble result dict.
    Handles: bullet removal, artifact cells, word ordering, max_columns,
    dictionary detection, syllable dividers, spell check, empty columns,
    internal flag cleanup, and result assembly.
    """
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    # 5i. Remove blue bullet/artifact word_boxes
    _remove_bullets_and_artifacts(zones_data)
    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise
    _remove_garbled_cells(zones_data)
    # 5j. Normalise word_box order to reading order
    _normalize_word_order(zones_data)
    # 5k. Enforce max_columns by merging narrowest columns
    if max_columns and max_columns > 0:
        _enforce_max_columns(zones_data, max_columns)
    # --- Dictionary detection on assembled grid ---
    dict_detection = _detect_dictionary(
        zones_data, img_w, img_h, document_category, margin_strip_detected
    )
    # --- Word-gap merge ---
    try:
        from cv_syllable_detect import merge_word_gaps_in_zones
        merge_word_gaps_in_zones(zones_data, session_id)
    except Exception as e:
        logger.warning("Word-gap merge failed: %s", e)
    # --- Pipe auto-correction ---
    try:
        from cv_syllable_detect import autocorrect_pipe_artifacts
        autocorrect_pipe_artifacts(zones_data, session_id)
    except Exception as e:
        logger.warning("Pipe autocorrect failed: %s", e)
    # --- Syllable divider insertion ---
    syllable_insertions = _insert_syllable_dividers(
        zones_data, img_bgr, session_id, syllable_mode, dict_detection,
        en_col_type, all_content_cols, total_cols,
    )
    # --- Split merged words ---
    _split_merged_words(zones_data, session_id)
    # --- Ensure space before IPA/phonetic brackets ---
    _fix_ipa_spacing(zones_data)
    # --- SmartSpellChecker ---
    _run_spell_checker(zones_data, session_id, en_col_type, total_cols)
    # --- Debug log cell counts per column ---
    for z in zones_data:
        if z.get("zone_type") == "content":
            from collections import Counter as _Counter
            _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
            _cols = z.get("columns", [])
            logger.info(
                "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
                z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
            )
    # --- Remove empty columns ---
    _remove_empty_columns(zones_data)
    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):
            cell.pop("_ipa_corrected", None)
    # 6. Build result
    return _assemble_result(
        zones_data, all_words, img_w, img_h, session_id,
        ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
        dict_detection, page_number_info, boxes_detected,
        recovered_count, duration, syllable_insertions,
    )
 def _detect_dictionary(
    zones_data: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
    document_category: Optional[str],
    margin_strip_detected: bool,
 ) -> Dict[str, Any]:
    """Run dictionary detection on the assembled grid."""
    from cv_layout import _score_dictionary_signals
    dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
    try:
        from cv_vocab_types import ColumnGeometry
        for z in zones_data:
            zone_cells = z.get("cells", [])
            zone_cols = z.get("columns", [])
            if len(zone_cols) < 2 or len(zone_cells) < 10:
                continue
            pseudo_geoms = []
            for col in zone_cols:
                ci = col["index"]
                col_cells = [c for c in zone_cells if c.get("col_index") == ci]
                col_words = []
                for cell in col_cells:
                    for wb in cell.get("word_boxes") or []:
                        col_words.append({
                            "text": wb.get("text", ""),
                            "conf": wb.get("conf", 0),
                            "top": wb.get("top", 0),
                            "left": wb.get("left", 0),
                            "height": wb.get("height", 0),
                            "width": wb.get("width", 0),
                        })
                    if not cell.get("word_boxes") and cell.get("text"):
                        col_words.append({
                            "text": cell["text"],
                            "conf": cell.get("confidence", 50),
                            "top": cell.get("bbox_px", {}).get("y", 0),
                            "left": cell.get("bbox_px", {}).get("x", 0),
                            "height": cell.get("bbox_px", {}).get("h", 20),
                            "width": cell.get("bbox_px", {}).get("w", 50),
                        })
                col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
                pseudo_geoms.append(ColumnGeometry(
                    index=ci, x=col.get("x_min_px", 0), y=0,
                    width=max(col_w, 1), height=img_h,
                    word_count=len(col_words), words=col_words,
                    width_ratio=col_w / max(img_w, 1),
                ))
            if len(pseudo_geoms) >= 2:
                dd = _score_dictionary_signals(
                    pseudo_geoms,
                    document_category=document_category,
                    margin_strip_detected=margin_strip_detected,
                )
                if dd["confidence"] > dict_detection["confidence"]:
                    dict_detection = dd
    except Exception as e:
        logger.warning("Dictionary detection failed: %s", e)
    return dict_detection
 def _insert_syllable_dividers(
    zones_data: List[Dict[str, Any]],
    img_bgr: Any,
    session_id: str,
    syllable_mode: str,
    dict_detection: Dict[str, Any],
    en_col_type: Optional[str],
    all_content_cols: set,
    total_cols: int,
 ) -> int:
    """Insert syllable dividers for dictionary pages. Returns insertion count."""
    syllable_insertions = 0
    if syllable_mode == "none" or img_bgr is None:
        if syllable_mode == "none":
            for z in zones_data:
                for cell in z.get("cells", []):
                    t = cell.get("text", "")
                    if "|" in t:
                        cell["text"] = t.replace("|", "")
        return syllable_insertions
    _syllable_eligible = False
    if syllable_mode in ("all", "de", "en"):
        _syllable_eligible = True
    elif (dict_detection.get("is_dictionary")
            and dict_detection.get("article_col_index") is not None):
        _syllable_eligible = True
    _syllable_col_filter: Optional[set] = None
    if syllable_mode == "en":
        _syllable_col_filter = {en_col_type} if en_col_type else set()
    elif syllable_mode == "de":
        if en_col_type and total_cols >= 3:
            _syllable_col_filter = all_content_cols - {en_col_type}
    if _syllable_eligible:
        try:
            from cv_syllable_detect import insert_syllable_dividers
            force_syllables = (syllable_mode in ("all", "de", "en"))
            syllable_insertions = insert_syllable_dividers(
                zones_data, img_bgr, session_id,
                force=force_syllables,
                col_filter=_syllable_col_filter,
            )
        except Exception as e:
            logger.warning("Syllable insertion failed: %s", e)
    return syllable_insertions
 def _split_merged_words(
    zones_data: List[Dict[str, Any]],
    session_id: str,
 ) -> None:
    """Split merged words using dictionary lookup."""
    try:
        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
        if not _SPELL_AVAILABLE:
            return
        split_count = 0
        for z in zones_data:
            for cell in z.get("cells", []):
                text = cell.get("text", "")
                if not text:
                    continue
                parts = []
                changed = False
                for token in text.split():
                    clean = token
                    bracket_pos = clean.find('[')
                    suffix_ipa = ""
                    if bracket_pos > 0:
                        suffix_ipa = clean[bracket_pos:]
                        clean = clean[:bracket_pos]
                    suffix_punct = ""
                    stripped = clean.rstrip(".,!?;:'\")")
                    if stripped != clean:
                        suffix_punct = clean[len(stripped):]
                        clean = stripped
                    suffix = suffix_punct + suffix_ipa
                    contraction = ""
                    if "'" in clean and clean.index("'") >= 2:
                        apos_pos = clean.index("'")
                        contraction = clean[apos_pos:]
                        clean = clean[:apos_pos]
                        suffix = contraction + suffix
                    if len(clean) >= 4 and clean.isalpha():
                        split = _try_split_merged_word(clean)
                        if split:
                            parts.append(split + suffix)
                            changed = True
                            continue
                    parts.append(token)
                if changed:
                    cell["text"] = " ".join(parts)
                    split_count += 1
        if split_count:
            logger.info("build-grid session %s: split %d merged words", session_id, split_count)
    except ImportError:
        pass
 def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
    """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if text and "[" in text:
                fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
                if fixed != text:
                    cell["text"] = fixed
 def _run_spell_checker(
    zones_data: List[Dict[str, Any]],
    session_id: str,
    en_col_type: Optional[str],
    total_cols: int,
 ) -> None:
    """Run SmartSpellChecker on all cells."""
    try:
        from smart_spell import SmartSpellChecker
        _ssc = SmartSpellChecker()
        spell_fix_count = 0
        for z in zones_data:
            for cell in z.get("cells", []):
                text = cell.get("text", "")
                if not text or not text.strip():
                    continue
                ct = cell.get("col_type", "")
                if not ct.startswith("column_"):
                    continue
                if total_cols >= 3 and en_col_type:
                    lang = "en" if ct == en_col_type else "de"
                elif total_cols <= 2:
                    lang = "auto"
                else:
                    lang = "auto"
                result = _ssc.correct_text(text, lang=lang)
                if result.changed:
                    cell["text"] = result.corrected
                    spell_fix_count += 1
        if spell_fix_count:
            logger.info(
                "build-grid session %s: SmartSpellChecker fixed %d cells",
                session_id, spell_fix_count,
            )
    except ImportError:
        logger.debug("SmartSpellChecker not available in build-grid")
    except Exception as e:
        logger.warning("SmartSpellChecker error in build-grid: %s", e)
 def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
    """Remove columns that have no cells assigned."""
    for z in zones_data:
        cells = z.get("cells", [])
        used_col_indices = {c.get("col_index") for c in cells}
        old_cols = z.get("columns", [])
        new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
        if len(new_cols) < len(old_cols):
            old_to_new = {}
            for new_i, col in enumerate(new_cols):
                old_i = col.get("col_index", col.get("index", new_i))
                old_to_new[old_i] = new_i
                col["col_index"] = new_i
                col["index"] = new_i
                col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
            for cell in cells:
                old_ci = cell.get("col_index", 0)
                cell["col_index"] = old_to_new.get(old_ci, old_ci)
                cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
            z["columns"] = new_cols
 def _assemble_result(
    zones_data: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
    session_id: str,
    ipa_mode: str,
    syllable_mode: str,
    ipa_target_cols: set,
    skip_ipa: bool,
    dict_detection: Dict[str, Any],
    page_number_info: Optional[Dict],
    boxes_detected: int,
    recovered_count: int,
    duration: float,
    syllable_insertions: int,
 ) -> dict:
    """Build the final result dict (Phase 6)."""
    total_cells = sum(len(z.get("cells", [])) for z in zones_data)
    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
    total_rows = sum(len(z.get("rows", [])) for z in zones_data)
    # Collect color statistics
    color_stats: Dict[str, int] = {}
    for z in zones_data:
        for cell in z.get("cells", []):
            for wb in cell.get("word_boxes", []):
                cn = wb.get("color_name", "black")
                color_stats[cn] = color_stats.get(cn, 0) + 1
    # Compute layout metrics
    all_content_row_heights: List[float] = []
    for z in zones_data:
        for row in z.get("rows", []):
            if not row.get("is_header", False):
                h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
                if h > 0:
                    all_content_row_heights.append(h)
    avg_row_height = (
        sum(all_content_row_heights) / len(all_content_row_heights)
        if all_content_row_heights else 30.0
    )
    font_size_suggestion = max(10, int(avg_row_height * 0.6))
    return {
        "session_id": session_id,
        "image_width": img_w,
        "image_height": img_h,
        "zones": zones_data,
        "boxes_detected": boxes_detected,
        "summary": {
            "total_zones": len(zones_data),
            "total_columns": total_columns,
            "total_rows": total_rows,
            "total_cells": total_cells,
            "total_words": len(all_words),
            "recovered_colored": recovered_count,
            "color_stats": color_stats,
        },
        "formatting": {
            "bold_columns": [],
            "header_rows": [],
        },
        "layout_metrics": {
            "page_width_px": img_w,
            "page_height_px": img_h,
            "avg_row_height_px": round(avg_row_height, 1),
            "font_size_suggestion_px": font_size_suggestion,
        },
        "dictionary_detection": {
            "is_dictionary": dict_detection.get("is_dictionary", False),
            "confidence": dict_detection.get("confidence", 0.0),
            "signals": dict_detection.get("signals", {}),
            "article_col_index": dict_detection.get("article_col_index"),
            "headword_col_index": dict_detection.get("headword_col_index"),
        },
        "processing_modes": {
            "ipa_mode": ipa_mode,
            "syllable_mode": syllable_mode,
            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
            "syllables_applied": syllable_insertions > 0,
        },
        "page_number": page_number_info,
        "duration_seconds": round(duration, 2),
    }
@@ -0,0 +1,489 @@
 """
 Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
 parenthesis fix, IPA phonetic correction, page ref extraction, and
 slash-IPA conversion.
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional, Set, Tuple
 from cv_color_detect import detect_word_colors
 from cv_ocr_engines import (
    fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
    _lookup_ipa,
 )
 from grid.editor.headers import (
    _detect_heading_rows_by_color,
    _detect_heading_rows_by_single_cell,
 )
 logger = logging.getLogger(__name__)
 def _process_text(
    zones_data: List[Dict[str, Any]],
    img_bgr: Any,
    img_w: int,
    img_h: int,
    ipa_mode: str,
    page_number_info: Optional[Dict],
 ) -> Dict[str, Any]:
    """Run color annotation, heading detection, IPA correction, and page refs.
    Args:
        zones_data: List of zone dicts (modified in place).
        img_bgr: BGR image array (or None).
        img_w: Image width.
        img_h: Image height.
        ipa_mode: IPA processing mode.
        page_number_info: Existing page number metadata (may be None).
    Returns:
        Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
        skip_ipa, page_number_info.
    """
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []
        for z in zones_data:
            for cell in z.get("cells", []):
                all_wb.extend(cell.get("word_boxes", []))
        detect_word_colors(img_bgr, all_wb)
    # 5a. Heading detection by color + height
    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
    if heading_count:
        logger.info("Detected %d heading rows by color+height", heading_count)
    # 5b. Fix unmatched parentheses in cell text
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if ")" in text and "(" not in text:
                cell["text"] = "(" + text
    # 5c. IPA phonetic correction
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    en_col_type = None
    ipa_target_cols: set = set()
    all_content_cols: set = set()
    skip_ipa = (ipa_mode == "none")
    # When ipa_mode=none, strip ALL square brackets from ALL content columns
    if skip_ipa:
        _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
            if "[" in text:
                stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
                if stripped != text:
                    cell["text"] = stripped.strip()
                    cell["_ipa_corrected"] = True
    if not skip_ipa and total_cols >= 3:
        en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
            all_cells, total_cols, ipa_mode, zones_data
        )
    elif not skip_ipa:
        # Collect all_content_cols even when <3 cols (needed by finalize)
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_") and (cell.get("text") or "").strip():
                all_content_cols.add(ct)
    # 5e. Heading detection by single-cell rows
    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
    if single_heading_count:
        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
    # 5f. Strip IPA from headings
    for z in zones_data:
        for cell in z.get("cells", []):
            if cell.get("col_type") != "heading":
                continue
            text = cell.get("text", "")
            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
            if stripped and stripped != text:
                cell["text"] = stripped
    # 5g. Extract page_ref cells and footer rows
    _extract_page_refs_and_footers(zones_data, page_number_info)
    # 5h. Convert slash-delimited IPA to bracket notation
    _convert_slash_ipa(zones_data, skip_ipa, en_col_type)
    return {
        "en_col_type": en_col_type,
        "ipa_target_cols": ipa_target_cols,
        "all_content_cols": all_content_cols,
        "skip_ipa": skip_ipa,
        "page_number_info": page_number_info,
    }
 def _run_ipa_correction(
    all_cells: List[Dict],
    total_cols: int,
    ipa_mode: str,
    zones_data: List[Dict[str, Any]],
 ) -> Tuple[Optional[str], set, set]:
    """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
    en_col_type = None
    all_content_cols: set = set()
    # Detect English headword column via IPA signals
    col_ipa_count: Dict[str, int] = {}
    for cell in all_cells:
        ct = cell.get("col_type", "")
        if not ct.startswith("column_"):
            continue
        txt = cell.get("text", "") or ""
        if txt.strip():
            all_content_cols.add(ct)
        if '[' in txt or _text_has_garbled_ipa(txt):
            col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
    if col_ipa_count:
        en_col_type = max(col_ipa_count, key=col_ipa_count.get)
    elif ipa_mode == "all":
        col_cell_count: Dict[str, int] = {}
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_") and (cell.get("text") or "").strip():
                col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
        if col_cell_count:
            en_col_type = max(col_cell_count, key=col_cell_count.get)
    # Decide which columns to process based on ipa_mode
    en_ipa_target_cols: set = set()
    de_ipa_target_cols: set = set()
    if ipa_mode in ("auto", "en"):
        if en_col_type:
            en_ipa_target_cols.add(en_col_type)
    elif ipa_mode == "de":
        de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
    elif ipa_mode == "all":
        if en_col_type:
            en_ipa_target_cols.add(en_col_type)
        de_ipa_target_cols = all_content_cols - en_ipa_target_cols
    # --- Strip IPA from columns NOT in the target set ---
    _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
    strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
    if strip_en_ipa or ipa_mode == "none":
        strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct not in strip_cols:
                continue
            text = cell.get("text", "")
            if "[" in text:
                stripped = _SQUARE_BRACKET_RE.sub("", text)
                if stripped != text:
                    cell["text"] = stripped.strip()
                    cell["_ipa_corrected"] = True
    # --- English IPA (Britfone + eng_to_ipa) ---
    if en_ipa_target_cols:
        for cell in all_cells:
            ct = cell.get("col_type")
            if ct in en_ipa_target_cols:
                cell["_orig_col_type"] = ct
                cell["col_type"] = "column_en"
    _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
    fix_cell_phonetics(all_cells, pronunciation="british")
    for cell in all_cells:
        orig = cell.pop("_orig_col_type", None)
        if orig:
            cell["col_type"] = orig
        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
            cell["_ipa_corrected"] = True
    # --- German IPA (wiki-pronunciation-dict + epitran) ---
    if de_ipa_target_cols:
        from cv_ipa_german import insert_german_ipa
        insert_german_ipa(all_cells, de_ipa_target_cols)
    ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
    # Mark cells whose text was changed by IPA correction
    for cell in all_cells:
        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
            cell["_ipa_corrected"] = True
    # 5d. Fix IPA continuation cells
    skip_ipa = (ipa_mode == "none")
    _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    ipa_cont_fixed = 0
    for z in ([] if skip_ipa else zones_data):
        rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
        z_cells = z.get("cells", [])
        for idx, row in enumerate(rows_sorted):
            if idx == 0:
                continue
            ri = row["index"]
            row_cells = [c for c in z_cells if c.get("row_index") == ri]
            for cell in row_cells:
                ct = cell.get("col_type", "")
                if not ct.startswith("column_"):
                    continue
                cell_text = (cell.get("text") or "").strip()
                if not cell_text:
                    wb_texts = [w.get("text", "")
                                for w in cell.get("word_boxes", [])]
                    cell_text = " ".join(wb_texts).strip()
                    if not cell_text:
                        continue
                is_bracketed = (
                    cell_text.startswith('[') and cell_text.endswith(']')
                )
                if is_bracketed:
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
                        continue
                else:
                    content_cells_in_row = [
                        c for c in row_cells
                        if c.get("col_type", "").startswith("column_")
                        and c.get("col_type") != "column_1"
                    ]
                    if len(content_cells_in_row) != 1:
                        continue
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    if any(c in _REAL_IPA_CHARS for c in cell_text):
                        continue
                    _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
                    if len(_words_in_text) >= 3:
                        continue
                # Find headword in previous row, same column
                prev_ri = rows_sorted[idx - 1]["index"]
                prev_same_col = [
                    c for c in z_cells
                    if c.get("row_index") == prev_ri
                    and c.get("col_type") == ct
                ]
                if not prev_same_col:
                    continue
                prev_text = prev_same_col[0].get("text", "")
                fixed = fix_ipa_continuation_cell(
                    cell_text, prev_text, pronunciation="british",
                )
                if fixed != cell_text:
                    cell["text"] = fixed
                    ipa_cont_fixed += 1
                    logger.info(
                        "IPA continuation R%d %s: '%s' -> '%s'",
                        ri, ct, cell_text, fixed,
                    )
    if ipa_cont_fixed:
        logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
    return en_col_type, ipa_target_cols, all_content_cols
 def _extract_page_refs_and_footers(
    zones_data: List[Dict[str, Any]],
    page_number_info: Optional[Dict],
 ) -> None:
    """Extract page_ref cells and footer rows from content zones.
    Modifies zones_data in place. Updates page_number_info if a page number
    footer is found.
    """
    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
    _NUMBER_WORDS = {
        "one", "two", "three", "four", "five", "six", "seven",
        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
        "einhundert", "zweihundert", "dreihundert", "vierhundert",
        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
    }
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        if not rows:
            continue
        # Extract column_1 cells that look like page references
        page_refs = []
        page_ref_cell_ids = set()
        for cell in cells:
            if cell.get("col_type") != "column_1":
                continue
            text = (cell.get("text") or "").strip()
            if not text:
                continue
            if not _PAGE_REF_RE.match(text):
                continue
            page_refs.append({
                "row_index": cell.get("row_index"),
                "text": text,
                "bbox_pct": cell.get("bbox_pct", {}),
            })
            page_ref_cell_ids.add(cell.get("cell_id"))
        # Detect footer: last non-header row if it has only 1 cell
        footer_rows = []
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if non_header_rows:
            last_row = non_header_rows[-1]
            last_ri = last_row["index"]
            last_cells = [c for c in z["cells"]
                          if c.get("row_index") == last_ri]
            if len(last_cells) == 1:
                text = (last_cells[0].get("text") or "").strip()
                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
                has_commas = ',' in text
                text_words = set(text.lower().split())
                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
                is_page_number = len(text) <= 20 or is_written_number
                if (text and not has_real_ipa and not has_commas
                        and is_page_number
                        and last_cells[0].get("col_type") != "heading"):
                    footer_rows.append({
                        "row_index": last_ri,
                        "text": text,
                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
                    })
        # Classify footer rows
        page_number_footers = []
        other_footers = []
        for fr in footer_rows:
            ft = fr["text"].strip()
            digits = "".join(c for c in ft if c.isdigit())
            if digits and re.match(r'^[\d\s.]+$', ft):
                page_number_footers.append(fr)
            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
                page_number_footers.append(fr)
            else:
                other_footers.append(fr)
        # Remove page-number footer rows from grid entirely
        if page_number_footers:
            pn_ris = {fr["row_index"] for fr in page_number_footers}
            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
            pn_text = page_number_footers[0]["text"].strip()
            pn_digits = "".join(c for c in pn_text if c.isdigit())
            if not page_number_info:
                page_number_info = {
                    "text": pn_text,
                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
                }
                if pn_digits:
                    page_number_info["number"] = int(pn_digits)
        # Mark remaining footer rows
        if other_footers:
            footer_ris = {fr["row_index"] for fr in other_footers}
            for r in z["rows"]:
                if r["index"] in footer_ris:
                    r["is_footer"] = True
            for c in z["cells"]:
                if c.get("row_index") in footer_ris:
                    c["col_type"] = "footer"
        if page_refs or footer_rows:
            logger.info(
                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
                len(page_refs), len(footer_rows), len(page_number_footers),
                z.get("zone_index", 0),
            )
        if page_refs:
            z["page_refs"] = page_refs
        if other_footers:
            z["footer"] = other_footers
 def _convert_slash_ipa(
    zones_data: List[Dict[str, Any]],
    skip_ipa: bool,
    en_col_type: Optional[str],
 ) -> None:
    """Convert slash-delimited IPA to bracket notation.
    Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
    """
    _SLASH_IPA_RE = re.compile(
        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
    )
    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
    slash_ipa_fixed = 0
    for z in ([] if skip_ipa else zones_data):
        for cell in z.get("cells", []):
            if en_col_type and cell.get("col_type") != en_col_type:
                continue
            text = cell.get("text", "")
            if "/" not in text:
                continue
            def _replace_slash_ipa(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                headword = m.group(1)
                ocr_ipa = m.group(2)
                inner_raw = ocr_ipa.strip("/").strip()
                if _SLASH_IPA_REJECT_RE.search(inner_raw):
                    return m.group(0)
                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
                if ipa:
                    slash_ipa_fixed += 1
                    return f"{headword} [{ipa}]"
                inner = inner_raw.lstrip("'").strip()
                if inner:
                    slash_ipa_fixed += 1
                    return f"{headword} [{inner}]"
                return m.group(0)
            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
            def _replace_trailing_slash(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                inner = m.group(1).strip("/").strip().lstrip("'").strip()
                if _SLASH_IPA_REJECT_RE.search(inner):
                    return m.group(0)
                if inner:
                    slash_ipa_fixed += 1
                    return f" [{inner}]"
                return m.group(0)
            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
            if new_text == text:
                m = _STANDALONE_SLASH_IPA_RE.match(text)
                if m:
                    inner = m.group(1).strip()
                    if not _SLASH_IPA_REJECT_RE.search(inner):
                        inner = inner.lstrip("'").strip()
                        if inner:
                            new_text = "[" + inner + "]" + text[m.end():]
                            slash_ipa_fixed += 1
            if new_text != text:
                cell["text"] = new_text
    if slash_ipa_fixed:
        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
@@ -0,0 +1,464 @@
 """
 Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
 detection and zone-aware grid building.
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_graphic_detect import detect_graphic_elements
 from cv_color_detect import recover_colored_text
 from cv_vocab_types import PageZone
 from ocr_pipeline_session_store import get_session_image
 from grid.editor.filters import (
    _filter_border_strip_words,
    _filter_border_ghosts,
    _words_in_zone,
 )
 from grid.editor.zones import (
    _PIPE_RE_VSPLIT,
    _detect_vertical_dividers,
    _split_zone_at_vertical_dividers,
    _merge_content_zones_across_boxes,
    _build_zone_grid,
 )
 logger = logging.getLogger(__name__)
 async def _build_zones(
    session_id: str,
    session: dict,
    all_words: List[Dict[str, Any]],
    graphic_rects: List[Dict[str, int]],
    content_x: int,
    content_y: int,
    content_w: int,
    content_h: int,
    img_w: int,
    img_h: int,
 ) -> Dict[str, Any]:
    """Load image, detect graphics/boxes, build zone-aware grids.
    Returns a dict with keys:
        zones_data, boxes_detected, recovered_count, border_prefiltered,
        img_bgr, all_words (modified in-place but returned for clarity).
    """
    zones_data: List[Dict[str, Any]] = []
    boxes_detected = 0
    recovered_count = 0
    border_prefiltered = False
    img_bgr = None
    # 3. Load image for box detection
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")
    if img_png:
        # Decode image for color detection + box detection
        arr = np.frombuffer(img_png, dtype=np.uint8)
        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        if img_bgr is not None:
            # --- 3a. Detect graphic/image regions via CV and hard-filter ---
            sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
            fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
            if fresh_graphics:
                fresh_rects = [
                    {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
                    for g in fresh_graphics
                ]
                graphic_rects.extend(fresh_rects)
                logger.info(
                    "build-grid session %s: detected %d graphic region(s) via CV",
                    session_id, len(fresh_graphics),
                )
                # Hard-filter words inside newly detected graphic regions
                before = len(all_words)
                all_words[:] = [
                    w for w in all_words
                    if not any(
                        gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in fresh_rects
                    )
                ]
                removed = before - len(all_words)
                if removed:
                    logger.info(
                        "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
                        session_id, removed, len(fresh_rects),
                    )
            # --- Recover colored text that OCR missed (before grid building) ---
            recovered = recover_colored_text(img_bgr, all_words)
            if recovered and graphic_rects:
                # Filter recovered chars inside graphic regions
                recovered = [
                    r for r in recovered
                    if not any(
                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in graphic_rects
                    )
                ]
            if recovered:
                recovered_count = len(recovered)
                all_words.extend(recovered)
                logger.info(
                    "build-grid session %s: +%d recovered colored words",
                    session_id, recovered_count,
                )
            # Detect bordered boxes
            boxes = detect_boxes(
                img_bgr,
                content_x=content_x,
                content_w=content_w,
                content_y=content_y,
                content_h=content_h,
            )
            boxes_detected = len(boxes)
            if boxes:
                # Filter border ghost words before grid building
                all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
                if ghost_count:
                    all_words[:] = all_words_new
                    logger.info(
                        "build-grid session %s: removed %d border ghost words",
                        session_id, ghost_count,
                    )
                # Split page into zones
                page_zones = split_page_into_zones(
                    content_x, content_y, content_w, content_h, boxes
                )
                # Merge content zones separated by box zones
                page_zones = _merge_content_zones_across_boxes(
                    page_zones, content_x, content_w
                )
                # 3b. Detect vertical dividers and split content zones
                page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
                    page_zones, all_words
                )
                # --- First pass: build grids per zone independently ---
                zone_grids = _build_grids_per_zone(
                    page_zones, all_words, img_w, img_h
                )
                border_prefiltered = border_prefiltered or any(
                    zg.get("_border_prefiltered") for zg in zone_grids
                )
                # --- Second pass: merge column boundaries from all content zones ---
                _merge_content_zone_columns(
                    zone_grids, all_words, content_w, img_w, img_h, session_id
                )
                # --- Build zones_data from zone_grids ---
                for zg in zone_grids:
                    pz = zg["pz"]
                    grid = zg["grid"]
                    grid.pop("_raw_columns", None)
                    zone_entry: Dict[str, Any] = {
                        "zone_index": pz.index,
                        "zone_type": pz.zone_type,
                        "bbox_px": {
                            "x": pz.x, "y": pz.y,
                            "w": pz.width, "h": pz.height,
                        },
                        "bbox_pct": {
                            "x": round(pz.x / img_w * 100, 2) if img_w else 0,
                            "y": round(pz.y / img_h * 100, 2) if img_h else 0,
                            "w": round(pz.width / img_w * 100, 2) if img_w else 0,
                            "h": round(pz.height / img_h * 100, 2) if img_h else 0,
                        },
                        "border": None,
                        "word_count": len(zg["words"]),
                        **grid,
                    }
                    if pz.box:
                        zone_entry["border"] = {
                            "thickness": pz.box.border_thickness,
                            "confidence": pz.box.confidence,
                        }
                    if pz.image_overlays:
                        zone_entry["image_overlays"] = pz.image_overlays
                    if pz.layout_hint:
                        zone_entry["layout_hint"] = pz.layout_hint
                    if pz.vsplit_group is not None:
                        zone_entry["vsplit_group"] = pz.vsplit_group
                    zones_data.append(zone_entry)
    # 4. Fallback: no boxes detected -> single zone with all words
    if not zones_data:
        before = len(all_words)
        filtered_words = [
            w for w in all_words
            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
        ]
        removed = before - len(filtered_words)
        if removed:
            logger.info(
                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
                session_id, removed,
            )
        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
        if bs_removed:
            border_prefiltered = True
            logger.info(
                "build-grid session %s: pre-filtered %d border-strip words",
                session_id, bs_removed,
            )
        grid = _build_zone_grid(
            filtered_words, content_x, content_y, content_w, content_h,
            0, img_w, img_h,
        )
        grid.pop("_raw_columns", None)
        zones_data.append({
            "zone_index": 0,
            "zone_type": "content",
            "bbox_px": {
                "x": content_x, "y": content_y,
                "w": content_w, "h": content_h,
            },
            "bbox_pct": {
                "x": round(content_x / img_w * 100, 2) if img_w else 0,
                "y": round(content_y / img_h * 100, 2) if img_h else 0,
                "w": round(content_w / img_w * 100, 2) if img_w else 0,
                "h": round(content_h / img_h * 100, 2) if img_h else 0,
            },
            "border": None,
            "word_count": len(all_words),
            **grid,
        })
    return {
        "zones_data": zones_data,
        "boxes_detected": boxes_detected,
        "recovered_count": recovered_count,
        "border_prefiltered": border_prefiltered,
        "img_bgr": img_bgr,
    }
 def _detect_and_split_vertical_dividers(
    page_zones: List[PageZone],
    all_words: List[Dict[str, Any]],
 ) -> tuple:
    """Detect vertical dividers and split content zones.
    Returns (expanded_zones, border_prefiltered_from_vsplit).
    """
    vsplit_group_counter = 0
    expanded_zones: List = []
    for pz in page_zones:
        if pz.zone_type != "content":
            expanded_zones.append(pz)
            continue
        zone_words = _words_in_zone(
            all_words, pz.y, pz.height, pz.x, pz.width
        )
        divider_xs = _detect_vertical_dividers(
            zone_words, pz.x, pz.width, pz.y, pz.height
        )
        if divider_xs:
            sub_zones = _split_zone_at_vertical_dividers(
                pz, divider_xs, vsplit_group_counter
            )
            expanded_zones.extend(sub_zones)
            vsplit_group_counter += 1
            # Remove pipe words so they don't appear in sub-zones
            pipe_ids = set(
                id(w) for w in zone_words
                if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
            )
            all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
            logger.info(
                "build-grid: vertical split zone %d at x=%s -> %d sub-zones",
                pz.index, [int(x) for x in divider_xs], len(sub_zones),
            )
        else:
            expanded_zones.append(pz)
    # Re-index zones
    for i, pz in enumerate(expanded_zones):
        pz.index = i
    return expanded_zones, False
 def _build_grids_per_zone(
    page_zones: List[PageZone],
    all_words: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
 ) -> List[Dict[str, Any]]:
    """Build grids for each zone independently (first pass)."""
    zone_grids: List[Dict] = []
    for pz in page_zones:
        zone_words = _words_in_zone(
            all_words, pz.y, pz.height, pz.x, pz.width
        )
        if pz.zone_type == "content":
            logger.info(
                "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
                pz.index, pz.zone_type,
                pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
                len(zone_words), len(all_words),
            )
        # Filter recovered single-char artifacts in ALL zones
        before = len(zone_words)
        zone_words = [
            w for w in zone_words
            if not (
                w.get("recovered")
                and len(w.get("text", "").strip()) <= 2
            )
        ]
        removed = before - len(zone_words)
        if removed:
            logger.info(
                "build-grid: filtered %d recovered artifacts from %s zone %d",
                removed, pz.zone_type, pz.index,
            )
        # Filter words inside image overlay regions (merged box zones)
        if pz.image_overlays:
            before_ov = len(zone_words)
            zone_words = [
                w for w in zone_words
                if not any(
                    ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
                    and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
                    for ov in pz.image_overlays
                )
            ]
            ov_removed = before_ov - len(zone_words)
            if ov_removed:
                logger.info(
                    "build-grid: filtered %d words inside image overlays from zone %d",
                    ov_removed, pz.index,
                )
        zone_words, bs_removed = _filter_border_strip_words(zone_words)
        bp = False
        if bs_removed:
            bp = True
            logger.info(
                "build-grid: pre-filtered %d border-strip words from zone %d",
                bs_removed, pz.index,
            )
        grid = _build_zone_grid(
            zone_words, pz.x, pz.y, pz.width, pz.height,
            pz.index, img_w, img_h,
            skip_first_row_header=bool(pz.image_overlays),
        )
        zone_grids.append({
            "pz": pz, "words": zone_words, "grid": grid,
            "_border_prefiltered": bp,
        })
    return zone_grids
 def _merge_content_zone_columns(
    zone_grids: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    content_w: int,
    img_w: int,
    img_h: int,
    session_id: str,
 ) -> None:
    """Second pass: merge column boundaries from all content zones.
    Modifies zone_grids in place.
    """
    content_zones = [
        zg for zg in zone_grids
        if zg["pz"].zone_type == "content"
        and zg["pz"].vsplit_group is None
    ]
    if len(content_zones) <= 1:
        return
    # Collect column split points (x_min of non-first columns)
    all_split_xs: List[float] = []
    for zg in content_zones:
        raw_cols = zg["grid"].get("_raw_columns", [])
        for col in raw_cols[1:]:
            all_split_xs.append(col["x_min"])
    if not all_split_xs:
        return
    all_split_xs.sort()
    merge_distance = max(25, int(content_w * 0.03))
    merged_xs = [all_split_xs[0]]
    for x in all_split_xs[1:]:
        if x - merged_xs[-1] < merge_distance:
            merged_xs[-1] = (merged_xs[-1] + x) / 2
        else:
            merged_xs.append(x)
    total_cols = len(merged_xs) + 1
    max_zone_cols = max(
        len(zg["grid"].get("_raw_columns", []))
        for zg in content_zones
    )
    if total_cols < max_zone_cols:
        return
    cx_min = min(w["left"] for w in all_words)
    cx_max = max(w["left"] + w["width"] for w in all_words)
    merged_columns: List[Dict[str, Any]] = []
    prev_x = cx_min
    for i, sx in enumerate(merged_xs):
        merged_columns.append({
            "index": i,
            "type": f"column_{i + 1}",
            "x_min": prev_x,
            "x_max": sx,
        })
        prev_x = sx
    merged_columns.append({
        "index": len(merged_xs),
        "type": f"column_{len(merged_xs) + 1}",
        "x_min": prev_x,
        "x_max": cx_max,
    })
    # Re-build ALL content zones with merged columns
    for zg in zone_grids:
        pz = zg["pz"]
        if pz.zone_type == "content":
            grid = _build_zone_grid(
                zg["words"], pz.x, pz.y,
                pz.width, pz.height,
                pz.index, img_w, img_h,
                global_columns=merged_columns,
                skip_first_row_header=bool(pz.image_overlays),
            )
            zg["grid"] = grid
    logger.info(
        "build-grid session %s: union of %d content "
        "zones -> %d merged columns (max single zone: %d)",
        session_id, len(content_zones),
        total_cols, max_zone_cols,
    )
@@ -0,0 +1,15 @@
 """
 Grid Editor sub-package — FastAPI endpoints and helper functions.
 Modules:
  - api          — barrel re-export (combined router + _build_grid_core)
  - api_grid     — build-grid, save-grid, get-grid endpoints
  - api_gutter   — gutter-repair endpoints
  - api_box      — build-box-grids endpoints
  - api_unified  — build-unified-grid endpoints
  - helpers      — barrel re-export of all helper symbols
  - columns      — column detection, cross-column splitting
  - filters      — word/zone filtering, border ghosts
  - headers      — header/heading detection, colspan detection
  - zones        — vertical dividers, zone splitting/merging
 """
@@ -0,0 +1,31 @@
 """
 Grid Editor API — barrel re-export.
 The actual endpoints live in:
  - grid_editor_api_grid.py   (build-grid, rerun-ocr, save-grid, get-grid)
  - grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply)
  - grid_editor_api_box.py    (build-box-grids)
  - grid_editor_api_unified.py (build-unified-grid, unified-grid)
 This module re-exports the combined router and key symbols so that
 existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core`
 continue to work unchanged.
 """
 from fastapi import APIRouter
 from .api_grid import router as _grid_router
 from .api_gutter import router as _gutter_router
 from .api_box import router as _box_router
 from .api_unified import router as _unified_router
 # Re-export _build_grid_core so callers that do
 # `from grid_editor_api import _build_grid_core` keep working.
 from grid.build.core import _build_grid_core  # noqa: F401
 # Merge all sub-routers into one combined router
 router = APIRouter()
 router.include_router(_grid_router)
 router.include_router(_gutter_router)
 router.include_router(_box_router)
 router.include_router(_unified_router)
@@ -0,0 +1,177 @@
 """
 Grid Editor API — box-grid-review endpoints.
 """
 import logging
 from fastapi import APIRouter, HTTPException, Request
 from .filters import _words_in_zone
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-box-grids")
 async def build_box_grids(session_id: str, request: Request):
    """Rebuild grid structure for all detected boxes with layout-aware detection.
    Uses structure_result.boxes (from Step 7) as the source of box coordinates,
    and raw_paddle_words as OCR word source. Creates or updates box zones in
    the grid_editor_result.
    Optional body: { "overrides": { "0": "bullet_list" } }
    Maps box_index -> forced layout_type.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
    # Get raw OCR words (with top/left/width/height keys)
    word_result = session.get("word_result") or {}
    all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or []
    if not all_words:
        raise HTTPException(status_code=400, detail="No raw OCR words available.")
    # Get detected boxes from structure_result
    structure_result = session.get("structure_result") or {}
    gt = session.get("ground_truth") or {}
    if not structure_result:
        structure_result = gt.get("structure_result") or {}
    detected_boxes = structure_result.get("boxes") or []
    if not detected_boxes:
        return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
    # Filter out false-positive boxes in header/footer margins.
    img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
    if img_h_for_filter > 0:
        margin_frac = 0.07  # 7% of image height
        margin_top = img_h_for_filter * margin_frac
        margin_bottom = img_h_for_filter * (1 - margin_frac)
        filtered = []
        for box in detected_boxes:
            by = box.get("y", 0)
            bh = box.get("h", 0)
            box_center_y = by + bh / 2
            if box_center_y < margin_top or box_center_y > margin_bottom:
                logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
                            by, bh, box_center_y, margin_top, margin_bottom)
                continue
            filtered.append(box)
        detected_boxes = filtered
    body = {}
    try:
        body = await request.json()
    except Exception:
        pass
    layout_overrides = body.get("overrides", {})
    from cv_box_layout import build_box_zone_grid
    img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0)
    img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
    zones = grid_data.get("zones", [])
    # Find highest existing zone_index
    max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1)
    # Remove old box zones (we'll rebuild them)
    zones = [z for z in zones if z.get("zone_type") != "box"]
    box_count = 0
    spell_fixes = 0
    for box_idx, box in enumerate(detected_boxes):
        bx = box.get("x", 0)
        by = box.get("y", 0)
        bw = box.get("w", 0)
        bh = box.get("h", 0)
        if bw <= 0 or bh <= 0:
            continue
        # Filter raw OCR words inside this box
        zone_words = _words_in_zone(all_words, by, bh, bx, bw)
        if not zone_words:
            logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh)
            continue
        zone_idx = max_zone_idx + 1 + box_idx
        forced_layout = layout_overrides.get(str(box_idx))
        # Build box grid
        box_grid = build_box_zone_grid(
            zone_words, bx, by, bw, bh,
            zone_idx, img_w, img_h,
            layout_type=forced_layout,
        )
        # Apply SmartSpellChecker to all box cells
        try:
            from smart_spell import SmartSpellChecker
            ssc = SmartSpellChecker()
            for cell in box_grid.get("cells", []):
                text = cell.get("text", "")
                if not text:
                    continue
                result = ssc.correct_text(text, lang="auto")
                if result.changed:
                    cell["text"] = result.corrected
                    spell_fixes += 1
        except ImportError:
            pass
        # Build zone entry
        zone_entry = {
            "zone_index": zone_idx,
            "zone_type": "box",
            "bbox_px": {"x": bx, "y": by, "w": bw, "h": bh},
            "bbox_pct": {
                "x": round(bx / img_w * 100, 2) if img_w else 0,
                "y": round(by / img_h * 100, 2) if img_h else 0,
                "w": round(bw / img_w * 100, 2) if img_w else 0,
                "h": round(bh / img_h * 100, 2) if img_h else 0,
            },
            "border": None,
            "word_count": len(zone_words),
            "columns": box_grid["columns"],
            "rows": box_grid["rows"],
            "cells": box_grid["cells"],
            "header_rows": box_grid.get("header_rows", []),
            "box_layout_type": box_grid.get("box_layout_type", "flowing"),
            "box_grid_reviewed": False,
            "box_bg_color": box.get("bg_color_name", ""),
            "box_bg_hex": box.get("bg_color_hex", ""),
        }
        zones.append(zone_entry)
        box_count += 1
    # Sort zones by y-position for correct reading order
    zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0))
    grid_data["zones"] = zones
    await update_session_db(session_id, grid_editor_result=grid_data)
    logger.info(
        "build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected",
        session_id, box_count, spell_fixes, len(detected_boxes),
    )
    return {
        "session_id": session_id,
        "box_zones_rebuilt": box_count,
        "total_detected_boxes": len(detected_boxes),
        "spell_fixes": spell_fixes,
        "zones": zones,
    }
@@ -0,0 +1,334 @@
 """
 Grid Editor API — grid build, save, and retrieve endpoints.
 """
 import logging
 from fastapi import APIRouter, HTTPException, Query, Request
 from grid.build.core import _build_grid_core
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-grid")
 async def build_grid(
    session_id: str,
    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
 ):
    """Build a structured, zone-aware grid from existing Kombi word results.
    Requires that paddle-kombi or rapid-kombi has already been run on the session.
    Uses the image for box detection and the word positions for grid structuring.
    Query params:
        ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
        syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
    Returns a StructuredGrid with zones, each containing their own
    columns, rows, and cells — ready for the frontend Excel-like editor.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    try:
        result = await _build_grid_core(
            session_id, session,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            enhance=enhance,
            max_columns=max_cols if max_cols > 0 else None,
            min_conf=min_conf if min_conf > 0 else None,
        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    # Save automatic grid snapshot for later comparison with manual corrections
    # Lazy import to avoid circular dependency with ocr_pipeline_regression
    from ocr_pipeline_regression import _build_reference_snapshot
    wr = session.get("word_result") or {}
    engine = wr.get("ocr_engine", "")
    if engine in ("kombi", "rapid_kombi"):
        auto_pipeline = "kombi"
    elif engine == "paddle_direct":
        auto_pipeline = "paddle-direct"
    else:
        auto_pipeline = "pipeline"
    auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline)
    gt = session.get("ground_truth") or {}
    gt["auto_grid_snapshot"] = auto_snapshot
    # Persist to DB and advance current_step to 11 (reconstruction complete)
    await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11)
    logger.info(
        "build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
        "%d boxes in %.2fs",
        session_id,
        len(result.get("zones", [])),
        result.get("summary", {}).get("total_columns", 0),
        result.get("summary", {}).get("total_rows", 0),
        result.get("summary", {}).get("total_cells", 0),
        result.get("boxes_detected", 0),
        result.get("duration_seconds", 0),
    )
    return result
@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid")
 async def rerun_ocr_and_build_grid(
    session_id: str,
    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
    vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
    doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
 ):
    """Re-run OCR with quality settings, then rebuild the grid.
    Unlike build-grid (which only rebuilds from existing words),
    this endpoint re-runs the full OCR pipeline on the cropped image
    with optional CLAHE enhancement, then builds the grid.
    Steps executed: Image Enhancement -> OCR -> Grid Build
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    import time as _time
    t0 = _time.time()
    # 1. Load the cropped/dewarped image from cache or session
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if dewarped_bgr is None:
        raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.")
    img_h, img_w = dewarped_bgr.shape[:2]
    ocr_input = dewarped_bgr.copy()
    # 2. Scan quality assessment
    scan_quality_info = {}
    try:
        from scan_quality import score_scan_quality
        quality_report = score_scan_quality(ocr_input)
        scan_quality_info = quality_report.to_dict()
        actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf
    except Exception as e:
        logger.warning(f"rerun-ocr: scan quality failed: {e}")
        actual_min_conf = min_conf if min_conf > 0 else 40
    # 3. Image enhancement (Step 3)
    is_degraded = scan_quality_info.get("is_degraded", False)
    if enhance and is_degraded:
        try:
            from ocr_image_enhance import enhance_for_ocr
            ocr_input = enhance_for_ocr(ocr_input, is_degraded=True)
            logger.info("rerun-ocr: CLAHE enhancement applied")
        except Exception as e:
            logger.warning(f"rerun-ocr: enhancement failed: {e}")
    # 4. Run dual-engine OCR
    from PIL import Image
    import pytesseract
    # RapidOCR
    rapid_words = []
    try:
        from cv_ocr_engines import ocr_region_rapid
        from cv_vocab_types import PageRegion
        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
        rapid_words = ocr_region_rapid(ocr_input, full_region) or []
    except Exception as e:
        logger.warning(f"rerun-ocr: RapidOCR failed: {e}")
    # Tesseract
    pil_img = Image.fromarray(ocr_input[:, :, ::-1])
    data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT)
    tess_words = []
    for i in range(len(data["text"])):
        text = (data["text"][i] or "").strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < actual_min_conf:
            continue
        tess_words.append({
            "text": text, "left": data["left"][i], "top": data["top"][i],
            "width": data["width"][i], "height": data["height"][i], "conf": conf,
        })
    # 5. Merge OCR results
    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
    if rapid_split or tess_words:
        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
        merged_words = _deduplicate_words(merged_words)
    else:
        merged_words = tess_words
    # 6. Store updated word_result in session
    cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
                          "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
                         for w in merged_words]
    word_result = {
        "cells": [{"text": " ".join(w["text"] for w in merged_words),
                    "word_boxes": cells_for_storage}],
        "image_width": img_w,
        "image_height": img_h,
        "ocr_engine": "rapid_kombi",
        "word_count": len(merged_words),
        "raw_paddle_words": rapid_words,
    }
    # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
    vision_applied = False
    if vision_fusion:
        try:
            from vision_ocr_fusion import vision_fuse_ocr
            category = doc_category or session.get("document_category") or "vokabelseite"
            logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
            merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
            vision_applied = True
            # Rebuild storage from fused words
            cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
                                  "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
                                 for w in merged_words]
            word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
                                     "word_boxes": cells_for_storage}]
            word_result["word_count"] = len(merged_words)
            word_result["ocr_engine"] = "vision_fusion"
        except Exception as e:
            logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
    await update_session_db(session_id, word_result=word_result)
    # Reload session with updated word_result
    session = await get_session_db(session_id)
    ocr_duration = _time.time() - t0
    logger.info(
        "rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs "
        "(enhance=%s, min_conf=%d, quality=%s)",
        session_id, len(merged_words), len(rapid_words), len(tess_words),
        len(merged_words), ocr_duration, enhance, actual_min_conf,
        scan_quality_info.get("quality_pct", "?"),
    )
    # 7. Build grid from new words
    try:
        result = await _build_grid_core(
            session_id, session,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            enhance=enhance,
            max_columns=max_cols if max_cols > 0 else None,
            min_conf=min_conf if min_conf > 0 else None,
        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    # Persist grid
    await update_session_db(session_id, grid_editor_result=result, current_step=11)
    # Add quality info to response
    result["scan_quality"] = scan_quality_info
    result["ocr_stats"] = {
        "rapid_words": len(rapid_words),
        "tess_words": len(tess_words),
        "merged_words": len(merged_words),
        "min_conf_used": actual_min_conf,
        "enhance_applied": enhance and is_degraded,
        "vision_fusion_applied": vision_applied,
        "document_category": doc_category or session.get("document_category", ""),
        "ocr_duration_seconds": round(ocr_duration, 1),
    }
    total_duration = _time.time() - t0
    logger.info(
        "rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs",
        session_id,
        len(result.get("zones", [])),
        result.get("summary", {}).get("total_columns", 0),
        result.get("summary", {}).get("total_cells", 0),
        total_duration,
    )
    return result
@router.post("/sessions/{session_id}/save-grid")
 async def save_grid(session_id: str, request: Request):
    """Save edited grid data from the frontend Excel-like editor.
    Receives the full StructuredGrid with user edits (text changes,
    formatting changes like bold columns, header rows, etc.) and
    persists it to the session's grid_editor_result.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    body = await request.json()
    # Validate basic structure
    if "zones" not in body:
        raise HTTPException(status_code=400, detail="Missing 'zones' in request body")
    # Preserve metadata from the original build
    existing = session.get("grid_editor_result") or {}
    result = {
        "session_id": session_id,
        "image_width": body.get("image_width", existing.get("image_width", 0)),
        "image_height": body.get("image_height", existing.get("image_height", 0)),
        "zones": body["zones"],
        "boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)),
        "summary": body.get("summary", existing.get("summary", {})),
        "formatting": body.get("formatting", existing.get("formatting", {})),
        "duration_seconds": existing.get("duration_seconds", 0),
        "edited": True,
    }
    await update_session_db(session_id, grid_editor_result=result, current_step=11)
    logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))
    return {"session_id": session_id, "saved": True}
@router.get("/sessions/{session_id}/grid-editor")
 async def get_grid(session_id: str):
    """Retrieve the current grid editor state for a session."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    result = session.get("grid_editor_result")
    if not result:
        raise HTTPException(
            status_code=404,
            detail="No grid editor data. Run build-grid first.",
        )
    return result
@@ -0,0 +1,110 @@
 """
 Grid Editor API — gutter repair endpoints.
 """
 import logging
 from fastapi import APIRouter, HTTPException, Request
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/gutter-repair")
 async def gutter_repair(session_id: str):
    """Analyse grid for gutter-edge OCR errors and return repair suggestions.
    Detects:
      - Words truncated/blurred at the book binding (spell_fix)
      - Words split across rows with missing hyphen chars (hyphen_join)
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(
            status_code=400,
            detail="No grid data. Run build-grid first.",
        )
    from cv_gutter_repair import analyse_grid_for_gutter_repair
    image_width = grid_data.get("image_width", 0)
    result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
    # Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
    gt = session.get("ground_truth") or {}
    gt["gutter_repair"] = result
    await update_session_db(session_id, ground_truth=gt)
    logger.info(
        "gutter-repair session %s: %d suggestions in %.2fs",
        session_id,
        result.get("stats", {}).get("suggestions_found", 0),
        result.get("duration_seconds", 0),
    )
    return result
@router.post("/sessions/{session_id}/gutter-repair/apply")
 async def gutter_repair_apply(session_id: str, request: Request):
    """Apply accepted gutter repair suggestions to the grid.
    Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(status_code=400, detail="No grid data.")
    gt = session.get("ground_truth") or {}
    gutter_result = gt.get("gutter_repair")
    if not gutter_result:
        raise HTTPException(
            status_code=400,
            detail="No gutter repair data. Run gutter-repair first.",
        )
    body = await request.json()
    accepted_ids = body.get("accepted", [])
    if not accepted_ids:
        return {"applied_count": 0, "changes": []}
    # text_overrides: { suggestion_id: "alternative_text" }
    # Allows the user to pick a different correction from the alternatives list
    text_overrides = body.get("text_overrides", {})
    from cv_gutter_repair import apply_gutter_suggestions
    suggestions = gutter_result.get("suggestions", [])
    # Apply user-selected alternatives before passing to apply
    for s in suggestions:
        sid = s.get("id", "")
        if sid in text_overrides and text_overrides[sid]:
            s["suggested_text"] = text_overrides[sid]
    result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
    # Save updated grid back to session
    await update_session_db(session_id, grid_editor_result=grid_data)
    logger.info(
        "gutter-repair/apply session %s: %d changes applied",
        session_id,
        result.get("applied_count", 0),
    )
    return result
@@ -0,0 +1,71 @@
 """
 Grid Editor API — unified grid endpoints.
 """
 import logging
 from fastapi import APIRouter, HTTPException
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-unified-grid")
 async def build_unified_grid_endpoint(session_id: str):
    """Build a single-zone unified grid merging content + box zones.
    Takes the existing multi-zone grid_editor_result and produces a
    unified grid where boxes are integrated into the main row sequence.
    Persists as unified_grid_result (preserves original multi-zone data).
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
    from unified_grid import build_unified_grid
    result = build_unified_grid(
        zones=grid_data.get("zones", []),
        image_width=grid_data.get("image_width", 0),
        image_height=grid_data.get("image_height", 0),
        layout_metrics=grid_data.get("layout_metrics", {}),
    )
    # Persist as separate field (don't overwrite original multi-zone grid)
    await update_session_db(session_id, unified_grid_result=result)
    logger.info(
        "build-unified-grid session %s: %d rows, %d cells",
        session_id,
        result.get("summary", {}).get("total_rows", 0),
        result.get("summary", {}).get("total_cells", 0),
    )
    return result
@router.get("/sessions/{session_id}/unified-grid")
 async def get_unified_grid(session_id: str):
    """Retrieve the unified grid for a session."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    result = session.get("unified_grid_result")
    if not result:
        raise HTTPException(
            status_code=404,
            detail="No unified grid. Run build-unified-grid first.",
        )
    return result
@@ -0,0 +1,492 @@
 """
 Grid Editor — column detection, cross-column splitting, marker merging.
 Split from grid_editor_helpers.py for maintainability.
 All functions are pure computation — no HTTP, DB, or session side effects.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Cross-column word splitting
 # ---------------------------------------------------------------------------
 _spell_cache: Optional[Any] = None
 _spell_loaded = False
 def _is_recognized_word(text: str) -> bool:
    """Check if *text* is a recognized German or English word.
    Uses the spellchecker library (same as cv_syllable_detect.py).
    Returns True for real words like "oder", "Kabel", "Zeitung".
    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
    """
    global _spell_cache, _spell_loaded
    if not text or len(text) < 2:
        return False
    if not _spell_loaded:
        _spell_loaded = True
        try:
            from spellchecker import SpellChecker
            _spell_cache = SpellChecker(language="de")
        except Exception:
            pass
    if _spell_cache is None:
        return False
    return text.lower() in _spell_cache
 def _split_cross_column_words(
    words: List[Dict],
    columns: List[Dict],
 ) -> List[Dict]:
    """Split word boxes that span across column boundaries.
    When OCR merges adjacent words from different columns (e.g. "sichzie"
    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
    split the word box at the column boundary so each piece is assigned
    to the correct column.
    Only splits when:
    - The word has significant overlap (>15% of its width) on both sides
    - AND the word is not a recognized real word (OCR merge artifact), OR
      the word contains a case transition (lowercase->uppercase) near the
      boundary indicating two merged words like "dasZimmer".
    """
    if len(columns) < 2:
        return words
    # Column boundaries = midpoints between adjacent column edges
    boundaries = []
    for i in range(len(columns) - 1):
        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
        boundaries.append(boundary)
    new_words: List[Dict] = []
    split_count = 0
    for w in words:
        w_left = w["left"]
        w_width = w["width"]
        w_right = w_left + w_width
        text = (w.get("text") or "").strip()
        if not text or len(text) < 4 or w_width < 10:
            new_words.append(w)
            continue
        # Find the first boundary this word straddles significantly
        split_boundary = None
        for b in boundaries:
            if w_left < b < w_right:
                left_part = b - w_left
                right_part = w_right - b
                # Both sides must have at least 15% of the word width
                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
                    split_boundary = b
                    break
        if split_boundary is None:
            new_words.append(w)
            continue
        # Compute approximate split position in the text.
        left_width = split_boundary - w_left
        split_ratio = left_width / w_width
        approx_pos = len(text) * split_ratio
        # Strategy 1: look for a case transition (lowercase->uppercase) near
        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
        split_char = None
        search_lo = max(1, int(approx_pos) - 3)
        search_hi = min(len(text), int(approx_pos) + 2)
        for i in range(search_lo, search_hi):
            if text[i - 1].islower() and text[i].isupper():
                split_char = i
                break
        # Strategy 2: if no case transition, only split if the whole word
        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
        # Real words like "oder", "Kabel", "Zeitung" must not be split.
        if split_char is None:
            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
            if _is_recognized_word(clean):
                new_words.append(w)
                continue
            # Not a real word — use floor of proportional position
            split_char = max(1, min(len(text) - 1, int(approx_pos)))
        left_text = text[:split_char].rstrip()
        right_text = text[split_char:].lstrip()
        if len(left_text) < 2 or len(right_text) < 2:
            new_words.append(w)
            continue
        right_width = w_width - round(left_width)
        new_words.append({
            **w,
            "text": left_text,
            "width": round(left_width),
        })
        new_words.append({
            **w,
            "text": right_text,
            "left": round(split_boundary),
            "width": right_width,
        })
        split_count += 1
        logger.info(
            "split cross-column word %r -> %r + %r at boundary %.0f",
            text, left_text, right_text, split_boundary,
        )
    if split_count:
        logger.info("split %d cross-column word(s)", split_count)
    return new_words
 def _cluster_columns_by_alignment(
    words: List[Dict],
    zone_w: int,
    rows: List[Dict],
 ) -> List[Dict[str, Any]]:
    """Detect columns by clustering left-edge alignment across rows.
    Hybrid approach:
      1. Group words by row, find "group start" positions within each row
         (words preceded by a large gap or first word in row)
      2. Cluster group-start left-edges by X-proximity across rows
      3. Filter by row coverage (how many rows have a group start here)
      4. Merge nearby clusters
      5. Build column boundaries
    This filters out mid-phrase word positions (e.g. IPA transcriptions,
    second words in multi-word entries) by only considering positions
    where a new word group begins within a row.
    """
    if not words or not rows:
        return []
    total_rows = len(rows)
    if total_rows == 0:
        return []
    # --- Group words by row ---
    row_words: Dict[int, List[Dict]] = {}
    for w in words:
        y_center = w["top"] + w["height"] / 2
        best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
        row_words.setdefault(best["index"], []).append(w)
    # --- Compute adaptive gap threshold for group-start detection ---
    all_gaps: List[float] = []
    for ri, rw_list in row_words.items():
        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
        for i in range(len(sorted_rw) - 1):
            right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
            gap = sorted_rw[i + 1]["left"] - right
            if gap > 0:
                all_gaps.append(gap)
    if all_gaps:
        sorted_gaps = sorted(all_gaps)
        median_gap = sorted_gaps[len(sorted_gaps) // 2]
        heights = [w["height"] for w in words if w.get("height", 0) > 0]
        median_h = sorted(heights)[len(heights) // 2] if heights else 25
        # For small word counts (boxes, sub-zones): PaddleOCR returns
        # multi-word blocks, so ALL inter-word gaps are potential column
        # boundaries.  Use a low threshold based on word height — any gap
        # wider than ~1x median word height is a column separator.
        if len(words) <= 60:
            gap_threshold = max(median_h * 1.0, 25)
            logger.info(
                "alignment columns (small zone): gap_threshold=%.0f "
                "(median_h=%.0f, %d words, %d gaps: %s)",
                gap_threshold, median_h, len(words), len(sorted_gaps),
                [int(g) for g in sorted_gaps[:10]],
            )
        else:
            # Standard approach for large zones (full pages)
            gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
            # Cap at 25% of zone width
            max_gap = zone_w * 0.25
            if gap_threshold > max_gap > 30:
                logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
                gap_threshold = max_gap
    else:
        gap_threshold = 50
    # --- Find group-start positions (left-edges that begin a new column) ---
    start_positions: List[tuple] = []  # (left_edge, row_index)
    for ri, rw_list in row_words.items():
        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
        # First word in row is always a group start
        start_positions.append((sorted_rw[0]["left"], ri))
        for i in range(1, len(sorted_rw)):
            right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
            gap = sorted_rw[i]["left"] - right_prev
            if gap >= gap_threshold:
                start_positions.append((sorted_rw[i]["left"], ri))
    start_positions.sort(key=lambda x: x[0])
    logger.info(
        "alignment columns: %d group-start positions from %d words "
        "(gap_threshold=%.0f, %d rows)",
        len(start_positions), len(words), gap_threshold, total_rows,
    )
    if not start_positions:
        x_min = min(w["left"] for w in words)
        x_max = max(w["left"] + w["width"] for w in words)
        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
    # --- Cluster group-start positions by X-proximity ---
    tolerance = max(10, int(zone_w * 0.01))
    clusters: List[Dict[str, Any]] = []
    cur_edges = [start_positions[0][0]]
    cur_rows = {start_positions[0][1]}
    for left, row_idx in start_positions[1:]:
        if left - cur_edges[-1] <= tolerance:
            cur_edges.append(left)
            cur_rows.add(row_idx)
        else:
            clusters.append({
                "mean_x": int(sum(cur_edges) / len(cur_edges)),
                "min_edge": min(cur_edges),
                "max_edge": max(cur_edges),
                "count": len(cur_edges),
                "distinct_rows": len(cur_rows),
                "row_coverage": len(cur_rows) / total_rows,
            })
            cur_edges = [left]
            cur_rows = {row_idx}
    clusters.append({
        "mean_x": int(sum(cur_edges) / len(cur_edges)),
        "min_edge": min(cur_edges),
        "max_edge": max(cur_edges),
        "count": len(cur_edges),
        "distinct_rows": len(cur_rows),
        "row_coverage": len(cur_rows) / total_rows,
    })
    # --- Filter by row coverage ---
    # These thresholds must be high enough to avoid false columns in flowing
    # text (random inter-word gaps) while still detecting real columns in
    # vocabulary worksheets (which typically have >80% row coverage).
    MIN_COVERAGE_PRIMARY = 0.35
    MIN_COVERAGE_SECONDARY = 0.12
    MIN_WORDS_SECONDARY = 4
    MIN_DISTINCT_ROWS = 3
    # Content boundary for left-margin detection
    content_x_min = min(w["left"] for w in words)
    content_x_max = max(w["left"] + w["width"] for w in words)
    content_span = content_x_max - content_x_min
    primary = [
        c for c in clusters
        if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
    ]
    primary_ids = {id(c) for c in primary}
    secondary = [
        c for c in clusters
        if id(c) not in primary_ids
        and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
        and c["count"] >= MIN_WORDS_SECONDARY
        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
    ]
    # Tertiary: narrow left-margin columns (page refs, markers) that have
    # too few rows for secondary but are clearly left-aligned and separated
    # from the main content.  These appear at the far left or far right and
    # have a large gap to the nearest significant cluster.
    used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
    sig_xs = [c["mean_x"] for c in primary + secondary]
    # Tertiary: clusters that are clearly to the LEFT of the first
    # significant column (or RIGHT of the last).  If words consistently
    # start at a position left of the established first column boundary,
    # they MUST be a separate column — regardless of how few rows they
    # cover.  The only requirement is a clear spatial gap.
    MIN_COVERAGE_TERTIARY = 0.02  # at least 1 row effectively
    tertiary = []
    for c in clusters:
        if id(c) in used_ids:
            continue
        if c["distinct_rows"] < 1:
            continue
        if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
            continue
        # Must be near left or right content margin (within 15%)
        rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
        if not (rel_pos < 0.15 or rel_pos > 0.85):
            continue
        # Must have significant gap to nearest significant cluster
        if sig_xs:
            min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
            if min_dist < max(30, content_span * 0.02):
                continue
        tertiary.append(c)
    if tertiary:
        for c in tertiary:
            logger.info(
                "  tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
                c["mean_x"], c["min_edge"], c["max_edge"],
                c["count"], c["distinct_rows"], c["row_coverage"] * 100,
            )
    significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
    for c in significant:
        logger.info(
            "  significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
            c["mean_x"], c["min_edge"], c["max_edge"],
            c["count"], c["distinct_rows"], c["row_coverage"] * 100,
        )
    logger.info(
        "alignment columns: %d clusters, %d primary, %d secondary -> %d significant",
        len(clusters), len(primary), len(secondary), len(significant),
    )
    if not significant:
        # Fallback: single column covering all content
        x_min = min(w["left"] for w in words)
        x_max = max(w["left"] + w["width"] for w in words)
        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
    # --- Merge nearby clusters ---
    merge_distance = max(25, int(zone_w * 0.03))
    merged = [significant[0].copy()]
    for s in significant[1:]:
        if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
            prev = merged[-1]
            total = prev["count"] + s["count"]
            prev["mean_x"] = (
                prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
            ) // total
            prev["count"] = total
            prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
            prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
            prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
        else:
            merged.append(s.copy())
    logger.info(
        "alignment columns: %d after merge (distance=%d)",
        len(merged), merge_distance,
    )
    # --- Build column boundaries ---
    margin = max(5, int(zone_w * 0.005))
    content_x_min = min(w["left"] for w in words)
    content_x_max = max(w["left"] + w["width"] for w in words)
    columns: List[Dict[str, Any]] = []
    for i, cluster in enumerate(merged):
        x_min = max(content_x_min, cluster["min_edge"] - margin)
        if i + 1 < len(merged):
            x_max = merged[i + 1]["min_edge"] - margin
        else:
            x_max = content_x_max
        columns.append({
            "index": i,
            "type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
            "x_min": x_min,
            "x_max": x_max,
        })
    return columns
 _MARKER_CHARS = set("*-+#>")
 def _merge_inline_marker_columns(
    columns: List[Dict],
    words: List[Dict],
 ) -> List[Dict]:
    """Merge narrow marker columns (bullets, numbering) into adjacent text.
    Bullet points (*, -) and numbering (1., 2.) create narrow columns
    at the left edge of a zone.  These are inline markers that indent text,
    not real separate columns.  Merge them with their right neighbour.
    Does NOT merge columns containing alphabetic words like "to", "in",
    "der", "die", "das" — those are legitimate content columns.
    """
    if len(columns) < 2:
        return columns
    merged: List[Dict] = []
    skip: set = set()
    for i, col in enumerate(columns):
        if i in skip:
            continue
        # Find words in this column
        col_words = [
            w for w in words
            if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
        ]
        col_width = col["x_max"] - col["x_min"]
        # Narrow column with mostly short words -> MIGHT be inline markers
        if col_words and col_width < 80:
            avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
            if avg_len <= 2 and i + 1 < len(columns):
                # Check if words are actual markers (symbols/numbers) vs
                # real alphabetic words like "to", "in", "der", "die"
                texts = [(w.get("text") or "").strip() for w in col_words]
                alpha_count = sum(
                    1 for t in texts
                    if t and t[0].isalpha() and t not in _MARKER_CHARS
                )
                alpha_ratio = alpha_count / len(texts) if texts else 0
                # If >=50% of words are alphabetic, this is a real column
                if alpha_ratio >= 0.5:
                    logger.info(
                        "  kept narrow column %d (w=%d, avg_len=%.1f, "
                        "alpha=%.0f%%) -- contains real words",
                        i, col_width, avg_len, alpha_ratio * 100,
                    )
                else:
                    # Merge into next column
                    next_col = columns[i + 1].copy()
                    next_col["x_min"] = col["x_min"]
                    merged.append(next_col)
                    skip.add(i + 1)
                    logger.info(
                        "  merged inline marker column %d (w=%d, avg_len=%.1f) "
                        "into column %d",
                        i, col_width, avg_len, i + 1,
                    )
                    continue
        merged.append(col)
    # Re-index
    for i, col in enumerate(merged):
        col["index"] = i
        col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
    return merged
@@ -0,0 +1,402 @@
 """
 Grid Editor — word/zone filtering, border ghosts, decorative margins, footers.
 Split from grid_editor_helpers.py for maintainability.
 All functions are pure computation — no HTTP, DB, or session side effects.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 from typing import Any, Dict, List, Optional, Tuple
 logger = logging.getLogger(__name__)
 def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
    """Remove page-border decoration strip words BEFORE column detection.
    Scans from each page edge inward to find the first significant x-gap
    (>30 px).  If the edge cluster contains <15 % of total words, those
    words are removed as border-strip artifacts (alphabet letters,
    illustration fragments).
    Must run BEFORE ``_build_zone_grid`` so that column detection only
    sees real content words and doesn't produce inflated row counts.
    """
    if len(words) < 10:
        return words, 0
    sorted_words = sorted(words, key=lambda w: w.get("left", 0))
    total = len(sorted_words)
    # -- Left-edge scan (running max right-edge) --
    left_count = 0
    running_right = 0
    for gi in range(total - 1):
        running_right = max(
            running_right,
            sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
        )
        if sorted_words[gi + 1].get("left", 0) - running_right > 30:
            left_count = gi + 1
            break
    # -- Right-edge scan (running min left) --
    right_count = 0
    running_left = sorted_words[-1].get("left", 0)
    for gi in range(total - 1, 0, -1):
        running_left = min(running_left, sorted_words[gi].get("left", 0))
        prev_right = (
            sorted_words[gi - 1].get("left", 0)
            + sorted_words[gi - 1].get("width", 0)
        )
        if running_left - prev_right > 30:
            right_count = total - gi
            break
    # Validate candidate strip: real border decorations are mostly short
    # words (alphabet letters like "A", "Bb", stray marks).  Multi-word
    # content like "der Ranzen" or "die Schals" (continuation of German
    # translations) must NOT be removed.
    def _is_decorative_strip(candidates: List[Dict]) -> bool:
        if not candidates:
            return False
        short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
        return short / len(candidates) >= 0.45
    strip_ids: set = set()
    if left_count > 0 and left_count / total < 0.20:
        candidates = sorted_words[:left_count]
        if _is_decorative_strip(candidates):
            strip_ids = {id(w) for w in candidates}
    elif right_count > 0 and right_count / total < 0.20:
        candidates = sorted_words[total - right_count:]
        if _is_decorative_strip(candidates):
            strip_ids = {id(w) for w in candidates}
    if not strip_ids:
        return words, 0
    return [w for w in words if id(w) not in strip_ids], len(strip_ids)
 # Characters that are typically OCR artefacts from box border lines.
 # Intentionally excludes ! (red markers) and . , ; (real punctuation).
 _GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+")
 def _filter_border_ghosts(
    words: List[Dict],
    boxes: List,
 ) -> tuple:
    """Remove words sitting on box borders that are OCR artefacts.
    Returns (filtered_words, removed_count).
    """
    if not boxes or not words:
        return words, 0
    # Build border bands from detected boxes
    x_bands: List[tuple] = []
    y_bands: List[tuple] = []
    for b in boxes:
        bt = (
            b.border_thickness
            if hasattr(b, "border_thickness")
            else b.get("border_thickness", 3)
        )
        # Skip borderless boxes (images/graphics) -- no border line to produce ghosts
        if bt == 0:
            continue
        bx = b.x if hasattr(b, "x") else b.get("x", 0)
        by = b.y if hasattr(b, "y") else b.get("y", 0)
        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
        margin = max(bt * 2, 10) + 6
        x_bands.append((bx - margin, bx + margin))
        x_bands.append((bx + bw - margin, bx + bw + margin))
        y_bands.append((by - margin, by + margin))
        y_bands.append((by + bh - margin, by + bh + margin))
    def _is_ghost(w: Dict) -> bool:
        text = (w.get("text") or "").strip()
        if not text:
            return False
        # Check if any word edge (not just center) touches a border band
        w_left = w["left"]
        w_right = w["left"] + w["width"]
        w_top = w["top"]
        w_bottom = w["top"] + w["height"]
        on_border = (
            any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
            or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
        )
        if not on_border:
            return False
        if len(text) == 1 and text in _GRID_GHOST_CHARS:
            return True
        return False
    filtered = [w for w in words if not _is_ghost(w)]
    return filtered, len(words) - len(filtered)
 def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
    """Extract all word_boxes from cells into a flat list of word dicts."""
    words: List[Dict] = []
    for cell in cells:
        for wb in cell.get("word_boxes") or []:
            if wb.get("text", "").strip():
                words.append({
                    "text": wb["text"],
                    "left": wb["left"],
                    "top": wb["top"],
                    "width": wb["width"],
                    "height": wb["height"],
                    "conf": wb.get("conf", 0),
                })
    return words
 def _words_in_zone(
    words: List[Dict],
    zone_y: int,
    zone_h: int,
    zone_x: int,
    zone_w: int,
 ) -> List[Dict]:
    """Filter words whose Y-center falls within a zone's bounds."""
    zone_y_end = zone_y + zone_h
    zone_x_end = zone_x + zone_w
    result = []
    for w in words:
        cy = w["top"] + w["height"] / 2
        cx = w["left"] + w["width"] / 2
        if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
            result.append(w)
    return result
 def _get_content_bounds(words: List[Dict]) -> tuple:
    """Get content bounds from word positions."""
    if not words:
        return 0, 0, 0, 0
    x_min = min(w["left"] for w in words)
    y_min = min(w["top"] for w in words)
    x_max = max(w["left"] + w["width"] for w in words)
    y_max = max(w["top"] + w["height"] for w in words)
    return x_min, y_min, x_max - x_min, y_max - y_min
 def _filter_decorative_margin(
    words: List[Dict],
    img_w: int,
    log: Any,
    session_id: str,
 ) -> Dict[str, Any]:
    """Remove words that belong to a decorative alphabet strip on a margin.
    Some vocabulary worksheets have a vertical A-Z alphabet graphic along
    the left or right edge.  OCR reads each letter as an isolated single-
    character word.  These decorative elements are not content and confuse
    column/row detection.
    Detection criteria (phase 1 -- find the strip using single-char words):
      - Words are in the outer 30% of the page (left or right)
      - Nearly all words are single characters (letters or digits)
      - At least 8 such words form a vertical strip (>=8 unique Y positions)
      - Average horizontal spread of the strip is small (< 80px)
    Phase 2 -- once a strip is confirmed, also remove any short word (<=3
    chars) in the same narrow x-range.  This catches multi-char OCR
    artifacts like "Vv" that belong to the same decorative element.
    Modifies *words* in place.
    Returns:
        Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
    """
    no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
    if not words or img_w <= 0:
        return no_strip
    margin_cutoff = img_w * 0.30
    # Phase 1: find candidate strips using short words (1-2 chars).
    # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
    # rather than singles, so accept <=2-char words as strip candidates.
    left_strip = [
        w for w in words
        if len((w.get("text") or "").strip()) <= 2
        and w["left"] + w.get("width", 0) / 2 < margin_cutoff
    ]
    right_strip = [
        w for w in words
        if len((w.get("text") or "").strip()) <= 2
        and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
    ]
    for strip, side in [(left_strip, "left"), (right_strip, "right")]:
        if len(strip) < 6:
            continue
        # Check vertical distribution: should have many distinct Y positions
        y_centers = sorted(set(
            int(w["top"] + w.get("height", 0) / 2) // 20 * 20  # bucket
            for w in strip
        ))
        if len(y_centers) < 6:
            continue
        # Check horizontal compactness
        x_positions = [w["left"] for w in strip]
        x_min = min(x_positions)
        x_max = max(x_positions)
        x_spread = x_max - x_min
        if x_spread > 80:
            continue
        # Phase 2: strip confirmed -- also collect short words in same x-range
        # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
        strip_x_lo = x_min - 20
        strip_x_hi = x_max + 60  # word width + tolerance
        all_strip_words = [
            w for w in words
            if len((w.get("text") or "").strip()) <= 3
            and strip_x_lo <= w["left"] <= strip_x_hi
            and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
                 if side == "left"
                 else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
        ]
        strip_set = set(id(w) for w in all_strip_words)
        before = len(words)
        words[:] = [w for w in words if id(w) not in strip_set]
        removed = before - len(words)
        if removed:
            log.info(
                "build-grid session %s: removed %d decorative %s-margin words "
                "(strip x=%d-%d)",
                session_id, removed, side, strip_x_lo, strip_x_hi,
            )
        return {"found": True, "side": side, "letters_detected": len(strip)}
    return no_strip
 def _filter_footer_words(
    words: List[Dict],
    img_h: int,
    log: Any,
    session_id: str,
 ) -> Optional[Dict]:
    """Remove isolated words in the bottom 5% of the page (page numbers).
    Modifies *words* in place and returns a page_number metadata dict
    if a page number was extracted, or None.
    """
    if not words or img_h <= 0:
        return None
    footer_y = img_h * 0.95
    footer_words = [
        w for w in words
        if w["top"] + w.get("height", 0) / 2 > footer_y
    ]
    if not footer_words:
        return None
    # Only remove if footer has very few words (<= 3) with short text
    total_text = "".join((w.get("text") or "").strip() for w in footer_words)
    if len(footer_words) <= 3 and len(total_text) <= 10:
        # Extract page number metadata before removing
        page_number_info = {
            "text": total_text.strip(),
            "y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
        }
        # Try to parse as integer
        digits = "".join(c for c in total_text if c.isdigit())
        if digits:
            page_number_info["number"] = int(digits)
        footer_set = set(id(w) for w in footer_words)
        words[:] = [w for w in words if id(w) not in footer_set]
        log.info(
            "build-grid session %s: extracted page number '%s' and removed %d footer words",
            session_id, total_text, len(footer_words),
        )
        return page_number_info
    return None
 def _filter_header_junk(
    words: List[Dict],
    img_h: int,
    log: Any,
    session_id: str,
 ) -> None:
    """Remove OCR junk from header illustrations above the real content.
    Textbook pages often have decorative header graphics (illustrations,
    icons) that OCR reads as low-confidence junk characters.  Real content
    typically starts further down the page.
    Algorithm:
      1. Find the "content start" -- the first Y position where a dense
         horizontal row of 3+ high-confidence words begins.
      2. Above that line, remove words with conf < 75 and text <= 3 chars.
         These are almost certainly OCR artifacts from illustrations.
    Modifies *words* in place.
    """
    if not words or img_h <= 0:
        return
    # --- Find content start: first horizontal row with >=3 high-conf words ---
    # Sort words by Y
    sorted_by_y = sorted(words, key=lambda w: w["top"])
    content_start_y = 0
    _ROW_TOLERANCE = img_h * 0.02  # words within 2% of page height = same row
    _MIN_ROW_WORDS = 3
    _MIN_CONF = 80
    i = 0
    while i < len(sorted_by_y):
        row_y = sorted_by_y[i]["top"]
        # Collect words in this row band
        row_words = []
        j = i
        while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
            row_words.append(sorted_by_y[j])
            j += 1
        # Count high-confidence words with real text (> 1 char)
        high_conf = [
            w for w in row_words
            if w.get("conf", 0) >= _MIN_CONF
            and len((w.get("text") or "").strip()) > 1
        ]
        if len(high_conf) >= _MIN_ROW_WORDS:
            content_start_y = row_y
            break
        i = j if j > i else i + 1
    if content_start_y <= 0:
        return  # no clear content start found
    # --- Remove low-conf short junk above content start ---
    junk = [
        w for w in words
        if w["top"] + w.get("height", 0) < content_start_y
        and w.get("conf", 0) < 75
        and len((w.get("text") or "").strip()) <= 3
    ]
    if not junk:
        return
    junk_set = set(id(w) for w in junk)
    before = len(words)
    words[:] = [w for w in words if id(w) not in junk_set]
    removed = before - len(words)
    if removed:
        log.info(
            "build-grid session %s: removed %d header junk words above y=%d "
            "(content start)",
            session_id, removed, content_start_y,
        )
@@ -0,0 +1,499 @@
 """
 Grid Editor — header/heading detection and colspan (merged cell) detection.
 Split from grid_editor_helpers.py.  Pure computation, no HTTP/DB side effects.
 Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import re
 from typing import Dict, List, Optional
 from cv_ocr_engines import _text_has_garbled_ipa
 logger = logging.getLogger(__name__)
 def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
    """Detect heading rows by color + height after color annotation.
    A row is a heading if:
    1. ALL word_boxes have color_name != 'black' (typically 'blue')
    2. Mean word height > 1.2x median height of all words in the zone
    Detected heading rows are merged into a single spanning cell.
    Returns count of headings detected.
    """
    heading_count = 0
    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        columns = z.get("columns", [])
        if not cells or not rows or len(columns) < 2:
            continue
        # Compute median word height across the zone
        all_heights = []
        for cell in cells:
            for wb in cell.get("word_boxes") or []:
                h = wb.get("height", 0)
                if h > 0:
                    all_heights.append(h)
        if not all_heights:
            continue
        all_heights_sorted = sorted(all_heights)
        median_h = all_heights_sorted[len(all_heights_sorted) // 2]
        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue  # already detected as header
            ri = row["index"]
            row_cells = [c for c in cells if c.get("row_index") == ri]
            row_wbs = [
                wb for cell in row_cells
                for wb in cell.get("word_boxes") or []
            ]
            if not row_wbs:
                continue
            # Condition 1: ALL words are non-black
            all_colored = all(
                wb.get("color_name", "black") != "black"
                for wb in row_wbs
            )
            if not all_colored:
                continue
            # Condition 2: mean height > 1.2x median
            mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
            if mean_h <= median_h * 1.2:
                continue
            heading_row_indices.append(ri)
        # Merge heading cells into spanning cells
        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if len(header_cells) <= 1:
                # Single cell -- just mark it as heading
                if header_cells:
                    header_cells[0]["col_type"] = "heading"
                    heading_count += 1
                    # Mark row as header
                    for row in rows:
                        if row["index"] == hri:
                            row["is_header"] = True
                continue
            # Collect all word_boxes and text from all columns
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            # Remove all cells for this row, replace with one spanning cell
            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                # Use the actual starting col_index from the first cell
                first_col = min(hc["col_index"] for hc in header_cells)
                zone_idx = z.get("zone_index", 0)
                z["cells"].append({
                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
                    "zone_index": zone_idx,
                    "row_index": hri,
                    "col_index": first_col,
                    "col_type": "heading",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
                    "bbox_px": {"x": x_min, "y": y_min,
                                "w": x_max - x_min, "h": y_max - y_min},
                    "bbox_pct": {
                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                    },
                    "word_boxes": all_wb,
                    "ocr_engine": "words_first",
                    "is_bold": True,
                })
            # Mark row as header
            for row in rows:
                if row["index"] == hri:
                    row["is_header"] = True
            heading_count += 1
    return heading_count
 def _detect_heading_rows_by_single_cell(
    zones_data: List[Dict], img_w: int, img_h: int,
 ) -> int:
    """Detect heading rows that have only a single content cell.
    Black headings like "Theme" have normal color and height, so they are
    missed by ``_detect_heading_rows_by_color``.  The distinguishing signal
    is that they occupy only one column while normal vocabulary rows fill
    at least 2-3 columns.
    A row qualifies as a heading if:
    1. It is not already marked as a header/heading.
    2. It has exactly ONE cell whose col_type starts with ``column_``
       (excluding column_1 / page_ref which only carries page numbers).
    3. That single cell is NOT in the last column (continuation/example
       lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
    4. The text does not start with ``[`` (IPA continuation).
    5. The zone has >=3 columns and >=5 rows (avoids false positives in
       tiny zones).
    6. The majority of rows in the zone have >=2 content cells (ensures
       we are in a multi-column vocab layout).
    """
    heading_count = 0
    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        columns = z.get("columns", [])
        if len(columns) < 3 or len(rows) < 5:
            continue
        # Determine the last col_index (example/sentence column)
        col_indices = sorted(set(c.get("col_index", 0) for c in cells))
        if not col_indices:
            continue
        last_col = col_indices[-1]
        # Count content cells per row (column_* but not column_1/page_ref).
        # Exception: column_1 cells that contain a dictionary article word
        # (die/der/das etc.) ARE content -- they appear in dictionary layouts
        # where the leftmost column holds grammatical articles.
        _ARTICLE_WORDS = {
            "die", "der", "das", "dem", "den", "des", "ein", "eine",
            "the", "a", "an",
        }
        row_content_counts: Dict[int, int] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            if ct == "column_1":
                ctext = (cell.get("text") or "").strip().lower()
                if ctext not in _ARTICLE_WORDS:
                    continue
            ri = cell.get("row_index", -1)
            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
        # Majority of rows must have >=2 content cells
        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
        if multi_col_rows < len(rows) * 0.4:
            continue
        # Exclude first and last non-header rows -- these are typically
        # page numbers or footer text, not headings.
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if len(non_header_rows) < 3:
            continue
        first_ri = non_header_rows[0]["index"]
        last_ri = non_header_rows[-1]["index"]
        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue
            ri = row["index"]
            if ri == first_ri or ri == last_ri:
                continue
            row_cells = [c for c in cells if c.get("row_index") == ri]
            content_cells = [
                c for c in row_cells
                if c.get("col_type", "").startswith("column_")
                and (c.get("col_type") != "column_1"
                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
            ]
            if len(content_cells) != 1:
                continue
            cell = content_cells[0]
            # Not in the last column (continuation/example lines)
            if cell.get("col_index") == last_col:
                continue
            text = (cell.get("text") or "").strip()
            if not text or text.startswith("["):
                continue
            # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
            if text.startswith("("):
                continue
            # Single cell NOT in the first content column is likely a
            # continuation/overflow line, not a heading.  Real headings
            # ("Theme 1", "Unit 3: ...") appear in the first or second
            # content column.
            first_content_col = col_indices[0] if col_indices else 0
            if cell.get("col_index", 0) > first_content_col + 1:
                continue
            # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
            # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
            _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
                continue
            # Guard: dictionary section headings are short (1-4 alpha chars
            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
            # lowercase is a regular vocabulary word (e.g. "zentral") that
            # happens to appear alone in its row.
            alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
            if len(alpha_only) > 4 and text[0].islower():
                continue
            heading_row_indices.append(ri)
        # Guard: if >25% of eligible rows would become headings, the
        # heuristic is misfiring (e.g. sparse single-column layout where
        # most rows naturally have only 1 content cell).
        eligible_rows = len(non_header_rows) - 2  # minus first/last excluded
        if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
            logger.debug(
                "Skipping single-cell heading detection for zone %s: "
                "%d/%d rows would be headings (>25%%)",
                z.get("zone_index"), len(heading_row_indices), eligible_rows,
            )
            continue
        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if not header_cells:
                continue
            # Collect all word_boxes and text
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            first_col_idx = min(hc["col_index"] for hc in header_cells)
            # Remove old cells for this row, add spanning heading cell
            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
            else:
                # Fallback to first cell bbox
                bp = header_cells[0].get("bbox_px", {})
                x_min = bp.get("x", 0)
                y_min = bp.get("y", 0)
                x_max = x_min + bp.get("w", 0)
                y_max = y_min + bp.get("h", 0)
            zone_idx = z.get("zone_index", 0)
            z["cells"].append({
                "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
                "zone_index": zone_idx,
                "row_index": hri,
                "col_index": first_col_idx,
                "col_type": "heading",
                "text": " ".join(all_text_parts),
                "confidence": 0.0,
                "bbox_px": {"x": x_min, "y": y_min,
                            "w": x_max - x_min, "h": y_max - y_min},
                "bbox_pct": {
                    "x": round(x_min / img_w * 100, 2) if img_w else 0,
                    "y": round(y_min / img_h * 100, 2) if img_h else 0,
                    "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                    "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                },
                "word_boxes": all_wb,
                "ocr_engine": "words_first",
                "is_bold": False,
            })
            for row in rows:
                if row["index"] == hri:
                    row["is_header"] = True
            heading_count += 1
    return heading_count
 def _detect_header_rows(
    rows: List[Dict],
    zone_words: List[Dict],
    zone_y: int,
    columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
 ) -> List[int]:
    """Detect header rows: first-row heuristic + spanning header detection.
    A "spanning header" is a row whose words stretch across multiple column
    boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
    """
    if len(rows) < 2:
        return []
    headers = []
    if not skip_first_row_header:
        first_row = rows[0]
        second_row = rows[1]
        # Gap between first and second row > 0.5x average row height
        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
        gap = second_row["y_min"] - first_row["y_max"]
        if gap > avg_h * 0.5:
            headers.append(0)
        # Also check if first row words are taller than average (bold/header text)
        all_heights = [w["height"] for w in zone_words]
        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
        first_row_words = [
            w for w in zone_words
            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
        ]
        if first_row_words:
            first_h = max(w["height"] for w in first_row_words)
            if first_h > median_h * 1.3:
                if 0 not in headers:
                    headers.append(0)
    # Note: Spanning-header detection (rows spanning all columns) has been
    # disabled because it produces too many false positives on vocabulary
    # worksheets where IPA transcriptions or short entries naturally span
    # multiple columns with few words.  The first-row heuristic above is
    # sufficient for detecting real headers.
    return headers
 def _detect_colspan_cells(
    zone_words: List[Dict],
    columns: List[Dict],
    rows: List[Dict],
    cells: List[Dict],
    img_w: int,
    img_h: int,
 ) -> List[Dict]:
    """Detect and merge cells that span multiple columns (colspan).
    A word-block (PaddleOCR phrase) that extends significantly past a column
    boundary into the next column indicates a merged cell.  This replaces
    the incorrectly split cells with a single cell spanning multiple columns.
    Works for both full-page scans and box zones.
    """
    if len(columns) < 2 or not zone_words or not rows:
        return cells
    from cv_words_first import _assign_word_to_row
    # Column boundaries (midpoints between adjacent columns)
    col_boundaries = []
    for ci in range(len(columns) - 1):
        col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
    def _cols_covered(w_left: float, w_right: float) -> List[int]:
        """Return list of column indices that a word-block covers."""
        covered = []
        for col in columns:
            col_mid = (col["x_min"] + col["x_max"]) / 2
            # Word covers a column if it extends past the column's midpoint
            if w_left < col_mid < w_right:
                covered.append(col["index"])
            # Also include column if word starts within it
            elif col["x_min"] <= w_left < col["x_max"]:
                covered.append(col["index"])
        return sorted(set(covered))
    # Group original word-blocks by row
    row_word_blocks: Dict[int, List[Dict]] = {}
    for w in zone_words:
        ri = _assign_word_to_row(w, rows)
        row_word_blocks.setdefault(ri, []).append(w)
    # For each row, check if any word-block spans multiple columns
    rows_to_merge: Dict[int, List[Dict]] = {}  # row_index -> list of spanning word-blocks
    for ri, wblocks in row_word_blocks.items():
        spanning = []
        for w in wblocks:
            w_left = w["left"]
            w_right = w_left + w["width"]
            covered = _cols_covered(w_left, w_right)
            if len(covered) >= 2:
                spanning.append({"word": w, "cols": covered})
        if spanning:
            rows_to_merge[ri] = spanning
    if not rows_to_merge:
        return cells
    # Merge cells for spanning rows
    new_cells = []
    for cell in cells:
        ri = cell.get("row_index", -1)
        if ri not in rows_to_merge:
            new_cells.append(cell)
            continue
        # Check if this cell's column is part of a spanning block
        ci = cell.get("col_index", -1)
        is_part_of_span = False
        for span in rows_to_merge[ri]:
            if ci in span["cols"]:
                is_part_of_span = True
                # Only emit the merged cell for the FIRST column in the span
                if ci == span["cols"][0]:
                    # Use the ORIGINAL word-block text (not the split cell texts
                    # which may have broken words like "euros a" + "nd cents")
                    orig_word = span["word"]
                    merged_text = orig_word.get("text", "").strip()
                    all_wb = [orig_word]
                    # Compute merged bbox
                    if all_wb:
                        x_min = min(wb["left"] for wb in all_wb)
                        y_min = min(wb["top"] for wb in all_wb)
                        x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                        y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                    else:
                        x_min = y_min = x_max = y_max = 0
                    new_cells.append({
                        "cell_id": cell["cell_id"],
                        "row_index": ri,
                        "col_index": span["cols"][0],
                        "col_type": "spanning_header",
                        "colspan": len(span["cols"]),
                        "text": merged_text,
                        "confidence": cell.get("confidence", 0),
                        "bbox_px": {"x": x_min, "y": y_min,
                                    "w": x_max - x_min, "h": y_max - y_min},
                        "bbox_pct": {
                            "x": round(x_min / img_w * 100, 2) if img_w else 0,
                            "y": round(y_min / img_h * 100, 2) if img_h else 0,
                            "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                            "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                        },
                        "word_boxes": all_wb,
                        "ocr_engine": cell.get("ocr_engine", ""),
                        "is_bold": cell.get("is_bold", False),
                    })
                    logger.info(
                        "colspan detected: row %d, cols %s -> merged %d cells (%r)",
                        ri, span["cols"], len(span["cols"]), merged_text[:50],
                    )
                break
        if not is_part_of_span:
            new_cells.append(cell)
    return new_cells
@@ -0,0 +1,58 @@
 """
 Grid Editor helper functions — barrel re-export module.
 This file re-exports all public symbols from the split sub-modules
 so that existing ``from grid_editor_helpers import ...`` statements
 continue to work without changes.
 Sub-modules:
  - columns  — column detection, cross-column splitting, marker merging
  - filters  — word/zone filtering, border ghosts, decorative margins
  - headers  — header/heading detection, colspan detection
  - zones    — vertical dividers, zone splitting/merging, zone grid building
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 # --- Re-export: columns ---------------------------------------------------
 from .columns import (  # noqa: F401
    _is_recognized_word,
    _split_cross_column_words,
    _cluster_columns_by_alignment,
    _MARKER_CHARS,
    _merge_inline_marker_columns,
 )
 # --- Re-export: filters ----------------------------------------------------
 from .filters import (  # noqa: F401
    _filter_border_strip_words,
    _GRID_GHOST_CHARS,
    _filter_border_ghosts,
    _flatten_word_boxes,
    _words_in_zone,
    _get_content_bounds,
    _filter_decorative_margin,
    _filter_footer_words,
    _filter_header_junk,
 )
 # --- Re-export: headers ----------------------------------------------------
 from .headers import (  # noqa: F401
    _detect_heading_rows_by_color,
    _detect_heading_rows_by_single_cell,
    _detect_header_rows,
    _detect_colspan_cells,
 )
 # --- Re-export: zones -------------------------------------------------------
 from .zones import (  # noqa: F401
    _PIPE_RE_VSPLIT,
    _detect_vertical_dividers,
    _split_zone_at_vertical_dividers,
    _merge_content_zones_across_boxes,
    _build_zone_grid,
 )
 # --- Re-export from cv_words_first (used by cv_box_layout.py) ---------------
 from cv_words_first import _cluster_rows  # noqa: F401
@@ -0,0 +1,389 @@
 """
 Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
 Split from grid_editor_helpers.py for maintainability.
 All functions are pure computation — no HTTP, DB, or session side effects.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 from cv_vocab_types import PageZone
 from cv_words_first import _cluster_rows, _build_cells
 from .columns import (
    _cluster_columns_by_alignment,
    _merge_inline_marker_columns,
    _split_cross_column_words,
 )
 from .headers import (
    _detect_header_rows,
    _detect_colspan_cells,
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Vertical divider detection and zone splitting
 # ---------------------------------------------------------------------------
 _PIPE_RE_VSPLIT = re.compile(r"^\|+$")
 def _detect_vertical_dividers(
    words: List[Dict],
    zone_x: int,
    zone_w: int,
    zone_y: int,
    zone_h: int,
 ) -> List[float]:
    """Detect vertical divider lines from pipe word_boxes at consistent x.
    Returns list of divider x-positions (empty if no dividers found).
    """
    if not words or zone_w <= 0 or zone_h <= 0:
        return []
    # Collect pipe word_boxes
    pipes = [
        w for w in words
        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
    ]
    if len(pipes) < 5:
        return []
    # Cluster pipe x-centers by proximity
    tolerance = max(15, int(zone_w * 0.02))
    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
    clusters: List[List[float]] = [[pipe_xs[0]]]
    for x in pipe_xs[1:]:
        if x - clusters[-1][-1] <= tolerance:
            clusters[-1].append(x)
        else:
            clusters.append([x])
    dividers: List[float] = []
    for cluster in clusters:
        if len(cluster) < 5:
            continue
        mean_x = sum(cluster) / len(cluster)
        # Must be between 15% and 85% of zone width
        rel_pos = (mean_x - zone_x) / zone_w
        if rel_pos < 0.15 or rel_pos > 0.85:
            continue
        # Check vertical coverage: pipes must span >= 50% of zone height
        cluster_pipes = [
            w for w in pipes
            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
        ]
        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
        y_span = max(ys) - min(ys) if ys else 0
        if y_span < zone_h * 0.5:
            continue
        dividers.append(mean_x)
    return sorted(dividers)
 def _split_zone_at_vertical_dividers(
    zone: "PageZone",
    divider_xs: List[float],
    vsplit_group_id: int,
 ) -> List["PageZone"]:
    """Split a PageZone at vertical divider positions into sub-zones."""
    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
    hints = []
    for i in range(len(boundaries) - 1):
        if i == 0:
            hints.append("left_of_vsplit")
        elif i == len(boundaries) - 2:
            hints.append("right_of_vsplit")
        else:
            hints.append("middle_of_vsplit")
    sub_zones = []
    for i in range(len(boundaries) - 1):
        x_start = int(boundaries[i])
        x_end = int(boundaries[i + 1])
        sub = PageZone(
            index=0,  # re-indexed later
            zone_type=zone.zone_type,
            y=zone.y,
            height=zone.height,
            x=x_start,
            width=x_end - x_start,
            box=zone.box,
            image_overlays=zone.image_overlays,
            layout_hint=hints[i],
            vsplit_group=vsplit_group_id,
        )
        sub_zones.append(sub)
    return sub_zones
 def _merge_content_zones_across_boxes(
    zones: List,
    content_x: int,
    content_w: int,
 ) -> List:
    """Merge content zones separated by box zones into single zones.
    Box zones become image_overlays on the merged content zone.
    Pattern: [content, box*, content] -> [merged_content with overlay]
    Box zones NOT between two content zones stay as standalone zones.
    """
    if len(zones) < 3:
        return zones
    # Group consecutive runs of [content, box+, content]
    result: List = []
    i = 0
    while i < len(zones):
        z = zones[i]
        if z.zone_type != "content":
            result.append(z)
            i += 1
            continue
        # Start of a potential merge group: content zone
        group_contents = [z]
        group_boxes = []
        j = i + 1
        # Absorb [box, content] pairs -- only absorb a box if it's
        # confirmed to be followed by another content zone.
        while j < len(zones):
            if (zones[j].zone_type == "box"
                    and j + 1 < len(zones)
                    and zones[j + 1].zone_type == "content"):
                group_boxes.append(zones[j])
                group_contents.append(zones[j + 1])
                j += 2
            else:
                break
        if len(group_contents) >= 2 and group_boxes:
            # Merge: create one large content zone spanning all
            y_min = min(c.y for c in group_contents)
            y_max = max(c.y + c.height for c in group_contents)
            overlays = []
            for bz in group_boxes:
                overlay = {
                    "y": bz.y,
                    "height": bz.height,
                    "x": bz.x,
                    "width": bz.width,
                }
                if bz.box:
                    overlay["box"] = {
                        "x": bz.box.x,
                        "y": bz.box.y,
                        "width": bz.box.width,
                        "height": bz.box.height,
                        "confidence": bz.box.confidence,
                        "border_thickness": bz.box.border_thickness,
                    }
                overlays.append(overlay)
            merged = PageZone(
                index=0,  # re-indexed below
                zone_type="content",
                y=y_min,
                height=y_max - y_min,
                x=content_x,
                width=content_w,
                image_overlays=overlays,
            )
            result.append(merged)
            i = j
        else:
            # No merge possible -- emit just the content zone
            result.append(z)
            i += 1
    # Re-index zones
    for idx, z in enumerate(result):
        z.index = idx
    logger.info(
        "zone-merge: %d zones -> %d zones after merging across boxes",
        len(zones), len(result),
    )
    return result
 def _build_zone_grid(
    zone_words: List[Dict],
    zone_x: int,
    zone_y: int,
    zone_w: int,
    zone_h: int,
    zone_index: int,
    img_w: int,
    img_h: int,
    global_columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
 ) -> Dict[str, Any]:
    """Build columns, rows, cells for a single zone from its words.
    Args:
        global_columns: If provided, use these pre-computed column boundaries
            instead of detecting columns per zone.  Used for content zones so
            that all content zones (above/between/below boxes) share the same
            column structure.  Box zones always detect columns independently.
    """
    if not zone_words:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
        }
    # Cluster rows first (needed for column alignment analysis)
    rows = _cluster_rows(zone_words)
    # Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
    if len(zone_words) <= 60:
        import statistics as _st
        _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
        _med_h = _st.median(_heights) if _heights else 20
        _y_tol = max(_med_h * 0.5, 5)
        logger.info(
            "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
            zone_index, len(zone_words), _med_h, _y_tol, len(rows),
        )
        for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
            logger.info(
                "  zone %d word: y=%d x=%d h=%d w=%d '%s'",
                zone_index, w['top'], w['left'], w['height'], w['width'],
                w.get('text', '')[:40],
            )
        for r in rows:
            logger.info(
                "  zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
                zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
            )
    # Use global columns if provided, otherwise detect per zone
    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
    # Merge inline marker columns (bullets, numbering) into adjacent text
    if not global_columns:
        columns = _merge_inline_marker_columns(columns, zone_words)
    if not columns or not rows:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
        }
    # Split word boxes that straddle column boundaries (e.g. "sichzie"
    # spanning Col 1 + Col 2).  Must happen after column detection and
    # before cell assignment.
    # Keep original words for colspan detection (split destroys span info).
    original_zone_words = zone_words
    if len(columns) >= 2:
        zone_words = _split_cross_column_words(zone_words, columns)
    # Build cells
    cells = _build_cells(zone_words, columns, rows, img_w, img_h)
    # --- Detect colspan (merged cells spanning multiple columns) ---
    # Uses the ORIGINAL (pre-split) words to detect word-blocks that span
    # multiple columns.  _split_cross_column_words would have destroyed
    # this information by cutting words at column boundaries.
    if len(columns) >= 2:
        cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
    # Prefix cell IDs with zone index
    for cell in cells:
        cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
        cell["zone_index"] = zone_index
    # Detect header rows (pass columns for spanning header detection)
    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
                                      skip_first_row_header=skip_first_row_header)
    # Merge cells in spanning header rows into a single col-0 cell
    if header_rows and len(columns) >= 2:
        for hri in header_rows:
            header_cells = [c for c in cells if c["row_index"] == hri]
            if len(header_cells) <= 1:
                continue
            # Collect all word_boxes and text from all columns
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            # Remove all header cells, replace with one spanning cell
            cells = [c for c in cells if c["row_index"] != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                cells.append({
                    "cell_id": f"R{hri:02d}_C0",
                    "row_index": hri,
                    "col_index": 0,
                    "col_type": "spanning_header",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
                    "bbox_px": {"x": x_min, "y": y_min,
                                "w": x_max - x_min, "h": y_max - y_min},
                    "bbox_pct": {
                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                    },
                    "word_boxes": all_wb,
                    "ocr_engine": "words_first",
                    "is_bold": True,
                })
    # Convert columns to output format with percentages
    out_columns = []
    for col in columns:
        x_min = col["x_min"]
        x_max = col["x_max"]
        out_columns.append({
            "index": col["index"],
            "label": col["type"],
            "x_min_px": round(x_min),
            "x_max_px": round(x_max),
            "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
            "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
            "bold": False,
        })
    # Convert rows to output format with percentages
    out_rows = []
    for row in rows:
        out_rows.append({
            "index": row["index"],
            "y_min_px": round(row["y_min"]),
            "y_max_px": round(row["y_max"]),
            "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
            "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
            "is_header": row["index"] in header_rows,
        })
    return {
        "columns": out_columns,
        "rows": out_rows,
        "cells": cells,
        "header_rows": header_rows,
        "_raw_columns": columns,  # internal: for propagation to other zones
    }
@@ -1,305 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/build/cell_ops.py
-Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
+import importlib as _importlib
-garbled cell cleanup, word-box reordering, and max_columns enforcement.
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("grid.build.cell_ops")
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List, Tuple
 from cv_ocr_engines import (
    _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
 )
 logger = logging.getLogger(__name__)
 def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
    """Remove blue bullet/artifact word_boxes (Step 5i).
    Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
    and syllable-split word merging.
    """
    _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
    _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
    bullet_removed = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            to_remove: set = set()
            # Rule (a): tiny coloured symbols
            for i, wb in enumerate(wbs):
                cn = wb.get("color_name", "black")
                if (cn != "black"
                        and wb.get("width", 0) * wb.get("height", 0) < 200
                        and wb.get("conf", 100) < 85):
                    to_remove.add(i)
            # Rule (a2): isolated non-alphanumeric symbols
            for i, wb in enumerate(wbs):
                t = (wb.get("text") or "").strip()
                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
                    if t in _REMOVE_SYMBOLS:
                        to_remove.add(i)
            # Rule (b) + (c): overlap and duplicate detection
            to_merge: List[Tuple[int, int]] = []
            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
            for p in range(len(indexed) - 1):
                i1, w1 = indexed[p]
                i2, w2 = indexed[p + 1]
                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
                min_w = min(w1.get("width", 1), w2.get("width", 1))
                gap = x2s - x1e
                overlap_pct = overlap / min_w if min_w > 0 else 0
                if overlap_pct > 0.20:
                    t1 = (w1.get("text") or "").strip()
                    t2 = (w2.get("text") or "").strip()
                    # Syllable-split words
                    if (overlap_pct <= 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)):
                        to_merge.append((i1, i2))
                        continue
                    # High overlap with short prefix
                    if (overlap_pct > 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)
                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
                        to_merge.append((i1, i2))
                        continue
                    if overlap_pct <= 0.40:
                        continue
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
                    # Very high overlap: prefer IPA-dictionary word
                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
                        if in_dict_1 and not in_dict_2:
                            to_remove.add(i2)
                            continue
                        elif in_dict_2 and not in_dict_1:
                            to_remove.add(i1)
                            continue
                    if c1 < c2:
                        to_remove.add(i1)
                    elif c2 < c1:
                        to_remove.add(i2)
                    else:
                        if w1.get("height", 0) > w2.get("height", 0):
                            to_remove.add(i1)
                        else:
                            to_remove.add(i2)
                elif (gap < 6
                      and w1.get("color_name") == "blue"
                      and w2.get("color_name") == "blue"
                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)
            # Execute merges first (syllable-split words)
            if to_merge:
                merge_parent: Dict[int, int] = {}
                for mi1, mi2 in to_merge:
                    actual_mi1 = mi1
                    while actual_mi1 in merge_parent:
                        actual_mi1 = merge_parent[actual_mi1]
                    if actual_mi1 in to_remove or mi2 in to_remove:
                        continue
                    if mi2 in merge_parent:
                        continue
                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
                    mt2 = (mw2.get("text") or "").strip()
                    merged_text = mt1 + mt2
                    mx = min(mw1["left"], mw2["left"])
                    my = min(mw1["top"], mw2["top"])
                    mr = max(mw1["left"] + mw1["width"],
                             mw2["left"] + mw2["width"])
                    mb = max(mw1["top"] + mw1["height"],
                             mw2["top"] + mw2["height"])
                    mw1["text"] = merged_text
                    mw1["left"] = mx
                    mw1["top"] = my
                    mw1["width"] = mr - mx
                    mw1["height"] = mb - my
                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
                    to_remove.add(mi2)
                    merge_parent[mi2] = actual_mi1
                    bullet_removed -= 1
            if to_remove:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
                cell["word_boxes"] = filtered
                if not cell.get("_ipa_corrected"):
                    cell["text"] = _words_to_reading_order_text(filtered)
    if bullet_removed:
        for z in zones_data:
            z["cells"] = [c for c in z.get("cells", [])
                          if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
 def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
    """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
    _COMMON_SHORT_WORDS = {
        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
        "die", "der", "das", "dem", "den", "des", "ein", "und",
        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
        "on", "or", "so", "to", "up", "us", "we",
        "the", "and", "but", "for", "not",
    }
    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
    artifact_cells_removed = 0
    for z in zones_data:
        before = len(z.get("cells", []))
        kept = []
        for cell in z.get("cells", []):
            text = (cell.get("text") or "").strip()
            core = text.rstrip(".,;:!?'\"")
            is_artifact = False
            if not core:
                is_artifact = True
            elif _PURE_JUNK_RE.match(core):
                if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
                    is_artifact = True
            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
                is_artifact = True
            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
                is_artifact = True
            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
                  and not re.match(r'^[pPsS]\.?\d+$', core)):
                is_artifact = True
            if is_artifact:
                kept.append(None)
            else:
                kept.append(cell)
        z["cells"] = [c for c in kept if c is not None]
        artifact_cells_removed += before - len(z["cells"])
    if artifact_cells_removed:
        for z in zones_data:
            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
 def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
    """Normalise word_box order to reading order (Step 5j)."""
    wb_reordered = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
            sorted_wbs = [w for line in lines for w in line]
            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
                cell["word_boxes"] = sorted_wbs
                wb_reordered += 1
    if wb_reordered:
        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
 def _enforce_max_columns(
    zones_data: List[Dict[str, Any]],
    max_columns: int,
 ) -> None:
    """Enforce max_columns by merging narrowest columns (Step 5k)."""
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
        cols = z.get("columns", [])
        cells = z.get("cells", [])
        if len(cols) <= max_columns:
            continue
        logger.info(
            "max_columns=%d: zone %s has %d columns -> merging",
            max_columns, z.get("zone_index"), len(cols),
        )
        cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
        while len(cols) > max_columns:
            narrowest = cols_by_width.pop(0)
            ni = narrowest["index"]
            sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
            pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
            if pos + 1 < len(sorted_by_x):
                merge_target = sorted_by_x[pos + 1]
            elif pos > 0:
                merge_target = sorted_by_x[pos - 1]
            else:
                break
            ti = merge_target["index"]
            merge_target["x_min_px"] = min(
                merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
                narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
            )
            merge_target["x_max_px"] = max(
                merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
                narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
            )
            if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
                merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
                merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
            for cell in cells:
                if cell.get("col_index") == ni:
                    cell["col_index"] = ti
                    existing = next(
                        (c for c in cells if c["col_index"] == ti
                         and c["row_index"] == cell["row_index"]
                         and c is not cell),
                        None,
                    )
                    if existing:
                        existing["text"] = (
                            (existing.get("text", "") + " " + cell.get("text", "")).strip()
                        )
                        existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
                        cell["_merged"] = True
            z["cells"] = [c for c in cells if not c.get("_merged")]
            cells = z["cells"]
            cols.remove(narrowest)
            cols_by_width = [c for c in cols_by_width if c["index"] != ni]
        # Re-index columns 0..N-1
        for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
            old_idx = col["index"]
            col["index"] = new_idx
            for cell in cells:
                if cell.get("col_index") == old_idx:
                    cell["col_index"] = new_idx
        logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
@@ -1,390 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/build/cleanup.py
-Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
+import importlib as _importlib
-divider removal, connector normalization, border strip detection, and
+import sys as _sys
-alphabet sidebar removal.
+_sys.modules[__name__] = _importlib.import_module("grid.build.cleanup")
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List
 from cv_ocr_engines import _words_to_reading_order_text
 logger = logging.getLogger(__name__)
 _PIPE_RE = re.compile(r"^\|+$")
 def _cleanup_zones(
    zones_data: List[Dict[str, Any]],
    border_prefiltered: bool,
    session_id: str,
 ) -> bool:
    """Clean up zone data: remove junk rows, artifacts, pipes, border strips.
    Args:
        zones_data: List of zone dicts (modified in place).
        border_prefiltered: Whether border words were already pre-filtered.
        session_id: For logging.
    Returns:
        Updated border_prefiltered flag.
    """
    _remove_junk_rows(zones_data)
    _remove_artifact_cells(zones_data)
    _remove_oversized_word_boxes(zones_data)
    _remove_pipe_dividers(zones_data)
    _normalize_connector_columns(zones_data)
    border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
    _remove_alphabet_sidebars(zones_data)
    return border_prefiltered
 def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
    """Remove rows where ALL cells contain only short, low-confidence text.
    Also removes 'oversized stub' rows and 'scattered debris' rows.
    """
    _JUNK_CONF_THRESHOLD = 50
    _JUNK_MAX_TEXT_LEN = 3
    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        if not cells or not rows:
            continue
        # Compute median word height across the zone for oversized detection
        all_wb_heights = [
            wb["height"]
            for cell in cells
            for wb in cell.get("word_boxes") or []
            if wb.get("height", 0) > 0
        ]
        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
        junk_row_indices = set()
        for row in rows:
            ri = row["index"]
            row_cells = [c for c in cells if c.get("row_index") == ri]
            if not row_cells:
                continue
            row_wbs = [
                wb for cell in row_cells
                for wb in cell.get("word_boxes") or []
            ]
            # Rule 1: ALL word_boxes are low-conf AND short text
            all_junk = True
            for wb in row_wbs:
                text = (wb.get("text") or "").strip()
                conf = wb.get("conf", 0)
                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
                    all_junk = False
                    break
            if all_junk and row_wbs:
                junk_row_indices.add(ri)
                continue
            # Rule 2: oversized stub -- <=3 words, short total text,
            # and word height > 1.8x median
            if len(row_wbs) <= 3:
                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
                has_page_ref = any(
                    re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
                    for wb in row_wbs
                )
                if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
                    junk_row_indices.add(ri)
                    continue
            # Rule 3: scattered debris -- rows with only tiny fragments
            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
            if longest <= 2:
                junk_row_indices.add(ri)
                continue
        if junk_row_indices:
            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
            logger.info(
                "build-grid: removed %d junk rows from zone %d: %s",
                len(junk_row_indices), z["zone_index"],
                sorted(junk_row_indices),
            )
 def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
    """Remove individual cells with a single very-short, low-conf word."""
    _ARTIFACT_MAX_LEN = 2
    _ARTIFACT_CONF_THRESHOLD = 65
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        artifact_ids = set()
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            if len(wbs) != 1:
                continue
            wb = wbs[0]
            text = (wb.get("text") or "").strip()
            conf = wb.get("conf", 100)
            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
                artifact_ids.add(cell.get("cell_id"))
        if artifact_ids:
            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
            logger.info(
                "build-grid: removed %d artifact cells from zone %d: %s",
                len(artifact_ids), z.get("zone_index", 0),
                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
            )
 def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
    """Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        all_wh = [
            wb["height"]
            for cell in cells
            for wb in cell.get("word_boxes") or []
            if wb.get("height", 0) > 0
        ]
        if not all_wh:
            continue
        med_h = sorted(all_wh)[len(all_wh) // 2]
        oversized_threshold = med_h * 3
        removed_oversized = 0
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
            if len(filtered) < len(wbs):
                removed_oversized += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_oversized:
            z["cells"] = [c for c in cells if c.get("word_boxes")]
            logger.info(
                "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
                removed_oversized, oversized_threshold, z.get("zone_index", 0),
            )
 def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
    """Remove pipe-character word_boxes (column divider artifacts)."""
    for z in zones_data:
        if z.get("vsplit_group") is not None:
            continue  # pipes already removed before split
        removed_pipes = 0
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                removed_pipes += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_pipes:
            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
            logger.info(
                "build-grid: removed %d pipe-divider word_boxes from zone %d",
                removed_pipes, z.get("zone_index", 0),
            )
    # Strip pipe chars ONLY from cell edges (OCR artifacts).
    # Preserve pipes embedded in words as syllable separators.
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if "|" in text:
                cleaned = text.strip("|").strip()
                if cleaned != text.strip():
                    cell["text"] = cleaned
 def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
    """Normalize narrow connector columns where OCR appends noise chars.
    In synonym dictionaries a narrow column repeats the same word
    (e.g. "oder") in every row. OCR sometimes appends noise chars.
    """
    for z in zones_data:
        cols = z.get("columns", [])
        cells = z.get("cells", [])
        if not cols or not cells:
            continue
        for col in cols:
            ci = col.get("index")
            col_cells = [c for c in cells if c.get("col_index") == ci]
            if len(col_cells) < 3:
                continue
            text_counts: Dict[str, int] = {}
            for c in col_cells:
                t = (c.get("text") or "").strip()
                if t:
                    text_counts[t] = text_counts.get(t, 0) + 1
            if not text_counts:
                continue
            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
            dominant_count = text_counts[dominant_text]
            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
                continue
            fixed = 0
            for c in col_cells:
                t = (c.get("text") or "").strip()
                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
                    c["text"] = dominant_text
                    wbs = c.get("word_boxes") or []
                    if len(wbs) == 1:
                        wbs[0]["text"] = dominant_text
                    fixed += 1
            if fixed:
                logger.info(
                    "build-grid: normalized %d outlier cells in connector column %d "
                    "(dominant='%s') zone %d",
                    fixed, ci, dominant_text, z.get("zone_index", 0),
                )
 def _remove_border_strips(
    zones_data: List[Dict[str, Any]],
    border_prefiltered: bool,
 ) -> bool:
    """Detect and remove page-border decoration strips.
    Returns updated border_prefiltered flag.
    """
    border_strip_removed = 0
    if border_prefiltered:
        logger.info("Step 4e: skipped (border pre-filter already applied)")
        return border_prefiltered
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        all_wbs_with_cell: list = []
        for cell in cells:
            for wb in cell.get("word_boxes") or []:
                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
        if len(all_wbs_with_cell) < 10:
            continue
        all_wbs_with_cell.sort(key=lambda t: t[0])
        total = len(all_wbs_with_cell)
        # -- Left-edge scan --
        left_strip_count = 0
        left_gap = 0
        running_right = 0
        for gi in range(total - 1):
            running_right = max(
                running_right,
                all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
            )
            gap = all_wbs_with_cell[gi + 1][0] - running_right
            if gap > 30:
                left_strip_count = gi + 1
                left_gap = gap
                break
        # -- Right-edge scan --
        right_strip_count = 0
        right_gap = 0
        running_left = all_wbs_with_cell[-1][0]
        for gi in range(total - 1, 0, -1):
            running_left = min(running_left, all_wbs_with_cell[gi][0])
            prev_right = (
                all_wbs_with_cell[gi - 1][0]
                + all_wbs_with_cell[gi - 1][1].get("width", 0)
            )
            gap = running_left - prev_right
            if gap > 30:
                right_strip_count = total - gi
                right_gap = gap
                break
        strip_wbs: set = set()
        strip_side = ""
        strip_gap = 0
        strip_count = 0
        if left_strip_count > 0 and left_strip_count / total < 0.20:
            strip_side = "left"
            strip_count = left_strip_count
            strip_gap = left_gap
            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
        elif right_strip_count > 0 and right_strip_count / total < 0.20:
            strip_side = "right"
            strip_count = right_strip_count
            strip_gap = right_gap
            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
        if not strip_wbs:
            continue
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
            if len(filtered) < len(wbs):
                border_strip_removed += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        z["cells"] = [c for c in cells
                      if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info(
            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
            "(gap=%dpx, strip=%d/%d wbs)",
            border_strip_removed, strip_side, z.get("zone_index", 0),
            strip_gap, strip_count, total,
        )
    return border_prefiltered
 def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
    """Remove decorative edge columns (alphabet sidebar safety net).
    Dictionary pages have A-Z letter sidebars that OCR reads as single-
    character word_boxes.
    """
    for z in zones_data:
        columns = z.get("columns", [])
        cells = z.get("cells", [])
        if len(columns) < 3 or not cells:
            continue
        col_cells: Dict[str, List[Dict]] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_"):
                col_cells.setdefault(ct, []).append(cell)
        col_types_ordered = sorted(col_cells.keys())
        if len(col_types_ordered) < 3:
            continue
        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
            edge_cells_list = col_cells.get(edge_ct, [])
            if len(edge_cells_list) < 3:
                continue
            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
            avg_len = sum(len(t) for t in texts) / len(texts)
            single_char = sum(1 for t in texts if len(t) <= 1)
            single_ratio = single_char / len(texts)
            if avg_len > 1.5:
                continue
            if single_ratio < 0.7:
                continue
            removed_count = len(edge_cells_list)
            edge_ids = {id(c) for c in edge_cells_list}
            z["cells"] = [c for c in cells if id(c) not in edge_ids]
            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
            logger.info(
                "Step 4f: removed decorative edge column '%s' from zone %d "
                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
                edge_ct, z.get("zone_index", 0), removed_count,
                avg_len, single_ratio * 100,
            )
            break  # only remove one edge per zone
@@ -1,213 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/build/core.py
-Grid Build Core — the main _build_grid_core() function.
+import importlib as _importlib
-
+import sys as _sys
-Extracted from grid_editor_api.py for maintainability.
+_sys.modules[__name__] = _importlib.import_module("grid.build.core")
 Takes merged OCR word positions and builds a structured, zone-aware grid.
 The function delegates to phase-specific modules:
 - grid_build_zones.py   — image loading, graphic/box detection, zone grids
 - grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
 - grid_build_text_ops.py — color, headings, IPA, page refs
 - grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
 """
 import logging
 import time
 from typing import Any, Dict, List, Optional
 from grid_editor_helpers import (
    _flatten_word_boxes,
    _get_content_bounds,
    _filter_decorative_margin,
    _filter_footer_words,
    _filter_header_junk,
 )
 from grid_build_zones import _build_zones
 from grid_build_cleanup import _cleanup_zones
 from grid_build_text_ops import _process_text
 from grid_build_finalize import _finalize_grid
 logger = logging.getLogger(__name__)
 async def _build_grid_core(
    session_id: str,
    session: dict,
    *,
    ipa_mode: str = "auto",
    syllable_mode: str = "auto",
    enhance: bool = True,
    max_columns: Optional[int] = None,
    min_conf: Optional[int] = None,
 ) -> dict:
    """Core grid building logic — pure computation, no HTTP or DB side effects.
    Args:
        session_id: Session identifier (for logging and image loading).
        session: Full session dict from get_session_db().
        ipa_mode: "auto" (only when English headwords detected), "all"
            (force IPA on all content columns), "en" (English column only),
            "de" (German/definition columns only), or "none" (skip entirely).
        syllable_mode: "auto" (only when original has pipe dividers),
            "all" (force syllabification on all words), "en" (English only),
            "de" (German only), or "none" (skip).
    Returns:
        StructuredGrid result dict.
    Raises:
        ValueError: If session data is incomplete.
    """
    t0 = time.time()
    # ── Phase 1: Input Validation & Word Filtering ──────────────────
    # 1. Validate and load word results
    word_result = session.get("word_result")
    if not word_result or not word_result.get("cells"):
        raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
    img_w = word_result.get("image_width", 0)
    img_h = word_result.get("image_height", 0)
    if not img_w or not img_h:
        raise ValueError("Missing image dimensions in word_result")
    # 2. Flatten all word boxes from cells
    all_words = _flatten_word_boxes(word_result["cells"])
    if not all_words:
        raise ValueError("No word boxes found in cells")
    # 2a-pre. Apply min_conf filter if specified
    if min_conf and min_conf > 0:
        before = len(all_words)
        all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
        removed = before - len(all_words)
        if removed:
            logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
                        session_id, min_conf, removed, before)
    logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
                session_id, len(all_words), len(word_result["cells"]),
                enhance, max_columns, min_conf)
    # 2b. Filter decorative margin columns (alphabet graphics)
    margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
    margin_strip_detected = margin_strip_info.get("found", False)
    # Read document_category from session
    document_category = session.get("document_category")
    # 2c. Filter footer rows (page numbers at the very bottom)
    page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
    # 2c2. Filter OCR junk from header illustrations
    _filter_header_junk(all_words, img_h, logger, session_id)
    # 2d. Filter words inside user-defined exclude regions
    structure_result = session.get("structure_result")
    exclude_rects = []
    if structure_result:
        for er in structure_result.get("exclude_regions", []):
            exclude_rects.append({
                "x": er["x"], "y": er["y"],
                "w": er["w"], "h": er["h"],
            })
    if exclude_rects:
        before = len(all_words)
        filtered = []
        for w in all_words:
            w_cx = w["left"] + w.get("width", 0) / 2
            w_cy = w["top"] + w.get("height", 0) / 2
            inside = any(
                er["x"] <= w_cx <= er["x"] + er["w"]
                and er["y"] <= w_cy <= er["y"] + er["h"]
                for er in exclude_rects
            )
            if not inside:
                filtered.append(w)
        removed = before - len(filtered)
        if removed:
            all_words = filtered
            logger.info(
                "build-grid session %s: removed %d words inside %d user exclude region(s)",
                session_id, removed, len(exclude_rects),
            )
    # 2e. Hard-filter words inside graphic/image regions from structure step
    graphic_rects: List[Dict[str, int]] = []
    if structure_result:
        for g in structure_result.get("graphics", []):
            graphic_rects.append({
                "x": g["x"], "y": g["y"],
                "w": g["w"], "h": g["h"],
            })
    if graphic_rects:
        before = len(all_words)
        all_words = [
            w for w in all_words
            if not any(
                gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
                and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
                for gr in graphic_rects
            )
        ]
        removed = before - len(all_words)
        if removed:
            logger.info(
                "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
                session_id, removed, len(graphic_rects),
            )
    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
    # ── Phase 2: Image Processing & Zone Detection ──────────────────
    zone_result = await _build_zones(
        session_id, session, all_words, graphic_rects,
        content_x, content_y, content_w, content_h,
        img_w, img_h,
    )
    zones_data = zone_result["zones_data"]
    boxes_detected = zone_result["boxes_detected"]
    recovered_count = zone_result["recovered_count"]
    border_prefiltered = zone_result["border_prefiltered"]
    img_bgr = zone_result["img_bgr"]
    # ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
    border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
    # ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
    text_result = _process_text(
        zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
    )
    # ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
    duration = time.time() - t0
    result = _finalize_grid(
        zones_data=zones_data,
        all_words=all_words,
        img_bgr=img_bgr,
        img_w=img_w,
        img_h=img_h,
        session_id=session_id,
        max_columns=max_columns,
        ipa_mode=ipa_mode,
        syllable_mode=syllable_mode,
        en_col_type=text_result["en_col_type"],
        ipa_target_cols=text_result["ipa_target_cols"],
        all_content_cols=text_result["all_content_cols"],
        skip_ipa=text_result["skip_ipa"],
        document_category=document_category,
        margin_strip_detected=margin_strip_detected,
        page_number_info=text_result["page_number_info"],
        boxes_detected=boxes_detected,
        recovered_count=recovered_count,
        duration=duration,
    )
    return result
@@ -1,452 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/build/finalize.py
-Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
+import importlib as _importlib
-dictionary detection, syllable dividers, spell checking, empty column
+import sys as _sys
-removal, and result assembly.
+_sys.modules[__name__] = _importlib.import_module("grid.build.finalize")
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 from grid_build_cell_ops import (
    _remove_bullets_and_artifacts,
    _remove_garbled_cells,
    _normalize_word_order,
    _enforce_max_columns,
 )
 logger = logging.getLogger(__name__)
 def _finalize_grid(
    zones_data: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    img_bgr: Any,
    img_w: int,
    img_h: int,
    session_id: str,
    max_columns: Optional[int],
    ipa_mode: str,
    syllable_mode: str,
    en_col_type: Optional[str],
    ipa_target_cols: set,
    all_content_cols: set,
    skip_ipa: bool,
    document_category: Optional[str],
    margin_strip_detected: bool,
    page_number_info: Optional[Dict],
    boxes_detected: int,
    recovered_count: int,
    duration: float,
 ) -> dict:
    """Run final processing steps and assemble result dict.
    Handles: bullet removal, artifact cells, word ordering, max_columns,
    dictionary detection, syllable dividers, spell check, empty columns,
    internal flag cleanup, and result assembly.
    """
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    # 5i. Remove blue bullet/artifact word_boxes
    _remove_bullets_and_artifacts(zones_data)
    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise
    _remove_garbled_cells(zones_data)
    # 5j. Normalise word_box order to reading order
    _normalize_word_order(zones_data)
    # 5k. Enforce max_columns by merging narrowest columns
    if max_columns and max_columns > 0:
        _enforce_max_columns(zones_data, max_columns)
    # --- Dictionary detection on assembled grid ---
    dict_detection = _detect_dictionary(
        zones_data, img_w, img_h, document_category, margin_strip_detected
    )
    # --- Word-gap merge ---
    try:
        from cv_syllable_detect import merge_word_gaps_in_zones
        merge_word_gaps_in_zones(zones_data, session_id)
    except Exception as e:
        logger.warning("Word-gap merge failed: %s", e)
    # --- Pipe auto-correction ---
    try:
        from cv_syllable_detect import autocorrect_pipe_artifacts
        autocorrect_pipe_artifacts(zones_data, session_id)
    except Exception as e:
        logger.warning("Pipe autocorrect failed: %s", e)
    # --- Syllable divider insertion ---
    syllable_insertions = _insert_syllable_dividers(
        zones_data, img_bgr, session_id, syllable_mode, dict_detection,
        en_col_type, all_content_cols, total_cols,
    )
    # --- Split merged words ---
    _split_merged_words(zones_data, session_id)
    # --- Ensure space before IPA/phonetic brackets ---
    _fix_ipa_spacing(zones_data)
    # --- SmartSpellChecker ---
    _run_spell_checker(zones_data, session_id, en_col_type, total_cols)
    # --- Debug log cell counts per column ---
    for z in zones_data:
        if z.get("zone_type") == "content":
            from collections import Counter as _Counter
            _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
            _cols = z.get("columns", [])
            logger.info(
                "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
                z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
            )
    # --- Remove empty columns ---
    _remove_empty_columns(zones_data)
    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):
            cell.pop("_ipa_corrected", None)
    # 6. Build result
    return _assemble_result(
        zones_data, all_words, img_w, img_h, session_id,
        ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
        dict_detection, page_number_info, boxes_detected,
        recovered_count, duration, syllable_insertions,
    )
 def _detect_dictionary(
    zones_data: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
    document_category: Optional[str],
    margin_strip_detected: bool,
 ) -> Dict[str, Any]:
    """Run dictionary detection on the assembled grid."""
    from cv_layout import _score_dictionary_signals
    dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
    try:
        from cv_vocab_types import ColumnGeometry
        for z in zones_data:
            zone_cells = z.get("cells", [])
            zone_cols = z.get("columns", [])
            if len(zone_cols) < 2 or len(zone_cells) < 10:
                continue
            pseudo_geoms = []
            for col in zone_cols:
                ci = col["index"]
                col_cells = [c for c in zone_cells if c.get("col_index") == ci]
                col_words = []
                for cell in col_cells:
                    for wb in cell.get("word_boxes") or []:
                        col_words.append({
                            "text": wb.get("text", ""),
                            "conf": wb.get("conf", 0),
                            "top": wb.get("top", 0),
                            "left": wb.get("left", 0),
                            "height": wb.get("height", 0),
                            "width": wb.get("width", 0),
                        })
                    if not cell.get("word_boxes") and cell.get("text"):
                        col_words.append({
                            "text": cell["text"],
                            "conf": cell.get("confidence", 50),
                            "top": cell.get("bbox_px", {}).get("y", 0),
                            "left": cell.get("bbox_px", {}).get("x", 0),
                            "height": cell.get("bbox_px", {}).get("h", 20),
                            "width": cell.get("bbox_px", {}).get("w", 50),
                        })
                col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
                pseudo_geoms.append(ColumnGeometry(
                    index=ci, x=col.get("x_min_px", 0), y=0,
                    width=max(col_w, 1), height=img_h,
                    word_count=len(col_words), words=col_words,
                    width_ratio=col_w / max(img_w, 1),
                ))
            if len(pseudo_geoms) >= 2:
                dd = _score_dictionary_signals(
                    pseudo_geoms,
                    document_category=document_category,
                    margin_strip_detected=margin_strip_detected,
                )
                if dd["confidence"] > dict_detection["confidence"]:
                    dict_detection = dd
    except Exception as e:
        logger.warning("Dictionary detection failed: %s", e)
    return dict_detection
 def _insert_syllable_dividers(
    zones_data: List[Dict[str, Any]],
    img_bgr: Any,
    session_id: str,
    syllable_mode: str,
    dict_detection: Dict[str, Any],
    en_col_type: Optional[str],
    all_content_cols: set,
    total_cols: int,
 ) -> int:
    """Insert syllable dividers for dictionary pages. Returns insertion count."""
    syllable_insertions = 0
    if syllable_mode == "none" or img_bgr is None:
        if syllable_mode == "none":
            for z in zones_data:
                for cell in z.get("cells", []):
                    t = cell.get("text", "")
                    if "|" in t:
                        cell["text"] = t.replace("|", "")
        return syllable_insertions
    _syllable_eligible = False
    if syllable_mode in ("all", "de", "en"):
        _syllable_eligible = True
    elif (dict_detection.get("is_dictionary")
            and dict_detection.get("article_col_index") is not None):
        _syllable_eligible = True
    _syllable_col_filter: Optional[set] = None
    if syllable_mode == "en":
        _syllable_col_filter = {en_col_type} if en_col_type else set()
    elif syllable_mode == "de":
        if en_col_type and total_cols >= 3:
            _syllable_col_filter = all_content_cols - {en_col_type}
    if _syllable_eligible:
        try:
            from cv_syllable_detect import insert_syllable_dividers
            force_syllables = (syllable_mode in ("all", "de", "en"))
            syllable_insertions = insert_syllable_dividers(
                zones_data, img_bgr, session_id,
                force=force_syllables,
                col_filter=_syllable_col_filter,
            )
        except Exception as e:
            logger.warning("Syllable insertion failed: %s", e)
    return syllable_insertions
 def _split_merged_words(
    zones_data: List[Dict[str, Any]],
    session_id: str,
 ) -> None:
    """Split merged words using dictionary lookup."""
    try:
        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
        if not _SPELL_AVAILABLE:
            return
        split_count = 0
        for z in zones_data:
            for cell in z.get("cells", []):
                text = cell.get("text", "")
                if not text:
                    continue
                parts = []
                changed = False
                for token in text.split():
                    clean = token
                    bracket_pos = clean.find('[')
                    suffix_ipa = ""
                    if bracket_pos > 0:
                        suffix_ipa = clean[bracket_pos:]
                        clean = clean[:bracket_pos]
                    suffix_punct = ""
                    stripped = clean.rstrip(".,!?;:'\")")
                    if stripped != clean:
                        suffix_punct = clean[len(stripped):]
                        clean = stripped
                    suffix = suffix_punct + suffix_ipa
                    contraction = ""
                    if "'" in clean and clean.index("'") >= 2:
                        apos_pos = clean.index("'")
                        contraction = clean[apos_pos:]
                        clean = clean[:apos_pos]
                        suffix = contraction + suffix
                    if len(clean) >= 4 and clean.isalpha():
                        split = _try_split_merged_word(clean)
                        if split:
                            parts.append(split + suffix)
                            changed = True
                            continue
                    parts.append(token)
                if changed:
                    cell["text"] = " ".join(parts)
                    split_count += 1
        if split_count:
            logger.info("build-grid session %s: split %d merged words", session_id, split_count)
    except ImportError:
        pass
 def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
    """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if text and "[" in text:
                fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
                if fixed != text:
                    cell["text"] = fixed
 def _run_spell_checker(
    zones_data: List[Dict[str, Any]],
    session_id: str,
    en_col_type: Optional[str],
    total_cols: int,
 ) -> None:
    """Run SmartSpellChecker on all cells."""
    try:
        from smart_spell import SmartSpellChecker
        _ssc = SmartSpellChecker()
        spell_fix_count = 0
        for z in zones_data:
            for cell in z.get("cells", []):
                text = cell.get("text", "")
                if not text or not text.strip():
                    continue
                ct = cell.get("col_type", "")
                if not ct.startswith("column_"):
                    continue
                if total_cols >= 3 and en_col_type:
                    lang = "en" if ct == en_col_type else "de"
                elif total_cols <= 2:
                    lang = "auto"
                else:
                    lang = "auto"
                result = _ssc.correct_text(text, lang=lang)
                if result.changed:
                    cell["text"] = result.corrected
                    spell_fix_count += 1
        if spell_fix_count:
            logger.info(
                "build-grid session %s: SmartSpellChecker fixed %d cells",
                session_id, spell_fix_count,
            )
    except ImportError:
        logger.debug("SmartSpellChecker not available in build-grid")
    except Exception as e:
        logger.warning("SmartSpellChecker error in build-grid: %s", e)
 def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
    """Remove columns that have no cells assigned."""
    for z in zones_data:
        cells = z.get("cells", [])
        used_col_indices = {c.get("col_index") for c in cells}
        old_cols = z.get("columns", [])
        new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
        if len(new_cols) < len(old_cols):
            old_to_new = {}
            for new_i, col in enumerate(new_cols):
                old_i = col.get("col_index", col.get("index", new_i))
                old_to_new[old_i] = new_i
                col["col_index"] = new_i
                col["index"] = new_i
                col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
            for cell in cells:
                old_ci = cell.get("col_index", 0)
                cell["col_index"] = old_to_new.get(old_ci, old_ci)
                cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
            z["columns"] = new_cols
 def _assemble_result(
    zones_data: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
    session_id: str,
    ipa_mode: str,
    syllable_mode: str,
    ipa_target_cols: set,
    skip_ipa: bool,
    dict_detection: Dict[str, Any],
    page_number_info: Optional[Dict],
    boxes_detected: int,
    recovered_count: int,
    duration: float,
    syllable_insertions: int,
 ) -> dict:
    """Build the final result dict (Phase 6)."""
    total_cells = sum(len(z.get("cells", [])) for z in zones_data)
    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
    total_rows = sum(len(z.get("rows", [])) for z in zones_data)
    # Collect color statistics
    color_stats: Dict[str, int] = {}
    for z in zones_data:
        for cell in z.get("cells", []):
            for wb in cell.get("word_boxes", []):
                cn = wb.get("color_name", "black")
                color_stats[cn] = color_stats.get(cn, 0) + 1
    # Compute layout metrics
    all_content_row_heights: List[float] = []
    for z in zones_data:
        for row in z.get("rows", []):
            if not row.get("is_header", False):
                h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
                if h > 0:
                    all_content_row_heights.append(h)
    avg_row_height = (
        sum(all_content_row_heights) / len(all_content_row_heights)
        if all_content_row_heights else 30.0
    )
    font_size_suggestion = max(10, int(avg_row_height * 0.6))
    return {
        "session_id": session_id,
        "image_width": img_w,
        "image_height": img_h,
        "zones": zones_data,
        "boxes_detected": boxes_detected,
        "summary": {
            "total_zones": len(zones_data),
            "total_columns": total_columns,
            "total_rows": total_rows,
            "total_cells": total_cells,
            "total_words": len(all_words),
            "recovered_colored": recovered_count,
            "color_stats": color_stats,
        },
        "formatting": {
            "bold_columns": [],
            "header_rows": [],
        },
        "layout_metrics": {
            "page_width_px": img_w,
            "page_height_px": img_h,
            "avg_row_height_px": round(avg_row_height, 1),
            "font_size_suggestion_px": font_size_suggestion,
        },
        "dictionary_detection": {
            "is_dictionary": dict_detection.get("is_dictionary", False),
            "confidence": dict_detection.get("confidence", 0.0),
            "signals": dict_detection.get("signals", {}),
            "article_col_index": dict_detection.get("article_col_index"),
            "headword_col_index": dict_detection.get("headword_col_index"),
        },
        "processing_modes": {
            "ipa_mode": ipa_mode,
            "syllable_mode": syllable_mode,
            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
            "syllables_applied": syllable_insertions > 0,
        },
        "page_number": page_number_info,
        "duration_seconds": round(duration, 2),
    }
@@ -1,489 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/build/text_ops.py
-Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
+import importlib as _importlib
-parenthesis fix, IPA phonetic correction, page ref extraction, and
+import sys as _sys
-slash-IPA conversion.
+_sys.modules[__name__] = _importlib.import_module("grid.build.text_ops")
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional, Set, Tuple
 from cv_color_detect import detect_word_colors
 from cv_ocr_engines import (
    fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
    _lookup_ipa,
 )
 from grid_editor_helpers import (
    _detect_heading_rows_by_color,
    _detect_heading_rows_by_single_cell,
 )
 logger = logging.getLogger(__name__)
 def _process_text(
    zones_data: List[Dict[str, Any]],
    img_bgr: Any,
    img_w: int,
    img_h: int,
    ipa_mode: str,
    page_number_info: Optional[Dict],
 ) -> Dict[str, Any]:
    """Run color annotation, heading detection, IPA correction, and page refs.
    Args:
        zones_data: List of zone dicts (modified in place).
        img_bgr: BGR image array (or None).
        img_w: Image width.
        img_h: Image height.
        ipa_mode: IPA processing mode.
        page_number_info: Existing page number metadata (may be None).
    Returns:
        Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
        skip_ipa, page_number_info.
    """
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []
        for z in zones_data:
            for cell in z.get("cells", []):
                all_wb.extend(cell.get("word_boxes", []))
        detect_word_colors(img_bgr, all_wb)
    # 5a. Heading detection by color + height
    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
    if heading_count:
        logger.info("Detected %d heading rows by color+height", heading_count)
    # 5b. Fix unmatched parentheses in cell text
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if ")" in text and "(" not in text:
                cell["text"] = "(" + text
    # 5c. IPA phonetic correction
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    en_col_type = None
    ipa_target_cols: set = set()
    all_content_cols: set = set()
    skip_ipa = (ipa_mode == "none")
    # When ipa_mode=none, strip ALL square brackets from ALL content columns
    if skip_ipa:
        _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
            if "[" in text:
                stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
                if stripped != text:
                    cell["text"] = stripped.strip()
                    cell["_ipa_corrected"] = True
    if not skip_ipa and total_cols >= 3:
        en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
            all_cells, total_cols, ipa_mode, zones_data
        )
    elif not skip_ipa:
        # Collect all_content_cols even when <3 cols (needed by finalize)
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_") and (cell.get("text") or "").strip():
                all_content_cols.add(ct)
    # 5e. Heading detection by single-cell rows
    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
    if single_heading_count:
        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
    # 5f. Strip IPA from headings
    for z in zones_data:
        for cell in z.get("cells", []):
            if cell.get("col_type") != "heading":
                continue
            text = cell.get("text", "")
            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
            if stripped and stripped != text:
                cell["text"] = stripped
    # 5g. Extract page_ref cells and footer rows
    _extract_page_refs_and_footers(zones_data, page_number_info)
    # 5h. Convert slash-delimited IPA to bracket notation
    _convert_slash_ipa(zones_data, skip_ipa, en_col_type)
    return {
        "en_col_type": en_col_type,
        "ipa_target_cols": ipa_target_cols,
        "all_content_cols": all_content_cols,
        "skip_ipa": skip_ipa,
        "page_number_info": page_number_info,
    }
 def _run_ipa_correction(
    all_cells: List[Dict],
    total_cols: int,
    ipa_mode: str,
    zones_data: List[Dict[str, Any]],
 ) -> Tuple[Optional[str], set, set]:
    """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
    en_col_type = None
    all_content_cols: set = set()
    # Detect English headword column via IPA signals
    col_ipa_count: Dict[str, int] = {}
    for cell in all_cells:
        ct = cell.get("col_type", "")
        if not ct.startswith("column_"):
            continue
        txt = cell.get("text", "") or ""
        if txt.strip():
            all_content_cols.add(ct)
        if '[' in txt or _text_has_garbled_ipa(txt):
            col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
    if col_ipa_count:
        en_col_type = max(col_ipa_count, key=col_ipa_count.get)
    elif ipa_mode == "all":
        col_cell_count: Dict[str, int] = {}
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_") and (cell.get("text") or "").strip():
                col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
        if col_cell_count:
            en_col_type = max(col_cell_count, key=col_cell_count.get)
    # Decide which columns to process based on ipa_mode
    en_ipa_target_cols: set = set()
    de_ipa_target_cols: set = set()
    if ipa_mode in ("auto", "en"):
        if en_col_type:
            en_ipa_target_cols.add(en_col_type)
    elif ipa_mode == "de":
        de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
    elif ipa_mode == "all":
        if en_col_type:
            en_ipa_target_cols.add(en_col_type)
        de_ipa_target_cols = all_content_cols - en_ipa_target_cols
    # --- Strip IPA from columns NOT in the target set ---
    _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
    strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
    if strip_en_ipa or ipa_mode == "none":
        strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct not in strip_cols:
                continue
            text = cell.get("text", "")
            if "[" in text:
                stripped = _SQUARE_BRACKET_RE.sub("", text)
                if stripped != text:
                    cell["text"] = stripped.strip()
                    cell["_ipa_corrected"] = True
    # --- English IPA (Britfone + eng_to_ipa) ---
    if en_ipa_target_cols:
        for cell in all_cells:
            ct = cell.get("col_type")
            if ct in en_ipa_target_cols:
                cell["_orig_col_type"] = ct
                cell["col_type"] = "column_en"
    _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
    fix_cell_phonetics(all_cells, pronunciation="british")
    for cell in all_cells:
        orig = cell.pop("_orig_col_type", None)
        if orig:
            cell["col_type"] = orig
        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
            cell["_ipa_corrected"] = True
    # --- German IPA (wiki-pronunciation-dict + epitran) ---
    if de_ipa_target_cols:
        from cv_ipa_german import insert_german_ipa
        insert_german_ipa(all_cells, de_ipa_target_cols)
    ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
    # Mark cells whose text was changed by IPA correction
    for cell in all_cells:
        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
            cell["_ipa_corrected"] = True
    # 5d. Fix IPA continuation cells
    skip_ipa = (ipa_mode == "none")
    _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    ipa_cont_fixed = 0
    for z in ([] if skip_ipa else zones_data):
        rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
        z_cells = z.get("cells", [])
        for idx, row in enumerate(rows_sorted):
            if idx == 0:
                continue
            ri = row["index"]
            row_cells = [c for c in z_cells if c.get("row_index") == ri]
            for cell in row_cells:
                ct = cell.get("col_type", "")
                if not ct.startswith("column_"):
                    continue
                cell_text = (cell.get("text") or "").strip()
                if not cell_text:
                    wb_texts = [w.get("text", "")
                                for w in cell.get("word_boxes", [])]
                    cell_text = " ".join(wb_texts).strip()
                    if not cell_text:
                        continue
                is_bracketed = (
                    cell_text.startswith('[') and cell_text.endswith(']')
                )
                if is_bracketed:
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
                        continue
                else:
                    content_cells_in_row = [
                        c for c in row_cells
                        if c.get("col_type", "").startswith("column_")
                        and c.get("col_type") != "column_1"
                    ]
                    if len(content_cells_in_row) != 1:
                        continue
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    if any(c in _REAL_IPA_CHARS for c in cell_text):
                        continue
                    _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
                    if len(_words_in_text) >= 3:
                        continue
                # Find headword in previous row, same column
                prev_ri = rows_sorted[idx - 1]["index"]
                prev_same_col = [
                    c for c in z_cells
                    if c.get("row_index") == prev_ri
                    and c.get("col_type") == ct
                ]
                if not prev_same_col:
                    continue
                prev_text = prev_same_col[0].get("text", "")
                fixed = fix_ipa_continuation_cell(
                    cell_text, prev_text, pronunciation="british",
                )
                if fixed != cell_text:
                    cell["text"] = fixed
                    ipa_cont_fixed += 1
                    logger.info(
                        "IPA continuation R%d %s: '%s' -> '%s'",
                        ri, ct, cell_text, fixed,
                    )
    if ipa_cont_fixed:
        logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
    return en_col_type, ipa_target_cols, all_content_cols
 def _extract_page_refs_and_footers(
    zones_data: List[Dict[str, Any]],
    page_number_info: Optional[Dict],
 ) -> None:
    """Extract page_ref cells and footer rows from content zones.
    Modifies zones_data in place. Updates page_number_info if a page number
    footer is found.
    """
    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
    _NUMBER_WORDS = {
        "one", "two", "three", "four", "five", "six", "seven",
        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
        "einhundert", "zweihundert", "dreihundert", "vierhundert",
        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
    }
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        if not rows:
            continue
        # Extract column_1 cells that look like page references
        page_refs = []
        page_ref_cell_ids = set()
        for cell in cells:
            if cell.get("col_type") != "column_1":
                continue
            text = (cell.get("text") or "").strip()
            if not text:
                continue
            if not _PAGE_REF_RE.match(text):
                continue
            page_refs.append({
                "row_index": cell.get("row_index"),
                "text": text,
                "bbox_pct": cell.get("bbox_pct", {}),
            })
            page_ref_cell_ids.add(cell.get("cell_id"))
        # Detect footer: last non-header row if it has only 1 cell
        footer_rows = []
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if non_header_rows:
            last_row = non_header_rows[-1]
            last_ri = last_row["index"]
            last_cells = [c for c in z["cells"]
                          if c.get("row_index") == last_ri]
            if len(last_cells) == 1:
                text = (last_cells[0].get("text") or "").strip()
                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
                has_commas = ',' in text
                text_words = set(text.lower().split())
                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
                is_page_number = len(text) <= 20 or is_written_number
                if (text and not has_real_ipa and not has_commas
                        and is_page_number
                        and last_cells[0].get("col_type") != "heading"):
                    footer_rows.append({
                        "row_index": last_ri,
                        "text": text,
                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
                    })
        # Classify footer rows
        page_number_footers = []
        other_footers = []
        for fr in footer_rows:
            ft = fr["text"].strip()
            digits = "".join(c for c in ft if c.isdigit())
            if digits and re.match(r'^[\d\s.]+$', ft):
                page_number_footers.append(fr)
            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
                page_number_footers.append(fr)
            else:
                other_footers.append(fr)
        # Remove page-number footer rows from grid entirely
        if page_number_footers:
            pn_ris = {fr["row_index"] for fr in page_number_footers}
            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
            pn_text = page_number_footers[0]["text"].strip()
            pn_digits = "".join(c for c in pn_text if c.isdigit())
            if not page_number_info:
                page_number_info = {
                    "text": pn_text,
                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
                }
                if pn_digits:
                    page_number_info["number"] = int(pn_digits)
        # Mark remaining footer rows
        if other_footers:
            footer_ris = {fr["row_index"] for fr in other_footers}
            for r in z["rows"]:
                if r["index"] in footer_ris:
                    r["is_footer"] = True
            for c in z["cells"]:
                if c.get("row_index") in footer_ris:
                    c["col_type"] = "footer"
        if page_refs or footer_rows:
            logger.info(
                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
                len(page_refs), len(footer_rows), len(page_number_footers),
                z.get("zone_index", 0),
            )
        if page_refs:
            z["page_refs"] = page_refs
        if other_footers:
            z["footer"] = other_footers
 def _convert_slash_ipa(
    zones_data: List[Dict[str, Any]],
    skip_ipa: bool,
    en_col_type: Optional[str],
 ) -> None:
    """Convert slash-delimited IPA to bracket notation.
    Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
    """
    _SLASH_IPA_RE = re.compile(
        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
    )
    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
    slash_ipa_fixed = 0
    for z in ([] if skip_ipa else zones_data):
        for cell in z.get("cells", []):
            if en_col_type and cell.get("col_type") != en_col_type:
                continue
            text = cell.get("text", "")
            if "/" not in text:
                continue
            def _replace_slash_ipa(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                headword = m.group(1)
                ocr_ipa = m.group(2)
                inner_raw = ocr_ipa.strip("/").strip()
                if _SLASH_IPA_REJECT_RE.search(inner_raw):
                    return m.group(0)
                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
                if ipa:
                    slash_ipa_fixed += 1
                    return f"{headword} [{ipa}]"
                inner = inner_raw.lstrip("'").strip()
                if inner:
                    slash_ipa_fixed += 1
                    return f"{headword} [{inner}]"
                return m.group(0)
            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
            def _replace_trailing_slash(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                inner = m.group(1).strip("/").strip().lstrip("'").strip()
                if _SLASH_IPA_REJECT_RE.search(inner):
                    return m.group(0)
                if inner:
                    slash_ipa_fixed += 1
                    return f" [{inner}]"
                return m.group(0)
            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
            if new_text == text:
                m = _STANDALONE_SLASH_IPA_RE.match(text)
                if m:
                    inner = m.group(1).strip()
                    if not _SLASH_IPA_REJECT_RE.search(inner):
                        inner = inner.lstrip("'").strip()
                        if inner:
                            new_text = "[" + inner + "]" + text[m.end():]
                            slash_ipa_fixed += 1
            if new_text != text:
                cell["text"] = new_text
    if slash_ipa_fixed:
        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
@@ -1,462 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/build/zones.py
-Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
+import importlib as _importlib
-detection and zone-aware grid building.
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("grid.build.zones")
 Extracted from grid_build_core.py for maintainability.
 """
 import logging
 from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_graphic_detect import detect_graphic_elements
 from cv_color_detect import recover_colored_text
 from cv_vocab_types import PageZone
 from ocr_pipeline_session_store import get_session_image
 from grid_editor_helpers import (
    _filter_border_strip_words,
    _filter_border_ghosts,
    _words_in_zone,
    _PIPE_RE_VSPLIT,
    _detect_vertical_dividers,
    _split_zone_at_vertical_dividers,
    _merge_content_zones_across_boxes,
    _build_zone_grid,
 )
 logger = logging.getLogger(__name__)
 async def _build_zones(
    session_id: str,
    session: dict,
    all_words: List[Dict[str, Any]],
    graphic_rects: List[Dict[str, int]],
    content_x: int,
    content_y: int,
    content_w: int,
    content_h: int,
    img_w: int,
    img_h: int,
 ) -> Dict[str, Any]:
    """Load image, detect graphics/boxes, build zone-aware grids.
    Returns a dict with keys:
        zones_data, boxes_detected, recovered_count, border_prefiltered,
        img_bgr, all_words (modified in-place but returned for clarity).
    """
    zones_data: List[Dict[str, Any]] = []
    boxes_detected = 0
    recovered_count = 0
    border_prefiltered = False
    img_bgr = None
    # 3. Load image for box detection
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")
    if img_png:
        # Decode image for color detection + box detection
        arr = np.frombuffer(img_png, dtype=np.uint8)
        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        if img_bgr is not None:
            # --- 3a. Detect graphic/image regions via CV and hard-filter ---
            sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
            fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
            if fresh_graphics:
                fresh_rects = [
                    {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
                    for g in fresh_graphics
                ]
                graphic_rects.extend(fresh_rects)
                logger.info(
                    "build-grid session %s: detected %d graphic region(s) via CV",
                    session_id, len(fresh_graphics),
                )
                # Hard-filter words inside newly detected graphic regions
                before = len(all_words)
                all_words[:] = [
                    w for w in all_words
                    if not any(
                        gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in fresh_rects
                    )
                ]
                removed = before - len(all_words)
                if removed:
                    logger.info(
                        "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
                        session_id, removed, len(fresh_rects),
                    )
            # --- Recover colored text that OCR missed (before grid building) ---
            recovered = recover_colored_text(img_bgr, all_words)
            if recovered and graphic_rects:
                # Filter recovered chars inside graphic regions
                recovered = [
                    r for r in recovered
                    if not any(
                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in graphic_rects
                    )
                ]
            if recovered:
                recovered_count = len(recovered)
                all_words.extend(recovered)
                logger.info(
                    "build-grid session %s: +%d recovered colored words",
                    session_id, recovered_count,
                )
            # Detect bordered boxes
            boxes = detect_boxes(
                img_bgr,
                content_x=content_x,
                content_w=content_w,
                content_y=content_y,
                content_h=content_h,
            )
            boxes_detected = len(boxes)
            if boxes:
                # Filter border ghost words before grid building
                all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
                if ghost_count:
                    all_words[:] = all_words_new
                    logger.info(
                        "build-grid session %s: removed %d border ghost words",
                        session_id, ghost_count,
                    )
                # Split page into zones
                page_zones = split_page_into_zones(
                    content_x, content_y, content_w, content_h, boxes
                )
                # Merge content zones separated by box zones
                page_zones = _merge_content_zones_across_boxes(
                    page_zones, content_x, content_w
                )
                # 3b. Detect vertical dividers and split content zones
                page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
                    page_zones, all_words
                )
                # --- First pass: build grids per zone independently ---
                zone_grids = _build_grids_per_zone(
                    page_zones, all_words, img_w, img_h
                )
                border_prefiltered = border_prefiltered or any(
                    zg.get("_border_prefiltered") for zg in zone_grids
                )
                # --- Second pass: merge column boundaries from all content zones ---
                _merge_content_zone_columns(
                    zone_grids, all_words, content_w, img_w, img_h, session_id
                )
                # --- Build zones_data from zone_grids ---
                for zg in zone_grids:
                    pz = zg["pz"]
                    grid = zg["grid"]
                    grid.pop("_raw_columns", None)
                    zone_entry: Dict[str, Any] = {
                        "zone_index": pz.index,
                        "zone_type": pz.zone_type,
                        "bbox_px": {
                            "x": pz.x, "y": pz.y,
                            "w": pz.width, "h": pz.height,
                        },
                        "bbox_pct": {
                            "x": round(pz.x / img_w * 100, 2) if img_w else 0,
                            "y": round(pz.y / img_h * 100, 2) if img_h else 0,
                            "w": round(pz.width / img_w * 100, 2) if img_w else 0,
                            "h": round(pz.height / img_h * 100, 2) if img_h else 0,
                        },
                        "border": None,
                        "word_count": len(zg["words"]),
                        **grid,
                    }
                    if pz.box:
                        zone_entry["border"] = {
                            "thickness": pz.box.border_thickness,
                            "confidence": pz.box.confidence,
                        }
                    if pz.image_overlays:
                        zone_entry["image_overlays"] = pz.image_overlays
                    if pz.layout_hint:
                        zone_entry["layout_hint"] = pz.layout_hint
                    if pz.vsplit_group is not None:
                        zone_entry["vsplit_group"] = pz.vsplit_group
                    zones_data.append(zone_entry)
    # 4. Fallback: no boxes detected -> single zone with all words
    if not zones_data:
        before = len(all_words)
        filtered_words = [
            w for w in all_words
            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
        ]
        removed = before - len(filtered_words)
        if removed:
            logger.info(
                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
                session_id, removed,
            )
        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
        if bs_removed:
            border_prefiltered = True
            logger.info(
                "build-grid session %s: pre-filtered %d border-strip words",
                session_id, bs_removed,
            )
        grid = _build_zone_grid(
            filtered_words, content_x, content_y, content_w, content_h,
            0, img_w, img_h,
        )
        grid.pop("_raw_columns", None)
        zones_data.append({
            "zone_index": 0,
            "zone_type": "content",
            "bbox_px": {
                "x": content_x, "y": content_y,
                "w": content_w, "h": content_h,
            },
            "bbox_pct": {
                "x": round(content_x / img_w * 100, 2) if img_w else 0,
                "y": round(content_y / img_h * 100, 2) if img_h else 0,
                "w": round(content_w / img_w * 100, 2) if img_w else 0,
                "h": round(content_h / img_h * 100, 2) if img_h else 0,
            },
            "border": None,
            "word_count": len(all_words),
            **grid,
        })
    return {
        "zones_data": zones_data,
        "boxes_detected": boxes_detected,
        "recovered_count": recovered_count,
        "border_prefiltered": border_prefiltered,
        "img_bgr": img_bgr,
    }
 def _detect_and_split_vertical_dividers(
    page_zones: List[PageZone],
    all_words: List[Dict[str, Any]],
 ) -> tuple:
    """Detect vertical dividers and split content zones.
    Returns (expanded_zones, border_prefiltered_from_vsplit).
    """
    vsplit_group_counter = 0
    expanded_zones: List = []
    for pz in page_zones:
        if pz.zone_type != "content":
            expanded_zones.append(pz)
            continue
        zone_words = _words_in_zone(
            all_words, pz.y, pz.height, pz.x, pz.width
        )
        divider_xs = _detect_vertical_dividers(
            zone_words, pz.x, pz.width, pz.y, pz.height
        )
        if divider_xs:
            sub_zones = _split_zone_at_vertical_dividers(
                pz, divider_xs, vsplit_group_counter
            )
            expanded_zones.extend(sub_zones)
            vsplit_group_counter += 1
            # Remove pipe words so they don't appear in sub-zones
            pipe_ids = set(
                id(w) for w in zone_words
                if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
            )
            all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
            logger.info(
                "build-grid: vertical split zone %d at x=%s -> %d sub-zones",
                pz.index, [int(x) for x in divider_xs], len(sub_zones),
            )
        else:
            expanded_zones.append(pz)
    # Re-index zones
    for i, pz in enumerate(expanded_zones):
        pz.index = i
    return expanded_zones, False
 def _build_grids_per_zone(
    page_zones: List[PageZone],
    all_words: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
 ) -> List[Dict[str, Any]]:
    """Build grids for each zone independently (first pass)."""
    zone_grids: List[Dict] = []
    for pz in page_zones:
        zone_words = _words_in_zone(
            all_words, pz.y, pz.height, pz.x, pz.width
        )
        if pz.zone_type == "content":
            logger.info(
                "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
                pz.index, pz.zone_type,
                pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
                len(zone_words), len(all_words),
            )
        # Filter recovered single-char artifacts in ALL zones
        before = len(zone_words)
        zone_words = [
            w for w in zone_words
            if not (
                w.get("recovered")
                and len(w.get("text", "").strip()) <= 2
            )
        ]
        removed = before - len(zone_words)
        if removed:
            logger.info(
                "build-grid: filtered %d recovered artifacts from %s zone %d",
                removed, pz.zone_type, pz.index,
            )
        # Filter words inside image overlay regions (merged box zones)
        if pz.image_overlays:
            before_ov = len(zone_words)
            zone_words = [
                w for w in zone_words
                if not any(
                    ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
                    and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
                    for ov in pz.image_overlays
                )
            ]
            ov_removed = before_ov - len(zone_words)
            if ov_removed:
                logger.info(
                    "build-grid: filtered %d words inside image overlays from zone %d",
                    ov_removed, pz.index,
                )
        zone_words, bs_removed = _filter_border_strip_words(zone_words)
        bp = False
        if bs_removed:
            bp = True
            logger.info(
                "build-grid: pre-filtered %d border-strip words from zone %d",
                bs_removed, pz.index,
            )
        grid = _build_zone_grid(
            zone_words, pz.x, pz.y, pz.width, pz.height,
            pz.index, img_w, img_h,
            skip_first_row_header=bool(pz.image_overlays),
        )
        zone_grids.append({
            "pz": pz, "words": zone_words, "grid": grid,
            "_border_prefiltered": bp,
        })
    return zone_grids
 def _merge_content_zone_columns(
    zone_grids: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    content_w: int,
    img_w: int,
    img_h: int,
    session_id: str,
 ) -> None:
    """Second pass: merge column boundaries from all content zones.
    Modifies zone_grids in place.
    """
    content_zones = [
        zg for zg in zone_grids
        if zg["pz"].zone_type == "content"
        and zg["pz"].vsplit_group is None
    ]
    if len(content_zones) <= 1:
        return
    # Collect column split points (x_min of non-first columns)
    all_split_xs: List[float] = []
    for zg in content_zones:
        raw_cols = zg["grid"].get("_raw_columns", [])
        for col in raw_cols[1:]:
            all_split_xs.append(col["x_min"])
    if not all_split_xs:
        return
    all_split_xs.sort()
    merge_distance = max(25, int(content_w * 0.03))
    merged_xs = [all_split_xs[0]]
    for x in all_split_xs[1:]:
        if x - merged_xs[-1] < merge_distance:
            merged_xs[-1] = (merged_xs[-1] + x) / 2
        else:
            merged_xs.append(x)
    total_cols = len(merged_xs) + 1
    max_zone_cols = max(
        len(zg["grid"].get("_raw_columns", []))
        for zg in content_zones
    )
    if total_cols < max_zone_cols:
        return
    cx_min = min(w["left"] for w in all_words)
    cx_max = max(w["left"] + w["width"] for w in all_words)
    merged_columns: List[Dict[str, Any]] = []
    prev_x = cx_min
    for i, sx in enumerate(merged_xs):
        merged_columns.append({
            "index": i,
            "type": f"column_{i + 1}",
            "x_min": prev_x,
            "x_max": sx,
        })
        prev_x = sx
    merged_columns.append({
        "index": len(merged_xs),
        "type": f"column_{len(merged_xs) + 1}",
        "x_min": prev_x,
        "x_max": cx_max,
    })
    # Re-build ALL content zones with merged columns
    for zg in zone_grids:
        pz = zg["pz"]
        if pz.zone_type == "content":
            grid = _build_zone_grid(
                zg["words"], pz.x, pz.y,
                pz.width, pz.height,
                pz.index, img_w, img_h,
                global_columns=merged_columns,
                skip_first_row_header=bool(pz.image_overlays),
            )
            zg["grid"] = grid
    logger.info(
        "build-grid session %s: union of %d content "
        "zones -> %d merged columns (max single zone: %d)",
        session_id, len(content_zones),
        total_cols, max_zone_cols,
    )
@@ -1,31 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/api.py
-Grid Editor API — barrel re-export.
+import importlib as _importlib
-
+import sys as _sys
-The actual endpoints live in:
+_sys.modules[__name__] = _importlib.import_module("grid.editor.api")
  - grid_editor_api_grid.py   (build-grid, rerun-ocr, save-grid, get-grid)
  - grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply)
  - grid_editor_api_box.py    (build-box-grids)
  - grid_editor_api_unified.py (build-unified-grid, unified-grid)
 This module re-exports the combined router and key symbols so that
 existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core`
 continue to work unchanged.
 """
 from fastapi import APIRouter
 from grid_editor_api_grid import router as _grid_router
 from grid_editor_api_gutter import router as _gutter_router
 from grid_editor_api_box import router as _box_router
 from grid_editor_api_unified import router as _unified_router
 # Re-export _build_grid_core so callers that do
 # `from grid_editor_api import _build_grid_core` keep working.
 from grid_build_core import _build_grid_core  # noqa: F401
 # Merge all sub-routers into one combined router
 router = APIRouter()
 router.include_router(_grid_router)
 router.include_router(_gutter_router)
 router.include_router(_box_router)
 router.include_router(_unified_router)
@@ -1,177 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/api_box.py
-Grid Editor API — box-grid-review endpoints.
+import importlib as _importlib
-"""
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("grid.editor.api_box")
 import logging
 from fastapi import APIRouter, HTTPException, Request
 from grid_editor_helpers import _words_in_zone
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-box-grids")
 async def build_box_grids(session_id: str, request: Request):
    """Rebuild grid structure for all detected boxes with layout-aware detection.
    Uses structure_result.boxes (from Step 7) as the source of box coordinates,
    and raw_paddle_words as OCR word source. Creates or updates box zones in
    the grid_editor_result.
    Optional body: { "overrides": { "0": "bullet_list" } }
    Maps box_index -> forced layout_type.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
    # Get raw OCR words (with top/left/width/height keys)
    word_result = session.get("word_result") or {}
    all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or []
    if not all_words:
        raise HTTPException(status_code=400, detail="No raw OCR words available.")
    # Get detected boxes from structure_result
    structure_result = session.get("structure_result") or {}
    gt = session.get("ground_truth") or {}
    if not structure_result:
        structure_result = gt.get("structure_result") or {}
    detected_boxes = structure_result.get("boxes") or []
    if not detected_boxes:
        return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
    # Filter out false-positive boxes in header/footer margins.
    img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
    if img_h_for_filter > 0:
        margin_frac = 0.07  # 7% of image height
        margin_top = img_h_for_filter * margin_frac
        margin_bottom = img_h_for_filter * (1 - margin_frac)
        filtered = []
        for box in detected_boxes:
            by = box.get("y", 0)
            bh = box.get("h", 0)
            box_center_y = by + bh / 2
            if box_center_y < margin_top or box_center_y > margin_bottom:
                logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
                            by, bh, box_center_y, margin_top, margin_bottom)
                continue
            filtered.append(box)
        detected_boxes = filtered
    body = {}
    try:
        body = await request.json()
    except Exception:
        pass
    layout_overrides = body.get("overrides", {})
    from cv_box_layout import build_box_zone_grid
    img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0)
    img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
    zones = grid_data.get("zones", [])
    # Find highest existing zone_index
    max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1)
    # Remove old box zones (we'll rebuild them)
    zones = [z for z in zones if z.get("zone_type") != "box"]
    box_count = 0
    spell_fixes = 0
    for box_idx, box in enumerate(detected_boxes):
        bx = box.get("x", 0)
        by = box.get("y", 0)
        bw = box.get("w", 0)
        bh = box.get("h", 0)
        if bw <= 0 or bh <= 0:
            continue
        # Filter raw OCR words inside this box
        zone_words = _words_in_zone(all_words, by, bh, bx, bw)
        if not zone_words:
            logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh)
            continue
        zone_idx = max_zone_idx + 1 + box_idx
        forced_layout = layout_overrides.get(str(box_idx))
        # Build box grid
        box_grid = build_box_zone_grid(
            zone_words, bx, by, bw, bh,
            zone_idx, img_w, img_h,
            layout_type=forced_layout,
        )
        # Apply SmartSpellChecker to all box cells
        try:
            from smart_spell import SmartSpellChecker
            ssc = SmartSpellChecker()
            for cell in box_grid.get("cells", []):
                text = cell.get("text", "")
                if not text:
                    continue
                result = ssc.correct_text(text, lang="auto")
                if result.changed:
                    cell["text"] = result.corrected
                    spell_fixes += 1
        except ImportError:
            pass
        # Build zone entry
        zone_entry = {
            "zone_index": zone_idx,
            "zone_type": "box",
            "bbox_px": {"x": bx, "y": by, "w": bw, "h": bh},
            "bbox_pct": {
                "x": round(bx / img_w * 100, 2) if img_w else 0,
                "y": round(by / img_h * 100, 2) if img_h else 0,
                "w": round(bw / img_w * 100, 2) if img_w else 0,
                "h": round(bh / img_h * 100, 2) if img_h else 0,
            },
            "border": None,
            "word_count": len(zone_words),
            "columns": box_grid["columns"],
            "rows": box_grid["rows"],
            "cells": box_grid["cells"],
            "header_rows": box_grid.get("header_rows", []),
            "box_layout_type": box_grid.get("box_layout_type", "flowing"),
            "box_grid_reviewed": False,
            "box_bg_color": box.get("bg_color_name", ""),
            "box_bg_hex": box.get("bg_color_hex", ""),
        }
        zones.append(zone_entry)
        box_count += 1
    # Sort zones by y-position for correct reading order
    zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0))
    grid_data["zones"] = zones
    await update_session_db(session_id, grid_editor_result=grid_data)
    logger.info(
        "build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected",
        session_id, box_count, spell_fixes, len(detected_boxes),
    )
    return {
        "session_id": session_id,
        "box_zones_rebuilt": box_count,
        "total_detected_boxes": len(detected_boxes),
        "spell_fixes": spell_fixes,
        "zones": zones,
    }
@@ -1,337 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/api_grid.py
-Grid Editor API — grid build, save, and retrieve endpoints.
+import importlib as _importlib
-"""
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("grid.editor.api_grid")
 import logging
 import time
 from typing import Any, Dict
 from fastapi import APIRouter, HTTPException, Query, Request
 from grid_build_core import _build_grid_core
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-grid")
 async def build_grid(
    session_id: str,
    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
 ):
    """Build a structured, zone-aware grid from existing Kombi word results.
    Requires that paddle-kombi or rapid-kombi has already been run on the session.
    Uses the image for box detection and the word positions for grid structuring.
    Query params:
        ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
        syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
    Returns a StructuredGrid with zones, each containing their own
    columns, rows, and cells — ready for the frontend Excel-like editor.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    try:
        result = await _build_grid_core(
            session_id, session,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            enhance=enhance,
            max_columns=max_cols if max_cols > 0 else None,
            min_conf=min_conf if min_conf > 0 else None,
        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    # Save automatic grid snapshot for later comparison with manual corrections
    # Lazy import to avoid circular dependency with ocr_pipeline_regression
    from ocr_pipeline_regression import _build_reference_snapshot
    wr = session.get("word_result") or {}
    engine = wr.get("ocr_engine", "")
    if engine in ("kombi", "rapid_kombi"):
        auto_pipeline = "kombi"
    elif engine == "paddle_direct":
        auto_pipeline = "paddle-direct"
    else:
        auto_pipeline = "pipeline"
    auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline)
    gt = session.get("ground_truth") or {}
    gt["auto_grid_snapshot"] = auto_snapshot
    # Persist to DB and advance current_step to 11 (reconstruction complete)
    await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11)
    logger.info(
        "build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
        "%d boxes in %.2fs",
        session_id,
        len(result.get("zones", [])),
        result.get("summary", {}).get("total_columns", 0),
        result.get("summary", {}).get("total_rows", 0),
        result.get("summary", {}).get("total_cells", 0),
        result.get("boxes_detected", 0),
        result.get("duration_seconds", 0),
    )
    return result
@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid")
 async def rerun_ocr_and_build_grid(
    session_id: str,
    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
    max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
    vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
    doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
 ):
    """Re-run OCR with quality settings, then rebuild the grid.
    Unlike build-grid (which only rebuilds from existing words),
    this endpoint re-runs the full OCR pipeline on the cropped image
    with optional CLAHE enhancement, then builds the grid.
    Steps executed: Image Enhancement -> OCR -> Grid Build
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    import time as _time
    t0 = _time.time()
    # 1. Load the cropped/dewarped image from cache or session
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if dewarped_bgr is None:
        raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.")
    import numpy as np
    img_h, img_w = dewarped_bgr.shape[:2]
    ocr_input = dewarped_bgr.copy()
    # 2. Scan quality assessment
    scan_quality_info = {}
    try:
        from scan_quality import score_scan_quality
        quality_report = score_scan_quality(ocr_input)
        scan_quality_info = quality_report.to_dict()
        actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf
    except Exception as e:
        logger.warning(f"rerun-ocr: scan quality failed: {e}")
        actual_min_conf = min_conf if min_conf > 0 else 40
    # 3. Image enhancement (Step 3)
    is_degraded = scan_quality_info.get("is_degraded", False)
    if enhance and is_degraded:
        try:
            from ocr_image_enhance import enhance_for_ocr
            ocr_input = enhance_for_ocr(ocr_input, is_degraded=True)
            logger.info("rerun-ocr: CLAHE enhancement applied")
        except Exception as e:
            logger.warning(f"rerun-ocr: enhancement failed: {e}")
    # 4. Run dual-engine OCR
    from PIL import Image
    import pytesseract
    # RapidOCR
    rapid_words = []
    try:
        from cv_ocr_engines import ocr_region_rapid
        from cv_vocab_types import PageRegion
        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
        rapid_words = ocr_region_rapid(ocr_input, full_region) or []
    except Exception as e:
        logger.warning(f"rerun-ocr: RapidOCR failed: {e}")
    # Tesseract
    pil_img = Image.fromarray(ocr_input[:, :, ::-1])
    data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT)
    tess_words = []
    for i in range(len(data["text"])):
        text = (data["text"][i] or "").strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < actual_min_conf:
            continue
        tess_words.append({
            "text": text, "left": data["left"][i], "top": data["top"][i],
            "width": data["width"][i], "height": data["height"][i], "conf": conf,
        })
    # 5. Merge OCR results
    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
    if rapid_split or tess_words:
        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
        merged_words = _deduplicate_words(merged_words)
    else:
        merged_words = tess_words
    # 6. Store updated word_result in session
    cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
                          "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
                         for w in merged_words]
    word_result = {
        "cells": [{"text": " ".join(w["text"] for w in merged_words),
                    "word_boxes": cells_for_storage}],
        "image_width": img_w,
        "image_height": img_h,
        "ocr_engine": "rapid_kombi",
        "word_count": len(merged_words),
        "raw_paddle_words": rapid_words,
    }
    # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
    vision_applied = False
    if vision_fusion:
        try:
            from vision_ocr_fusion import vision_fuse_ocr
            category = doc_category or session.get("document_category") or "vokabelseite"
            logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
            merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
            vision_applied = True
            # Rebuild storage from fused words
            cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
                                  "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
                                 for w in merged_words]
            word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
                                     "word_boxes": cells_for_storage}]
            word_result["word_count"] = len(merged_words)
            word_result["ocr_engine"] = "vision_fusion"
        except Exception as e:
            logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
    await update_session_db(session_id, word_result=word_result)
    # Reload session with updated word_result
    session = await get_session_db(session_id)
    ocr_duration = _time.time() - t0
    logger.info(
        "rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs "
        "(enhance=%s, min_conf=%d, quality=%s)",
        session_id, len(merged_words), len(rapid_words), len(tess_words),
        len(merged_words), ocr_duration, enhance, actual_min_conf,
        scan_quality_info.get("quality_pct", "?"),
    )
    # 7. Build grid from new words
    try:
        result = await _build_grid_core(
            session_id, session,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            enhance=enhance,
            max_columns=max_cols if max_cols > 0 else None,
            min_conf=min_conf if min_conf > 0 else None,
        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    # Persist grid
    await update_session_db(session_id, grid_editor_result=result, current_step=11)
    # Add quality info to response
    result["scan_quality"] = scan_quality_info
    result["ocr_stats"] = {
        "rapid_words": len(rapid_words),
        "tess_words": len(tess_words),
        "merged_words": len(merged_words),
        "min_conf_used": actual_min_conf,
        "enhance_applied": enhance and is_degraded,
        "vision_fusion_applied": vision_applied,
        "document_category": doc_category or session.get("document_category", ""),
        "ocr_duration_seconds": round(ocr_duration, 1),
    }
    total_duration = _time.time() - t0
    logger.info(
        "rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs",
        session_id,
        len(result.get("zones", [])),
        result.get("summary", {}).get("total_columns", 0),
        result.get("summary", {}).get("total_cells", 0),
        total_duration,
    )
    return result
@router.post("/sessions/{session_id}/save-grid")
 async def save_grid(session_id: str, request: Request):
    """Save edited grid data from the frontend Excel-like editor.
    Receives the full StructuredGrid with user edits (text changes,
    formatting changes like bold columns, header rows, etc.) and
    persists it to the session's grid_editor_result.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    body = await request.json()
    # Validate basic structure
    if "zones" not in body:
        raise HTTPException(status_code=400, detail="Missing 'zones' in request body")
    # Preserve metadata from the original build
    existing = session.get("grid_editor_result") or {}
    result = {
        "session_id": session_id,
        "image_width": body.get("image_width", existing.get("image_width", 0)),
        "image_height": body.get("image_height", existing.get("image_height", 0)),
        "zones": body["zones"],
        "boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)),
        "summary": body.get("summary", existing.get("summary", {})),
        "formatting": body.get("formatting", existing.get("formatting", {})),
        "duration_seconds": existing.get("duration_seconds", 0),
        "edited": True,
    }
    await update_session_db(session_id, grid_editor_result=result, current_step=11)
    logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))
    return {"session_id": session_id, "saved": True}
@router.get("/sessions/{session_id}/grid-editor")
 async def get_grid(session_id: str):
    """Retrieve the current grid editor state for a session."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    result = session.get("grid_editor_result")
    if not result:
        raise HTTPException(
            status_code=404,
            detail="No grid editor data. Run build-grid first.",
        )
    return result
@@ -1,110 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/api_gutter.py
-Grid Editor API — gutter repair endpoints.
+import importlib as _importlib
-"""
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("grid.editor.api_gutter")
 import logging
 from fastapi import APIRouter, HTTPException, Request
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/gutter-repair")
 async def gutter_repair(session_id: str):
    """Analyse grid for gutter-edge OCR errors and return repair suggestions.
    Detects:
      - Words truncated/blurred at the book binding (spell_fix)
      - Words split across rows with missing hyphen chars (hyphen_join)
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(
            status_code=400,
            detail="No grid data. Run build-grid first.",
        )
    from cv_gutter_repair import analyse_grid_for_gutter_repair
    image_width = grid_data.get("image_width", 0)
    result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
    # Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
    gt = session.get("ground_truth") or {}
    gt["gutter_repair"] = result
    await update_session_db(session_id, ground_truth=gt)
    logger.info(
        "gutter-repair session %s: %d suggestions in %.2fs",
        session_id,
        result.get("stats", {}).get("suggestions_found", 0),
        result.get("duration_seconds", 0),
    )
    return result
@router.post("/sessions/{session_id}/gutter-repair/apply")
 async def gutter_repair_apply(session_id: str, request: Request):
    """Apply accepted gutter repair suggestions to the grid.
    Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(status_code=400, detail="No grid data.")
    gt = session.get("ground_truth") or {}
    gutter_result = gt.get("gutter_repair")
    if not gutter_result:
        raise HTTPException(
            status_code=400,
            detail="No gutter repair data. Run gutter-repair first.",
        )
    body = await request.json()
    accepted_ids = body.get("accepted", [])
    if not accepted_ids:
        return {"applied_count": 0, "changes": []}
    # text_overrides: { suggestion_id: "alternative_text" }
    # Allows the user to pick a different correction from the alternatives list
    text_overrides = body.get("text_overrides", {})
    from cv_gutter_repair import apply_gutter_suggestions
    suggestions = gutter_result.get("suggestions", [])
    # Apply user-selected alternatives before passing to apply
    for s in suggestions:
        sid = s.get("id", "")
        if sid in text_overrides and text_overrides[sid]:
            s["suggested_text"] = text_overrides[sid]
    result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
    # Save updated grid back to session
    await update_session_db(session_id, grid_editor_result=grid_data)
    logger.info(
        "gutter-repair/apply session %s: %d changes applied",
        session_id,
        result.get("applied_count", 0),
    )
    return result
@@ -1,71 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/api_unified.py
-Grid Editor API — unified grid endpoints.
+import importlib as _importlib
-"""
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("grid.editor.api_unified")
 import logging
 from fastapi import APIRouter, HTTPException
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-unified-grid")
 async def build_unified_grid_endpoint(session_id: str):
    """Build a single-zone unified grid merging content + box zones.
    Takes the existing multi-zone grid_editor_result and produces a
    unified grid where boxes are integrated into the main row sequence.
    Persists as unified_grid_result (preserves original multi-zone data).
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    grid_data = session.get("grid_editor_result")
    if not grid_data:
        raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
    from unified_grid import build_unified_grid
    result = build_unified_grid(
        zones=grid_data.get("zones", []),
        image_width=grid_data.get("image_width", 0),
        image_height=grid_data.get("image_height", 0),
        layout_metrics=grid_data.get("layout_metrics", {}),
    )
    # Persist as separate field (don't overwrite original multi-zone grid)
    await update_session_db(session_id, unified_grid_result=result)
    logger.info(
        "build-unified-grid session %s: %d rows, %d cells",
        session_id,
        result.get("summary", {}).get("total_rows", 0),
        result.get("summary", {}).get("total_cells", 0),
    )
    return result
@router.get("/sessions/{session_id}/unified-grid")
 async def get_unified_grid(session_id: str):
    """Retrieve the unified grid for a session."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    result = session.get("unified_grid_result")
    if not result:
        raise HTTPException(
            status_code=404,
            detail="No unified grid. Run build-unified-grid first.",
        )
    return result
@@ -1,492 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/columns.py
-Grid Editor — column detection, cross-column splitting, marker merging.
+import importlib as _importlib
-
+import sys as _sys
-Split from grid_editor_helpers.py for maintainability.
+_sys.modules[__name__] = _importlib.import_module("grid.editor.columns")
 All functions are pure computation — no HTTP, DB, or session side effects.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Cross-column word splitting
 # ---------------------------------------------------------------------------
 _spell_cache: Optional[Any] = None
 _spell_loaded = False
 def _is_recognized_word(text: str) -> bool:
    """Check if *text* is a recognized German or English word.
    Uses the spellchecker library (same as cv_syllable_detect.py).
    Returns True for real words like "oder", "Kabel", "Zeitung".
    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
    """
    global _spell_cache, _spell_loaded
    if not text or len(text) < 2:
        return False
    if not _spell_loaded:
        _spell_loaded = True
        try:
            from spellchecker import SpellChecker
            _spell_cache = SpellChecker(language="de")
        except Exception:
            pass
    if _spell_cache is None:
        return False
    return text.lower() in _spell_cache
 def _split_cross_column_words(
    words: List[Dict],
    columns: List[Dict],
 ) -> List[Dict]:
    """Split word boxes that span across column boundaries.
    When OCR merges adjacent words from different columns (e.g. "sichzie"
    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
    split the word box at the column boundary so each piece is assigned
    to the correct column.
    Only splits when:
    - The word has significant overlap (>15% of its width) on both sides
    - AND the word is not a recognized real word (OCR merge artifact), OR
      the word contains a case transition (lowercase->uppercase) near the
      boundary indicating two merged words like "dasZimmer".
    """
    if len(columns) < 2:
        return words
    # Column boundaries = midpoints between adjacent column edges
    boundaries = []
    for i in range(len(columns) - 1):
        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
        boundaries.append(boundary)
    new_words: List[Dict] = []
    split_count = 0
    for w in words:
        w_left = w["left"]
        w_width = w["width"]
        w_right = w_left + w_width
        text = (w.get("text") or "").strip()
        if not text or len(text) < 4 or w_width < 10:
            new_words.append(w)
            continue
        # Find the first boundary this word straddles significantly
        split_boundary = None
        for b in boundaries:
            if w_left < b < w_right:
                left_part = b - w_left
                right_part = w_right - b
                # Both sides must have at least 15% of the word width
                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
                    split_boundary = b
                    break
        if split_boundary is None:
            new_words.append(w)
            continue
        # Compute approximate split position in the text.
        left_width = split_boundary - w_left
        split_ratio = left_width / w_width
        approx_pos = len(text) * split_ratio
        # Strategy 1: look for a case transition (lowercase->uppercase) near
        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
        split_char = None
        search_lo = max(1, int(approx_pos) - 3)
        search_hi = min(len(text), int(approx_pos) + 2)
        for i in range(search_lo, search_hi):
            if text[i - 1].islower() and text[i].isupper():
                split_char = i
                break
        # Strategy 2: if no case transition, only split if the whole word
        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
        # Real words like "oder", "Kabel", "Zeitung" must not be split.
        if split_char is None:
            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
            if _is_recognized_word(clean):
                new_words.append(w)
                continue
            # Not a real word — use floor of proportional position
            split_char = max(1, min(len(text) - 1, int(approx_pos)))
        left_text = text[:split_char].rstrip()
        right_text = text[split_char:].lstrip()
        if len(left_text) < 2 or len(right_text) < 2:
            new_words.append(w)
            continue
        right_width = w_width - round(left_width)
        new_words.append({
            **w,
            "text": left_text,
            "width": round(left_width),
        })
        new_words.append({
            **w,
            "text": right_text,
            "left": round(split_boundary),
            "width": right_width,
        })
        split_count += 1
        logger.info(
            "split cross-column word %r -> %r + %r at boundary %.0f",
            text, left_text, right_text, split_boundary,
        )
    if split_count:
        logger.info("split %d cross-column word(s)", split_count)
    return new_words
 def _cluster_columns_by_alignment(
    words: List[Dict],
    zone_w: int,
    rows: List[Dict],
 ) -> List[Dict[str, Any]]:
    """Detect columns by clustering left-edge alignment across rows.
    Hybrid approach:
      1. Group words by row, find "group start" positions within each row
         (words preceded by a large gap or first word in row)
      2. Cluster group-start left-edges by X-proximity across rows
      3. Filter by row coverage (how many rows have a group start here)
      4. Merge nearby clusters
      5. Build column boundaries
    This filters out mid-phrase word positions (e.g. IPA transcriptions,
    second words in multi-word entries) by only considering positions
    where a new word group begins within a row.
    """
    if not words or not rows:
        return []
    total_rows = len(rows)
    if total_rows == 0:
        return []
    # --- Group words by row ---
    row_words: Dict[int, List[Dict]] = {}
    for w in words:
        y_center = w["top"] + w["height"] / 2
        best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
        row_words.setdefault(best["index"], []).append(w)
    # --- Compute adaptive gap threshold for group-start detection ---
    all_gaps: List[float] = []
    for ri, rw_list in row_words.items():
        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
        for i in range(len(sorted_rw) - 1):
            right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
            gap = sorted_rw[i + 1]["left"] - right
            if gap > 0:
                all_gaps.append(gap)
    if all_gaps:
        sorted_gaps = sorted(all_gaps)
        median_gap = sorted_gaps[len(sorted_gaps) // 2]
        heights = [w["height"] for w in words if w.get("height", 0) > 0]
        median_h = sorted(heights)[len(heights) // 2] if heights else 25
        # For small word counts (boxes, sub-zones): PaddleOCR returns
        # multi-word blocks, so ALL inter-word gaps are potential column
        # boundaries.  Use a low threshold based on word height — any gap
        # wider than ~1x median word height is a column separator.
        if len(words) <= 60:
            gap_threshold = max(median_h * 1.0, 25)
            logger.info(
                "alignment columns (small zone): gap_threshold=%.0f "
                "(median_h=%.0f, %d words, %d gaps: %s)",
                gap_threshold, median_h, len(words), len(sorted_gaps),
                [int(g) for g in sorted_gaps[:10]],
            )
        else:
            # Standard approach for large zones (full pages)
            gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
            # Cap at 25% of zone width
            max_gap = zone_w * 0.25
            if gap_threshold > max_gap > 30:
                logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
                gap_threshold = max_gap
    else:
        gap_threshold = 50
    # --- Find group-start positions (left-edges that begin a new column) ---
    start_positions: List[tuple] = []  # (left_edge, row_index)
    for ri, rw_list in row_words.items():
        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
        # First word in row is always a group start
        start_positions.append((sorted_rw[0]["left"], ri))
        for i in range(1, len(sorted_rw)):
            right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
            gap = sorted_rw[i]["left"] - right_prev
            if gap >= gap_threshold:
                start_positions.append((sorted_rw[i]["left"], ri))
    start_positions.sort(key=lambda x: x[0])
    logger.info(
        "alignment columns: %d group-start positions from %d words "
        "(gap_threshold=%.0f, %d rows)",
        len(start_positions), len(words), gap_threshold, total_rows,
    )
    if not start_positions:
        x_min = min(w["left"] for w in words)
        x_max = max(w["left"] + w["width"] for w in words)
        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
    # --- Cluster group-start positions by X-proximity ---
    tolerance = max(10, int(zone_w * 0.01))
    clusters: List[Dict[str, Any]] = []
    cur_edges = [start_positions[0][0]]
    cur_rows = {start_positions[0][1]}
    for left, row_idx in start_positions[1:]:
        if left - cur_edges[-1] <= tolerance:
            cur_edges.append(left)
            cur_rows.add(row_idx)
        else:
            clusters.append({
                "mean_x": int(sum(cur_edges) / len(cur_edges)),
                "min_edge": min(cur_edges),
                "max_edge": max(cur_edges),
                "count": len(cur_edges),
                "distinct_rows": len(cur_rows),
                "row_coverage": len(cur_rows) / total_rows,
            })
            cur_edges = [left]
            cur_rows = {row_idx}
    clusters.append({
        "mean_x": int(sum(cur_edges) / len(cur_edges)),
        "min_edge": min(cur_edges),
        "max_edge": max(cur_edges),
        "count": len(cur_edges),
        "distinct_rows": len(cur_rows),
        "row_coverage": len(cur_rows) / total_rows,
    })
    # --- Filter by row coverage ---
    # These thresholds must be high enough to avoid false columns in flowing
    # text (random inter-word gaps) while still detecting real columns in
    # vocabulary worksheets (which typically have >80% row coverage).
    MIN_COVERAGE_PRIMARY = 0.35
    MIN_COVERAGE_SECONDARY = 0.12
    MIN_WORDS_SECONDARY = 4
    MIN_DISTINCT_ROWS = 3
    # Content boundary for left-margin detection
    content_x_min = min(w["left"] for w in words)
    content_x_max = max(w["left"] + w["width"] for w in words)
    content_span = content_x_max - content_x_min
    primary = [
        c for c in clusters
        if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
    ]
    primary_ids = {id(c) for c in primary}
    secondary = [
        c for c in clusters
        if id(c) not in primary_ids
        and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
        and c["count"] >= MIN_WORDS_SECONDARY
        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
    ]
    # Tertiary: narrow left-margin columns (page refs, markers) that have
    # too few rows for secondary but are clearly left-aligned and separated
    # from the main content.  These appear at the far left or far right and
    # have a large gap to the nearest significant cluster.
    used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
    sig_xs = [c["mean_x"] for c in primary + secondary]
    # Tertiary: clusters that are clearly to the LEFT of the first
    # significant column (or RIGHT of the last).  If words consistently
    # start at a position left of the established first column boundary,
    # they MUST be a separate column — regardless of how few rows they
    # cover.  The only requirement is a clear spatial gap.
    MIN_COVERAGE_TERTIARY = 0.02  # at least 1 row effectively
    tertiary = []
    for c in clusters:
        if id(c) in used_ids:
            continue
        if c["distinct_rows"] < 1:
            continue
        if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
            continue
        # Must be near left or right content margin (within 15%)
        rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
        if not (rel_pos < 0.15 or rel_pos > 0.85):
            continue
        # Must have significant gap to nearest significant cluster
        if sig_xs:
            min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
            if min_dist < max(30, content_span * 0.02):
                continue
        tertiary.append(c)
    if tertiary:
        for c in tertiary:
            logger.info(
                "  tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
                c["mean_x"], c["min_edge"], c["max_edge"],
                c["count"], c["distinct_rows"], c["row_coverage"] * 100,
            )
    significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
    for c in significant:
        logger.info(
            "  significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
            c["mean_x"], c["min_edge"], c["max_edge"],
            c["count"], c["distinct_rows"], c["row_coverage"] * 100,
        )
    logger.info(
        "alignment columns: %d clusters, %d primary, %d secondary -> %d significant",
        len(clusters), len(primary), len(secondary), len(significant),
    )
    if not significant:
        # Fallback: single column covering all content
        x_min = min(w["left"] for w in words)
        x_max = max(w["left"] + w["width"] for w in words)
        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
    # --- Merge nearby clusters ---
    merge_distance = max(25, int(zone_w * 0.03))
    merged = [significant[0].copy()]
    for s in significant[1:]:
        if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
            prev = merged[-1]
            total = prev["count"] + s["count"]
            prev["mean_x"] = (
                prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
            ) // total
            prev["count"] = total
            prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
            prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
            prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
        else:
            merged.append(s.copy())
    logger.info(
        "alignment columns: %d after merge (distance=%d)",
        len(merged), merge_distance,
    )
    # --- Build column boundaries ---
    margin = max(5, int(zone_w * 0.005))
    content_x_min = min(w["left"] for w in words)
    content_x_max = max(w["left"] + w["width"] for w in words)
    columns: List[Dict[str, Any]] = []
    for i, cluster in enumerate(merged):
        x_min = max(content_x_min, cluster["min_edge"] - margin)
        if i + 1 < len(merged):
            x_max = merged[i + 1]["min_edge"] - margin
        else:
            x_max = content_x_max
        columns.append({
            "index": i,
            "type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
            "x_min": x_min,
            "x_max": x_max,
        })
    return columns
 _MARKER_CHARS = set("*-+#>")
 def _merge_inline_marker_columns(
    columns: List[Dict],
    words: List[Dict],
 ) -> List[Dict]:
    """Merge narrow marker columns (bullets, numbering) into adjacent text.
    Bullet points (*, -) and numbering (1., 2.) create narrow columns
    at the left edge of a zone.  These are inline markers that indent text,
    not real separate columns.  Merge them with their right neighbour.
    Does NOT merge columns containing alphabetic words like "to", "in",
    "der", "die", "das" — those are legitimate content columns.
    """
    if len(columns) < 2:
        return columns
    merged: List[Dict] = []
    skip: set = set()
    for i, col in enumerate(columns):
        if i in skip:
            continue
        # Find words in this column
        col_words = [
            w for w in words
            if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
        ]
        col_width = col["x_max"] - col["x_min"]
        # Narrow column with mostly short words -> MIGHT be inline markers
        if col_words and col_width < 80:
            avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
            if avg_len <= 2 and i + 1 < len(columns):
                # Check if words are actual markers (symbols/numbers) vs
                # real alphabetic words like "to", "in", "der", "die"
                texts = [(w.get("text") or "").strip() for w in col_words]
                alpha_count = sum(
                    1 for t in texts
                    if t and t[0].isalpha() and t not in _MARKER_CHARS
                )
                alpha_ratio = alpha_count / len(texts) if texts else 0
                # If >=50% of words are alphabetic, this is a real column
                if alpha_ratio >= 0.5:
                    logger.info(
                        "  kept narrow column %d (w=%d, avg_len=%.1f, "
                        "alpha=%.0f%%) -- contains real words",
                        i, col_width, avg_len, alpha_ratio * 100,
                    )
                else:
                    # Merge into next column
                    next_col = columns[i + 1].copy()
                    next_col["x_min"] = col["x_min"]
                    merged.append(next_col)
                    skip.add(i + 1)
                    logger.info(
                        "  merged inline marker column %d (w=%d, avg_len=%.1f) "
                        "into column %d",
                        i, col_width, avg_len, i + 1,
                    )
                    continue
        merged.append(col)
    # Re-index
    for i, col in enumerate(merged):
        col["index"] = i
        col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
    return merged
@@ -1,402 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/filters.py
-Grid Editor — word/zone filtering, border ghosts, decorative margins, footers.
+import importlib as _importlib
-
+import sys as _sys
-Split from grid_editor_helpers.py for maintainability.
+_sys.modules[__name__] = _importlib.import_module("grid.editor.filters")
 All functions are pure computation — no HTTP, DB, or session side effects.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 from typing import Any, Dict, List, Optional, Tuple
 logger = logging.getLogger(__name__)
 def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
    """Remove page-border decoration strip words BEFORE column detection.
    Scans from each page edge inward to find the first significant x-gap
    (>30 px).  If the edge cluster contains <15 % of total words, those
    words are removed as border-strip artifacts (alphabet letters,
    illustration fragments).
    Must run BEFORE ``_build_zone_grid`` so that column detection only
    sees real content words and doesn't produce inflated row counts.
    """
    if len(words) < 10:
        return words, 0
    sorted_words = sorted(words, key=lambda w: w.get("left", 0))
    total = len(sorted_words)
    # -- Left-edge scan (running max right-edge) --
    left_count = 0
    running_right = 0
    for gi in range(total - 1):
        running_right = max(
            running_right,
            sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
        )
        if sorted_words[gi + 1].get("left", 0) - running_right > 30:
            left_count = gi + 1
            break
    # -- Right-edge scan (running min left) --
    right_count = 0
    running_left = sorted_words[-1].get("left", 0)
    for gi in range(total - 1, 0, -1):
        running_left = min(running_left, sorted_words[gi].get("left", 0))
        prev_right = (
            sorted_words[gi - 1].get("left", 0)
            + sorted_words[gi - 1].get("width", 0)
        )
        if running_left - prev_right > 30:
            right_count = total - gi
            break
    # Validate candidate strip: real border decorations are mostly short
    # words (alphabet letters like "A", "Bb", stray marks).  Multi-word
    # content like "der Ranzen" or "die Schals" (continuation of German
    # translations) must NOT be removed.
    def _is_decorative_strip(candidates: List[Dict]) -> bool:
        if not candidates:
            return False
        short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
        return short / len(candidates) >= 0.45
    strip_ids: set = set()
    if left_count > 0 and left_count / total < 0.20:
        candidates = sorted_words[:left_count]
        if _is_decorative_strip(candidates):
            strip_ids = {id(w) for w in candidates}
    elif right_count > 0 and right_count / total < 0.20:
        candidates = sorted_words[total - right_count:]
        if _is_decorative_strip(candidates):
            strip_ids = {id(w) for w in candidates}
    if not strip_ids:
        return words, 0
    return [w for w in words if id(w) not in strip_ids], len(strip_ids)
 # Characters that are typically OCR artefacts from box border lines.
 # Intentionally excludes ! (red markers) and . , ; (real punctuation).
 _GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+")
 def _filter_border_ghosts(
    words: List[Dict],
    boxes: List,
 ) -> tuple:
    """Remove words sitting on box borders that are OCR artefacts.
    Returns (filtered_words, removed_count).
    """
    if not boxes or not words:
        return words, 0
    # Build border bands from detected boxes
    x_bands: List[tuple] = []
    y_bands: List[tuple] = []
    for b in boxes:
        bt = (
            b.border_thickness
            if hasattr(b, "border_thickness")
            else b.get("border_thickness", 3)
        )
        # Skip borderless boxes (images/graphics) -- no border line to produce ghosts
        if bt == 0:
            continue
        bx = b.x if hasattr(b, "x") else b.get("x", 0)
        by = b.y if hasattr(b, "y") else b.get("y", 0)
        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
        margin = max(bt * 2, 10) + 6
        x_bands.append((bx - margin, bx + margin))
        x_bands.append((bx + bw - margin, bx + bw + margin))
        y_bands.append((by - margin, by + margin))
        y_bands.append((by + bh - margin, by + bh + margin))
    def _is_ghost(w: Dict) -> bool:
        text = (w.get("text") or "").strip()
        if not text:
            return False
        # Check if any word edge (not just center) touches a border band
        w_left = w["left"]
        w_right = w["left"] + w["width"]
        w_top = w["top"]
        w_bottom = w["top"] + w["height"]
        on_border = (
            any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
            or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
        )
        if not on_border:
            return False
        if len(text) == 1 and text in _GRID_GHOST_CHARS:
            return True
        return False
    filtered = [w for w in words if not _is_ghost(w)]
    return filtered, len(words) - len(filtered)
 def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
    """Extract all word_boxes from cells into a flat list of word dicts."""
    words: List[Dict] = []
    for cell in cells:
        for wb in cell.get("word_boxes") or []:
            if wb.get("text", "").strip():
                words.append({
                    "text": wb["text"],
                    "left": wb["left"],
                    "top": wb["top"],
                    "width": wb["width"],
                    "height": wb["height"],
                    "conf": wb.get("conf", 0),
                })
    return words
 def _words_in_zone(
    words: List[Dict],
    zone_y: int,
    zone_h: int,
    zone_x: int,
    zone_w: int,
 ) -> List[Dict]:
    """Filter words whose Y-center falls within a zone's bounds."""
    zone_y_end = zone_y + zone_h
    zone_x_end = zone_x + zone_w
    result = []
    for w in words:
        cy = w["top"] + w["height"] / 2
        cx = w["left"] + w["width"] / 2
        if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
            result.append(w)
    return result
 def _get_content_bounds(words: List[Dict]) -> tuple:
    """Get content bounds from word positions."""
    if not words:
        return 0, 0, 0, 0
    x_min = min(w["left"] for w in words)
    y_min = min(w["top"] for w in words)
    x_max = max(w["left"] + w["width"] for w in words)
    y_max = max(w["top"] + w["height"] for w in words)
    return x_min, y_min, x_max - x_min, y_max - y_min
 def _filter_decorative_margin(
    words: List[Dict],
    img_w: int,
    log: Any,
    session_id: str,
 ) -> Dict[str, Any]:
    """Remove words that belong to a decorative alphabet strip on a margin.
    Some vocabulary worksheets have a vertical A-Z alphabet graphic along
    the left or right edge.  OCR reads each letter as an isolated single-
    character word.  These decorative elements are not content and confuse
    column/row detection.
    Detection criteria (phase 1 -- find the strip using single-char words):
      - Words are in the outer 30% of the page (left or right)
      - Nearly all words are single characters (letters or digits)
      - At least 8 such words form a vertical strip (>=8 unique Y positions)
      - Average horizontal spread of the strip is small (< 80px)
    Phase 2 -- once a strip is confirmed, also remove any short word (<=3
    chars) in the same narrow x-range.  This catches multi-char OCR
    artifacts like "Vv" that belong to the same decorative element.
    Modifies *words* in place.
    Returns:
        Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
    """
    no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
    if not words or img_w <= 0:
        return no_strip
    margin_cutoff = img_w * 0.30
    # Phase 1: find candidate strips using short words (1-2 chars).
    # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
    # rather than singles, so accept <=2-char words as strip candidates.
    left_strip = [
        w for w in words
        if len((w.get("text") or "").strip()) <= 2
        and w["left"] + w.get("width", 0) / 2 < margin_cutoff
    ]
    right_strip = [
        w for w in words
        if len((w.get("text") or "").strip()) <= 2
        and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
    ]
    for strip, side in [(left_strip, "left"), (right_strip, "right")]:
        if len(strip) < 6:
            continue
        # Check vertical distribution: should have many distinct Y positions
        y_centers = sorted(set(
            int(w["top"] + w.get("height", 0) / 2) // 20 * 20  # bucket
            for w in strip
        ))
        if len(y_centers) < 6:
            continue
        # Check horizontal compactness
        x_positions = [w["left"] for w in strip]
        x_min = min(x_positions)
        x_max = max(x_positions)
        x_spread = x_max - x_min
        if x_spread > 80:
            continue
        # Phase 2: strip confirmed -- also collect short words in same x-range
        # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
        strip_x_lo = x_min - 20
        strip_x_hi = x_max + 60  # word width + tolerance
        all_strip_words = [
            w for w in words
            if len((w.get("text") or "").strip()) <= 3
            and strip_x_lo <= w["left"] <= strip_x_hi
            and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
                 if side == "left"
                 else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
        ]
        strip_set = set(id(w) for w in all_strip_words)
        before = len(words)
        words[:] = [w for w in words if id(w) not in strip_set]
        removed = before - len(words)
        if removed:
            log.info(
                "build-grid session %s: removed %d decorative %s-margin words "
                "(strip x=%d-%d)",
                session_id, removed, side, strip_x_lo, strip_x_hi,
            )
        return {"found": True, "side": side, "letters_detected": len(strip)}
    return no_strip
 def _filter_footer_words(
    words: List[Dict],
    img_h: int,
    log: Any,
    session_id: str,
 ) -> Optional[Dict]:
    """Remove isolated words in the bottom 5% of the page (page numbers).
    Modifies *words* in place and returns a page_number metadata dict
    if a page number was extracted, or None.
    """
    if not words or img_h <= 0:
        return None
    footer_y = img_h * 0.95
    footer_words = [
        w for w in words
        if w["top"] + w.get("height", 0) / 2 > footer_y
    ]
    if not footer_words:
        return None
    # Only remove if footer has very few words (<= 3) with short text
    total_text = "".join((w.get("text") or "").strip() for w in footer_words)
    if len(footer_words) <= 3 and len(total_text) <= 10:
        # Extract page number metadata before removing
        page_number_info = {
            "text": total_text.strip(),
            "y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
        }
        # Try to parse as integer
        digits = "".join(c for c in total_text if c.isdigit())
        if digits:
            page_number_info["number"] = int(digits)
        footer_set = set(id(w) for w in footer_words)
        words[:] = [w for w in words if id(w) not in footer_set]
        log.info(
            "build-grid session %s: extracted page number '%s' and removed %d footer words",
            session_id, total_text, len(footer_words),
        )
        return page_number_info
    return None
 def _filter_header_junk(
    words: List[Dict],
    img_h: int,
    log: Any,
    session_id: str,
 ) -> None:
    """Remove OCR junk from header illustrations above the real content.
    Textbook pages often have decorative header graphics (illustrations,
    icons) that OCR reads as low-confidence junk characters.  Real content
    typically starts further down the page.
    Algorithm:
      1. Find the "content start" -- the first Y position where a dense
         horizontal row of 3+ high-confidence words begins.
      2. Above that line, remove words with conf < 75 and text <= 3 chars.
         These are almost certainly OCR artifacts from illustrations.
    Modifies *words* in place.
    """
    if not words or img_h <= 0:
        return
    # --- Find content start: first horizontal row with >=3 high-conf words ---
    # Sort words by Y
    sorted_by_y = sorted(words, key=lambda w: w["top"])
    content_start_y = 0
    _ROW_TOLERANCE = img_h * 0.02  # words within 2% of page height = same row
    _MIN_ROW_WORDS = 3
    _MIN_CONF = 80
    i = 0
    while i < len(sorted_by_y):
        row_y = sorted_by_y[i]["top"]
        # Collect words in this row band
        row_words = []
        j = i
        while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
            row_words.append(sorted_by_y[j])
            j += 1
        # Count high-confidence words with real text (> 1 char)
        high_conf = [
            w for w in row_words
            if w.get("conf", 0) >= _MIN_CONF
            and len((w.get("text") or "").strip()) > 1
        ]
        if len(high_conf) >= _MIN_ROW_WORDS:
            content_start_y = row_y
            break
        i = j if j > i else i + 1
    if content_start_y <= 0:
        return  # no clear content start found
    # --- Remove low-conf short junk above content start ---
    junk = [
        w for w in words
        if w["top"] + w.get("height", 0) < content_start_y
        and w.get("conf", 0) < 75
        and len((w.get("text") or "").strip()) <= 3
    ]
    if not junk:
        return
    junk_set = set(id(w) for w in junk)
    before = len(words)
    words[:] = [w for w in words if id(w) not in junk_set]
    removed = before - len(words)
    if removed:
        log.info(
            "build-grid session %s: removed %d header junk words above y=%d "
            "(content start)",
            session_id, removed, content_start_y,
        )
@@ -1,499 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/headers.py
-Grid Editor — header/heading detection and colspan (merged cell) detection.
+import importlib as _importlib
-Split from grid_editor_helpers.py.  Pure computation, no HTTP/DB side effects.
+import sys as _sys
-Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+_sys.modules[__name__] = _importlib.import_module("grid.editor.headers")
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 from cv_ocr_engines import _text_has_garbled_ipa
 logger = logging.getLogger(__name__)
 def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
    """Detect heading rows by color + height after color annotation.
    A row is a heading if:
    1. ALL word_boxes have color_name != 'black' (typically 'blue')
    2. Mean word height > 1.2x median height of all words in the zone
    Detected heading rows are merged into a single spanning cell.
    Returns count of headings detected.
    """
    heading_count = 0
    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        columns = z.get("columns", [])
        if not cells or not rows or len(columns) < 2:
            continue
        # Compute median word height across the zone
        all_heights = []
        for cell in cells:
            for wb in cell.get("word_boxes") or []:
                h = wb.get("height", 0)
                if h > 0:
                    all_heights.append(h)
        if not all_heights:
            continue
        all_heights_sorted = sorted(all_heights)
        median_h = all_heights_sorted[len(all_heights_sorted) // 2]
        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue  # already detected as header
            ri = row["index"]
            row_cells = [c for c in cells if c.get("row_index") == ri]
            row_wbs = [
                wb for cell in row_cells
                for wb in cell.get("word_boxes") or []
            ]
            if not row_wbs:
                continue
            # Condition 1: ALL words are non-black
            all_colored = all(
                wb.get("color_name", "black") != "black"
                for wb in row_wbs
            )
            if not all_colored:
                continue
            # Condition 2: mean height > 1.2x median
            mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
            if mean_h <= median_h * 1.2:
                continue
            heading_row_indices.append(ri)
        # Merge heading cells into spanning cells
        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if len(header_cells) <= 1:
                # Single cell -- just mark it as heading
                if header_cells:
                    header_cells[0]["col_type"] = "heading"
                    heading_count += 1
                    # Mark row as header
                    for row in rows:
                        if row["index"] == hri:
                            row["is_header"] = True
                continue
            # Collect all word_boxes and text from all columns
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            # Remove all cells for this row, replace with one spanning cell
            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                # Use the actual starting col_index from the first cell
                first_col = min(hc["col_index"] for hc in header_cells)
                zone_idx = z.get("zone_index", 0)
                z["cells"].append({
                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
                    "zone_index": zone_idx,
                    "row_index": hri,
                    "col_index": first_col,
                    "col_type": "heading",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
                    "bbox_px": {"x": x_min, "y": y_min,
                                "w": x_max - x_min, "h": y_max - y_min},
                    "bbox_pct": {
                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                    },
                    "word_boxes": all_wb,
                    "ocr_engine": "words_first",
                    "is_bold": True,
                })
            # Mark row as header
            for row in rows:
                if row["index"] == hri:
                    row["is_header"] = True
            heading_count += 1
    return heading_count
 def _detect_heading_rows_by_single_cell(
    zones_data: List[Dict], img_w: int, img_h: int,
 ) -> int:
    """Detect heading rows that have only a single content cell.
    Black headings like "Theme" have normal color and height, so they are
    missed by ``_detect_heading_rows_by_color``.  The distinguishing signal
    is that they occupy only one column while normal vocabulary rows fill
    at least 2-3 columns.
    A row qualifies as a heading if:
    1. It is not already marked as a header/heading.
    2. It has exactly ONE cell whose col_type starts with ``column_``
       (excluding column_1 / page_ref which only carries page numbers).
    3. That single cell is NOT in the last column (continuation/example
       lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
    4. The text does not start with ``[`` (IPA continuation).
    5. The zone has >=3 columns and >=5 rows (avoids false positives in
       tiny zones).
    6. The majority of rows in the zone have >=2 content cells (ensures
       we are in a multi-column vocab layout).
    """
    heading_count = 0
    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        columns = z.get("columns", [])
        if len(columns) < 3 or len(rows) < 5:
            continue
        # Determine the last col_index (example/sentence column)
        col_indices = sorted(set(c.get("col_index", 0) for c in cells))
        if not col_indices:
            continue
        last_col = col_indices[-1]
        # Count content cells per row (column_* but not column_1/page_ref).
        # Exception: column_1 cells that contain a dictionary article word
        # (die/der/das etc.) ARE content -- they appear in dictionary layouts
        # where the leftmost column holds grammatical articles.
        _ARTICLE_WORDS = {
            "die", "der", "das", "dem", "den", "des", "ein", "eine",
            "the", "a", "an",
        }
        row_content_counts: Dict[int, int] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            if ct == "column_1":
                ctext = (cell.get("text") or "").strip().lower()
                if ctext not in _ARTICLE_WORDS:
                    continue
            ri = cell.get("row_index", -1)
            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
        # Majority of rows must have >=2 content cells
        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
        if multi_col_rows < len(rows) * 0.4:
            continue
        # Exclude first and last non-header rows -- these are typically
        # page numbers or footer text, not headings.
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if len(non_header_rows) < 3:
            continue
        first_ri = non_header_rows[0]["index"]
        last_ri = non_header_rows[-1]["index"]
        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue
            ri = row["index"]
            if ri == first_ri or ri == last_ri:
                continue
            row_cells = [c for c in cells if c.get("row_index") == ri]
            content_cells = [
                c for c in row_cells
                if c.get("col_type", "").startswith("column_")
                and (c.get("col_type") != "column_1"
                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
            ]
            if len(content_cells) != 1:
                continue
            cell = content_cells[0]
            # Not in the last column (continuation/example lines)
            if cell.get("col_index") == last_col:
                continue
            text = (cell.get("text") or "").strip()
            if not text or text.startswith("["):
                continue
            # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
            if text.startswith("("):
                continue
            # Single cell NOT in the first content column is likely a
            # continuation/overflow line, not a heading.  Real headings
            # ("Theme 1", "Unit 3: ...") appear in the first or second
            # content column.
            first_content_col = col_indices[0] if col_indices else 0
            if cell.get("col_index", 0) > first_content_col + 1:
                continue
            # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
            # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
            _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
                continue
            # Guard: dictionary section headings are short (1-4 alpha chars
            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
            # lowercase is a regular vocabulary word (e.g. "zentral") that
            # happens to appear alone in its row.
            alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
            if len(alpha_only) > 4 and text[0].islower():
                continue
            heading_row_indices.append(ri)
        # Guard: if >25% of eligible rows would become headings, the
        # heuristic is misfiring (e.g. sparse single-column layout where
        # most rows naturally have only 1 content cell).
        eligible_rows = len(non_header_rows) - 2  # minus first/last excluded
        if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
            logger.debug(
                "Skipping single-cell heading detection for zone %s: "
                "%d/%d rows would be headings (>25%%)",
                z.get("zone_index"), len(heading_row_indices), eligible_rows,
            )
            continue
        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if not header_cells:
                continue
            # Collect all word_boxes and text
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            first_col_idx = min(hc["col_index"] for hc in header_cells)
            # Remove old cells for this row, add spanning heading cell
            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
            else:
                # Fallback to first cell bbox
                bp = header_cells[0].get("bbox_px", {})
                x_min = bp.get("x", 0)
                y_min = bp.get("y", 0)
                x_max = x_min + bp.get("w", 0)
                y_max = y_min + bp.get("h", 0)
            zone_idx = z.get("zone_index", 0)
            z["cells"].append({
                "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
                "zone_index": zone_idx,
                "row_index": hri,
                "col_index": first_col_idx,
                "col_type": "heading",
                "text": " ".join(all_text_parts),
                "confidence": 0.0,
                "bbox_px": {"x": x_min, "y": y_min,
                            "w": x_max - x_min, "h": y_max - y_min},
                "bbox_pct": {
                    "x": round(x_min / img_w * 100, 2) if img_w else 0,
                    "y": round(y_min / img_h * 100, 2) if img_h else 0,
                    "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                    "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                },
                "word_boxes": all_wb,
                "ocr_engine": "words_first",
                "is_bold": False,
            })
            for row in rows:
                if row["index"] == hri:
                    row["is_header"] = True
            heading_count += 1
    return heading_count
 def _detect_header_rows(
    rows: List[Dict],
    zone_words: List[Dict],
    zone_y: int,
    columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
 ) -> List[int]:
    """Detect header rows: first-row heuristic + spanning header detection.
    A "spanning header" is a row whose words stretch across multiple column
    boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
    """
    if len(rows) < 2:
        return []
    headers = []
    if not skip_first_row_header:
        first_row = rows[0]
        second_row = rows[1]
        # Gap between first and second row > 0.5x average row height
        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
        gap = second_row["y_min"] - first_row["y_max"]
        if gap > avg_h * 0.5:
            headers.append(0)
        # Also check if first row words are taller than average (bold/header text)
        all_heights = [w["height"] for w in zone_words]
        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
        first_row_words = [
            w for w in zone_words
            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
        ]
        if first_row_words:
            first_h = max(w["height"] for w in first_row_words)
            if first_h > median_h * 1.3:
                if 0 not in headers:
                    headers.append(0)
    # Note: Spanning-header detection (rows spanning all columns) has been
    # disabled because it produces too many false positives on vocabulary
    # worksheets where IPA transcriptions or short entries naturally span
    # multiple columns with few words.  The first-row heuristic above is
    # sufficient for detecting real headers.
    return headers
 def _detect_colspan_cells(
    zone_words: List[Dict],
    columns: List[Dict],
    rows: List[Dict],
    cells: List[Dict],
    img_w: int,
    img_h: int,
 ) -> List[Dict]:
    """Detect and merge cells that span multiple columns (colspan).
    A word-block (PaddleOCR phrase) that extends significantly past a column
    boundary into the next column indicates a merged cell.  This replaces
    the incorrectly split cells with a single cell spanning multiple columns.
    Works for both full-page scans and box zones.
    """
    if len(columns) < 2 or not zone_words or not rows:
        return cells
    from cv_words_first import _assign_word_to_row
    # Column boundaries (midpoints between adjacent columns)
    col_boundaries = []
    for ci in range(len(columns) - 1):
        col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
    def _cols_covered(w_left: float, w_right: float) -> List[int]:
        """Return list of column indices that a word-block covers."""
        covered = []
        for col in columns:
            col_mid = (col["x_min"] + col["x_max"]) / 2
            # Word covers a column if it extends past the column's midpoint
            if w_left < col_mid < w_right:
                covered.append(col["index"])
            # Also include column if word starts within it
            elif col["x_min"] <= w_left < col["x_max"]:
                covered.append(col["index"])
        return sorted(set(covered))
    # Group original word-blocks by row
    row_word_blocks: Dict[int, List[Dict]] = {}
    for w in zone_words:
        ri = _assign_word_to_row(w, rows)
        row_word_blocks.setdefault(ri, []).append(w)
    # For each row, check if any word-block spans multiple columns
    rows_to_merge: Dict[int, List[Dict]] = {}  # row_index -> list of spanning word-blocks
    for ri, wblocks in row_word_blocks.items():
        spanning = []
        for w in wblocks:
            w_left = w["left"]
            w_right = w_left + w["width"]
            covered = _cols_covered(w_left, w_right)
            if len(covered) >= 2:
                spanning.append({"word": w, "cols": covered})
        if spanning:
            rows_to_merge[ri] = spanning
    if not rows_to_merge:
        return cells
    # Merge cells for spanning rows
    new_cells = []
    for cell in cells:
        ri = cell.get("row_index", -1)
        if ri not in rows_to_merge:
            new_cells.append(cell)
            continue
        # Check if this cell's column is part of a spanning block
        ci = cell.get("col_index", -1)
        is_part_of_span = False
        for span in rows_to_merge[ri]:
            if ci in span["cols"]:
                is_part_of_span = True
                # Only emit the merged cell for the FIRST column in the span
                if ci == span["cols"][0]:
                    # Use the ORIGINAL word-block text (not the split cell texts
                    # which may have broken words like "euros a" + "nd cents")
                    orig_word = span["word"]
                    merged_text = orig_word.get("text", "").strip()
                    all_wb = [orig_word]
                    # Compute merged bbox
                    if all_wb:
                        x_min = min(wb["left"] for wb in all_wb)
                        y_min = min(wb["top"] for wb in all_wb)
                        x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                        y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                    else:
                        x_min = y_min = x_max = y_max = 0
                    new_cells.append({
                        "cell_id": cell["cell_id"],
                        "row_index": ri,
                        "col_index": span["cols"][0],
                        "col_type": "spanning_header",
                        "colspan": len(span["cols"]),
                        "text": merged_text,
                        "confidence": cell.get("confidence", 0),
                        "bbox_px": {"x": x_min, "y": y_min,
                                    "w": x_max - x_min, "h": y_max - y_min},
                        "bbox_pct": {
                            "x": round(x_min / img_w * 100, 2) if img_w else 0,
                            "y": round(y_min / img_h * 100, 2) if img_h else 0,
                            "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                            "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                        },
                        "word_boxes": all_wb,
                        "ocr_engine": cell.get("ocr_engine", ""),
                        "is_bold": cell.get("is_bold", False),
                    })
                    logger.info(
                        "colspan detected: row %d, cols %s -> merged %d cells (%r)",
                        ri, span["cols"], len(span["cols"]), merged_text[:50],
                    )
                break
        if not is_part_of_span:
            new_cells.append(cell)
    return new_cells
@@ -1,58 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/helpers.py
-Grid Editor helper functions — barrel re-export module.
+import importlib as _importlib
-
+import sys as _sys
-This file re-exports all public symbols from the split sub-modules
+_sys.modules[__name__] = _importlib.import_module("grid.editor.helpers")
 so that existing ``from grid_editor_helpers import ...`` statements
 continue to work without changes.
 Sub-modules:
  - grid_editor_columns  — column detection, cross-column splitting, marker merging
  - grid_editor_filters  — word/zone filtering, border ghosts, decorative margins
  - grid_editor_headers  — header/heading detection, colspan detection
  - grid_editor_zones    — vertical dividers, zone splitting/merging, zone grid building
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 # --- Re-export: columns ---------------------------------------------------
 from grid_editor_columns import (  # noqa: F401
    _is_recognized_word,
    _split_cross_column_words,
    _cluster_columns_by_alignment,
    _MARKER_CHARS,
    _merge_inline_marker_columns,
 )
 # --- Re-export: filters ----------------------------------------------------
 from grid_editor_filters import (  # noqa: F401
    _filter_border_strip_words,
    _GRID_GHOST_CHARS,
    _filter_border_ghosts,
    _flatten_word_boxes,
    _words_in_zone,
    _get_content_bounds,
    _filter_decorative_margin,
    _filter_footer_words,
    _filter_header_junk,
 )
 # --- Re-export: headers ----------------------------------------------------
 from grid_editor_headers import (  # noqa: F401
    _detect_heading_rows_by_color,
    _detect_heading_rows_by_single_cell,
    _detect_header_rows,
    _detect_colspan_cells,
 )
 # --- Re-export: zones -------------------------------------------------------
 from grid_editor_zones import (  # noqa: F401
    _PIPE_RE_VSPLIT,
    _detect_vertical_dividers,
    _split_zone_at_vertical_dividers,
    _merge_content_zones_across_boxes,
    _build_zone_grid,
 )
 # --- Re-export from cv_words_first (used by cv_box_layout.py) ---------------
 from cv_words_first import _cluster_rows  # noqa: F401
@@ -1,389 +1,4 @@
-"""
+# Backward-compat shim -- module moved to grid/editor/zones.py
-Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
+import importlib as _importlib
-
+import sys as _sys
-Split from grid_editor_helpers.py for maintainability.
+_sys.modules[__name__] = _importlib.import_module("grid.editor.zones")
 All functions are pure computation — no HTTP, DB, or session side effects.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import re
 from typing import Any, Dict, List, Optional
 from cv_vocab_types import PageZone
 from cv_words_first import _cluster_rows, _build_cells
 from grid_editor_columns import (
    _cluster_columns_by_alignment,
    _merge_inline_marker_columns,
    _split_cross_column_words,
 )
 from grid_editor_headers import (
    _detect_header_rows,
    _detect_colspan_cells,
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Vertical divider detection and zone splitting
 # ---------------------------------------------------------------------------
 _PIPE_RE_VSPLIT = re.compile(r"^\|+$")
 def _detect_vertical_dividers(
    words: List[Dict],
    zone_x: int,
    zone_w: int,
    zone_y: int,
    zone_h: int,
 ) -> List[float]:
    """Detect vertical divider lines from pipe word_boxes at consistent x.
    Returns list of divider x-positions (empty if no dividers found).
    """
    if not words or zone_w <= 0 or zone_h <= 0:
        return []
    # Collect pipe word_boxes
    pipes = [
        w for w in words
        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
    ]
    if len(pipes) < 5:
        return []
    # Cluster pipe x-centers by proximity
    tolerance = max(15, int(zone_w * 0.02))
    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
    clusters: List[List[float]] = [[pipe_xs[0]]]
    for x in pipe_xs[1:]:
        if x - clusters[-1][-1] <= tolerance:
            clusters[-1].append(x)
        else:
            clusters.append([x])
    dividers: List[float] = []
    for cluster in clusters:
        if len(cluster) < 5:
            continue
        mean_x = sum(cluster) / len(cluster)
        # Must be between 15% and 85% of zone width
        rel_pos = (mean_x - zone_x) / zone_w
        if rel_pos < 0.15 or rel_pos > 0.85:
            continue
        # Check vertical coverage: pipes must span >= 50% of zone height
        cluster_pipes = [
            w for w in pipes
            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
        ]
        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
        y_span = max(ys) - min(ys) if ys else 0
        if y_span < zone_h * 0.5:
            continue
        dividers.append(mean_x)
    return sorted(dividers)
 def _split_zone_at_vertical_dividers(
    zone: "PageZone",
    divider_xs: List[float],
    vsplit_group_id: int,
 ) -> List["PageZone"]:
    """Split a PageZone at vertical divider positions into sub-zones."""
    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
    hints = []
    for i in range(len(boundaries) - 1):
        if i == 0:
            hints.append("left_of_vsplit")
        elif i == len(boundaries) - 2:
            hints.append("right_of_vsplit")
        else:
            hints.append("middle_of_vsplit")
    sub_zones = []
    for i in range(len(boundaries) - 1):
        x_start = int(boundaries[i])
        x_end = int(boundaries[i + 1])
        sub = PageZone(
            index=0,  # re-indexed later
            zone_type=zone.zone_type,
            y=zone.y,
            height=zone.height,
            x=x_start,
            width=x_end - x_start,
            box=zone.box,
            image_overlays=zone.image_overlays,
            layout_hint=hints[i],
            vsplit_group=vsplit_group_id,
        )
        sub_zones.append(sub)
    return sub_zones
 def _merge_content_zones_across_boxes(
    zones: List,
    content_x: int,
    content_w: int,
 ) -> List:
    """Merge content zones separated by box zones into single zones.
    Box zones become image_overlays on the merged content zone.
    Pattern: [content, box*, content] -> [merged_content with overlay]
    Box zones NOT between two content zones stay as standalone zones.
    """
    if len(zones) < 3:
        return zones
    # Group consecutive runs of [content, box+, content]
    result: List = []
    i = 0
    while i < len(zones):
        z = zones[i]
        if z.zone_type != "content":
            result.append(z)
            i += 1
            continue
        # Start of a potential merge group: content zone
        group_contents = [z]
        group_boxes = []
        j = i + 1
        # Absorb [box, content] pairs -- only absorb a box if it's
        # confirmed to be followed by another content zone.
        while j < len(zones):
            if (zones[j].zone_type == "box"
                    and j + 1 < len(zones)
                    and zones[j + 1].zone_type == "content"):
                group_boxes.append(zones[j])
                group_contents.append(zones[j + 1])
                j += 2
            else:
                break
        if len(group_contents) >= 2 and group_boxes:
            # Merge: create one large content zone spanning all
            y_min = min(c.y for c in group_contents)
            y_max = max(c.y + c.height for c in group_contents)
            overlays = []
            for bz in group_boxes:
                overlay = {
                    "y": bz.y,
                    "height": bz.height,
                    "x": bz.x,
                    "width": bz.width,
                }
                if bz.box:
                    overlay["box"] = {
                        "x": bz.box.x,
                        "y": bz.box.y,
                        "width": bz.box.width,
                        "height": bz.box.height,
                        "confidence": bz.box.confidence,
                        "border_thickness": bz.box.border_thickness,
                    }
                overlays.append(overlay)
            merged = PageZone(
                index=0,  # re-indexed below
                zone_type="content",
                y=y_min,
                height=y_max - y_min,
                x=content_x,
                width=content_w,
                image_overlays=overlays,
            )
            result.append(merged)
            i = j
        else:
            # No merge possible -- emit just the content zone
            result.append(z)
            i += 1
    # Re-index zones
    for idx, z in enumerate(result):
        z.index = idx
    logger.info(
        "zone-merge: %d zones -> %d zones after merging across boxes",
        len(zones), len(result),
    )
    return result
 def _build_zone_grid(
    zone_words: List[Dict],
    zone_x: int,
    zone_y: int,
    zone_w: int,
    zone_h: int,
    zone_index: int,
    img_w: int,
    img_h: int,
    global_columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
 ) -> Dict[str, Any]:
    """Build columns, rows, cells for a single zone from its words.
    Args:
        global_columns: If provided, use these pre-computed column boundaries
            instead of detecting columns per zone.  Used for content zones so
            that all content zones (above/between/below boxes) share the same
            column structure.  Box zones always detect columns independently.
    """
    if not zone_words:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
        }
    # Cluster rows first (needed for column alignment analysis)
    rows = _cluster_rows(zone_words)
    # Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
    if len(zone_words) <= 60:
        import statistics as _st
        _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
        _med_h = _st.median(_heights) if _heights else 20
        _y_tol = max(_med_h * 0.5, 5)
        logger.info(
            "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
            zone_index, len(zone_words), _med_h, _y_tol, len(rows),
        )
        for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
            logger.info(
                "  zone %d word: y=%d x=%d h=%d w=%d '%s'",
                zone_index, w['top'], w['left'], w['height'], w['width'],
                w.get('text', '')[:40],
            )
        for r in rows:
            logger.info(
                "  zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
                zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
            )
    # Use global columns if provided, otherwise detect per zone
    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
    # Merge inline marker columns (bullets, numbering) into adjacent text
    if not global_columns:
        columns = _merge_inline_marker_columns(columns, zone_words)
    if not columns or not rows:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
        }
    # Split word boxes that straddle column boundaries (e.g. "sichzie"
    # spanning Col 1 + Col 2).  Must happen after column detection and
    # before cell assignment.
    # Keep original words for colspan detection (split destroys span info).
    original_zone_words = zone_words
    if len(columns) >= 2:
        zone_words = _split_cross_column_words(zone_words, columns)
    # Build cells
    cells = _build_cells(zone_words, columns, rows, img_w, img_h)
    # --- Detect colspan (merged cells spanning multiple columns) ---
    # Uses the ORIGINAL (pre-split) words to detect word-blocks that span
    # multiple columns.  _split_cross_column_words would have destroyed
    # this information by cutting words at column boundaries.
    if len(columns) >= 2:
        cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
    # Prefix cell IDs with zone index
    for cell in cells:
        cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
        cell["zone_index"] = zone_index
    # Detect header rows (pass columns for spanning header detection)
    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
                                      skip_first_row_header=skip_first_row_header)
    # Merge cells in spanning header rows into a single col-0 cell
    if header_rows and len(columns) >= 2:
        for hri in header_rows:
            header_cells = [c for c in cells if c["row_index"] == hri]
            if len(header_cells) <= 1:
                continue
            # Collect all word_boxes and text from all columns
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            # Remove all header cells, replace with one spanning cell
            cells = [c for c in cells if c["row_index"] != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                cells.append({
                    "cell_id": f"R{hri:02d}_C0",
                    "row_index": hri,
                    "col_index": 0,
                    "col_type": "spanning_header",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
                    "bbox_px": {"x": x_min, "y": y_min,
                                "w": x_max - x_min, "h": y_max - y_min},
                    "bbox_pct": {
                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                    },
                    "word_boxes": all_wb,
                    "ocr_engine": "words_first",
                    "is_bold": True,
                })
    # Convert columns to output format with percentages
    out_columns = []
    for col in columns:
        x_min = col["x_min"]
        x_max = col["x_max"]
        out_columns.append({
            "index": col["index"],
            "label": col["type"],
            "x_min_px": round(x_min),
            "x_max_px": round(x_max),
            "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
            "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
            "bold": False,
        })
    # Convert rows to output format with percentages
    out_rows = []
    for row in rows:
        out_rows.append({
            "index": row["index"],
            "y_min_px": round(row["y_min"]),
            "y_max_px": round(row["y_max"]),
            "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
            "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
            "is_header": row["index"] in header_rows,
        })
    return {
        "columns": out_columns,
        "rows": out_rows,
        "cells": cells,
        "header_rows": header_rows,
        "_raw_columns": columns,  # internal: for propagation to other zones
    }
@@ -0,0 +1,6 @@
 """
 Vocab package — restructured from vocab_* flat modules.
 Backward-compatible re-exports: consumers can still use
 ``from vocab_worksheet_api import ...`` etc. via the shim files in backend/.
 """
@@ -0,0 +1,196 @@
 """
 Vocab Learn Bridge — Converts vocabulary session data into Learning Units.
 Bridges klausur-service (vocab extraction) with backend-lehrer (learning units + generators).
 Creates a Learning Unit in backend-lehrer, then triggers MC/Cloze/QA generation.
 DATENSCHUTZ: All communication stays within Docker network (breakpilot-network).
 """
 import os
 import json
 import logging
 import httpx
 from typing import List, Dict, Any, Optional
 logger = logging.getLogger(__name__)
 BACKEND_LEHRER_URL = os.getenv("BACKEND_LEHRER_URL", "http://backend-lehrer:8001")
 def vocab_to_analysis_data(session_name: str, vocabulary: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Convert vocabulary entries from a vocab session into the analysis_data format
    expected by backend-lehrer generators (MC, Cloze, QA).
    The generators consume:
    - title: Display name
    - subject: Subject area
    - grade_level: Target grade
    - canonical_text: Full text representation
    - printed_blocks: Individual text blocks
    - vocabulary: Original vocab data (for vocab-specific modules)
    """
    canonical_lines = []
    printed_blocks = []
    for v in vocabulary:
        en = v.get("english", "").strip()
        de = v.get("german", "").strip()
        example = v.get("example_sentence", "").strip()
        if not en and not de:
            continue
        line = f"{en} = {de}"
        if example:
            line += f" ({example})"
        canonical_lines.append(line)
        block_text = f"{en} — {de}"
        if example:
            block_text += f" | {example}"
        printed_blocks.append({"text": block_text})
    return {
        "title": session_name,
        "subject": "English Vocabulary",
        "grade_level": "5-8",
        "canonical_text": "\n".join(canonical_lines),
        "printed_blocks": printed_blocks,
        "vocabulary": vocabulary,
    }
 async def create_learning_unit(
    session_name: str,
    vocabulary: List[Dict[str, Any]],
    grade: Optional[str] = None,
 ) -> Dict[str, Any]:
    """
    Create a Learning Unit in backend-lehrer from vocabulary data.
    Steps:
    1. Create unit via POST /api/learning-units/
    2. Return the created unit info
    Returns dict with unit_id, status, vocabulary_count.
    """
    if not vocabulary:
        raise ValueError("No vocabulary entries provided")
    analysis_data = vocab_to_analysis_data(session_name, vocabulary)
    async with httpx.AsyncClient(timeout=30.0) as client:
        # 1. Create Learning Unit
        create_payload = {
            "title": session_name,
            "subject": "Englisch",
            "grade": grade or "5-8",
        }
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/",
                json=create_payload,
            )
            resp.raise_for_status()
            unit = resp.json()
        except httpx.HTTPError as e:
            logger.error(f"Failed to create learning unit: {e}")
            raise RuntimeError(f"Backend-Lehrer nicht erreichbar: {e}")
        unit_id = unit.get("id")
        if not unit_id:
            raise RuntimeError("Learning Unit created but no ID returned")
        logger.info(f"Created learning unit {unit_id} with {len(vocabulary)} vocabulary entries")
        # 2. Save analysis_data as JSON file for generators
        analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
        os.makedirs(analysis_dir, exist_ok=True)
        analysis_path = os.path.join(analysis_dir, f"{unit_id}_analyse.json")
        with open(analysis_path, "w", encoding="utf-8") as f:
            json.dump(analysis_data, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved analysis data to {analysis_path}")
        return {
            "unit_id": unit_id,
            "unit": unit,
            "analysis_path": analysis_path,
            "vocabulary_count": len(vocabulary),
            "status": "created",
        }
 async def generate_learning_modules(
    unit_id: str,
    analysis_path: str,
 ) -> Dict[str, Any]:
    """
    Trigger MC, Cloze, and QA generation from analysis data.
    Imports generators directly (they run in-process for klausur-service)
    or calls backend-lehrer API if generators aren't available locally.
    Returns dict with generation results.
    """
    results = {
        "unit_id": unit_id,
        "mc": {"status": "pending"},
        "cloze": {"status": "pending"},
        "qa": {"status": "pending"},
    }
    # Load analysis data
    with open(analysis_path, "r", encoding="utf-8") as f:
        analysis_data = json.load(f)
    # Try to generate via backend-lehrer API
    async with httpx.AsyncClient(timeout=120.0) as client:
        # Generate QA (includes Leitner fields)
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-qa",
                json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 20)},
            )
            if resp.status_code == 200:
                results["qa"] = {"status": "generated", "data": resp.json()}
            else:
                logger.warning(f"QA generation returned {resp.status_code}")
                results["qa"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
        except Exception as e:
            logger.warning(f"QA generation failed: {e}")
            results["qa"] = {"status": "error", "reason": str(e)}
        # Generate MC
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-mc",
                json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 10)},
            )
            if resp.status_code == 200:
                results["mc"] = {"status": "generated", "data": resp.json()}
            else:
                results["mc"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
        except Exception as e:
            logger.warning(f"MC generation failed: {e}")
            results["mc"] = {"status": "error", "reason": str(e)}
        # Generate Cloze
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-cloze",
                json={"analysis_data": analysis_data},
            )
            if resp.status_code == 200:
                results["cloze"] = {"status": "generated", "data": resp.json()}
            else:
                results["cloze"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
        except Exception as e:
            logger.warning(f"Cloze generation failed: {e}")
            results["cloze"] = {"status": "error", "reason": str(e)}
    return results
@@ -0,0 +1,427 @@
 """
 Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.
 Replaces in-memory storage with database persistence.
 See migrations/001_vocab_sessions.sql for schema.
 """
 import os
 import uuid
 import logging
 import json
 from typing import Optional, List, Dict, Any
 import asyncpg
 logger = logging.getLogger(__name__)
 # Database configuration
 DATABASE_URL = os.getenv(
    "DATABASE_URL",
    "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
 )
 # Connection pool (initialized lazily)
 _pool: Optional[asyncpg.Pool] = None
 async def get_pool() -> asyncpg.Pool:
    """Get or create the database connection pool."""
    global _pool
    if _pool is None:
        _pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
    return _pool
 async def init_vocab_tables():
    """
    Initialize vocab tables if they don't exist.
    This is called at startup.
    """
    pool = await get_pool()
    async with pool.acquire() as conn:
        # Check if tables exist
        tables_exist = await conn.fetchval("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_name = 'vocab_sessions'
            )
        """)
        if not tables_exist:
            logger.info("Creating vocab tables...")
            # Read and execute migration
            migration_path = os.path.join(
                os.path.dirname(__file__),
                "migrations/001_vocab_sessions.sql"
            )
            if os.path.exists(migration_path):
                with open(migration_path, "r") as f:
                    sql = f.read()
                await conn.execute(sql)
                logger.info("Vocab tables created successfully")
            else:
                logger.warning(f"Migration file not found: {migration_path}")
        else:
            logger.debug("Vocab tables already exist")
 # =============================================================================
 # SESSION OPERATIONS
 # =============================================================================
 async def create_session_db(
    session_id: str,
    name: str,
    description: str = "",
    source_language: str = "en",
    target_language: str = "de"
 ) -> Dict[str, Any]:
    """Create a new vocabulary session in the database."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            INSERT INTO vocab_sessions (
                id, name, description, source_language, target_language,
                status, vocabulary_count
            ) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
            RETURNING *
        """, uuid.UUID(session_id), name, description, source_language, target_language)
        return _row_to_dict(row)
 async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
    """Get a session by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            SELECT * FROM vocab_sessions WHERE id = $1
        """, uuid.UUID(session_id))
        if row:
            return _row_to_dict(row)
        return None
 async def list_sessions_db(
    limit: int = 50,
    offset: int = 0,
    status: Optional[str] = None
 ) -> List[Dict[str, Any]]:
    """List all sessions with optional filtering."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        if status:
            rows = await conn.fetch("""
                SELECT * FROM vocab_sessions
                WHERE status = $1
                ORDER BY created_at DESC
                LIMIT $2 OFFSET $3
            """, status, limit, offset)
        else:
            rows = await conn.fetch("""
                SELECT * FROM vocab_sessions
                ORDER BY created_at DESC
                LIMIT $1 OFFSET $2
            """, limit, offset)
        return [_row_to_dict(row) for row in rows]
 async def update_session_db(
    session_id: str,
    **kwargs
 ) -> Optional[Dict[str, Any]]:
    """Update a session with given fields."""
    pool = await get_pool()
    # Build dynamic UPDATE query
    fields = []
    values = []
    param_idx = 1
    allowed_fields = [
        'name', 'description', 'status', 'vocabulary_count',
        'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
        'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
    ]
    for key, value in kwargs.items():
        if key in allowed_fields:
            fields.append(f"{key} = ${param_idx}")
            # Convert dicts/lists to JSON for JSONB columns
            if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
                value = json.dumps(value) if value else None
            values.append(value)
            param_idx += 1
    if not fields:
        return await get_session_db(session_id)
    values.append(uuid.UUID(session_id))
    async with pool.acquire() as conn:
        row = await conn.fetchrow(f"""
            UPDATE vocab_sessions
            SET {', '.join(fields)}
            WHERE id = ${param_idx}
            RETURNING *
        """, *values)
        if row:
            return _row_to_dict(row)
        return None
 async def delete_session_db(session_id: str) -> bool:
    """Delete a session and all related data (cascades)."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_sessions WHERE id = $1
        """, uuid.UUID(session_id))
        return result == "DELETE 1"
 # =============================================================================
 # VOCABULARY OPERATIONS
 # =============================================================================
 async def add_vocabulary_db(
    session_id: str,
    vocab_list: List[Dict[str, Any]]
 ) -> List[Dict[str, Any]]:
    """Add vocabulary entries to a session."""
    if not vocab_list:
        return []
    pool = await get_pool()
    results = []
    async with pool.acquire() as conn:
        for vocab in vocab_list:
            vocab_id = str(uuid.uuid4())
            row = await conn.fetchrow("""
                INSERT INTO vocab_entries (
                    id, session_id, english, german, example_sentence,
                    example_sentence_gap, word_type, source_page
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                RETURNING *
            """,
                uuid.UUID(vocab_id),
                uuid.UUID(session_id),
                vocab.get('english', ''),
                vocab.get('german', ''),
                vocab.get('example_sentence'),
                vocab.get('example_sentence_gap'),
                vocab.get('word_type'),
                vocab.get('source_page')
            )
            results.append(_row_to_dict(row))
        # Update vocabulary count
        await conn.execute("""
            UPDATE vocab_sessions
            SET vocabulary_count = (
                SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
            )
            WHERE id = $1
        """, uuid.UUID(session_id))
    return results
 async def get_vocabulary_db(
    session_id: str,
    source_page: Optional[int] = None
 ) -> List[Dict[str, Any]]:
    """Get vocabulary entries for a session."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        if source_page is not None:
            rows = await conn.fetch("""
                SELECT * FROM vocab_entries
                WHERE session_id = $1 AND source_page = $2
                ORDER BY created_at
            """, uuid.UUID(session_id), source_page)
        else:
            rows = await conn.fetch("""
                SELECT * FROM vocab_entries
                WHERE session_id = $1
                ORDER BY source_page NULLS LAST, created_at
            """, uuid.UUID(session_id))
        return [_row_to_dict(row) for row in rows]
 async def update_vocabulary_db(
    entry_id: str,
    **kwargs
 ) -> Optional[Dict[str, Any]]:
    """Update a single vocabulary entry."""
    pool = await get_pool()
    fields = []
    values = []
    param_idx = 1
    allowed_fields = [
        'english', 'german', 'example_sentence', 'example_sentence_gap',
        'word_type', 'source_page'
    ]
    for key, value in kwargs.items():
        if key in allowed_fields:
            fields.append(f"{key} = ${param_idx}")
            values.append(value)
            param_idx += 1
    if not fields:
        return None
    values.append(uuid.UUID(entry_id))
    async with pool.acquire() as conn:
        row = await conn.fetchrow(f"""
            UPDATE vocab_entries
            SET {', '.join(fields)}
            WHERE id = ${param_idx}
            RETURNING *
        """, *values)
        if row:
            return _row_to_dict(row)
        return None
 async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
    """Clear all vocabulary for a specific page."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_entries
            WHERE session_id = $1 AND source_page = $2
        """, uuid.UUID(session_id), page)
        # Update vocabulary count
        await conn.execute("""
            UPDATE vocab_sessions
            SET vocabulary_count = (
                SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
            )
            WHERE id = $1
        """, uuid.UUID(session_id))
        # Return count of deleted rows
        count = int(result.split()[-1]) if result else 0
        return count
 # =============================================================================
 # WORKSHEET OPERATIONS
 # =============================================================================
 async def create_worksheet_db(
    session_id: str,
    worksheet_types: List[str],
    pdf_path: Optional[str] = None,
    solution_path: Optional[str] = None
 ) -> Dict[str, Any]:
    """Create a worksheet record."""
    pool = await get_pool()
    worksheet_id = str(uuid.uuid4())
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            INSERT INTO vocab_worksheets (
                id, session_id, worksheet_types, pdf_path, solution_path
            ) VALUES ($1, $2, $3, $4, $5)
            RETURNING *
        """,
            uuid.UUID(worksheet_id),
            uuid.UUID(session_id),
            json.dumps(worksheet_types),
            pdf_path,
            solution_path
        )
        return _row_to_dict(row)
 async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
    """Get a worksheet by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            SELECT * FROM vocab_worksheets WHERE id = $1
        """, uuid.UUID(worksheet_id))
        if row:
            return _row_to_dict(row)
        return None
 async def delete_worksheets_for_session_db(session_id: str) -> int:
    """Delete all worksheets for a session."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_worksheets WHERE session_id = $1
        """, uuid.UUID(session_id))
        count = int(result.split()[-1]) if result else 0
        return count
 # =============================================================================
 # PDF CACHE OPERATIONS
 # =============================================================================
 # Simple in-memory cache for PDF data (temporary until served)
 _pdf_cache: Dict[str, bytes] = {}
 def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
    """Cache PDF data temporarily for download."""
    _pdf_cache[worksheet_id] = pdf_data
 def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
    """Get cached PDF data."""
    return _pdf_cache.get(worksheet_id)
 def clear_cached_pdf_data(worksheet_id: str) -> None:
    """Clear cached PDF data."""
    _pdf_cache.pop(worksheet_id, None)
 # =============================================================================
 # HELPER FUNCTIONS
 # =============================================================================
 def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
    """Convert asyncpg Record to dict with proper type handling."""
    if row is None:
        return {}
    result = dict(row)
    # Convert UUIDs to strings
    for key in ['id', 'session_id']:
        if key in result and result[key] is not None:
            result[key] = str(result[key])
    # Convert datetimes to ISO strings
    for key in ['created_at', 'updated_at', 'generated_at']:
        if key in result and result[key] is not None:
            result[key] = result[key].isoformat()
    # Parse JSONB fields back to dicts/lists
    for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
        if key in result and result[key] is not None:
            if isinstance(result[key], str):
                result[key] = json.loads(result[key])
    return result
@@ -0,0 +1,5 @@
 """
 Vocab worksheet sub-package.
 Main entry point: ``from vocab.worksheet.api import router``
 """
@@ -0,0 +1,472 @@
 """
 Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
 extract-with-boxes, deskewed images, and learning unit generation.
 The two large handlers (compare_ocr_methods, analyze_grid) live in
 vocab_worksheet_compare_api.py and are included via compare_router.
 """
 from fastapi import APIRouter, Body, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from typing import Optional, Dict, Any
 from datetime import datetime
 import os
 import io
 import json
 import logging
 def _get_sessions():
    from .api import _sessions
    return _sessions
 def _get_local_storage_path():
    from .api import LOCAL_STORAGE_PATH
    return LOCAL_STORAGE_PATH
 from .generation import convert_pdf_page_to_image
 # Try to import Tesseract extractor
 try:
    from tesseract_vocab_extractor import (
        extract_bounding_boxes, TESSERACT_AVAILABLE,
    )
 except ImportError:
    TESSERACT_AVAILABLE = False
 # Try to import Grid Detection Service
 try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
 except ImportError:
    GRID_SERVICE_AVAILABLE = False
 logger = logging.getLogger(__name__)
 analysis_router = APIRouter()
 def _ocr_export_dir():
    return os.path.join(_get_local_storage_path(), "ocr-exports")
 def _ground_truth_dir():
    return os.path.join(_get_local_storage_path(), "ground-truth")
 # =============================================================================
 # OCR Export Endpoints (for cross-app OCR data sharing)
 # =============================================================================
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
 async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
    """
    Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
    Both apps proxy to klausur-service via /klausur-api/, so this endpoint
    serves as shared storage accessible from both ports.
    """
    logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
    os.makedirs(_ocr_export_dir(), exist_ok=True)
    # Save the export data
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    with open(export_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    # Update latest pointer
    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
    with open(latest_path, 'w', encoding='utf-8') as f:
        json.dump({
            "session_id": session_id,
            "page_number": page_number,
            "saved_at": datetime.utcnow().isoformat(),
        }, f, ensure_ascii=False, indent=2)
    return {
        "success": True,
        "session_id": session_id,
        "page_number": page_number,
        "message": "OCR export saved successfully",
    }
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
 async def load_ocr_export(session_id: str, page_number: int):
    """Load a specific OCR export by session and page number."""
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="OCR export not found")
    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
@analysis_router.get("/ocr-export/latest")
 async def load_latest_ocr_export():
    """Load the most recently saved OCR export data."""
    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
    if not os.path.exists(latest_path):
        raise HTTPException(status_code=404, detail="No OCR exports found")
    with open(latest_path, 'r', encoding='utf-8') as f:
        pointer = json.load(f)
    session_id = pointer.get("session_id")
    page_number = pointer.get("page_number")
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="Latest OCR export file not found")
    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
 # =============================================================================
 # Extract with Boxes & Deskewed Image
 # =============================================================================
 async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
    """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
    Returns dict with 'entries' list and 'image_width'/'image_height'.
    Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
    All bbox coordinates are in percent (0-100).
    """
    if not TESSERACT_AVAILABLE:
        raise HTTPException(status_code=500, detail="Tesseract not available")
    if not GRID_SERVICE_AVAILABLE:
        raise HTTPException(status_code=500, detail="GridDetectionService not available")
    # Step 1: Tesseract word-level bounding boxes
    tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
    words = tess_result.get("words", [])
    img_w = tess_result.get("image_width", 0)
    img_h = tess_result.get("image_height", 0)
    if not words or img_w == 0 or img_h == 0:
        return {"entries": [], "image_width": img_w, "image_height": img_h}
    # Step 2: Convert to OCR regions (percentage-based)
    service = GridDetectionService()
    regions = service.convert_tesseract_regions(words, img_w, img_h)
    if not regions:
        return {"entries": [], "image_width": img_w, "image_height": img_h}
    # Step 3: Detect grid
    grid_result = service.detect_grid(regions)
    if not grid_result.cells:
        return {"entries": [], "image_width": img_w, "image_height": img_h}
    # Step 4: Group cells by logical_row and column_type
    from services.grid_detection_service import ColumnType
    entries = []
    for row_idx, row_cells in enumerate(grid_result.cells):
        en_text = ""
        de_text = ""
        ex_text = ""
        en_bbox = None
        de_bbox = None
        ex_bbox = None
        row_conf_sum = 0.0
        row_conf_count = 0
        for cell in row_cells:
            cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
                         "w": round(cell.width, 2), "h": round(cell.height, 2)}
            if cell.column_type == ColumnType.ENGLISH:
                en_text = cell.text.strip()
                en_bbox = cell_bbox
            elif cell.column_type == ColumnType.GERMAN:
                de_text = cell.text.strip()
                de_bbox = cell_bbox
            elif cell.column_type == ColumnType.EXAMPLE:
                ex_text = cell.text.strip()
                ex_bbox = cell_bbox
            if cell.text.strip():
                row_conf_sum += cell.confidence
                row_conf_count += 1
        # Skip completely empty rows
        if not en_text and not de_text and not ex_text:
            continue
        # Calculate whole-row bounding box
        all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
        if all_bboxes:
            row_x = min(b["x"] for b in all_bboxes)
            row_y = min(b["y"] for b in all_bboxes)
            row_right = max(b["x"] + b["w"] for b in all_bboxes)
            row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
            row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
                        "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
        else:
            row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
        avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
        entries.append({
            "row_index": row_idx,
            "english": en_text,
            "german": de_text,
            "example": ex_text,
            "confidence": avg_conf,
            "bbox": row_bbox,
            "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
        })
    return {"entries": entries, "image_width": img_w, "image_height": img_h}
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
 async def extract_with_boxes(session_id: str, page_number: int):
    """Extract vocabulary entries with bounding boxes for ground truth labeling.
    Uses Tesseract + GridDetectionService for spatial positioning.
    page_number is 0-indexed.
    """
    logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    # Convert page to hires image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    # Deskew image before OCR
    deskew_angle = 0.0
    try:
        from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
        if CV2_AVAILABLE:
            image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
            logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
    except Exception as e:
        logger.warning(f"Deskew failed for page {page_number}: {e}")
    # Cache deskewed image in session for later serving
    if "deskewed_images" not in session:
        session["deskewed_images"] = {}
    session["deskewed_images"][str(page_number)] = image_data
    # Extract entries with boxes (now on deskewed image)
    result = await extract_entries_with_boxes(image_data)
    # Cache in session
    if "gt_entries" not in session:
        session["gt_entries"] = {}
    session["gt_entries"][str(page_number)] = result["entries"]
    return {
        "success": True,
        "entries": result["entries"],
        "entry_count": len(result["entries"]),
        "image_width": result["image_width"],
        "image_height": result["image_height"],
        "deskew_angle": round(deskew_angle, 2),
        "deskewed": abs(deskew_angle) > 0.05,
    }
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
 async def get_deskewed_image(session_id: str, page_number: int):
    """Return the deskewed page image as PNG.
    Falls back to the original hires image if no deskewed version is cached.
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    deskewed = session.get("deskewed_images", {}).get(str(page_number))
    if deskewed:
        return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
    # Fallback: render original hires image
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
 # =============================================================================
 # Ground Truth Labeling
 # =============================================================================
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
 async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
    """Save ground truth labels for a page.
    Expects body with 'entries' list - each entry has english, german, example,
    status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
    """
    logger.info(f"Save ground truth for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    entries = data.get("entries", [])
    if not entries:
        raise HTTPException(status_code=400, detail="No entries provided")
    # Save in session
    session = _get_sessions()[session_id]
    if "ground_truth" not in session:
        session["ground_truth"] = {}
    session["ground_truth"][str(page_number)] = entries
    # Also save to disk
    os.makedirs(_ground_truth_dir(), exist_ok=True)
    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
    gt_data = {
        "session_id": session_id,
        "page_number": page_number,
        "saved_at": datetime.now().isoformat(),
        "entry_count": len(entries),
        "entries": entries,
    }
    with open(gt_path, 'w', encoding='utf-8') as f:
        json.dump(gt_data, f, ensure_ascii=False, indent=2)
    logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
    confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
    edited = sum(1 for e in entries if e.get("status") == "edited")
    skipped = sum(1 for e in entries if e.get("status") == "skipped")
    return {
        "success": True,
        "saved_count": len(entries),
        "confirmed": confirmed,
        "edited": edited,
        "skipped": skipped,
        "file_path": gt_path,
    }
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
 async def load_ground_truth(session_id: str, page_number: int):
    """Load saved ground truth for a page."""
    logger.info(f"Load ground truth for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    # Try session cache first
    session = _get_sessions()[session_id]
    cached = session.get("ground_truth", {}).get(str(page_number))
    if cached:
        return {"success": True, "entries": cached, "source": "cache"}
    # Try disk
    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(gt_path):
        raise HTTPException(status_code=404, detail="No ground truth found for this page")
    with open(gt_path, 'r', encoding='utf-8') as f:
        gt_data = json.load(f)
    return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
 # ─── Learning Module Generation ─────────────────────────────────────────────
 class GenerateLearningUnitRequest(BaseModel):
    grade: Optional[str] = None
    generate_modules: bool = True
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
 async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
    """
    Create a Learning Unit from the vocabulary in this session.
    1. Takes vocabulary from the session
    2. Creates a Learning Unit in backend-lehrer
    3. Optionally triggers MC/Cloze/QA generation
    Returns the created unit info and generation status.
    """
    if request is None:
        request = GenerateLearningUnitRequest()
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    vocabulary = session.get("vocabulary", [])
    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary in this session")
    try:
        from vocab.learn_bridge import create_learning_unit, generate_learning_modules
        # Step 1: Create Learning Unit
        result = await create_learning_unit(
            session_name=session["name"],
            vocabulary=vocabulary,
            grade=request.grade,
        )
        # Step 2: Generate modules if requested
        if request.generate_modules:
            try:
                gen_result = await generate_learning_modules(
                    unit_id=result["unit_id"],
                    analysis_path=result["analysis_path"],
                )
                result["generation"] = gen_result
            except Exception as e:
                logger.warning(f"Module generation failed (unit created): {e}")
                result["generation"] = {"status": "error", "reason": str(e)}
        return result
    except ImportError:
        raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        raise HTTPException(status_code=502, detail=str(e))
 # =============================================================================
 # Include compare_ocr_methods & analyze_grid from companion module
 # =============================================================================
 from .compare_api import compare_router  # noqa: E402
 analysis_router.include_router(compare_router)
@@ -0,0 +1,498 @@
 """
 Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
 vocabulary editing, worksheet generation, and PDF downloads.
 Sub-routers (included at bottom):
 - vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
 - vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
 """
 from fastapi import APIRouter, HTTPException, UploadFile, File, Query
 from fastapi.responses import StreamingResponse
 from typing import List, Dict, Any
 from datetime import datetime
 import uuid
 import os
 import io
 import logging
 logger = logging.getLogger(__name__)
 # --- Imports from extracted sub-modules ---
 from .models import (
    SessionStatus,
    VocabularyEntry,
    SessionCreate,
    SessionResponse,
    VocabularyResponse,
    VocabularyUpdate,
    WorksheetGenerateRequest,
    WorksheetResponse,
 )
 from .extraction import extract_vocabulary_from_image
 from .generation import (
    generate_worksheet_html, generate_worksheet_pdf,
    convert_pdf_page_to_image,
 )
 # --- Database integration (used by main.py lifespan) ---
 try:
    from vocab.session_store import (
        DATABASE_URL, get_pool, init_vocab_tables,
        list_sessions_db, get_session_db,
    )
 except ImportError:
    DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
    get_pool = None
    init_vocab_tables = None
    list_sessions_db = None
    get_session_db = None
 _db_pool = None
 def set_db_pool(pool):
    """Set the database connection pool (called from main.py lifespan)."""
    global _db_pool
    _db_pool = pool
 async def _init_vocab_table():
    """Initialize vocab tables in database."""
    if init_vocab_tables:
        try:
            await init_vocab_tables()
            logger.info("vocab_session_cache table ready")
        except Exception as e:
            logger.warning(f"Failed to init vocab tables: {e}")
    else:
        logger.info("vocab_session_cache table ready")
 async def _load_all_sessions():
    """Load all vocab sessions from database into memory cache."""
    if not list_sessions_db:
        logger.info("Loaded 0 vocab sessions from database")
        return
    try:
        sessions = await list_sessions_db(limit=500)
        count = 0
        for s in sessions:
            sid = s.get("id") or s.get("session_id")
            if sid and sid not in _sessions:
                _sessions[sid] = {
                    "id": sid,
                    "name": s.get("name", ""),
                    "description": s.get("description", ""),
                    "status": s.get("status", "created"),
                    "vocabulary_count": s.get("vocabulary_count", 0),
                    "source_language": s.get("source_language", "en"),
                    "target_language": s.get("target_language", "de"),
                    "created_at": str(s.get("created_at", "")),
                }
                count += 1
        logger.info(f"Loaded {count} vocab sessions from database")
    except Exception as e:
        logger.warning(f"Failed to load sessions from database: {e}")
 # --- Router & module-level state ---
 router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
 LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
 _sessions: Dict[str, Dict[str, Any]] = {}
 _worksheets: Dict[str, Dict[str, Any]] = {}
@router.post("/sessions", response_model=SessionResponse)
 async def create_session(session: SessionCreate):
    """Create a new vocabulary extraction session."""
    session_id = str(uuid.uuid4())
    session_data = {
        "id": session_id,
        "name": session.name,
        "description": session.description,
        "source_language": session.source_language,
        "target_language": session.target_language,
        "status": SessionStatus.PENDING.value,
        "vocabulary": [],
        "vocabulary_count": 0,
        "image_path": None,
        "extraction_confidence": None,
        "created_at": datetime.utcnow(),
    }
    _sessions[session_id] = session_data
    # Create storage directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    return SessionResponse(
        id=session_id,
        name=session.name,
        description=session.description,
        source_language=session.source_language,
        target_language=session.target_language,
        status=SessionStatus.PENDING.value,
        vocabulary_count=0,
        image_path=None,
        created_at=session_data["created_at"],
    )
@router.get("/sessions", response_model=List[SessionResponse])
 async def list_sessions(limit: int = Query(50, ge=1, le=100)):
    """List all vocabulary sessions."""
    sessions = sorted(
        _sessions.values(),
        key=lambda x: x["created_at"],
        reverse=True
    )[:limit]
    return [
        SessionResponse(
            id=s["id"],
            name=s["name"],
            description=s.get("description"),
            source_language=s["source_language"],
            target_language=s["target_language"],
            status=s["status"],
            vocabulary_count=s.get("vocabulary_count", 0),
            image_path=s.get("image_path"),
            created_at=s["created_at"],
        )
        for s in sessions
    ]
@router.get("/sessions/{session_id}", response_model=SessionResponse)
 async def get_session(session_id: str):
    """Get a specific session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    s = _sessions[session_id]
    return SessionResponse(
        id=s["id"],
        name=s["name"],
        description=s.get("description"),
        source_language=s["source_language"],
        target_language=s["target_language"],
        status=s["status"],
        vocabulary_count=s.get("vocabulary_count", 0),
        image_path=s.get("image_path"),
        created_at=s["created_at"],
    )
@router.post("/sessions/{session_id}/upload")
 async def upload_image(
    session_id: str,
    file: UploadFile = File(...),
 ):
    """
    Upload a textbook page image or PDF and extract vocabulary.
    Supported formats: PNG, JPG, JPEG, PDF
    """
    logger.info(f"Upload request for session {session_id}")
    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
    if session_id not in _sessions:
        logger.error(f"Session {session_id} not found")
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    # Validate file type - check both extension and content type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''
    # Accept images and PDFs
    valid_image_extensions = ['png', 'jpg', 'jpeg']
    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
    is_image = extension in valid_image_extensions or content_type in valid_image_content_types
    if not is_pdf and not is_image:
        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
        raise HTTPException(
            status_code=400,
            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
        )
    # Determine final extension for saving
    if is_pdf:
        save_extension = 'png'  # PDFs will be converted to PNG
    elif extension in valid_image_extensions:
        save_extension = extension
    elif content_type == 'image/png':
        save_extension = 'png'
    else:
        save_extension = 'jpg'
    # Read file content
    content = await file.read()
    logger.info(f"Read {len(content)} bytes from uploaded file")
    # Convert PDF to image if needed
    if is_pdf:
        logger.info("Converting PDF to image...")
        content = await convert_pdf_page_to_image(content, page_number=0)
        logger.info(f"PDF converted, image size: {len(content)} bytes")
    # Save image
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    image_path = os.path.join(session_dir, f"source.{save_extension}")
    with open(image_path, 'wb') as f:
        f.write(content)
    # Update session status
    session["status"] = SessionStatus.PROCESSING.value
    session["image_path"] = image_path
    # Extract vocabulary using Vision LLM
    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
    # Update session with extracted vocabulary
    session["vocabulary"] = [v.dict() for v in vocabulary]
    session["vocabulary_count"] = len(vocabulary)
    session["extraction_confidence"] = confidence
    session["status"] = SessionStatus.EXTRACTED.value
    result = {
        "session_id": session_id,
        "filename": file.filename,
        "image_path": image_path,
        "vocabulary_count": len(vocabulary),
        "extraction_confidence": confidence,
        "status": SessionStatus.EXTRACTED.value,
    }
    if error:
        result["error"] = error
    return result
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
 async def get_vocabulary(session_id: str):
    """Get extracted vocabulary for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
    return VocabularyResponse(
        session_id=session_id,
        vocabulary=vocabulary,
        extraction_confidence=session.get("extraction_confidence"),
    )
@router.put("/sessions/{session_id}/vocabulary")
 async def update_vocabulary(session_id: str, update: VocabularyUpdate):
    """Update vocabulary entries (for manual corrections)."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    session["vocabulary"] = [v.dict() for v in update.vocabulary]
    session["vocabulary_count"] = len(update.vocabulary)
    return {
        "session_id": session_id,
        "vocabulary_count": len(update.vocabulary),
        "message": "Vocabulary updated successfully",
    }
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
 async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
    """Generate worksheet PDF(s) from extracted vocabulary."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
    worksheet_id = str(uuid.uuid4())
    title = request.title or session["name"]
    # Generate HTML for each worksheet type
    combined_html = ""
    for wtype in request.worksheet_types:
        html = generate_worksheet_html(
            vocabulary=vocabulary,
            worksheet_type=wtype,
            title=f"{title} - {wtype.value}",
            show_solutions=False,
            repetitions=request.repetitions,
            line_height=request.line_height,
        )
        combined_html += html + '<div style="page-break-after: always;"></div>'
    # Generate PDF
    try:
        pdf_bytes = await generate_worksheet_pdf(combined_html)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
    # Save PDF
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(pdf_bytes)
    # Generate solution PDF if requested
    solution_path = None
    if request.include_solutions:
        solution_html = ""
        for wtype in request.worksheet_types:
            html = generate_worksheet_html(
                vocabulary=vocabulary,
                worksheet_type=wtype,
                title=f"{title} - {wtype.value} (Loesung)",
                show_solutions=True,
                repetitions=request.repetitions,
                line_height=request.line_height,
            )
            solution_html += html + '<div style="page-break-after: always;"></div>'
        solution_bytes = await generate_worksheet_pdf(solution_html)
        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
        with open(solution_path, 'wb') as f:
            f.write(solution_bytes)
    # Store worksheet info
    worksheet_data = {
        "id": worksheet_id,
        "session_id": session_id,
        "worksheet_types": [wt.value for wt in request.worksheet_types],
        "pdf_path": pdf_path,
        "solution_path": solution_path,
        "generated_at": datetime.utcnow(),
    }
    _worksheets[worksheet_id] = worksheet_data
    # Update session status
    session["status"] = SessionStatus.COMPLETED.value
    return WorksheetResponse(
        id=worksheet_id,
        session_id=session_id,
        worksheet_types=worksheet_data["worksheet_types"],
        pdf_path=pdf_path,
        solution_path=solution_path,
        generated_at=worksheet_data["generated_at"],
    )
@router.get("/worksheets/{worksheet_id}/pdf")
 async def download_worksheet_pdf(worksheet_id: str):
    """Download the generated worksheet PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")
    worksheet = _worksheets[worksheet_id]
    pdf_path = worksheet["pdf_path"]
    if not os.path.exists(pdf_path):
        raise HTTPException(status_code=404, detail="PDF file not found")
    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()
    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
    )
@router.get("/worksheets/{worksheet_id}/solution")
 async def download_solution_pdf(worksheet_id: str):
    """Download the solution PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")
    worksheet = _worksheets[worksheet_id]
    solution_path = worksheet.get("solution_path")
    if not solution_path or not os.path.exists(solution_path):
        raise HTTPException(status_code=404, detail="Solution PDF not found")
    with open(solution_path, 'rb') as f:
        pdf_bytes = f.read()
    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
    )
@router.get("/sessions/{session_id}/image")
 async def get_session_image(session_id: str):
    """Get the uploaded source image for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    image_path = session.get("image_path")
    if not image_path or not os.path.exists(image_path):
        raise HTTPException(status_code=404, detail="Image not found")
    # Determine content type
    extension = image_path.split('.')[-1].lower()
    content_type = {
        'png': 'image/png',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
    }.get(extension, 'application/octet-stream')
    with open(image_path, 'rb') as f:
        image_bytes = f.read()
    return StreamingResponse(
        io.BytesIO(image_bytes),
        media_type=content_type,
    )
@router.delete("/sessions/{session_id}")
 async def delete_session(session_id: str):
    """Delete a vocabulary session and all associated files."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    # Delete session directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    if os.path.exists(session_dir):
        import shutil
        shutil.rmtree(session_dir)
    # Remove from storage
    del _sessions[session_id]
    # Remove associated worksheets
    for wid, ws in list(_worksheets.items()):
        if ws["session_id"] == session_id:
            del _worksheets[wid]
    return {"message": "Session deleted successfully", "session_id": session_id}
 # --- Include sub-routers ---
 from .upload_api import upload_router
 from .analysis_api import analysis_router
 router.include_router(upload_router)
 router.include_router(analysis_router)
@@ -0,0 +1,542 @@
 """
 Vocabulary Worksheet Compare & Grid Analysis API.
 Split from vocab_worksheet_analysis_api.py — contains the two largest
 route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
 """
 from fastapi import APIRouter, HTTPException, Query
 import base64
 import json
 import logging
 import os
 from .extraction import extract_vocabulary_from_image
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
 def _get_sessions():
    from .api import _sessions
    return _sessions
 from .generation import convert_pdf_page_to_image
 # Try to import Tesseract extractor
 try:
    from tesseract_vocab_extractor import (
        run_tesseract_pipeline,
        match_positions_to_vocab, TESSERACT_AVAILABLE,
    )
 except ImportError:
    TESSERACT_AVAILABLE = False
 # Try to import CV Pipeline
 try:
    from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
 except ImportError:
    CV_PIPELINE_AVAILABLE = False
 # Try to import Grid Detection Service
 try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
 except ImportError:
    GRID_SERVICE_AVAILABLE = False
 logger = logging.getLogger(__name__)
 compare_router = APIRouter()
 # =============================================================================
 # OCR Compare & Grid Analysis Endpoints
 # =============================================================================
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
 async def compare_ocr_methods(session_id: str, page_number: int):
    """
    Run multiple OCR methods on a page and compare results.
    This endpoint:
    1. Gets the page image from the session's uploaded PDF
    2. Runs Vision LLM extraction (primary method)
    3. Optionally runs Tesseract extraction
    4. Compares found vocabulary across methods
    5. Returns structured comparison results
    page_number is 0-indexed.
    """
    import time
    logger.info(f"Compare OCR for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    methods_results = {}
    all_vocab_sets = {}
    # --- Method: Vision LLM ---
    try:
        start = time.time()
        vocab, confidence, error = await extract_vocabulary_from_image(
            image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
        )
        duration = time.time() - start
        vocab_list = []
        for v in vocab:
            entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
            vocab_list.append({
                "english": entry.get("english", ""),
                "german": entry.get("german", ""),
                "example": entry.get("example_sentence", ""),
            })
        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": round(duration, 1),
            "vocabulary_count": len(vocab_list),
            "vocabulary": vocab_list,
            "confidence": confidence,
            "success": len(vocab_list) > 0 and not error,
            "error": error if error else None,
        }
        all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
    except Exception as e:
        logger.error(f"Vision LLM failed: {e}")
        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": 0,
            "vocabulary_count": 0,
            "vocabulary": [],
            "confidence": 0,
            "success": False,
            "error": str(e),
        }
        all_vocab_sets["vision_llm"] = set()
    # --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
    if TESSERACT_AVAILABLE:
        try:
            start = time.time()
            tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
            duration = time.time() - start
            tess_vocab = tess_result.get("vocabulary", [])
            tess_words = tess_result.get("words", [])
            # Store Tesseract words in session for later use (grid analysis, position matching)
            session["tesseract_words"] = tess_words
            session["tesseract_image_width"] = tess_result.get("image_width", 0)
            session["tesseract_image_height"] = tess_result.get("image_height", 0)
            session[f"tesseract_page_{page_number}"] = tess_result
            vocab_list_tess = []
            for v in tess_vocab:
                vocab_list_tess.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })
            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr (eng+deu)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_tess),
                "vocabulary": vocab_list_tess,
                "confidence": 0.7 if tess_vocab else 0,
                "success": len(vocab_list_tess) > 0,
                "error": tess_result.get("error"),
                "word_count": tess_result.get("word_count", 0),
                "columns_detected": len(tess_result.get("columns", [])),
            }
            all_vocab_sets["tesseract"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_tess if v["english"] and v["german"]
            }
            # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
            if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
                llm_vocab_with_bbox = match_positions_to_vocab(
                    tess_words,
                    methods_results["vision_llm"]["vocabulary"],
                    tess_result.get("image_width", 1),
                    tess_result.get("image_height", 1),
                )
                methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
        except Exception as e:
            logger.error(f"Tesseract failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["tesseract"] = set()
    # --- Method: CV Pipeline (Document Reconstruction) ---
    if CV_PIPELINE_AVAILABLE:
        try:
            start = time.time()
            cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
            duration = time.time() - start
            cv_vocab = cv_result.vocabulary if not cv_result.error else []
            vocab_list_cv = []
            for v in cv_vocab:
                vocab_list_cv.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })
            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_cv),
                "vocabulary": vocab_list_cv,
                "confidence": 0.8 if cv_vocab else 0,
                "success": len(vocab_list_cv) > 0,
                "error": cv_result.error,
                "word_count": cv_result.word_count,
                "columns_detected": cv_result.columns_detected,
                "stages": cv_result.stages,
            }
            all_vocab_sets["cv_pipeline"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_cv if v["english"] and v["german"]
            }
        except Exception as e:
            logger.error(f"CV Pipeline failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["cv_pipeline"] = set()
    # --- Build comparison ---
    all_unique = set()
    for vs in all_vocab_sets.values():
        all_unique |= vs
    found_by_all = []
    found_by_some = []
    for english, german in sorted(all_unique):
        found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
        entry = {"english": english, "german": german, "methods": found_in}
        if len(found_in) == len(all_vocab_sets):
            found_by_all.append(entry)
        else:
            found_by_some.append(entry)
    total_methods = max(len(all_vocab_sets), 1)
    agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
    # Find best method
    best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
    return {
        "session_id": session_id,
        "page_number": page_number,
        "methods": methods_results,
        "comparison": {
            "found_by_all_methods": found_by_all,
            "found_by_some_methods": found_by_some,
            "total_unique_vocabulary": len(all_unique),
            "agreement_rate": agreement_rate,
        },
        "recommendation": {
            "best_method": best_method,
            "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
        },
    }
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
 async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
    """
    Analyze the grid/table structure of a vocabulary page.
    Hybrid approach:
    1. If Tesseract bounding boxes are available (from compare-ocr), use them for
       real spatial positions via GridDetectionService.
    2. Otherwise fall back to Vision LLM for grid structure detection.
    page_number is 0-indexed.
    Returns GridData structure expected by the frontend GridOverlay component.
    """
    import httpx
    logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number.")
    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
    tess_page_data = session.get(f"tesseract_page_{page_number}")
    if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
        try:
            # Run Tesseract if not already cached
            if not tess_page_data:
                logger.info("Running Tesseract for grid analysis (not cached)")
                from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
                tess_page_data = await _run_tess(image_data, lang="eng+deu")
                session[f"tesseract_page_{page_number}"] = tess_page_data
                session["tesseract_words"] = tess_page_data.get("words", [])
                session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
                session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
            tess_words = tess_page_data.get("words", [])
            img_w = tess_page_data.get("image_width", 0)
            img_h = tess_page_data.get("image_height", 0)
            if tess_words and img_w > 0 and img_h > 0:
                service = GridDetectionService()
                regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
                if regions:
                    grid_result = service.detect_grid(regions)
                    grid_dict = grid_result.to_dict()
                    # Merge LLM text if available (better quality than Tesseract text)
                    # The LLM vocab was stored during compare-ocr
                    grid_dict["source"] = "tesseract+grid_service"
                    grid_dict["word_count"] = len(tess_words)
                    logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
                                f"{grid_result.stats.get('recognized', 0)} recognized")
                    return {"success": True, "grid": grid_dict}
            logger.info("Tesseract data insufficient, falling back to LLM")
        except Exception as e:
            logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
            import traceback
            logger.debug(traceback.format_exc())
    # --- Strategy 2: Fall back to Vision LLM ---
    image_base64 = base64.b64encode(image_data).decode("utf-8")
    grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
 Your task: Identify the TABLE STRUCTURE and extract each cell's content.
 Return a JSON object with this EXACT structure:
 {
  "rows": <number of rows>,
  "columns": <number of columns>,
  "column_types": ["english", "german", "example"],
  "entries": [
    {
      "row": 0,
      "col": 0,
      "text": "the word or phrase in this cell",
      "column_type": "english",
      "confidence": 0.95
    }
  ]
 }
 Rules:
 - row and col are 0-indexed
 - column_type is one of: "english", "german", "example", "unknown"
 - Detect whether each column contains English words, German translations, or example sentences
 - Include ALL non-empty cells
 - confidence is 0.0-1.0 based on how clear the text is
 - If a cell is empty, don't include it
 - Return ONLY the JSON, no other text"""
    try:
        import asyncio
        raw_text = ""
        max_retries = 3
        for attempt in range(max_retries):
            async with httpx.AsyncClient(timeout=300.0) as client:
                response = await client.post(
                    f"{OLLAMA_URL}/api/chat",
                    json={
                        "model": VISION_MODEL,
                        "messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
                        "stream": False,
                        "options": {"temperature": 0.1, "num_predict": 8192},
                    },
                    timeout=300.0,
                )
            if response.status_code == 500 and attempt < max_retries - 1:
                wait_time = 10 * (attempt + 1)
                logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
                await asyncio.sleep(wait_time)
                continue
            elif response.status_code != 200:
                error_detail = response.text[:200] if response.text else "Unknown error"
                return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
            raw_text = response.json().get("message", {}).get("content", "")
            break
        # Parse JSON from response
        import re
        json_match = re.search(r'\{[\s\S]*\}', raw_text)
        if not json_match:
            return {"success": False, "error": "Could not parse grid structure from LLM response"}
        grid_raw = json.loads(json_match.group())
        num_rows = grid_raw.get("rows", 0)
        num_cols = grid_raw.get("columns", 0)
        column_types = grid_raw.get("column_types", [])
        entries = grid_raw.get("entries", [])
        if num_rows == 0 or num_cols == 0:
            return {"success": False, "error": "No grid structure detected"}
        # Ensure column_types has the right length
        while len(column_types) < num_cols:
            column_types.append("unknown")
        # Build cell grid with percentage-based coordinates
        row_height = 100.0 / num_rows
        col_width = 100.0 / num_cols
        # Track which cells have content
        cell_map = {}
        for entry in entries:
            r = entry.get("row", 0)
            c = entry.get("col", 0)
            cell_map[(r, c)] = entry
        cells = []
        recognized_count = 0
        empty_count = 0
        problematic_count = 0
        for r in range(num_rows):
            row_cells = []
            for c in range(num_cols):
                x = c * col_width
                y = r * row_height
                if (r, c) in cell_map:
                    entry = cell_map[(r, c)]
                    text = entry.get("text", "").strip()
                    conf = entry.get("confidence", 0.8)
                    col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
                    if text:
                        status = "recognized" if conf >= 0.5 else "problematic"
                        if status == "recognized":
                            recognized_count += 1
                        else:
                            problematic_count += 1
                    else:
                        status = "empty"
                        empty_count += 1
                else:
                    text = ""
                    conf = 0.0
                    col_type = column_types[c] if c < len(column_types) else "unknown"
                    status = "empty"
                    empty_count += 1
                row_cells.append({
                    "row": r,
                    "col": c,
                    "x": round(x, 2),
                    "y": round(y, 2),
                    "width": round(col_width, 2),
                    "height": round(row_height, 2),
                    "text": text,
                    "confidence": conf,
                    "status": status,
                    "column_type": col_type,
                })
            cells.append(row_cells)
        total = num_rows * num_cols
        coverage = (recognized_count + problematic_count) / max(total, 1)
        # Column and row boundaries as percentages
        col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
        row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
        grid_data = {
            "rows": num_rows,
            "columns": num_cols,
            "cells": cells,
            "column_types": column_types,
            "column_boundaries": col_boundaries,
            "row_boundaries": row_boundaries,
            "deskew_angle": 0.0,
            "source": "vision_llm",
            "stats": {
                "recognized": recognized_count,
                "problematic": problematic_count,
                "empty": empty_count,
                "manual": 0,
                "total": total,
                "coverage": round(coverage, 3),
            },
        }
        return {"success": True, "grid": grid_data}
    except httpx.TimeoutException:
        logger.error("Grid analysis timed out")
        return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
    except Exception as e:
        logger.error(f"Grid analysis failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
@@ -0,0 +1,325 @@
 """Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
 Contains:
 - VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
 - extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
 - _get_demo_vocabulary(): Demo data for testing
 - parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
 """
 import base64
 import json
 import logging
 import os
 import re
 import uuid
 from typing import List
 import httpx
 from .models import VocabularyEntry
 logger = logging.getLogger(__name__)
 # Ollama Configuration
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
 # =============================================================================
 # Vision LLM Vocabulary Extraction
 # =============================================================================
 VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
 AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
 {
  "vocabulary": [
    {
      "english": "to improve",
      "german": "verbessern",
      "example": "I want to improve my English."
    }
  ]
 }
 REGELN:
 1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
 2. Behalte die exakte Schreibweise bei
 3. Bei fehlenden Beispielsaetzen: "example": null
 4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
 5. Gib NUR valides JSON zurueck, keine Erklaerungen
 6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
 Beispiel-Output:
 {
  "vocabulary": [
    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
  ]
 }"""
 async def extract_vocabulary_from_image(
    image_data: bytes,
    filename: str,
    page_number: int = 0,
    use_hybrid: bool = False  # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
 ) -> tuple[List[VocabularyEntry], float, str]:
    """
    Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
    Args:
        image_data: Image bytes
        filename: Original filename for logging
        page_number: 0-indexed page number for error messages
        use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
                   If False, use Vision LLM (slower, better for complex layouts)
    Returns:
        Tuple of (vocabulary_entries, confidence, error_message)
        error_message is empty string on success
    """
    # ==========================================================================
    # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
    # ==========================================================================
    if use_hybrid:
        try:
            from hybrid_vocab_extractor import extract_vocabulary_hybrid
            logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
            if error:
                logger.warning(f"Hybrid extraction had issues: {error}")
                # Fall through to Vision LLM fallback
            elif vocab_dicts:
                # Convert dicts to VocabularyEntry objects
                vocabulary = [
                    VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=v.get("english", ""),
                        german=v.get("german", ""),
                        example_sentence=v.get("example"),
                        source_page=page_number + 1
                    )
                    for v in vocab_dicts
                    if v.get("english") and v.get("german")
                ]
                logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
                return vocabulary, confidence, ""
        except ImportError as e:
            logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
        except Exception as e:
            logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
            import traceback
            logger.debug(traceback.format_exc())
    # ==========================================================================
    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
    # ==========================================================================
    logger.info(f"Using VISION LLM extraction for {filename}")
    try:
        # First check if Ollama is available
        async with httpx.AsyncClient(timeout=10.0) as check_client:
            try:
                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
                if health_response.status_code != 200:
                    logger.error(f"Ollama not available at {OLLAMA_URL}")
                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
            except Exception as e:
                logger.error(f"Ollama health check failed: {e}")
                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
        image_base64 = base64.b64encode(image_data).decode("utf-8")
        payload = {
            "model": VISION_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": VOCAB_EXTRACTION_PROMPT,
                    "images": [image_base64]
                }
            ],
            "stream": False,
            "options": {
                "temperature": 0.1,
                "num_predict": 4096,
            }
        }
        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
        # Increased timeout for Vision models (they can be slow)
        async with httpx.AsyncClient(timeout=600.0) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/chat",
                json=payload,
                timeout=300.0  # 5 minutes per page
            )
            response.raise_for_status()
            data = response.json()
            extracted_text = data.get("message", {}).get("content", "")
        logger.info(f"Ollama response received: {len(extracted_text)} chars")
        # Parse JSON from response
        vocabulary = parse_vocabulary_json(extracted_text)
        # Set source_page for each entry
        for v in vocabulary:
            v.source_page = page_number + 1
        # Estimate confidence
        confidence = 0.85 if len(vocabulary) > 0 else 0.1
        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
        return vocabulary, confidence, ""
    except httpx.TimeoutException:
        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
    except Exception as e:
        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
 def _get_demo_vocabulary() -> List[VocabularyEntry]:
    """Return demo vocabulary for testing when Vision LLM is not available."""
    demo_entries = [
        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
    ]
    return [
        VocabularyEntry(
            id=str(uuid.uuid4()),
            english=e["english"],
            german=e["german"],
            example_sentence=e.get("example"),
        )
        for e in demo_entries
    ]
 def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
    """Parse vocabulary JSON from LLM response with robust error handling."""
    def clean_json_string(s: str) -> str:
        """Clean a JSON string by removing control characters and fixing common issues."""
        # Remove control characters except newlines and tabs
        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
        # Replace unescaped newlines within strings with space
        # This is a simplistic approach - replace actual newlines with escaped ones
        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return s
    def try_parse_json(json_str: str) -> dict:
        """Try multiple strategies to parse JSON."""
        # Strategy 1: Direct parse
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass
        # Strategy 2: Clean and parse
        try:
            cleaned = clean_json_string(json_str)
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass
        # Strategy 3: Try to fix common issues
        try:
            # Remove trailing commas before } or ]
            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
            # Fix unquoted keys
            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass
        return None
    try:
        # Find JSON in response (may have extra text)
        start = text.find('{')
        end = text.rfind('}') + 1
        if start == -1 or end == 0:
            logger.warning("No JSON found in response")
            return []
        json_str = text[start:end]
        data = try_parse_json(json_str)
        if data is None:
            # Strategy 4: Extract vocabulary entries using regex as fallback
            logger.warning("JSON parsing failed, trying regex extraction")
            vocabulary = []
            # Match patterns like {"english": "...", "german": "...", ...}
            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
            for match in matches:
                english = match[0].strip() if match[0] else ""
                german = match[1].strip() if match[1] else ""
                example = match[2].strip() if len(match) > 2 and match[2] else None
                if english and german:
                    vocab_entry = VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=english,
                        german=german,
                        example_sentence=example,
                    )
                    vocabulary.append(vocab_entry)
            if vocabulary:
                logger.info(f"Regex extraction found {len(vocabulary)} entries")
            return vocabulary
        # Normal JSON parsing succeeded
        vocabulary = []
        for i, entry in enumerate(data.get("vocabulary", [])):
            english = entry.get("english", "").strip()
            german = entry.get("german", "").strip()
            # Skip entries that look like hallucinations (very long or containing unusual patterns)
            if len(english) > 100 or len(german) > 200:
                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
                continue
            if not english or not german:
                continue
            vocab_entry = VocabularyEntry(
                id=str(uuid.uuid4()),
                english=english,
                german=german,
                example_sentence=entry.get("example"),
                word_type=entry.get("word_type"),
            )
            vocabulary.append(vocab_entry)
        return vocabulary
    except Exception as e:
        logger.error(f"Failed to parse vocabulary JSON: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return []
@@ -0,0 +1,258 @@
 """
 Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
 Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
 Functions:
  - generate_worksheet_html(): Build HTML for various worksheet types
  - generate_worksheet_pdf():  Convert HTML to PDF via WeasyPrint
  - get_pdf_page_count():      Count pages in a PDF (PyMuPDF)
  - convert_pdf_page_to_image(): Render single PDF page to PNG
  - convert_pdf_to_images():     Render multiple PDF pages to PNG
 """
 import logging
 from typing import List
 from fastapi import HTTPException
 from .models import VocabularyEntry, WorksheetType
 logger = logging.getLogger(__name__)
 # Optional dependency: WeasyPrint
 try:
    from weasyprint import HTML as _WeasyHTML
    WEASYPRINT_AVAILABLE = True
 except (ImportError, OSError):
    WEASYPRINT_AVAILABLE = False
    logger.warning("WeasyPrint not available")
 # Optional dependency: PyMuPDF
 try:
    import fitz  # PyMuPDF
    FITZ_AVAILABLE = True
 except ImportError:
    FITZ_AVAILABLE = False
    logger.warning("PyMuPDF (fitz) not available")
 # =============================================================================
 # Worksheet HTML Generation
 # =============================================================================
 def generate_worksheet_html(
    vocabulary: List[VocabularyEntry],
    worksheet_type: WorksheetType,
    title: str,
    show_solutions: bool = False,
    repetitions: int = 3,
    line_height: str = "normal"
 ) -> str:
    """Generate HTML for a worksheet."""
    # Line height CSS
    line_heights = {
        "normal": "2.5em",
        "large": "3.5em",
        "extra-large": "4.5em"
    }
    lh = line_heights.get(line_height, "2.5em")
    html = f"""<!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8">
    <style>
        @page {{ size: A4; margin: 2cm; }}
        body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
        h1 {{ font-size: 24px; margin-bottom: 10px; }}
        .meta {{ color: #666; margin-bottom: 20px; }}
        .name-line {{ margin-bottom: 30px; }}
        .vocab-table {{ width: 100%; border-collapse: collapse; }}
        .vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
        .vocab-word {{ width: 40%; font-weight: 500; }}
        .vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
        .vocab-answer {{ width: 60%; color: #2563eb; }}
        .gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
        .hint {{ color: #666; font-style: italic; font-size: 12px; }}
        .section {{ margin-top: 30px; }}
        .section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
    </style>
 </head>
 <body>
    <h1>{title}</h1>
    <div class="name-line">Name: _________________________ Datum: _____________</div>
 """
    if worksheet_type == WorksheetType.EN_TO_DE:
        html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'
    elif worksheet_type == WorksheetType.DE_TO_EN:
        html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'
    elif worksheet_type == WorksheetType.COPY_PRACTICE:
        html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            html += f'<tr><td class="vocab-word">{entry.english}</td>'
            html += '<td class="vocab-blank">'
            if show_solutions:
                html += f' {entry.english} ' * repetitions
            html += '</td></tr>'
        html += '</table></div>'
    elif worksheet_type == WorksheetType.GAP_FILL:
        entries_with_examples = [e for e in vocabulary if e.example_sentence]
        if entries_with_examples:
            html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
            for i, entry in enumerate(entries_with_examples, 1):
                # Create gap sentence by removing the English word
                gap_sentence = entry.example_sentence
                for word in entry.english.split():
                    if word.lower() in gap_sentence.lower():
                        gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
                        break
                html += f'<p>{i}. {gap_sentence}</p>'
                if show_solutions:
                    html += f'<p class="hint">Loesung: {entry.english}</p>'
                else:
                    html += f'<p class="hint">({entry.german})</p>'
            html += '</div>'
    html += '</body></html>'
    return html
 # =============================================================================
 # Worksheet PDF Generation
 # =============================================================================
 async def generate_worksheet_pdf(html: str) -> bytes:
    """Generate PDF from HTML using WeasyPrint."""
    try:
        from weasyprint import HTML
        pdf_bytes = HTML(string=html).write_pdf()
        return pdf_bytes
    except ImportError:
        logger.warning("WeasyPrint not available, returning HTML")
        return html.encode('utf-8')
    except Exception as e:
        logger.error(f"PDF generation failed: {e}")
        raise
 # =============================================================================
 # PDF Utilities (PyMuPDF)
 # =============================================================================
 def get_pdf_page_count(pdf_data: bytes) -> int:
    """Get the number of pages in a PDF."""
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        count = pdf_document.page_count
        pdf_document.close()
        return count
    except Exception as e:
        logger.error(f"Failed to get PDF page count: {e}")
        return 0
 async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
    """Convert a specific page of PDF to PNG image using PyMuPDF.
    Args:
        pdf_data: PDF file as bytes
        page_number: 0-indexed page number
        thumbnail: If True, return a smaller thumbnail image
    """
    try:
        import fitz  # PyMuPDF
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")
        if page_number >= pdf_document.page_count:
            raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
        page = pdf_document[page_number]
        # Render page to image
        # For thumbnails: lower resolution, for OCR: higher resolution
        zoom = 0.5 if thumbnail else 2.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
        logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
        return png_data
    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
 async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
    """Convert multiple pages of PDF to PNG images.
    Args:
        pdf_data: PDF file as bytes
        pages: List of 0-indexed page numbers to convert. If None, convert all pages.
    """
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")
        # If no pages specified, convert all
        if pages is None:
            pages = list(range(pdf_document.page_count))
        images = []
        zoom = 2.0
        mat = fitz.Matrix(zoom, zoom)
        for page_num in pages:
            if page_num < pdf_document.page_count:
                page = pdf_document[page_num]
                pix = page.get_pixmap(matrix=mat)
                images.append(pix.tobytes("png"))
        pdf_document.close()
        logger.info(f"Converted {len(images)} PDF pages to images")
        return images
    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
@@ -0,0 +1,86 @@
 """Pydantic models and enums for the Vocab Worksheet API."""
 from datetime import datetime
 from enum import Enum
 from typing import List, Optional
 from pydantic import BaseModel
 # =============================================================================
 # Enums
 # =============================================================================
 class WorksheetType(str, Enum):
    EN_TO_DE = "en_to_de"          # English -> German translation
    DE_TO_EN = "de_to_en"          # German -> English translation
    COPY_PRACTICE = "copy"         # Write word multiple times
    GAP_FILL = "gap_fill"          # Fill in the blanks
    COMBINED = "combined"          # All types combined
 class SessionStatus(str, Enum):
    PENDING = "pending"            # Session created, no upload yet
    PROCESSING = "processing"      # OCR in progress
    EXTRACTED = "extracted"        # Vocabulary extracted, ready to edit
    COMPLETED = "completed"        # Worksheet generated
 # =============================================================================
 # Pydantic Models
 # =============================================================================
 class VocabularyEntry(BaseModel):
    id: str
    english: str
    german: str
    example_sentence: Optional[str] = None
    example_sentence_gap: Optional[str] = None  # With ___ for gap-fill
    word_type: Optional[str] = None  # noun, verb, adjective, etc.
    source_page: Optional[int] = None  # Page number where entry was found (1-indexed)
 class SessionCreate(BaseModel):
    name: str
    description: Optional[str] = None
    source_language: str = "en"  # Source language (default English)
    target_language: str = "de"  # Target language (default German)
 class SessionResponse(BaseModel):
    id: str
    name: str
    description: Optional[str]
    source_language: str
    target_language: str
    status: str
    vocabulary_count: int
    image_path: Optional[str]
    created_at: datetime
 class VocabularyResponse(BaseModel):
    session_id: str
    vocabulary: List[VocabularyEntry]
    extraction_confidence: Optional[float]
 class VocabularyUpdate(BaseModel):
    vocabulary: List[VocabularyEntry]
 class WorksheetGenerateRequest(BaseModel):
    worksheet_types: List[WorksheetType]
    title: Optional[str] = None
    include_solutions: bool = True
    repetitions: int = 3  # For copy practice
    line_height: str = "normal"  # normal, large, extra-large
 class WorksheetResponse(BaseModel):
    id: str
    session_id: str
    worksheet_types: List[str]
    pdf_path: str
    solution_path: Optional[str]
    generated_at: datetime
@@ -0,0 +1,481 @@
 """
 Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
 Extracted from vocab_worksheet_api.py to keep file sizes manageable.
 Pipeline steps:
  orientation → deskew → dewarp → crop → scan-quality → enhance →
  dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
  vocab extraction → row merging
 """
 import logging
 import uuid
 from typing import Optional
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Optional heavy dependencies (not available in every environment)
 # ---------------------------------------------------------------------------
 try:
    import cv2
    import numpy as np
 except ImportError:
    cv2 = None  # type: ignore[assignment]
    np = None  # type: ignore[assignment]
    logger.warning("cv2 / numpy not available — OCR pipeline disabled")
 try:
    from PIL import Image
 except ImportError:
    Image = None  # type: ignore[assignment]
 try:
    import pytesseract
 except ImportError:
    pytesseract = None  # type: ignore[assignment]
 # CV pipeline helpers
 try:
    from cv_vocab_pipeline import (
        deskew_two_pass,
        dewarp_image,
        detect_and_fix_orientation,
        _cells_to_vocab_entries,
        _fix_phonetic_brackets,
    )
 except ImportError:
    deskew_two_pass = None  # type: ignore[assignment]
    dewarp_image = None  # type: ignore[assignment]
    detect_and_fix_orientation = None  # type: ignore[assignment]
    _cells_to_vocab_entries = None  # type: ignore[assignment]
    _fix_phonetic_brackets = None  # type: ignore[assignment]
 try:
    from cv_cell_grid import (
        _merge_wrapped_rows,
        _merge_phonetic_continuation_rows,
        _merge_continuation_rows,
    )
 except ImportError:
    _merge_wrapped_rows = None  # type: ignore[assignment]
    _merge_phonetic_continuation_rows = None  # type: ignore[assignment]
    _merge_continuation_rows = None  # type: ignore[assignment]
 try:
    from cv_ocr_engines import ocr_region_rapid
 except ImportError:
    ocr_region_rapid = None  # type: ignore[assignment]
 try:
    from cv_vocab_types import PageRegion
 except ImportError:
    PageRegion = None  # type: ignore[assignment]
 try:
    from ocr_pipeline_ocr_merge import (
        _split_paddle_multi_words,
        _merge_paddle_tesseract,
        _deduplicate_words,
    )
 except ImportError:
    _split_paddle_multi_words = None  # type: ignore[assignment]
    _merge_paddle_tesseract = None  # type: ignore[assignment]
    _deduplicate_words = None  # type: ignore[assignment]
 try:
    from cv_words_first import build_grid_from_words
 except ImportError:
    build_grid_from_words = None  # type: ignore[assignment]
 try:
    from ocr_pipeline_session_store import (
        create_session_db as create_pipeline_session_db,
        update_session_db as update_pipeline_session_db,
    )
 except ImportError:
    create_pipeline_session_db = None  # type: ignore[assignment]
    update_pipeline_session_db = None  # type: ignore[assignment]
 # ---------------------------------------------------------------------------
 # Main pipeline function
 # ---------------------------------------------------------------------------
 async def _run_ocr_pipeline_for_page(
    img_bgr: "np.ndarray",
    page_number: int,
    vocab_session_id: str,
    *,
    ipa_mode: str = "none",
    syllable_mode: str = "none",
    enable_enhance: bool = True,
    max_columns: Optional[int] = 3,
    override_min_conf: Optional[int] = None,
 ) -> tuple:
    """Run the full Kombi OCR pipeline on a single page and return vocab entries.
    Uses the same pipeline as the admin OCR Kombi pipeline:
    orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
    (with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
    Args:
        img_bgr: BGR numpy array.
        page_number: 0-indexed page number.
        vocab_session_id: Vocab session ID for logging.
        ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
        syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
    Returns (entries, rotation_deg) where entries is a list of dicts and
    rotation_deg is the orientation correction applied (0, 90, 180, 270).
    """
    import time as _time
    t_total = _time.time()
    img_h, img_w = img_bgr.shape[:2]
    logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
    # 1. Orientation detection (fix upside-down scans)
    t0 = _time.time()
    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
    if rotation:
        img_h, img_w = img_bgr.shape[:2]
        logger.info(f"  orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
    else:
        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")
    # 2. Create pipeline session in DB (visible in admin Kombi UI)
    pipeline_session_id = str(uuid.uuid4())
    try:
        _, png_buf = cv2.imencode(".png", img_bgr)
        original_png = png_buf.tobytes()
        await create_pipeline_session_db(
            pipeline_session_id,
            name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
            filename=f"page_{page_number + 1}.png",
            original_png=original_png,
        )
    except Exception as e:
        logger.warning(f"Could not create pipeline session in DB: {e}")
    # 3. Three-pass deskew
    t0 = _time.time()
    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
    logger.info(f"  deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
    # 4. Dewarp
    t0 = _time.time()
    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
    # 5. Content crop (removes scanner borders, gutter shadows)
    t0 = _time.time()
    try:
        from page_crop import detect_and_crop_page
        cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
        if crop_result.get("crop_applied"):
            dewarped_bgr = cropped_bgr
            logger.info(f"  crop: applied ({_time.time() - t0:.1f}s)")
        else:
            logger.info(f"  crop: skipped ({_time.time() - t0:.1f}s)")
    except Exception as e:
        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")
    # 5b. Scan quality assessment
    scan_quality_report = None
    try:
        from scan_quality import score_scan_quality
        scan_quality_report = score_scan_quality(dewarped_bgr)
    except Exception as e:
        logger.warning(f"  scan quality: failed ({e})")
    if override_min_conf:
        min_ocr_conf = override_min_conf
    else:
        min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
    # 5c. Image enhancement for degraded scans
    is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
    if is_degraded and enable_enhance:
        try:
            from ocr_image_enhance import enhance_for_ocr
            dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
            logger.info("  enhancement: applied (degraded scan)")
        except Exception as e:
            logger.warning(f"  enhancement: failed ({e})")
    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
    t0 = _time.time()
    img_h, img_w = dewarped_bgr.shape[:2]
    # RapidOCR (local ONNX)
    try:
        from cv_ocr_engines import ocr_region_rapid
        from cv_vocab_types import PageRegion
        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
        rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
    except Exception as e:
        logger.warning(f"  RapidOCR failed: {e}")
        rapid_words = []
    # Tesseract
    from PIL import Image
    import pytesseract
    pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
    data = pytesseract.image_to_data(
        pil_img, lang="eng+deu", config="--psm 6 --oem 3",
        output_type=pytesseract.Output.DICT,
    )
    tess_words = []
    for i in range(len(data["text"])):
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < min_ocr_conf:
            continue
        tess_words.append({
            "text": text,
            "left": data["left"][i], "top": data["top"][i],
            "width": data["width"][i], "height": data["height"][i],
            "conf": conf,
        })
    # Merge dual-engine results
    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
    from cv_words_first import build_grid_from_words
    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
    if rapid_split or tess_words:
        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
        merged_words = _deduplicate_words(merged_words)
    else:
        merged_words = tess_words  # fallback to Tesseract only
    # Build initial grid from merged words
    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
    for cell in cells:
        cell["ocr_engine"] = "rapid_kombi"
    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    logger.info(f"  ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
                f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
    # 7. Save word_result to pipeline session (needed by _build_grid_core)
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": 0,
        "ocr_engine": "rapid_kombi",
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
        },
    }
    # Save images + word_result to pipeline session for admin visibility
    try:
        _, dsk_buf = cv2.imencode(".png", deskewed_bgr)
        _, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        await update_pipeline_session_db(
            pipeline_session_id,
            deskewed_png=dsk_buf.tobytes(),
            dewarped_png=dwp_buf.tobytes(),
            cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
            word_result=word_result,
            deskew_result={"angle_applied": round(angle_applied, 3)},
            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
            current_step=8,
        )
    except Exception as e:
        logger.warning(f"Could not update pipeline session: {e}")
    # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
    t0 = _time.time()
    try:
        from grid_editor_api import _build_grid_core
        session_data = {
            "word_result": word_result,
        }
        grid_result = await _build_grid_core(
            pipeline_session_id, session_data,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
        )
        logger.info(f"  grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
                    f"({_time.time() - t0:.1f}s)")
        # Save grid result to pipeline session
        try:
            await update_pipeline_session_db(
                pipeline_session_id,
                grid_editor_result=grid_result,
                current_step=11,
            )
        except Exception:
            pass
    except Exception as e:
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None
    # 9. Extract vocab entries
    # Prefer grid-build result (better column detection, more cells) over
    # the initial build_grid_from_words() which often under-clusters.
    page_vocabulary = []
    extraction_source = "none"
    # A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
    if grid_result and grid_result.get("zones"):
        for zone in grid_result["zones"]:
            zone_cols = zone.get("columns", [])
            zone_cells = zone.get("cells", [])
            if not zone_cols or not zone_cells:
                continue
            # Sort columns by x position to determine roles
            sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
            col_idx_to_pos = {}
            for pos, col in enumerate(sorted_cols):
                ci = col.get("col_index", col.get("index", -1))
                col_idx_to_pos[ci] = pos
            # Skip zones with only 1 column (likely headers/boxes)
            if len(sorted_cols) < 2:
                continue
            # Group cells by row
            rows_map: dict = {}
            for cell in zone_cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map:
                    rows_map[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map[ri][ci] = (cell.get("text") or "").strip()
            n_cols = len(sorted_cols)
            for ri in sorted(rows_map.keys()):
                row = rows_map[ri]
                # Collect texts in column-position order
                texts = []
                for col in sorted_cols:
                    ci = col.get("col_index", col.get("index", -1))
                    texts.append(row.get(ci, ""))
                if not any(texts):
                    continue
                # Map by position, skipping narrow first column (page refs/markers)
                # Heuristic: if first column is very narrow (<15% of zone width),
                # it's likely a marker/ref column — skip it for vocab
                first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
                zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
                skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
                data_texts = texts[1:] if skip_first else texts
                entry = {
                    "id": str(uuid.uuid4()),
                    "english": data_texts[0] if len(data_texts) > 0 else "",
                    "german": data_texts[1] if len(data_texts) > 1 else "",
                    "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
                    "source_page": page_number + 1,
                }
                if entry["english"] or entry["german"]:
                    page_vocabulary.append(entry)
        if page_vocabulary:
            extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
    # B) Fallback: original cells with column classification
    if not page_vocabulary:
        col_types = {c.get("type") for c in columns_meta}
        is_vocab = bool(col_types & {"column_en", "column_de"})
        if is_vocab:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation="british")
            for entry in entries:
                if not entry.get("english") and not entry.get("german"):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": entry.get("english", ""),
                    "german": entry.get("german", ""),
                    "example_sentence": entry.get("example", ""),
                    "source_page": page_number + 1,
                })
            extraction_source = f"classified ({len(columns_meta)} cols)"
        else:
            # Last resort: all cells by position
            rows_map2: dict = {}
            for cell in cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map2:
                    rows_map2[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map2[ri][ci] = (cell.get("text") or "").strip()
            all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
            for ri in sorted(rows_map2.keys()):
                row = rows_map2[ri]
                texts = [row.get(ci, "") for ci in all_ci]
                if not any(texts):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": texts[0] if len(texts) > 0 else "",
                    "german": texts[1] if len(texts) > 1 else "",
                    "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
                    "source_page": page_number + 1,
                })
            extraction_source = f"generic ({len(all_ci)} cols)"
    # --- Post-processing: merge cell-wrap continuation rows ---
    if len(page_vocabulary) >= 2:
        try:
            # Convert to internal format (example_sentence → example)
            internal = []
            for v in page_vocabulary:
                internal.append({
                    'row_index': len(internal),
                    'english': v.get('english', ''),
                    'german': v.get('german', ''),
                    'example': v.get('example_sentence', ''),
                })
            n_before = len(internal)
            internal = _merge_wrapped_rows(internal)
            internal = _merge_phonetic_continuation_rows(internal)
            internal = _merge_continuation_rows(internal)
            if len(internal) < n_before:
                # Rebuild page_vocabulary from merged entries
                merged_vocab = []
                for entry in internal:
                    if not entry.get('english') and not entry.get('german'):
                        continue
                    merged_vocab.append({
                        'id': str(uuid.uuid4()),
                        'english': entry.get('english', ''),
                        'german': entry.get('german', ''),
                        'example_sentence': entry.get('example', ''),
                        'source_page': page_number + 1,
                    })
                logger.info(f"  row merging: {n_before} → {len(merged_vocab)} entries")
                page_vocabulary = merged_vocab
        except Exception as e:
            logger.warning(f"  row merging failed (non-critical): {e}")
    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
    total_duration = _time.time() - t_total
    logger.info(f"Kombi Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
    return page_vocabulary, rotation, scan_quality_report
@@ -0,0 +1,490 @@
 """
 Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
 Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
 Routes (no prefix — included into the main /api/v1/vocab router):
  POST /sessions/{session_id}/upload-pdf-info
  GET  /sessions/{session_id}/pdf-thumbnail/{page_number}
  GET  /sessions/{session_id}/pdf-page-image/{page_number}
  POST /sessions/{session_id}/process-single-page/{page_number}
  POST /sessions/{session_id}/process-pages
 """
 import io
 import logging
 import os
 import uuid
 from typing import List
 from fastapi import APIRouter, HTTPException, Query, UploadFile, File
 from fastapi.responses import StreamingResponse
 from .models import SessionStatus
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Local storage path
 # ---------------------------------------------------------------------------
 LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
 # ---------------------------------------------------------------------------
 # Optional heavy dependencies
 # ---------------------------------------------------------------------------
 try:
    import numpy as np
    from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
    OCR_PIPELINE_AVAILABLE = True
 except ImportError:
    np = None  # type: ignore[assignment]
    OCR_PIPELINE_AVAILABLE = False
    logger.warning("OCR pipeline imports not available in upload module")
 # Sub-module imports (already split out)
 from .generation import (
    convert_pdf_page_to_image,
    convert_pdf_to_images,
    get_pdf_page_count,
 )
 from .extraction import extract_vocabulary_from_image
 try:
    from .ocr import _run_ocr_pipeline_for_page
 except ImportError:
    _run_ocr_pipeline_for_page = None  # type: ignore[assignment]
    logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
 # ---------------------------------------------------------------------------
 # In-memory session store (shared with main module)
 # ---------------------------------------------------------------------------
 def _get_sessions():
    from .api import _sessions
    return _sessions
 # ---------------------------------------------------------------------------
 # Router (no prefix — will be included into the main vocab router)
 # ---------------------------------------------------------------------------
 upload_router = APIRouter()
 # =============================================================================
 # POST /sessions/{session_id}/upload-pdf-info
 # =============================================================================
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
 async def upload_pdf_get_info(
    session_id: str,
    file: UploadFile = File(...),
 ):
    """
    Upload a PDF and get page count and thumbnails for preview.
    Use this before processing to let user select pages.
    """
    logger.info(f"PDF info request for session {session_id}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    # Validate file type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''
    if extension != 'pdf' and content_type != 'application/pdf':
        raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
    content = await file.read()
    # Save PDF temporarily
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    pdf_path = os.path.join(session_dir, "source.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(content)
    # Get page count
    page_count = get_pdf_page_count(content)
    # Store PDF data in session for later processing
    session["pdf_data"] = content
    session["pdf_path"] = pdf_path
    session["pdf_page_count"] = page_count
    session["status"] = "pdf_uploaded"
    # Detect orientation for each page so thumbnails are shown correctly
    page_rotations: dict = {}
    if OCR_PIPELINE_AVAILABLE:
        for pg in range(page_count):
            try:
                img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
                _, rotation = detect_and_fix_orientation(img_bgr)
                if rotation:
                    page_rotations[pg] = rotation
                    logger.info(f"Page {pg + 1}: orientation {rotation}°")
            except Exception as e:
                logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
    session["page_rotations"] = page_rotations
    return {
        "session_id": session_id,
        "page_count": page_count,
        "filename": file.filename,
        "page_rotations": page_rotations,
    }
 # =============================================================================
 # GET /sessions/{session_id}/pdf-thumbnail/{page_number}
 # =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
 async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
    """Get a thumbnail image of a specific PDF page.
    Uses fitz for rendering so that page_rotations (from OCR orientation
    detection) are applied consistently.
    Args:
        hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    try:
        import fitz
        zoom = 2.0 if hires else 0.5
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        page = pdf_document[page_number]
        # Apply orientation correction detected during OCR processing
        rot = session.get("page_rotations", {}).get(page_number, 0)
        if rot:
            page.set_rotation(rot)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
    except Exception as e:
        logger.error(f"PDF thumbnail failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
    return StreamingResponse(
        io.BytesIO(png_data),
        media_type="image/png",
    )
 # =============================================================================
 # GET /sessions/{session_id}/pdf-page-image/{page_number}
 # =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
 async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
    """PDF page as PNG at arbitrary resolution (for editor view).
    Args:
        zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        page = pdf_document[page_number]
        # Apply orientation correction detected during OCR processing
        rot = session.get("page_rotations", {}).get(page_number, 0)
        if rot:
            page.set_rotation(rot)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
        logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
    except Exception as e:
        logger.error(f"PDF page image failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
    return StreamingResponse(
        io.BytesIO(png_data),
        media_type="image/png",
    )
 # =============================================================================
 # POST /sessions/{session_id}/process-single-page/{page_number}
 # =============================================================================
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
 async def process_single_page(
    session_id: str,
    page_number: int,
    ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
    enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
    max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
 ):
    """
    Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
    Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
    dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
    Query params:
        ipa_mode: "none" (default), "auto", "all", "en", "de"
        syllable_mode: "none" (default), "auto", "all", "en", "de"
        enhance: true (default) -- apply CLAHE/denoise for degraded scans
        max_cols: 3 (default) -- max column count (0=unlimited)
        min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
    The frontend should call this sequentially for each page.
    Returns the vocabulary for just this one page.
    """
    logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
    if session_id not in _get_sessions():
        raise HTTPException(
            status_code=404,
            detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
        )
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    # Derive pipeline-level variable names for the quality report
    enable_enhance = enhance
    max_columns = max_cols if max_cols > 0 else None
    override_min_conf = min_conf if min_conf > 0 else None
    # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
    rotation_deg = 0
    quality_report = None
    min_ocr_conf = 40  # default; overridden by pipeline when quality report is available
    if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
        try:
            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
            page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
                img_bgr, page_number, session_id,
                ipa_mode=ipa_mode, syllable_mode=syllable_mode,
                enable_enhance=enable_enhance,
                max_columns=max_columns,
                override_min_conf=override_min_conf,
            )
            # Update min_ocr_conf from quality report if available
            if quality_report and hasattr(quality_report, 'recommended_min_conf'):
                min_ocr_conf = quality_report.recommended_min_conf
        except Exception as e:
            logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
            return {
                "session_id": session_id,
                "page_number": page_number + 1,
                "success": False,
                "error": f"OCR pipeline error: {e}",
                "vocabulary": [],
                "vocabulary_count": 0,
            }
    else:
        # Fallback to LLM vision extraction
        logger.warning("OCR pipeline not available, falling back to LLM vision")
        image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_number + 1}.png",
            page_number=page_number
        )
        if error:
            logger.warning(f"Page {page_number + 1} failed: {error}")
            return {
                "session_id": session_id,
                "page_number": page_number + 1,
                "success": False,
                "error": error,
                "vocabulary": [],
                "vocabulary_count": 0,
            }
        page_vocabulary = []
        for entry in vocabulary:
            entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
            entry_dict['source_page'] = page_number + 1
            if 'id' not in entry_dict or not entry_dict['id']:
                entry_dict['id'] = str(uuid.uuid4())
            page_vocabulary.append(entry_dict)
    logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
    # Store rotation for this page (used by image/thumbnail endpoints)
    session.setdefault("page_rotations", {})[page_number] = rotation_deg
    # Add to session's vocabulary (append, don't replace)
    existing_vocab = session.get("vocabulary", [])
    # Remove any existing entries from this page (in case of re-processing)
    existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
    existing_vocab.extend(page_vocabulary)
    session["vocabulary"] = existing_vocab
    session["vocabulary_count"] = len(existing_vocab)
    session["status"] = SessionStatus.EXTRACTED.value
    result = {
        "session_id": session_id,
        "page_number": page_number + 1,
        "success": True,
        "vocabulary": page_vocabulary,
        "vocabulary_count": len(page_vocabulary),
        "total_vocabulary_count": len(existing_vocab),
        "extraction_confidence": 0.9,
        "rotation": rotation_deg,
    }
    # Add scan quality report + active steps info
    if quality_report:
        sq = quality_report.to_dict()
        sq["active_steps"] = {
            "step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
            "step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
            "step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
        }
        result["scan_quality"] = sq
    return result
 # =============================================================================
 # POST /sessions/{session_id}/process-pages  (DEPRECATED)
 # =============================================================================
@upload_router.post("/sessions/{session_id}/process-pages")
 async def process_pdf_pages(
    session_id: str,
    pages: List[int] = None,
    process_all: bool = False,
 ):
    """
    Process specific pages of an uploaded PDF.
    DEPRECATED: Use /process-single-page/{page_number} instead for better results.
    Args:
        pages: List of 0-indexed page numbers to process
        process_all: If True, process all pages
    """
    logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    # Determine which pages to process
    if process_all:
        pages = list(range(page_count))
    elif pages is None or len(pages) == 0:
        pages = [0]  # Default to first page
    # Convert selected pages to images
    images = await convert_pdf_to_images(pdf_data, pages)
    # Extract vocabulary from each page SEQUENTIALLY
    all_vocabulary = []
    total_confidence = 0.0
    successful_pages = []
    failed_pages = []
    error_messages = []
    for i, image_data in enumerate(images):
        page_num = pages[i]
        logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_num + 1}.png",
            page_number=page_num
        )
        if error:
            failed_pages.append(page_num + 1)
            error_messages.append(error)
            logger.warning(f"Page {page_num + 1} failed: {error}")
        else:
            successful_pages.append(page_num + 1)
            total_confidence += confidence
            # Add page info to each entry and convert to dict
            for entry in vocabulary:
                entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
                entry_dict['source_page'] = page_num + 1
                all_vocabulary.append(entry_dict)
            logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
    avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
    # Update session
    session["vocabulary"] = all_vocabulary
    session["vocabulary_count"] = len(all_vocabulary)
    session["extraction_confidence"] = avg_confidence
    session["processed_pages"] = pages
    session["successful_pages"] = successful_pages
    session["failed_pages"] = failed_pages
    session["status"] = SessionStatus.EXTRACTED.value
    # Save first page as preview image
    if images:
        session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
        image_path = os.path.join(session_dir, "source.png")
        with open(image_path, 'wb') as f:
            f.write(images[0])
        session["image_path"] = image_path
    result = {
        "session_id": session_id,
        "pages_processed": len(pages),
        "pages_successful": len(successful_pages),
        "pages_failed": len(failed_pages),
        "successful_pages": successful_pages,
        "failed_pages": failed_pages,
        "vocabulary_count": len(all_vocabulary),
        "extraction_confidence": avg_confidence,
        "status": SessionStatus.EXTRACTED.value,
    }
    if error_messages:
        result["errors"] = error_messages
    return result
@@ -1,196 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/learn_bridge.py
-Vocab Learn Bridge — Converts vocabulary session data into Learning Units.
+import importlib as _importlib
-
+import sys as _sys
-Bridges klausur-service (vocab extraction) with backend-lehrer (learning units + generators).
+_sys.modules[__name__] = _importlib.import_module("vocab.learn_bridge")
 Creates a Learning Unit in backend-lehrer, then triggers MC/Cloze/QA generation.
 DATENSCHUTZ: All communication stays within Docker network (breakpilot-network).
 """
 import os
 import json
 import logging
 import httpx
 from typing import List, Dict, Any, Optional
 logger = logging.getLogger(__name__)
 BACKEND_LEHRER_URL = os.getenv("BACKEND_LEHRER_URL", "http://backend-lehrer:8001")
 def vocab_to_analysis_data(session_name: str, vocabulary: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Convert vocabulary entries from a vocab session into the analysis_data format
    expected by backend-lehrer generators (MC, Cloze, QA).
    The generators consume:
    - title: Display name
    - subject: Subject area
    - grade_level: Target grade
    - canonical_text: Full text representation
    - printed_blocks: Individual text blocks
    - vocabulary: Original vocab data (for vocab-specific modules)
    """
    canonical_lines = []
    printed_blocks = []
    for v in vocabulary:
        en = v.get("english", "").strip()
        de = v.get("german", "").strip()
        example = v.get("example_sentence", "").strip()
        if not en and not de:
            continue
        line = f"{en} = {de}"
        if example:
            line += f" ({example})"
        canonical_lines.append(line)
        block_text = f"{en} — {de}"
        if example:
            block_text += f" | {example}"
        printed_blocks.append({"text": block_text})
    return {
        "title": session_name,
        "subject": "English Vocabulary",
        "grade_level": "5-8",
        "canonical_text": "\n".join(canonical_lines),
        "printed_blocks": printed_blocks,
        "vocabulary": vocabulary,
    }
 async def create_learning_unit(
    session_name: str,
    vocabulary: List[Dict[str, Any]],
    grade: Optional[str] = None,
 ) -> Dict[str, Any]:
    """
    Create a Learning Unit in backend-lehrer from vocabulary data.
    Steps:
    1. Create unit via POST /api/learning-units/
    2. Return the created unit info
    Returns dict with unit_id, status, vocabulary_count.
    """
    if not vocabulary:
        raise ValueError("No vocabulary entries provided")
    analysis_data = vocab_to_analysis_data(session_name, vocabulary)
    async with httpx.AsyncClient(timeout=30.0) as client:
        # 1. Create Learning Unit
        create_payload = {
            "title": session_name,
            "subject": "Englisch",
            "grade": grade or "5-8",
        }
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/",
                json=create_payload,
            )
            resp.raise_for_status()
            unit = resp.json()
        except httpx.HTTPError as e:
            logger.error(f"Failed to create learning unit: {e}")
            raise RuntimeError(f"Backend-Lehrer nicht erreichbar: {e}")
        unit_id = unit.get("id")
        if not unit_id:
            raise RuntimeError("Learning Unit created but no ID returned")
        logger.info(f"Created learning unit {unit_id} with {len(vocabulary)} vocabulary entries")
        # 2. Save analysis_data as JSON file for generators
        analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
        os.makedirs(analysis_dir, exist_ok=True)
        analysis_path = os.path.join(analysis_dir, f"{unit_id}_analyse.json")
        with open(analysis_path, "w", encoding="utf-8") as f:
            json.dump(analysis_data, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved analysis data to {analysis_path}")
        return {
            "unit_id": unit_id,
            "unit": unit,
            "analysis_path": analysis_path,
            "vocabulary_count": len(vocabulary),
            "status": "created",
        }
 async def generate_learning_modules(
    unit_id: str,
    analysis_path: str,
 ) -> Dict[str, Any]:
    """
    Trigger MC, Cloze, and QA generation from analysis data.
    Imports generators directly (they run in-process for klausur-service)
    or calls backend-lehrer API if generators aren't available locally.
    Returns dict with generation results.
    """
    results = {
        "unit_id": unit_id,
        "mc": {"status": "pending"},
        "cloze": {"status": "pending"},
        "qa": {"status": "pending"},
    }
    # Load analysis data
    with open(analysis_path, "r", encoding="utf-8") as f:
        analysis_data = json.load(f)
    # Try to generate via backend-lehrer API
    async with httpx.AsyncClient(timeout=120.0) as client:
        # Generate QA (includes Leitner fields)
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-qa",
                json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 20)},
            )
            if resp.status_code == 200:
                results["qa"] = {"status": "generated", "data": resp.json()}
            else:
                logger.warning(f"QA generation returned {resp.status_code}")
                results["qa"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
        except Exception as e:
            logger.warning(f"QA generation failed: {e}")
            results["qa"] = {"status": "error", "reason": str(e)}
        # Generate MC
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-mc",
                json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 10)},
            )
            if resp.status_code == 200:
                results["mc"] = {"status": "generated", "data": resp.json()}
            else:
                results["mc"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
        except Exception as e:
            logger.warning(f"MC generation failed: {e}")
            results["mc"] = {"status": "error", "reason": str(e)}
        # Generate Cloze
        try:
            resp = await client.post(
                f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-cloze",
                json={"analysis_data": analysis_data},
            )
            if resp.status_code == 200:
                results["cloze"] = {"status": "generated", "data": resp.json()}
            else:
                results["cloze"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
        except Exception as e:
            logger.warning(f"Cloze generation failed: {e}")
            results["cloze"] = {"status": "error", "reason": str(e)}
    return results
@@ -1,428 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/session_store.py
-Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.
+import importlib as _importlib
-
+import sys as _sys
-Replaces in-memory storage with database persistence.
+_sys.modules[__name__] = _importlib.import_module("vocab.session_store")
 See migrations/001_vocab_sessions.sql for schema.
 """
 import os
 import uuid
 import logging
 import json
 from typing import Optional, List, Dict, Any
 from datetime import datetime
 import asyncpg
 logger = logging.getLogger(__name__)
 # Database configuration
 DATABASE_URL = os.getenv(
    "DATABASE_URL",
    "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
 )
 # Connection pool (initialized lazily)
 _pool: Optional[asyncpg.Pool] = None
 async def get_pool() -> asyncpg.Pool:
    """Get or create the database connection pool."""
    global _pool
    if _pool is None:
        _pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
    return _pool
 async def init_vocab_tables():
    """
    Initialize vocab tables if they don't exist.
    This is called at startup.
    """
    pool = await get_pool()
    async with pool.acquire() as conn:
        # Check if tables exist
        tables_exist = await conn.fetchval("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_name = 'vocab_sessions'
            )
        """)
        if not tables_exist:
            logger.info("Creating vocab tables...")
            # Read and execute migration
            migration_path = os.path.join(
                os.path.dirname(__file__),
                "migrations/001_vocab_sessions.sql"
            )
            if os.path.exists(migration_path):
                with open(migration_path, "r") as f:
                    sql = f.read()
                await conn.execute(sql)
                logger.info("Vocab tables created successfully")
            else:
                logger.warning(f"Migration file not found: {migration_path}")
        else:
            logger.debug("Vocab tables already exist")
 # =============================================================================
 # SESSION OPERATIONS
 # =============================================================================
 async def create_session_db(
    session_id: str,
    name: str,
    description: str = "",
    source_language: str = "en",
    target_language: str = "de"
 ) -> Dict[str, Any]:
    """Create a new vocabulary session in the database."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            INSERT INTO vocab_sessions (
                id, name, description, source_language, target_language,
                status, vocabulary_count
            ) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
            RETURNING *
        """, uuid.UUID(session_id), name, description, source_language, target_language)
        return _row_to_dict(row)
 async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
    """Get a session by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            SELECT * FROM vocab_sessions WHERE id = $1
        """, uuid.UUID(session_id))
        if row:
            return _row_to_dict(row)
        return None
 async def list_sessions_db(
    limit: int = 50,
    offset: int = 0,
    status: Optional[str] = None
 ) -> List[Dict[str, Any]]:
    """List all sessions with optional filtering."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        if status:
            rows = await conn.fetch("""
                SELECT * FROM vocab_sessions
                WHERE status = $1
                ORDER BY created_at DESC
                LIMIT $2 OFFSET $3
            """, status, limit, offset)
        else:
            rows = await conn.fetch("""
                SELECT * FROM vocab_sessions
                ORDER BY created_at DESC
                LIMIT $1 OFFSET $2
            """, limit, offset)
        return [_row_to_dict(row) for row in rows]
 async def update_session_db(
    session_id: str,
    **kwargs
 ) -> Optional[Dict[str, Any]]:
    """Update a session with given fields."""
    pool = await get_pool()
    # Build dynamic UPDATE query
    fields = []
    values = []
    param_idx = 1
    allowed_fields = [
        'name', 'description', 'status', 'vocabulary_count',
        'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
        'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
    ]
    for key, value in kwargs.items():
        if key in allowed_fields:
            fields.append(f"{key} = ${param_idx}")
            # Convert dicts/lists to JSON for JSONB columns
            if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
                value = json.dumps(value) if value else None
            values.append(value)
            param_idx += 1
    if not fields:
        return await get_session_db(session_id)
    values.append(uuid.UUID(session_id))
    async with pool.acquire() as conn:
        row = await conn.fetchrow(f"""
            UPDATE vocab_sessions
            SET {', '.join(fields)}
            WHERE id = ${param_idx}
            RETURNING *
        """, *values)
        if row:
            return _row_to_dict(row)
        return None
 async def delete_session_db(session_id: str) -> bool:
    """Delete a session and all related data (cascades)."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_sessions WHERE id = $1
        """, uuid.UUID(session_id))
        return result == "DELETE 1"
 # =============================================================================
 # VOCABULARY OPERATIONS
 # =============================================================================
 async def add_vocabulary_db(
    session_id: str,
    vocab_list: List[Dict[str, Any]]
 ) -> List[Dict[str, Any]]:
    """Add vocabulary entries to a session."""
    if not vocab_list:
        return []
    pool = await get_pool()
    results = []
    async with pool.acquire() as conn:
        for vocab in vocab_list:
            vocab_id = str(uuid.uuid4())
            row = await conn.fetchrow("""
                INSERT INTO vocab_entries (
                    id, session_id, english, german, example_sentence,
                    example_sentence_gap, word_type, source_page
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                RETURNING *
            """,
                uuid.UUID(vocab_id),
                uuid.UUID(session_id),
                vocab.get('english', ''),
                vocab.get('german', ''),
                vocab.get('example_sentence'),
                vocab.get('example_sentence_gap'),
                vocab.get('word_type'),
                vocab.get('source_page')
            )
            results.append(_row_to_dict(row))
        # Update vocabulary count
        await conn.execute("""
            UPDATE vocab_sessions
            SET vocabulary_count = (
                SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
            )
            WHERE id = $1
        """, uuid.UUID(session_id))
    return results
 async def get_vocabulary_db(
    session_id: str,
    source_page: Optional[int] = None
 ) -> List[Dict[str, Any]]:
    """Get vocabulary entries for a session."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        if source_page is not None:
            rows = await conn.fetch("""
                SELECT * FROM vocab_entries
                WHERE session_id = $1 AND source_page = $2
                ORDER BY created_at
            """, uuid.UUID(session_id), source_page)
        else:
            rows = await conn.fetch("""
                SELECT * FROM vocab_entries
                WHERE session_id = $1
                ORDER BY source_page NULLS LAST, created_at
            """, uuid.UUID(session_id))
        return [_row_to_dict(row) for row in rows]
 async def update_vocabulary_db(
    entry_id: str,
    **kwargs
 ) -> Optional[Dict[str, Any]]:
    """Update a single vocabulary entry."""
    pool = await get_pool()
    fields = []
    values = []
    param_idx = 1
    allowed_fields = [
        'english', 'german', 'example_sentence', 'example_sentence_gap',
        'word_type', 'source_page'
    ]
    for key, value in kwargs.items():
        if key in allowed_fields:
            fields.append(f"{key} = ${param_idx}")
            values.append(value)
            param_idx += 1
    if not fields:
        return None
    values.append(uuid.UUID(entry_id))
    async with pool.acquire() as conn:
        row = await conn.fetchrow(f"""
            UPDATE vocab_entries
            SET {', '.join(fields)}
            WHERE id = ${param_idx}
            RETURNING *
        """, *values)
        if row:
            return _row_to_dict(row)
        return None
 async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
    """Clear all vocabulary for a specific page."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_entries
            WHERE session_id = $1 AND source_page = $2
        """, uuid.UUID(session_id), page)
        # Update vocabulary count
        await conn.execute("""
            UPDATE vocab_sessions
            SET vocabulary_count = (
                SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
            )
            WHERE id = $1
        """, uuid.UUID(session_id))
        # Return count of deleted rows
        count = int(result.split()[-1]) if result else 0
        return count
 # =============================================================================
 # WORKSHEET OPERATIONS
 # =============================================================================
 async def create_worksheet_db(
    session_id: str,
    worksheet_types: List[str],
    pdf_path: Optional[str] = None,
    solution_path: Optional[str] = None
 ) -> Dict[str, Any]:
    """Create a worksheet record."""
    pool = await get_pool()
    worksheet_id = str(uuid.uuid4())
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            INSERT INTO vocab_worksheets (
                id, session_id, worksheet_types, pdf_path, solution_path
            ) VALUES ($1, $2, $3, $4, $5)
            RETURNING *
        """,
            uuid.UUID(worksheet_id),
            uuid.UUID(session_id),
            json.dumps(worksheet_types),
            pdf_path,
            solution_path
        )
        return _row_to_dict(row)
 async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
    """Get a worksheet by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            SELECT * FROM vocab_worksheets WHERE id = $1
        """, uuid.UUID(worksheet_id))
        if row:
            return _row_to_dict(row)
        return None
 async def delete_worksheets_for_session_db(session_id: str) -> int:
    """Delete all worksheets for a session."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_worksheets WHERE session_id = $1
        """, uuid.UUID(session_id))
        count = int(result.split()[-1]) if result else 0
        return count
 # =============================================================================
 # PDF CACHE OPERATIONS
 # =============================================================================
 # Simple in-memory cache for PDF data (temporary until served)
 _pdf_cache: Dict[str, bytes] = {}
 def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
    """Cache PDF data temporarily for download."""
    _pdf_cache[worksheet_id] = pdf_data
 def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
    """Get cached PDF data."""
    return _pdf_cache.get(worksheet_id)
 def clear_cached_pdf_data(worksheet_id: str) -> None:
    """Clear cached PDF data."""
    _pdf_cache.pop(worksheet_id, None)
 # =============================================================================
 # HELPER FUNCTIONS
 # =============================================================================
 def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
    """Convert asyncpg Record to dict with proper type handling."""
    if row is None:
        return {}
    result = dict(row)
    # Convert UUIDs to strings
    for key in ['id', 'session_id']:
        if key in result and result[key] is not None:
            result[key] = str(result[key])
    # Convert datetimes to ISO strings
    for key in ['created_at', 'updated_at', 'generated_at']:
        if key in result and result[key] is not None:
            result[key] = result[key].isoformat()
    # Parse JSONB fields back to dicts/lists
    for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
        if key in result and result[key] is not None:
            if isinstance(result[key], str):
                result[key] = json.loads(result[key])
    return result
@@ -1,472 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/worksheet/analysis_api.py
-Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
+import importlib as _importlib
-extract-with-boxes, deskewed images, and learning unit generation.
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.analysis_api")
 The two large handlers (compare_ocr_methods, analyze_grid) live in
 vocab_worksheet_compare_api.py and are included via compare_router.
 """
 from fastapi import APIRouter, Body, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from typing import Optional, Dict, Any
 from datetime import datetime
 import os
 import io
 import json
 import logging
 def _get_sessions():
    from vocab_worksheet_api import _sessions
    return _sessions
 def _get_local_storage_path():
    from vocab_worksheet_api import LOCAL_STORAGE_PATH
    return LOCAL_STORAGE_PATH
 from vocab_worksheet_generation import convert_pdf_page_to_image
 # Try to import Tesseract extractor
 try:
    from tesseract_vocab_extractor import (
        extract_bounding_boxes, TESSERACT_AVAILABLE,
    )
 except ImportError:
    TESSERACT_AVAILABLE = False
 # Try to import Grid Detection Service
 try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
 except ImportError:
    GRID_SERVICE_AVAILABLE = False
 logger = logging.getLogger(__name__)
 analysis_router = APIRouter()
 def _ocr_export_dir():
    return os.path.join(_get_local_storage_path(), "ocr-exports")
 def _ground_truth_dir():
    return os.path.join(_get_local_storage_path(), "ground-truth")
 # =============================================================================
 # OCR Export Endpoints (for cross-app OCR data sharing)
 # =============================================================================
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
 async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
    """
    Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
    Both apps proxy to klausur-service via /klausur-api/, so this endpoint
    serves as shared storage accessible from both ports.
    """
    logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
    os.makedirs(_ocr_export_dir(), exist_ok=True)
    # Save the export data
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    with open(export_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    # Update latest pointer
    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
    with open(latest_path, 'w', encoding='utf-8') as f:
        json.dump({
            "session_id": session_id,
            "page_number": page_number,
            "saved_at": datetime.utcnow().isoformat(),
        }, f, ensure_ascii=False, indent=2)
    return {
        "success": True,
        "session_id": session_id,
        "page_number": page_number,
        "message": "OCR export saved successfully",
    }
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
 async def load_ocr_export(session_id: str, page_number: int):
    """Load a specific OCR export by session and page number."""
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="OCR export not found")
    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
@analysis_router.get("/ocr-export/latest")
 async def load_latest_ocr_export():
    """Load the most recently saved OCR export data."""
    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
    if not os.path.exists(latest_path):
        raise HTTPException(status_code=404, detail="No OCR exports found")
    with open(latest_path, 'r', encoding='utf-8') as f:
        pointer = json.load(f)
    session_id = pointer.get("session_id")
    page_number = pointer.get("page_number")
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="Latest OCR export file not found")
    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
 # =============================================================================
 # Extract with Boxes & Deskewed Image
 # =============================================================================
 async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
    """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
    Returns dict with 'entries' list and 'image_width'/'image_height'.
    Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
    All bbox coordinates are in percent (0-100).
    """
    if not TESSERACT_AVAILABLE:
        raise HTTPException(status_code=500, detail="Tesseract not available")
    if not GRID_SERVICE_AVAILABLE:
        raise HTTPException(status_code=500, detail="GridDetectionService not available")
    # Step 1: Tesseract word-level bounding boxes
    tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
    words = tess_result.get("words", [])
    img_w = tess_result.get("image_width", 0)
    img_h = tess_result.get("image_height", 0)
    if not words or img_w == 0 or img_h == 0:
        return {"entries": [], "image_width": img_w, "image_height": img_h}
    # Step 2: Convert to OCR regions (percentage-based)
    service = GridDetectionService()
    regions = service.convert_tesseract_regions(words, img_w, img_h)
    if not regions:
        return {"entries": [], "image_width": img_w, "image_height": img_h}
    # Step 3: Detect grid
    grid_result = service.detect_grid(regions)
    if not grid_result.cells:
        return {"entries": [], "image_width": img_w, "image_height": img_h}
    # Step 4: Group cells by logical_row and column_type
    from services.grid_detection_service import ColumnType
    entries = []
    for row_idx, row_cells in enumerate(grid_result.cells):
        en_text = ""
        de_text = ""
        ex_text = ""
        en_bbox = None
        de_bbox = None
        ex_bbox = None
        row_conf_sum = 0.0
        row_conf_count = 0
        for cell in row_cells:
            cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
                         "w": round(cell.width, 2), "h": round(cell.height, 2)}
            if cell.column_type == ColumnType.ENGLISH:
                en_text = cell.text.strip()
                en_bbox = cell_bbox
            elif cell.column_type == ColumnType.GERMAN:
                de_text = cell.text.strip()
                de_bbox = cell_bbox
            elif cell.column_type == ColumnType.EXAMPLE:
                ex_text = cell.text.strip()
                ex_bbox = cell_bbox
            if cell.text.strip():
                row_conf_sum += cell.confidence
                row_conf_count += 1
        # Skip completely empty rows
        if not en_text and not de_text and not ex_text:
            continue
        # Calculate whole-row bounding box
        all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
        if all_bboxes:
            row_x = min(b["x"] for b in all_bboxes)
            row_y = min(b["y"] for b in all_bboxes)
            row_right = max(b["x"] + b["w"] for b in all_bboxes)
            row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
            row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
                        "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
        else:
            row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
        avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
        entries.append({
            "row_index": row_idx,
            "english": en_text,
            "german": de_text,
            "example": ex_text,
            "confidence": avg_conf,
            "bbox": row_bbox,
            "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
        })
    return {"entries": entries, "image_width": img_w, "image_height": img_h}
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
 async def extract_with_boxes(session_id: str, page_number: int):
    """Extract vocabulary entries with bounding boxes for ground truth labeling.
    Uses Tesseract + GridDetectionService for spatial positioning.
    page_number is 0-indexed.
    """
    logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    # Convert page to hires image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    # Deskew image before OCR
    deskew_angle = 0.0
    try:
        from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
        if CV2_AVAILABLE:
            image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
            logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
    except Exception as e:
        logger.warning(f"Deskew failed for page {page_number}: {e}")
    # Cache deskewed image in session for later serving
    if "deskewed_images" not in session:
        session["deskewed_images"] = {}
    session["deskewed_images"][str(page_number)] = image_data
    # Extract entries with boxes (now on deskewed image)
    result = await extract_entries_with_boxes(image_data)
    # Cache in session
    if "gt_entries" not in session:
        session["gt_entries"] = {}
    session["gt_entries"][str(page_number)] = result["entries"]
    return {
        "success": True,
        "entries": result["entries"],
        "entry_count": len(result["entries"]),
        "image_width": result["image_width"],
        "image_height": result["image_height"],
        "deskew_angle": round(deskew_angle, 2),
        "deskewed": abs(deskew_angle) > 0.05,
    }
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
 async def get_deskewed_image(session_id: str, page_number: int):
    """Return the deskewed page image as PNG.
    Falls back to the original hires image if no deskewed version is cached.
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    deskewed = session.get("deskewed_images", {}).get(str(page_number))
    if deskewed:
        return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
    # Fallback: render original hires image
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
 # =============================================================================
 # Ground Truth Labeling
 # =============================================================================
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
 async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
    """Save ground truth labels for a page.
    Expects body with 'entries' list - each entry has english, german, example,
    status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
    """
    logger.info(f"Save ground truth for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    entries = data.get("entries", [])
    if not entries:
        raise HTTPException(status_code=400, detail="No entries provided")
    # Save in session
    session = _get_sessions()[session_id]
    if "ground_truth" not in session:
        session["ground_truth"] = {}
    session["ground_truth"][str(page_number)] = entries
    # Also save to disk
    os.makedirs(_ground_truth_dir(), exist_ok=True)
    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
    gt_data = {
        "session_id": session_id,
        "page_number": page_number,
        "saved_at": datetime.now().isoformat(),
        "entry_count": len(entries),
        "entries": entries,
    }
    with open(gt_path, 'w', encoding='utf-8') as f:
        json.dump(gt_data, f, ensure_ascii=False, indent=2)
    logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
    confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
    edited = sum(1 for e in entries if e.get("status") == "edited")
    skipped = sum(1 for e in entries if e.get("status") == "skipped")
    return {
        "success": True,
        "saved_count": len(entries),
        "confirmed": confirmed,
        "edited": edited,
        "skipped": skipped,
        "file_path": gt_path,
    }
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
 async def load_ground_truth(session_id: str, page_number: int):
    """Load saved ground truth for a page."""
    logger.info(f"Load ground truth for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    # Try session cache first
    session = _get_sessions()[session_id]
    cached = session.get("ground_truth", {}).get(str(page_number))
    if cached:
        return {"success": True, "entries": cached, "source": "cache"}
    # Try disk
    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(gt_path):
        raise HTTPException(status_code=404, detail="No ground truth found for this page")
    with open(gt_path, 'r', encoding='utf-8') as f:
        gt_data = json.load(f)
    return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
 # ─── Learning Module Generation ─────────────────────────────────────────────
 class GenerateLearningUnitRequest(BaseModel):
    grade: Optional[str] = None
    generate_modules: bool = True
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
 async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
    """
    Create a Learning Unit from the vocabulary in this session.
    1. Takes vocabulary from the session
    2. Creates a Learning Unit in backend-lehrer
    3. Optionally triggers MC/Cloze/QA generation
    Returns the created unit info and generation status.
    """
    if request is None:
        request = GenerateLearningUnitRequest()
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    vocabulary = session.get("vocabulary", [])
    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary in this session")
    try:
        from vocab_learn_bridge import create_learning_unit, generate_learning_modules
        # Step 1: Create Learning Unit
        result = await create_learning_unit(
            session_name=session["name"],
            vocabulary=vocabulary,
            grade=request.grade,
        )
        # Step 2: Generate modules if requested
        if request.generate_modules:
            try:
                gen_result = await generate_learning_modules(
                    unit_id=result["unit_id"],
                    analysis_path=result["analysis_path"],
                )
                result["generation"] = gen_result
            except Exception as e:
                logger.warning(f"Module generation failed (unit created): {e}")
                result["generation"] = {"status": "error", "reason": str(e)}
        return result
    except ImportError:
        raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        raise HTTPException(status_code=502, detail=str(e))
 # =============================================================================
 # Include compare_ocr_methods & analyze_grid from companion module
 # =============================================================================
 from vocab_worksheet_compare_api import compare_router  # noqa: E402
 analysis_router.include_router(compare_router)
@@ -1,499 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/worksheet/api.py
-Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
+import importlib as _importlib
-vocabulary editing, worksheet generation, and PDF downloads.
+import sys as _sys
-
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.api")
 Sub-routers (included at bottom):
 - vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
 - vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
 """
 from fastapi import APIRouter, HTTPException, UploadFile, File, Query
 from fastapi.responses import StreamingResponse
 from typing import List, Dict, Any
 from datetime import datetime
 import uuid
 import os
 import io
 import logging
 logger = logging.getLogger(__name__)
 # --- Imports from extracted sub-modules ---
 from vocab_worksheet_models import (
    WorksheetType,
    SessionStatus,
    VocabularyEntry,
    SessionCreate,
    SessionResponse,
    VocabularyResponse,
    VocabularyUpdate,
    WorksheetGenerateRequest,
    WorksheetResponse,
 )
 from vocab_worksheet_extraction import extract_vocabulary_from_image
 from vocab_worksheet_generation import (
    generate_worksheet_html, generate_worksheet_pdf,
    convert_pdf_page_to_image,
 )
 # --- Database integration (used by main.py lifespan) ---
 try:
    from vocab_session_store import (
        DATABASE_URL, get_pool, init_vocab_tables,
        list_sessions_db, get_session_db,
    )
 except ImportError:
    DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
    get_pool = None
    init_vocab_tables = None
    list_sessions_db = None
    get_session_db = None
 _db_pool = None
 def set_db_pool(pool):
    """Set the database connection pool (called from main.py lifespan)."""
    global _db_pool
    _db_pool = pool
 async def _init_vocab_table():
    """Initialize vocab tables in database."""
    if init_vocab_tables:
        try:
            await init_vocab_tables()
            logger.info("vocab_session_cache table ready")
        except Exception as e:
            logger.warning(f"Failed to init vocab tables: {e}")
    else:
        logger.info("vocab_session_cache table ready")
 async def _load_all_sessions():
    """Load all vocab sessions from database into memory cache."""
    if not list_sessions_db:
        logger.info("Loaded 0 vocab sessions from database")
        return
    try:
        sessions = await list_sessions_db(limit=500)
        count = 0
        for s in sessions:
            sid = s.get("id") or s.get("session_id")
            if sid and sid not in _sessions:
                _sessions[sid] = {
                    "id": sid,
                    "name": s.get("name", ""),
                    "description": s.get("description", ""),
                    "status": s.get("status", "created"),
                    "vocabulary_count": s.get("vocabulary_count", 0),
                    "source_language": s.get("source_language", "en"),
                    "target_language": s.get("target_language", "de"),
                    "created_at": str(s.get("created_at", "")),
                }
                count += 1
        logger.info(f"Loaded {count} vocab sessions from database")
    except Exception as e:
        logger.warning(f"Failed to load sessions from database: {e}")
 # --- Router & module-level state ---
 router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
 LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
 _sessions: Dict[str, Dict[str, Any]] = {}
 _worksheets: Dict[str, Dict[str, Any]] = {}
@router.post("/sessions", response_model=SessionResponse)
 async def create_session(session: SessionCreate):
    """Create a new vocabulary extraction session."""
    session_id = str(uuid.uuid4())
    session_data = {
        "id": session_id,
        "name": session.name,
        "description": session.description,
        "source_language": session.source_language,
        "target_language": session.target_language,
        "status": SessionStatus.PENDING.value,
        "vocabulary": [],
        "vocabulary_count": 0,
        "image_path": None,
        "extraction_confidence": None,
        "created_at": datetime.utcnow(),
    }
    _sessions[session_id] = session_data
    # Create storage directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    return SessionResponse(
        id=session_id,
        name=session.name,
        description=session.description,
        source_language=session.source_language,
        target_language=session.target_language,
        status=SessionStatus.PENDING.value,
        vocabulary_count=0,
        image_path=None,
        created_at=session_data["created_at"],
    )
@router.get("/sessions", response_model=List[SessionResponse])
 async def list_sessions(limit: int = Query(50, ge=1, le=100)):
    """List all vocabulary sessions."""
    sessions = sorted(
        _sessions.values(),
        key=lambda x: x["created_at"],
        reverse=True
    )[:limit]
    return [
        SessionResponse(
            id=s["id"],
            name=s["name"],
            description=s.get("description"),
            source_language=s["source_language"],
            target_language=s["target_language"],
            status=s["status"],
            vocabulary_count=s.get("vocabulary_count", 0),
            image_path=s.get("image_path"),
            created_at=s["created_at"],
        )
        for s in sessions
    ]
@router.get("/sessions/{session_id}", response_model=SessionResponse)
 async def get_session(session_id: str):
    """Get a specific session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    s = _sessions[session_id]
    return SessionResponse(
        id=s["id"],
        name=s["name"],
        description=s.get("description"),
        source_language=s["source_language"],
        target_language=s["target_language"],
        status=s["status"],
        vocabulary_count=s.get("vocabulary_count", 0),
        image_path=s.get("image_path"),
        created_at=s["created_at"],
    )
@router.post("/sessions/{session_id}/upload")
 async def upload_image(
    session_id: str,
    file: UploadFile = File(...),
 ):
    """
    Upload a textbook page image or PDF and extract vocabulary.
    Supported formats: PNG, JPG, JPEG, PDF
    """
    logger.info(f"Upload request for session {session_id}")
    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
    if session_id not in _sessions:
        logger.error(f"Session {session_id} not found")
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    # Validate file type - check both extension and content type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''
    # Accept images and PDFs
    valid_image_extensions = ['png', 'jpg', 'jpeg']
    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
    is_image = extension in valid_image_extensions or content_type in valid_image_content_types
    if not is_pdf and not is_image:
        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
        raise HTTPException(
            status_code=400,
            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
        )
    # Determine final extension for saving
    if is_pdf:
        save_extension = 'png'  # PDFs will be converted to PNG
    elif extension in valid_image_extensions:
        save_extension = extension
    elif content_type == 'image/png':
        save_extension = 'png'
    else:
        save_extension = 'jpg'
    # Read file content
    content = await file.read()
    logger.info(f"Read {len(content)} bytes from uploaded file")
    # Convert PDF to image if needed
    if is_pdf:
        logger.info("Converting PDF to image...")
        content = await convert_pdf_page_to_image(content, page_number=0)
        logger.info(f"PDF converted, image size: {len(content)} bytes")
    # Save image
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    image_path = os.path.join(session_dir, f"source.{save_extension}")
    with open(image_path, 'wb') as f:
        f.write(content)
    # Update session status
    session["status"] = SessionStatus.PROCESSING.value
    session["image_path"] = image_path
    # Extract vocabulary using Vision LLM
    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
    # Update session with extracted vocabulary
    session["vocabulary"] = [v.dict() for v in vocabulary]
    session["vocabulary_count"] = len(vocabulary)
    session["extraction_confidence"] = confidence
    session["status"] = SessionStatus.EXTRACTED.value
    result = {
        "session_id": session_id,
        "filename": file.filename,
        "image_path": image_path,
        "vocabulary_count": len(vocabulary),
        "extraction_confidence": confidence,
        "status": SessionStatus.EXTRACTED.value,
    }
    if error:
        result["error"] = error
    return result
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
 async def get_vocabulary(session_id: str):
    """Get extracted vocabulary for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
    return VocabularyResponse(
        session_id=session_id,
        vocabulary=vocabulary,
        extraction_confidence=session.get("extraction_confidence"),
    )
@router.put("/sessions/{session_id}/vocabulary")
 async def update_vocabulary(session_id: str, update: VocabularyUpdate):
    """Update vocabulary entries (for manual corrections)."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    session["vocabulary"] = [v.dict() for v in update.vocabulary]
    session["vocabulary_count"] = len(update.vocabulary)
    return {
        "session_id": session_id,
        "vocabulary_count": len(update.vocabulary),
        "message": "Vocabulary updated successfully",
    }
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
 async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
    """Generate worksheet PDF(s) from extracted vocabulary."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
    worksheet_id = str(uuid.uuid4())
    title = request.title or session["name"]
    # Generate HTML for each worksheet type
    combined_html = ""
    for wtype in request.worksheet_types:
        html = generate_worksheet_html(
            vocabulary=vocabulary,
            worksheet_type=wtype,
            title=f"{title} - {wtype.value}",
            show_solutions=False,
            repetitions=request.repetitions,
            line_height=request.line_height,
        )
        combined_html += html + '<div style="page-break-after: always;"></div>'
    # Generate PDF
    try:
        pdf_bytes = await generate_worksheet_pdf(combined_html)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
    # Save PDF
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(pdf_bytes)
    # Generate solution PDF if requested
    solution_path = None
    if request.include_solutions:
        solution_html = ""
        for wtype in request.worksheet_types:
            html = generate_worksheet_html(
                vocabulary=vocabulary,
                worksheet_type=wtype,
                title=f"{title} - {wtype.value} (Loesung)",
                show_solutions=True,
                repetitions=request.repetitions,
                line_height=request.line_height,
            )
            solution_html += html + '<div style="page-break-after: always;"></div>'
        solution_bytes = await generate_worksheet_pdf(solution_html)
        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
        with open(solution_path, 'wb') as f:
            f.write(solution_bytes)
    # Store worksheet info
    worksheet_data = {
        "id": worksheet_id,
        "session_id": session_id,
        "worksheet_types": [wt.value for wt in request.worksheet_types],
        "pdf_path": pdf_path,
        "solution_path": solution_path,
        "generated_at": datetime.utcnow(),
    }
    _worksheets[worksheet_id] = worksheet_data
    # Update session status
    session["status"] = SessionStatus.COMPLETED.value
    return WorksheetResponse(
        id=worksheet_id,
        session_id=session_id,
        worksheet_types=worksheet_data["worksheet_types"],
        pdf_path=pdf_path,
        solution_path=solution_path,
        generated_at=worksheet_data["generated_at"],
    )
@router.get("/worksheets/{worksheet_id}/pdf")
 async def download_worksheet_pdf(worksheet_id: str):
    """Download the generated worksheet PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")
    worksheet = _worksheets[worksheet_id]
    pdf_path = worksheet["pdf_path"]
    if not os.path.exists(pdf_path):
        raise HTTPException(status_code=404, detail="PDF file not found")
    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()
    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
    )
@router.get("/worksheets/{worksheet_id}/solution")
 async def download_solution_pdf(worksheet_id: str):
    """Download the solution PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")
    worksheet = _worksheets[worksheet_id]
    solution_path = worksheet.get("solution_path")
    if not solution_path or not os.path.exists(solution_path):
        raise HTTPException(status_code=404, detail="Solution PDF not found")
    with open(solution_path, 'rb') as f:
        pdf_bytes = f.read()
    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
    )
@router.get("/sessions/{session_id}/image")
 async def get_session_image(session_id: str):
    """Get the uploaded source image for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    image_path = session.get("image_path")
    if not image_path or not os.path.exists(image_path):
        raise HTTPException(status_code=404, detail="Image not found")
    # Determine content type
    extension = image_path.split('.')[-1].lower()
    content_type = {
        'png': 'image/png',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
    }.get(extension, 'application/octet-stream')
    with open(image_path, 'rb') as f:
        image_bytes = f.read()
    return StreamingResponse(
        io.BytesIO(image_bytes),
        media_type=content_type,
    )
@router.delete("/sessions/{session_id}")
 async def delete_session(session_id: str):
    """Delete a vocabulary session and all associated files."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    # Delete session directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    if os.path.exists(session_dir):
        import shutil
        shutil.rmtree(session_dir)
    # Remove from storage
    del _sessions[session_id]
    # Remove associated worksheets
    for wid, ws in list(_worksheets.items()):
        if ws["session_id"] == session_id:
            del _worksheets[wid]
    return {"message": "Session deleted successfully", "session_id": session_id}
 # --- Include sub-routers ---
 from vocab_worksheet_upload_api import upload_router
 from vocab_worksheet_analysis_api import analysis_router
 router.include_router(upload_router)
 router.include_router(analysis_router)
@@ -1,545 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/worksheet/compare_api.py
-Vocabulary Worksheet Compare & Grid Analysis API.
+import importlib as _importlib
-
+import sys as _sys
-Split from vocab_worksheet_analysis_api.py — contains the two largest
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.compare_api")
 route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
 """
 from fastapi import APIRouter, HTTPException, Query
 from typing import Dict, Any
 import base64
 import json
 import logging
 import os
 from vocab_worksheet_extraction import extract_vocabulary_from_image
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
 def _get_sessions():
    from vocab_worksheet_api import _sessions
    return _sessions
 from vocab_worksheet_generation import convert_pdf_page_to_image
 # Try to import Tesseract extractor
 try:
    from tesseract_vocab_extractor import (
        run_tesseract_pipeline,
        match_positions_to_vocab, TESSERACT_AVAILABLE,
    )
 except ImportError:
    TESSERACT_AVAILABLE = False
 # Try to import CV Pipeline
 try:
    from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
 except ImportError:
    CV_PIPELINE_AVAILABLE = False
 # Try to import Grid Detection Service
 try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
 except ImportError:
    GRID_SERVICE_AVAILABLE = False
 logger = logging.getLogger(__name__)
 compare_router = APIRouter()
 # =============================================================================
 # OCR Compare & Grid Analysis Endpoints
 # =============================================================================
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
 async def compare_ocr_methods(session_id: str, page_number: int):
    """
    Run multiple OCR methods on a page and compare results.
    This endpoint:
    1. Gets the page image from the session's uploaded PDF
    2. Runs Vision LLM extraction (primary method)
    3. Optionally runs Tesseract extraction
    4. Compares found vocabulary across methods
    5. Returns structured comparison results
    page_number is 0-indexed.
    """
    import httpx
    import time
    logger.info(f"Compare OCR for session {session_id}, page {page_number}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    methods_results = {}
    all_vocab_sets = {}
    # --- Method: Vision LLM ---
    try:
        start = time.time()
        vocab, confidence, error = await extract_vocabulary_from_image(
            image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
        )
        duration = time.time() - start
        vocab_list = []
        for v in vocab:
            entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
            vocab_list.append({
                "english": entry.get("english", ""),
                "german": entry.get("german", ""),
                "example": entry.get("example_sentence", ""),
            })
        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": round(duration, 1),
            "vocabulary_count": len(vocab_list),
            "vocabulary": vocab_list,
            "confidence": confidence,
            "success": len(vocab_list) > 0 and not error,
            "error": error if error else None,
        }
        all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
    except Exception as e:
        logger.error(f"Vision LLM failed: {e}")
        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": 0,
            "vocabulary_count": 0,
            "vocabulary": [],
            "confidence": 0,
            "success": False,
            "error": str(e),
        }
        all_vocab_sets["vision_llm"] = set()
    # --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
    if TESSERACT_AVAILABLE:
        try:
            start = time.time()
            tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
            duration = time.time() - start
            tess_vocab = tess_result.get("vocabulary", [])
            tess_words = tess_result.get("words", [])
            # Store Tesseract words in session for later use (grid analysis, position matching)
            session["tesseract_words"] = tess_words
            session["tesseract_image_width"] = tess_result.get("image_width", 0)
            session["tesseract_image_height"] = tess_result.get("image_height", 0)
            session[f"tesseract_page_{page_number}"] = tess_result
            vocab_list_tess = []
            for v in tess_vocab:
                vocab_list_tess.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })
            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr (eng+deu)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_tess),
                "vocabulary": vocab_list_tess,
                "confidence": 0.7 if tess_vocab else 0,
                "success": len(vocab_list_tess) > 0,
                "error": tess_result.get("error"),
                "word_count": tess_result.get("word_count", 0),
                "columns_detected": len(tess_result.get("columns", [])),
            }
            all_vocab_sets["tesseract"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_tess if v["english"] and v["german"]
            }
            # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
            if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
                llm_vocab_with_bbox = match_positions_to_vocab(
                    tess_words,
                    methods_results["vision_llm"]["vocabulary"],
                    tess_result.get("image_width", 1),
                    tess_result.get("image_height", 1),
                )
                methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
        except Exception as e:
            logger.error(f"Tesseract failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["tesseract"] = set()
    # --- Method: CV Pipeline (Document Reconstruction) ---
    if CV_PIPELINE_AVAILABLE:
        try:
            start = time.time()
            cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
            duration = time.time() - start
            cv_vocab = cv_result.vocabulary if not cv_result.error else []
            vocab_list_cv = []
            for v in cv_vocab:
                vocab_list_cv.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })
            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_cv),
                "vocabulary": vocab_list_cv,
                "confidence": 0.8 if cv_vocab else 0,
                "success": len(vocab_list_cv) > 0,
                "error": cv_result.error,
                "word_count": cv_result.word_count,
                "columns_detected": cv_result.columns_detected,
                "stages": cv_result.stages,
            }
            all_vocab_sets["cv_pipeline"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_cv if v["english"] and v["german"]
            }
        except Exception as e:
            logger.error(f"CV Pipeline failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["cv_pipeline"] = set()
    # --- Build comparison ---
    all_unique = set()
    for vs in all_vocab_sets.values():
        all_unique |= vs
    found_by_all = []
    found_by_some = []
    for english, german in sorted(all_unique):
        found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
        entry = {"english": english, "german": german, "methods": found_in}
        if len(found_in) == len(all_vocab_sets):
            found_by_all.append(entry)
        else:
            found_by_some.append(entry)
    total_methods = max(len(all_vocab_sets), 1)
    agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
    # Find best method
    best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
    return {
        "session_id": session_id,
        "page_number": page_number,
        "methods": methods_results,
        "comparison": {
            "found_by_all_methods": found_by_all,
            "found_by_some_methods": found_by_some,
            "total_unique_vocabulary": len(all_unique),
            "agreement_rate": agreement_rate,
        },
        "recommendation": {
            "best_method": best_method,
            "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
        },
    }
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
 async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
    """
    Analyze the grid/table structure of a vocabulary page.
    Hybrid approach:
    1. If Tesseract bounding boxes are available (from compare-ocr), use them for
       real spatial positions via GridDetectionService.
    2. Otherwise fall back to Vision LLM for grid structure detection.
    page_number is 0-indexed.
    Returns GridData structure expected by the frontend GridOverlay component.
    """
    import httpx
    import time
    logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number.")
    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
    tess_page_data = session.get(f"tesseract_page_{page_number}")
    if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
        try:
            # Run Tesseract if not already cached
            if not tess_page_data:
                logger.info("Running Tesseract for grid analysis (not cached)")
                from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
                tess_page_data = await _run_tess(image_data, lang="eng+deu")
                session[f"tesseract_page_{page_number}"] = tess_page_data
                session["tesseract_words"] = tess_page_data.get("words", [])
                session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
                session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
            tess_words = tess_page_data.get("words", [])
            img_w = tess_page_data.get("image_width", 0)
            img_h = tess_page_data.get("image_height", 0)
            if tess_words and img_w > 0 and img_h > 0:
                service = GridDetectionService()
                regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
                if regions:
                    grid_result = service.detect_grid(regions)
                    grid_dict = grid_result.to_dict()
                    # Merge LLM text if available (better quality than Tesseract text)
                    # The LLM vocab was stored during compare-ocr
                    grid_dict["source"] = "tesseract+grid_service"
                    grid_dict["word_count"] = len(tess_words)
                    logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
                                f"{grid_result.stats.get('recognized', 0)} recognized")
                    return {"success": True, "grid": grid_dict}
            logger.info("Tesseract data insufficient, falling back to LLM")
        except Exception as e:
            logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
            import traceback
            logger.debug(traceback.format_exc())
    # --- Strategy 2: Fall back to Vision LLM ---
    image_base64 = base64.b64encode(image_data).decode("utf-8")
    grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
 Your task: Identify the TABLE STRUCTURE and extract each cell's content.
 Return a JSON object with this EXACT structure:
 {
  "rows": <number of rows>,
  "columns": <number of columns>,
  "column_types": ["english", "german", "example"],
  "entries": [
    {
      "row": 0,
      "col": 0,
      "text": "the word or phrase in this cell",
      "column_type": "english",
      "confidence": 0.95
    }
  ]
 }
 Rules:
 - row and col are 0-indexed
 - column_type is one of: "english", "german", "example", "unknown"
 - Detect whether each column contains English words, German translations, or example sentences
 - Include ALL non-empty cells
 - confidence is 0.0-1.0 based on how clear the text is
 - If a cell is empty, don't include it
 - Return ONLY the JSON, no other text"""
    try:
        import asyncio
        raw_text = ""
        max_retries = 3
        for attempt in range(max_retries):
            async with httpx.AsyncClient(timeout=300.0) as client:
                response = await client.post(
                    f"{OLLAMA_URL}/api/chat",
                    json={
                        "model": VISION_MODEL,
                        "messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
                        "stream": False,
                        "options": {"temperature": 0.1, "num_predict": 8192},
                    },
                    timeout=300.0,
                )
            if response.status_code == 500 and attempt < max_retries - 1:
                wait_time = 10 * (attempt + 1)
                logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
                await asyncio.sleep(wait_time)
                continue
            elif response.status_code != 200:
                error_detail = response.text[:200] if response.text else "Unknown error"
                return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
            raw_text = response.json().get("message", {}).get("content", "")
            break
        # Parse JSON from response
        import re
        json_match = re.search(r'\{[\s\S]*\}', raw_text)
        if not json_match:
            return {"success": False, "error": "Could not parse grid structure from LLM response"}
        grid_raw = json.loads(json_match.group())
        num_rows = grid_raw.get("rows", 0)
        num_cols = grid_raw.get("columns", 0)
        column_types = grid_raw.get("column_types", [])
        entries = grid_raw.get("entries", [])
        if num_rows == 0 or num_cols == 0:
            return {"success": False, "error": "No grid structure detected"}
        # Ensure column_types has the right length
        while len(column_types) < num_cols:
            column_types.append("unknown")
        # Build cell grid with percentage-based coordinates
        row_height = 100.0 / num_rows
        col_width = 100.0 / num_cols
        # Track which cells have content
        cell_map = {}
        for entry in entries:
            r = entry.get("row", 0)
            c = entry.get("col", 0)
            cell_map[(r, c)] = entry
        cells = []
        recognized_count = 0
        empty_count = 0
        problematic_count = 0
        for r in range(num_rows):
            row_cells = []
            for c in range(num_cols):
                x = c * col_width
                y = r * row_height
                if (r, c) in cell_map:
                    entry = cell_map[(r, c)]
                    text = entry.get("text", "").strip()
                    conf = entry.get("confidence", 0.8)
                    col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
                    if text:
                        status = "recognized" if conf >= 0.5 else "problematic"
                        if status == "recognized":
                            recognized_count += 1
                        else:
                            problematic_count += 1
                    else:
                        status = "empty"
                        empty_count += 1
                else:
                    text = ""
                    conf = 0.0
                    col_type = column_types[c] if c < len(column_types) else "unknown"
                    status = "empty"
                    empty_count += 1
                row_cells.append({
                    "row": r,
                    "col": c,
                    "x": round(x, 2),
                    "y": round(y, 2),
                    "width": round(col_width, 2),
                    "height": round(row_height, 2),
                    "text": text,
                    "confidence": conf,
                    "status": status,
                    "column_type": col_type,
                })
            cells.append(row_cells)
        total = num_rows * num_cols
        coverage = (recognized_count + problematic_count) / max(total, 1)
        # Column and row boundaries as percentages
        col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
        row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
        grid_data = {
            "rows": num_rows,
            "columns": num_cols,
            "cells": cells,
            "column_types": column_types,
            "column_boundaries": col_boundaries,
            "row_boundaries": row_boundaries,
            "deskew_angle": 0.0,
            "source": "vision_llm",
            "stats": {
                "recognized": recognized_count,
                "problematic": problematic_count,
                "empty": empty_count,
                "manual": 0,
                "total": total,
                "coverage": round(coverage, 3),
            },
        }
        return {"success": True, "grid": grid_data}
    except httpx.TimeoutException:
        logger.error("Grid analysis timed out")
        return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
    except Exception as e:
        logger.error(f"Grid analysis failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
@@ -1,325 +1,4 @@
-"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
+# Backward-compat shim -- module moved to vocab/worksheet/extraction.py
-
+import importlib as _importlib
-Contains:
+import sys as _sys
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.extraction")
 - extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
 - _get_demo_vocabulary(): Demo data for testing
 - parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
 """
 import base64
 import json
 import logging
 import os
 import re
 import uuid
 from typing import List
 import httpx
 from vocab_worksheet_models import VocabularyEntry
 logger = logging.getLogger(__name__)
 # Ollama Configuration
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
 # =============================================================================
 # Vision LLM Vocabulary Extraction
 # =============================================================================
 VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
 AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
 {
  "vocabulary": [
    {
      "english": "to improve",
      "german": "verbessern",
      "example": "I want to improve my English."
    }
  ]
 }
 REGELN:
 1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
 2. Behalte die exakte Schreibweise bei
 3. Bei fehlenden Beispielsaetzen: "example": null
 4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
 5. Gib NUR valides JSON zurueck, keine Erklaerungen
 6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
 Beispiel-Output:
 {
  "vocabulary": [
    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
  ]
 }"""
 async def extract_vocabulary_from_image(
    image_data: bytes,
    filename: str,
    page_number: int = 0,
    use_hybrid: bool = False  # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
 ) -> tuple[List[VocabularyEntry], float, str]:
    """
    Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
    Args:
        image_data: Image bytes
        filename: Original filename for logging
        page_number: 0-indexed page number for error messages
        use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
                   If False, use Vision LLM (slower, better for complex layouts)
    Returns:
        Tuple of (vocabulary_entries, confidence, error_message)
        error_message is empty string on success
    """
    # ==========================================================================
    # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
    # ==========================================================================
    if use_hybrid:
        try:
            from hybrid_vocab_extractor import extract_vocabulary_hybrid
            logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
            if error:
                logger.warning(f"Hybrid extraction had issues: {error}")
                # Fall through to Vision LLM fallback
            elif vocab_dicts:
                # Convert dicts to VocabularyEntry objects
                vocabulary = [
                    VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=v.get("english", ""),
                        german=v.get("german", ""),
                        example_sentence=v.get("example"),
                        source_page=page_number + 1
                    )
                    for v in vocab_dicts
                    if v.get("english") and v.get("german")
                ]
                logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
                return vocabulary, confidence, ""
        except ImportError as e:
            logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
        except Exception as e:
            logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
            import traceback
            logger.debug(traceback.format_exc())
    # ==========================================================================
    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
    # ==========================================================================
    logger.info(f"Using VISION LLM extraction for {filename}")
    try:
        # First check if Ollama is available
        async with httpx.AsyncClient(timeout=10.0) as check_client:
            try:
                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
                if health_response.status_code != 200:
                    logger.error(f"Ollama not available at {OLLAMA_URL}")
                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
            except Exception as e:
                logger.error(f"Ollama health check failed: {e}")
                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
        image_base64 = base64.b64encode(image_data).decode("utf-8")
        payload = {
            "model": VISION_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": VOCAB_EXTRACTION_PROMPT,
                    "images": [image_base64]
                }
            ],
            "stream": False,
            "options": {
                "temperature": 0.1,
                "num_predict": 4096,
            }
        }
        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
        # Increased timeout for Vision models (they can be slow)
        async with httpx.AsyncClient(timeout=600.0) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/chat",
                json=payload,
                timeout=300.0  # 5 minutes per page
            )
            response.raise_for_status()
            data = response.json()
            extracted_text = data.get("message", {}).get("content", "")
        logger.info(f"Ollama response received: {len(extracted_text)} chars")
        # Parse JSON from response
        vocabulary = parse_vocabulary_json(extracted_text)
        # Set source_page for each entry
        for v in vocabulary:
            v.source_page = page_number + 1
        # Estimate confidence
        confidence = 0.85 if len(vocabulary) > 0 else 0.1
        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
        return vocabulary, confidence, ""
    except httpx.TimeoutException:
        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
    except Exception as e:
        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
 def _get_demo_vocabulary() -> List[VocabularyEntry]:
    """Return demo vocabulary for testing when Vision LLM is not available."""
    demo_entries = [
        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
    ]
    return [
        VocabularyEntry(
            id=str(uuid.uuid4()),
            english=e["english"],
            german=e["german"],
            example_sentence=e.get("example"),
        )
        for e in demo_entries
    ]
 def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
    """Parse vocabulary JSON from LLM response with robust error handling."""
    def clean_json_string(s: str) -> str:
        """Clean a JSON string by removing control characters and fixing common issues."""
        # Remove control characters except newlines and tabs
        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
        # Replace unescaped newlines within strings with space
        # This is a simplistic approach - replace actual newlines with escaped ones
        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return s
    def try_parse_json(json_str: str) -> dict:
        """Try multiple strategies to parse JSON."""
        # Strategy 1: Direct parse
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass
        # Strategy 2: Clean and parse
        try:
            cleaned = clean_json_string(json_str)
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass
        # Strategy 3: Try to fix common issues
        try:
            # Remove trailing commas before } or ]
            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
            # Fix unquoted keys
            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass
        return None
    try:
        # Find JSON in response (may have extra text)
        start = text.find('{')
        end = text.rfind('}') + 1
        if start == -1 or end == 0:
            logger.warning("No JSON found in response")
            return []
        json_str = text[start:end]
        data = try_parse_json(json_str)
        if data is None:
            # Strategy 4: Extract vocabulary entries using regex as fallback
            logger.warning("JSON parsing failed, trying regex extraction")
            vocabulary = []
            # Match patterns like {"english": "...", "german": "...", ...}
            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
            for match in matches:
                english = match[0].strip() if match[0] else ""
                german = match[1].strip() if match[1] else ""
                example = match[2].strip() if len(match) > 2 and match[2] else None
                if english and german:
                    vocab_entry = VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=english,
                        german=german,
                        example_sentence=example,
                    )
                    vocabulary.append(vocab_entry)
            if vocabulary:
                logger.info(f"Regex extraction found {len(vocabulary)} entries")
            return vocabulary
        # Normal JSON parsing succeeded
        vocabulary = []
        for i, entry in enumerate(data.get("vocabulary", [])):
            english = entry.get("english", "").strip()
            german = entry.get("german", "").strip()
            # Skip entries that look like hallucinations (very long or containing unusual patterns)
            if len(english) > 100 or len(german) > 200:
                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
                continue
            if not english or not german:
                continue
            vocab_entry = VocabularyEntry(
                id=str(uuid.uuid4()),
                english=english,
                german=german,
                example_sentence=entry.get("example"),
                word_type=entry.get("word_type"),
            )
            vocabulary.append(vocab_entry)
        return vocabulary
    except Exception as e:
        logger.error(f"Failed to parse vocabulary JSON: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return []
@@ -1,260 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/worksheet/generation.py
-Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
+import importlib as _importlib
-
+import sys as _sys
-Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.generation")
 Functions:
  - generate_worksheet_html(): Build HTML for various worksheet types
  - generate_worksheet_pdf():  Convert HTML to PDF via WeasyPrint
  - get_pdf_page_count():      Count pages in a PDF (PyMuPDF)
  - convert_pdf_page_to_image(): Render single PDF page to PNG
  - convert_pdf_to_images():     Render multiple PDF pages to PNG
 """
 import io
 import logging
 import os
 from typing import List, Optional
 from fastapi import HTTPException
 from vocab_worksheet_models import VocabularyEntry, WorksheetType
 logger = logging.getLogger(__name__)
 # Optional dependency: WeasyPrint
 try:
    from weasyprint import HTML as _WeasyHTML
    WEASYPRINT_AVAILABLE = True
 except (ImportError, OSError):
    WEASYPRINT_AVAILABLE = False
    logger.warning("WeasyPrint not available")
 # Optional dependency: PyMuPDF
 try:
    import fitz  # PyMuPDF
    FITZ_AVAILABLE = True
 except ImportError:
    FITZ_AVAILABLE = False
    logger.warning("PyMuPDF (fitz) not available")
 # =============================================================================
 # Worksheet HTML Generation
 # =============================================================================
 def generate_worksheet_html(
    vocabulary: List[VocabularyEntry],
    worksheet_type: WorksheetType,
    title: str,
    show_solutions: bool = False,
    repetitions: int = 3,
    line_height: str = "normal"
 ) -> str:
    """Generate HTML for a worksheet."""
    # Line height CSS
    line_heights = {
        "normal": "2.5em",
        "large": "3.5em",
        "extra-large": "4.5em"
    }
    lh = line_heights.get(line_height, "2.5em")
    html = f"""<!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8">
    <style>
        @page {{ size: A4; margin: 2cm; }}
        body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
        h1 {{ font-size: 24px; margin-bottom: 10px; }}
        .meta {{ color: #666; margin-bottom: 20px; }}
        .name-line {{ margin-bottom: 30px; }}
        .vocab-table {{ width: 100%; border-collapse: collapse; }}
        .vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
        .vocab-word {{ width: 40%; font-weight: 500; }}
        .vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
        .vocab-answer {{ width: 60%; color: #2563eb; }}
        .gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
        .hint {{ color: #666; font-style: italic; font-size: 12px; }}
        .section {{ margin-top: 30px; }}
        .section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
    </style>
 </head>
 <body>
    <h1>{title}</h1>
    <div class="name-line">Name: _________________________ Datum: _____________</div>
 """
    if worksheet_type == WorksheetType.EN_TO_DE:
        html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'
    elif worksheet_type == WorksheetType.DE_TO_EN:
        html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'
    elif worksheet_type == WorksheetType.COPY_PRACTICE:
        html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            html += f'<tr><td class="vocab-word">{entry.english}</td>'
            html += '<td class="vocab-blank">'
            if show_solutions:
                html += f' {entry.english} ' * repetitions
            html += '</td></tr>'
        html += '</table></div>'
    elif worksheet_type == WorksheetType.GAP_FILL:
        entries_with_examples = [e for e in vocabulary if e.example_sentence]
        if entries_with_examples:
            html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
            for i, entry in enumerate(entries_with_examples, 1):
                # Create gap sentence by removing the English word
                gap_sentence = entry.example_sentence
                for word in entry.english.split():
                    if word.lower() in gap_sentence.lower():
                        gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
                        break
                html += f'<p>{i}. {gap_sentence}</p>'
                if show_solutions:
                    html += f'<p class="hint">Loesung: {entry.english}</p>'
                else:
                    html += f'<p class="hint">({entry.german})</p>'
            html += '</div>'
    html += '</body></html>'
    return html
 # =============================================================================
 # Worksheet PDF Generation
 # =============================================================================
 async def generate_worksheet_pdf(html: str) -> bytes:
    """Generate PDF from HTML using WeasyPrint."""
    try:
        from weasyprint import HTML
        pdf_bytes = HTML(string=html).write_pdf()
        return pdf_bytes
    except ImportError:
        logger.warning("WeasyPrint not available, returning HTML")
        return html.encode('utf-8')
    except Exception as e:
        logger.error(f"PDF generation failed: {e}")
        raise
 # =============================================================================
 # PDF Utilities (PyMuPDF)
 # =============================================================================
 def get_pdf_page_count(pdf_data: bytes) -> int:
    """Get the number of pages in a PDF."""
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        count = pdf_document.page_count
        pdf_document.close()
        return count
    except Exception as e:
        logger.error(f"Failed to get PDF page count: {e}")
        return 0
 async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
    """Convert a specific page of PDF to PNG image using PyMuPDF.
    Args:
        pdf_data: PDF file as bytes
        page_number: 0-indexed page number
        thumbnail: If True, return a smaller thumbnail image
    """
    try:
        import fitz  # PyMuPDF
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")
        if page_number >= pdf_document.page_count:
            raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
        page = pdf_document[page_number]
        # Render page to image
        # For thumbnails: lower resolution, for OCR: higher resolution
        zoom = 0.5 if thumbnail else 2.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
        logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
        return png_data
    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
 async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
    """Convert multiple pages of PDF to PNG images.
    Args:
        pdf_data: PDF file as bytes
        pages: List of 0-indexed page numbers to convert. If None, convert all pages.
    """
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")
        # If no pages specified, convert all
        if pages is None:
            pages = list(range(pdf_document.page_count))
        images = []
        zoom = 2.0
        mat = fitz.Matrix(zoom, zoom)
        for page_num in pages:
            if page_num < pdf_document.page_count:
                page = pdf_document[page_num]
                pix = page.get_pixmap(matrix=mat)
                images.append(pix.tobytes("png"))
        pdf_document.close()
        logger.info(f"Converted {len(images)} PDF pages to images")
        return images
    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
@@ -1,86 +1,4 @@
-"""Pydantic models and enums for the Vocab Worksheet API."""
+# Backward-compat shim -- module moved to vocab/worksheet/models.py
-
+import importlib as _importlib
-from datetime import datetime
+import sys as _sys
-from enum import Enum
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.models")
 from typing import List, Optional
 from pydantic import BaseModel
 # =============================================================================
 # Enums
 # =============================================================================
 class WorksheetType(str, Enum):
    EN_TO_DE = "en_to_de"          # English -> German translation
    DE_TO_EN = "de_to_en"          # German -> English translation
    COPY_PRACTICE = "copy"         # Write word multiple times
    GAP_FILL = "gap_fill"          # Fill in the blanks
    COMBINED = "combined"          # All types combined
 class SessionStatus(str, Enum):
    PENDING = "pending"            # Session created, no upload yet
    PROCESSING = "processing"      # OCR in progress
    EXTRACTED = "extracted"        # Vocabulary extracted, ready to edit
    COMPLETED = "completed"        # Worksheet generated
 # =============================================================================
 # Pydantic Models
 # =============================================================================
 class VocabularyEntry(BaseModel):
    id: str
    english: str
    german: str
    example_sentence: Optional[str] = None
    example_sentence_gap: Optional[str] = None  # With ___ for gap-fill
    word_type: Optional[str] = None  # noun, verb, adjective, etc.
    source_page: Optional[int] = None  # Page number where entry was found (1-indexed)
 class SessionCreate(BaseModel):
    name: str
    description: Optional[str] = None
    source_language: str = "en"  # Source language (default English)
    target_language: str = "de"  # Target language (default German)
 class SessionResponse(BaseModel):
    id: str
    name: str
    description: Optional[str]
    source_language: str
    target_language: str
    status: str
    vocabulary_count: int
    image_path: Optional[str]
    created_at: datetime
 class VocabularyResponse(BaseModel):
    session_id: str
    vocabulary: List[VocabularyEntry]
    extraction_confidence: Optional[float]
 class VocabularyUpdate(BaseModel):
    vocabulary: List[VocabularyEntry]
 class WorksheetGenerateRequest(BaseModel):
    worksheet_types: List[WorksheetType]
    title: Optional[str] = None
    include_solutions: bool = True
    repetitions: int = 3  # For copy practice
    line_height: str = "normal"  # normal, large, extra-large
 class WorksheetResponse(BaseModel):
    id: str
    session_id: str
    worksheet_types: List[str]
    pdf_path: str
    solution_path: Optional[str]
    generated_at: datetime
@@ -1,481 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/worksheet/ocr.py
-Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
+import importlib as _importlib
-
+import sys as _sys
-Extracted from vocab_worksheet_api.py to keep file sizes manageable.
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.ocr")
 Pipeline steps:
  orientation → deskew → dewarp → crop → scan-quality → enhance →
  dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
  vocab extraction → row merging
 """
 import logging
 import uuid
 from typing import Optional
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Optional heavy dependencies (not available in every environment)
 # ---------------------------------------------------------------------------
 try:
    import cv2
    import numpy as np
 except ImportError:
    cv2 = None  # type: ignore[assignment]
    np = None  # type: ignore[assignment]
    logger.warning("cv2 / numpy not available — OCR pipeline disabled")
 try:
    from PIL import Image
 except ImportError:
    Image = None  # type: ignore[assignment]
 try:
    import pytesseract
 except ImportError:
    pytesseract = None  # type: ignore[assignment]
 # CV pipeline helpers
 try:
    from cv_vocab_pipeline import (
        deskew_two_pass,
        dewarp_image,
        detect_and_fix_orientation,
        _cells_to_vocab_entries,
        _fix_phonetic_brackets,
    )
 except ImportError:
    deskew_two_pass = None  # type: ignore[assignment]
    dewarp_image = None  # type: ignore[assignment]
    detect_and_fix_orientation = None  # type: ignore[assignment]
    _cells_to_vocab_entries = None  # type: ignore[assignment]
    _fix_phonetic_brackets = None  # type: ignore[assignment]
 try:
    from cv_cell_grid import (
        _merge_wrapped_rows,
        _merge_phonetic_continuation_rows,
        _merge_continuation_rows,
    )
 except ImportError:
    _merge_wrapped_rows = None  # type: ignore[assignment]
    _merge_phonetic_continuation_rows = None  # type: ignore[assignment]
    _merge_continuation_rows = None  # type: ignore[assignment]
 try:
    from cv_ocr_engines import ocr_region_rapid
 except ImportError:
    ocr_region_rapid = None  # type: ignore[assignment]
 try:
    from cv_vocab_types import PageRegion
 except ImportError:
    PageRegion = None  # type: ignore[assignment]
 try:
    from ocr_pipeline_ocr_merge import (
        _split_paddle_multi_words,
        _merge_paddle_tesseract,
        _deduplicate_words,
    )
 except ImportError:
    _split_paddle_multi_words = None  # type: ignore[assignment]
    _merge_paddle_tesseract = None  # type: ignore[assignment]
    _deduplicate_words = None  # type: ignore[assignment]
 try:
    from cv_words_first import build_grid_from_words
 except ImportError:
    build_grid_from_words = None  # type: ignore[assignment]
 try:
    from ocr_pipeline_session_store import (
        create_session_db as create_pipeline_session_db,
        update_session_db as update_pipeline_session_db,
    )
 except ImportError:
    create_pipeline_session_db = None  # type: ignore[assignment]
    update_pipeline_session_db = None  # type: ignore[assignment]
 # ---------------------------------------------------------------------------
 # Main pipeline function
 # ---------------------------------------------------------------------------
 async def _run_ocr_pipeline_for_page(
    img_bgr: "np.ndarray",
    page_number: int,
    vocab_session_id: str,
    *,
    ipa_mode: str = "none",
    syllable_mode: str = "none",
    enable_enhance: bool = True,
    max_columns: Optional[int] = 3,
    override_min_conf: Optional[int] = None,
 ) -> tuple:
    """Run the full Kombi OCR pipeline on a single page and return vocab entries.
    Uses the same pipeline as the admin OCR Kombi pipeline:
    orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
    (with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
    Args:
        img_bgr: BGR numpy array.
        page_number: 0-indexed page number.
        vocab_session_id: Vocab session ID for logging.
        ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
        syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
    Returns (entries, rotation_deg) where entries is a list of dicts and
    rotation_deg is the orientation correction applied (0, 90, 180, 270).
    """
    import time as _time
    t_total = _time.time()
    img_h, img_w = img_bgr.shape[:2]
    logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
    # 1. Orientation detection (fix upside-down scans)
    t0 = _time.time()
    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
    if rotation:
        img_h, img_w = img_bgr.shape[:2]
        logger.info(f"  orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
    else:
        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")
    # 2. Create pipeline session in DB (visible in admin Kombi UI)
    pipeline_session_id = str(uuid.uuid4())
    try:
        _, png_buf = cv2.imencode(".png", img_bgr)
        original_png = png_buf.tobytes()
        await create_pipeline_session_db(
            pipeline_session_id,
            name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
            filename=f"page_{page_number + 1}.png",
            original_png=original_png,
        )
    except Exception as e:
        logger.warning(f"Could not create pipeline session in DB: {e}")
    # 3. Three-pass deskew
    t0 = _time.time()
    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
    logger.info(f"  deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
    # 4. Dewarp
    t0 = _time.time()
    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
    # 5. Content crop (removes scanner borders, gutter shadows)
    t0 = _time.time()
    try:
        from page_crop import detect_and_crop_page
        cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
        if crop_result.get("crop_applied"):
            dewarped_bgr = cropped_bgr
            logger.info(f"  crop: applied ({_time.time() - t0:.1f}s)")
        else:
            logger.info(f"  crop: skipped ({_time.time() - t0:.1f}s)")
    except Exception as e:
        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")
    # 5b. Scan quality assessment
    scan_quality_report = None
    try:
        from scan_quality import score_scan_quality
        scan_quality_report = score_scan_quality(dewarped_bgr)
    except Exception as e:
        logger.warning(f"  scan quality: failed ({e})")
    if override_min_conf:
        min_ocr_conf = override_min_conf
    else:
        min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
    # 5c. Image enhancement for degraded scans
    is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
    if is_degraded and enable_enhance:
        try:
            from ocr_image_enhance import enhance_for_ocr
            dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
            logger.info("  enhancement: applied (degraded scan)")
        except Exception as e:
            logger.warning(f"  enhancement: failed ({e})")
    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
    t0 = _time.time()
    img_h, img_w = dewarped_bgr.shape[:2]
    # RapidOCR (local ONNX)
    try:
        from cv_ocr_engines import ocr_region_rapid
        from cv_vocab_types import PageRegion
        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
        rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
    except Exception as e:
        logger.warning(f"  RapidOCR failed: {e}")
        rapid_words = []
    # Tesseract
    from PIL import Image
    import pytesseract
    pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
    data = pytesseract.image_to_data(
        pil_img, lang="eng+deu", config="--psm 6 --oem 3",
        output_type=pytesseract.Output.DICT,
    )
    tess_words = []
    for i in range(len(data["text"])):
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < min_ocr_conf:
            continue
        tess_words.append({
            "text": text,
            "left": data["left"][i], "top": data["top"][i],
            "width": data["width"][i], "height": data["height"][i],
            "conf": conf,
        })
    # Merge dual-engine results
    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
    from cv_words_first import build_grid_from_words
    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
    if rapid_split or tess_words:
        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
        merged_words = _deduplicate_words(merged_words)
    else:
        merged_words = tess_words  # fallback to Tesseract only
    # Build initial grid from merged words
    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
    for cell in cells:
        cell["ocr_engine"] = "rapid_kombi"
    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    logger.info(f"  ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
                f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
    # 7. Save word_result to pipeline session (needed by _build_grid_core)
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": 0,
        "ocr_engine": "rapid_kombi",
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
        },
    }
    # Save images + word_result to pipeline session for admin visibility
    try:
        _, dsk_buf = cv2.imencode(".png", deskewed_bgr)
        _, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        await update_pipeline_session_db(
            pipeline_session_id,
            deskewed_png=dsk_buf.tobytes(),
            dewarped_png=dwp_buf.tobytes(),
            cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
            word_result=word_result,
            deskew_result={"angle_applied": round(angle_applied, 3)},
            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
            current_step=8,
        )
    except Exception as e:
        logger.warning(f"Could not update pipeline session: {e}")
    # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
    t0 = _time.time()
    try:
        from grid_editor_api import _build_grid_core
        session_data = {
            "word_result": word_result,
        }
        grid_result = await _build_grid_core(
            pipeline_session_id, session_data,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
        )
        logger.info(f"  grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
                    f"({_time.time() - t0:.1f}s)")
        # Save grid result to pipeline session
        try:
            await update_pipeline_session_db(
                pipeline_session_id,
                grid_editor_result=grid_result,
                current_step=11,
            )
        except Exception:
            pass
    except Exception as e:
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None
    # 9. Extract vocab entries
    # Prefer grid-build result (better column detection, more cells) over
    # the initial build_grid_from_words() which often under-clusters.
    page_vocabulary = []
    extraction_source = "none"
    # A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
    if grid_result and grid_result.get("zones"):
        for zone in grid_result["zones"]:
            zone_cols = zone.get("columns", [])
            zone_cells = zone.get("cells", [])
            if not zone_cols or not zone_cells:
                continue
            # Sort columns by x position to determine roles
            sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
            col_idx_to_pos = {}
            for pos, col in enumerate(sorted_cols):
                ci = col.get("col_index", col.get("index", -1))
                col_idx_to_pos[ci] = pos
            # Skip zones with only 1 column (likely headers/boxes)
            if len(sorted_cols) < 2:
                continue
            # Group cells by row
            rows_map: dict = {}
            for cell in zone_cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map:
                    rows_map[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map[ri][ci] = (cell.get("text") or "").strip()
            n_cols = len(sorted_cols)
            for ri in sorted(rows_map.keys()):
                row = rows_map[ri]
                # Collect texts in column-position order
                texts = []
                for col in sorted_cols:
                    ci = col.get("col_index", col.get("index", -1))
                    texts.append(row.get(ci, ""))
                if not any(texts):
                    continue
                # Map by position, skipping narrow first column (page refs/markers)
                # Heuristic: if first column is very narrow (<15% of zone width),
                # it's likely a marker/ref column — skip it for vocab
                first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
                zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
                skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
                data_texts = texts[1:] if skip_first else texts
                entry = {
                    "id": str(uuid.uuid4()),
                    "english": data_texts[0] if len(data_texts) > 0 else "",
                    "german": data_texts[1] if len(data_texts) > 1 else "",
                    "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
                    "source_page": page_number + 1,
                }
                if entry["english"] or entry["german"]:
                    page_vocabulary.append(entry)
        if page_vocabulary:
            extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
    # B) Fallback: original cells with column classification
    if not page_vocabulary:
        col_types = {c.get("type") for c in columns_meta}
        is_vocab = bool(col_types & {"column_en", "column_de"})
        if is_vocab:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation="british")
            for entry in entries:
                if not entry.get("english") and not entry.get("german"):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": entry.get("english", ""),
                    "german": entry.get("german", ""),
                    "example_sentence": entry.get("example", ""),
                    "source_page": page_number + 1,
                })
            extraction_source = f"classified ({len(columns_meta)} cols)"
        else:
            # Last resort: all cells by position
            rows_map2: dict = {}
            for cell in cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map2:
                    rows_map2[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map2[ri][ci] = (cell.get("text") or "").strip()
            all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
            for ri in sorted(rows_map2.keys()):
                row = rows_map2[ri]
                texts = [row.get(ci, "") for ci in all_ci]
                if not any(texts):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": texts[0] if len(texts) > 0 else "",
                    "german": texts[1] if len(texts) > 1 else "",
                    "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
                    "source_page": page_number + 1,
                })
            extraction_source = f"generic ({len(all_ci)} cols)"
    # --- Post-processing: merge cell-wrap continuation rows ---
    if len(page_vocabulary) >= 2:
        try:
            # Convert to internal format (example_sentence → example)
            internal = []
            for v in page_vocabulary:
                internal.append({
                    'row_index': len(internal),
                    'english': v.get('english', ''),
                    'german': v.get('german', ''),
                    'example': v.get('example_sentence', ''),
                })
            n_before = len(internal)
            internal = _merge_wrapped_rows(internal)
            internal = _merge_phonetic_continuation_rows(internal)
            internal = _merge_continuation_rows(internal)
            if len(internal) < n_before:
                # Rebuild page_vocabulary from merged entries
                merged_vocab = []
                for entry in internal:
                    if not entry.get('english') and not entry.get('german'):
                        continue
                    merged_vocab.append({
                        'id': str(uuid.uuid4()),
                        'english': entry.get('english', ''),
                        'german': entry.get('german', ''),
                        'example_sentence': entry.get('example', ''),
                        'source_page': page_number + 1,
                    })
                logger.info(f"  row merging: {n_before} → {len(merged_vocab)} entries")
                page_vocabulary = merged_vocab
        except Exception as e:
            logger.warning(f"  row merging failed (non-critical): {e}")
    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
    total_duration = _time.time() - t_total
    logger.info(f"Kombi Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
    return page_vocabulary, rotation, scan_quality_report
@@ -1,490 +1,4 @@
-"""
+# Backward-compat shim -- module moved to vocab/worksheet/upload_api.py
-Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
+import importlib as _importlib
-
+import sys as _sys
-Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.upload_api")
 Routes (no prefix — included into the main /api/v1/vocab router):
  POST /sessions/{session_id}/upload-pdf-info
  GET  /sessions/{session_id}/pdf-thumbnail/{page_number}
  GET  /sessions/{session_id}/pdf-page-image/{page_number}
  POST /sessions/{session_id}/process-single-page/{page_number}
  POST /sessions/{session_id}/process-pages
 """
 import io
 import logging
 import os
 import uuid
 from typing import List, Optional
 from fastapi import APIRouter, HTTPException, Query, UploadFile, File
 from fastapi.responses import StreamingResponse
 from vocab_worksheet_models import SessionStatus, VocabularyEntry
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Local storage path
 # ---------------------------------------------------------------------------
 LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
 # ---------------------------------------------------------------------------
 # Optional heavy dependencies
 # ---------------------------------------------------------------------------
 try:
    import numpy as np
    from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
    OCR_PIPELINE_AVAILABLE = True
 except ImportError:
    np = None  # type: ignore[assignment]
    OCR_PIPELINE_AVAILABLE = False
    logger.warning("OCR pipeline imports not available in upload module")
 # Sub-module imports (already split out)
 from vocab_worksheet_generation import (
    convert_pdf_page_to_image,
    convert_pdf_to_images,
    get_pdf_page_count,
 )
 from vocab_worksheet_extraction import extract_vocabulary_from_image
 try:
    from vocab_worksheet_ocr import _run_ocr_pipeline_for_page
 except ImportError:
    _run_ocr_pipeline_for_page = None  # type: ignore[assignment]
    logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
 # ---------------------------------------------------------------------------
 # In-memory session store (shared with main module)
 # ---------------------------------------------------------------------------
 def _get_sessions():
    from vocab_worksheet_api import _sessions
    return _sessions
 # ---------------------------------------------------------------------------
 # Router (no prefix — will be included into the main vocab router)
 # ---------------------------------------------------------------------------
 upload_router = APIRouter()
 # =============================================================================
 # POST /sessions/{session_id}/upload-pdf-info
 # =============================================================================
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
 async def upload_pdf_get_info(
    session_id: str,
    file: UploadFile = File(...),
 ):
    """
    Upload a PDF and get page count and thumbnails for preview.
    Use this before processing to let user select pages.
    """
    logger.info(f"PDF info request for session {session_id}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    # Validate file type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''
    if extension != 'pdf' and content_type != 'application/pdf':
        raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
    content = await file.read()
    # Save PDF temporarily
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    pdf_path = os.path.join(session_dir, "source.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(content)
    # Get page count
    page_count = get_pdf_page_count(content)
    # Store PDF data in session for later processing
    session["pdf_data"] = content
    session["pdf_path"] = pdf_path
    session["pdf_page_count"] = page_count
    session["status"] = "pdf_uploaded"
    # Detect orientation for each page so thumbnails are shown correctly
    page_rotations: dict = {}
    if OCR_PIPELINE_AVAILABLE:
        for pg in range(page_count):
            try:
                img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
                _, rotation = detect_and_fix_orientation(img_bgr)
                if rotation:
                    page_rotations[pg] = rotation
                    logger.info(f"Page {pg + 1}: orientation {rotation}°")
            except Exception as e:
                logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
    session["page_rotations"] = page_rotations
    return {
        "session_id": session_id,
        "page_count": page_count,
        "filename": file.filename,
        "page_rotations": page_rotations,
    }
 # =============================================================================
 # GET /sessions/{session_id}/pdf-thumbnail/{page_number}
 # =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
 async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
    """Get a thumbnail image of a specific PDF page.
    Uses fitz for rendering so that page_rotations (from OCR orientation
    detection) are applied consistently.
    Args:
        hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    try:
        import fitz
        zoom = 2.0 if hires else 0.5
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        page = pdf_document[page_number]
        # Apply orientation correction detected during OCR processing
        rot = session.get("page_rotations", {}).get(page_number, 0)
        if rot:
            page.set_rotation(rot)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
    except Exception as e:
        logger.error(f"PDF thumbnail failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
    return StreamingResponse(
        io.BytesIO(png_data),
        media_type="image/png",
    )
 # =============================================================================
 # GET /sessions/{session_id}/pdf-page-image/{page_number}
 # =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
 async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
    """PDF page as PNG at arbitrary resolution (for editor view).
    Args:
        zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        page = pdf_document[page_number]
        # Apply orientation correction detected during OCR processing
        rot = session.get("page_rotations", {}).get(page_number, 0)
        if rot:
            page.set_rotation(rot)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
        logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
    except Exception as e:
        logger.error(f"PDF page image failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
    return StreamingResponse(
        io.BytesIO(png_data),
        media_type="image/png",
    )
 # =============================================================================
 # POST /sessions/{session_id}/process-single-page/{page_number}
 # =============================================================================
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
 async def process_single_page(
    session_id: str,
    page_number: int,
    ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
    enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
    max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
 ):
    """
    Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
    Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
    dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
    Query params:
        ipa_mode: "none" (default), "auto", "all", "en", "de"
        syllable_mode: "none" (default), "auto", "all", "en", "de"
        enhance: true (default) -- apply CLAHE/denoise for degraded scans
        max_cols: 3 (default) -- max column count (0=unlimited)
        min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
    The frontend should call this sequentially for each page.
    Returns the vocabulary for just this one page.
    """
    logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
    if session_id not in _get_sessions():
        raise HTTPException(
            status_code=404,
            detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
        )
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
    # Derive pipeline-level variable names for the quality report
    enable_enhance = enhance
    max_columns = max_cols if max_cols > 0 else None
    override_min_conf = min_conf if min_conf > 0 else None
    # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
    rotation_deg = 0
    quality_report = None
    min_ocr_conf = 40  # default; overridden by pipeline when quality report is available
    if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
        try:
            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
            page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
                img_bgr, page_number, session_id,
                ipa_mode=ipa_mode, syllable_mode=syllable_mode,
                enable_enhance=enable_enhance,
                max_columns=max_columns,
                override_min_conf=override_min_conf,
            )
            # Update min_ocr_conf from quality report if available
            if quality_report and hasattr(quality_report, 'recommended_min_conf'):
                min_ocr_conf = quality_report.recommended_min_conf
        except Exception as e:
            logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
            return {
                "session_id": session_id,
                "page_number": page_number + 1,
                "success": False,
                "error": f"OCR pipeline error: {e}",
                "vocabulary": [],
                "vocabulary_count": 0,
            }
    else:
        # Fallback to LLM vision extraction
        logger.warning("OCR pipeline not available, falling back to LLM vision")
        image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_number + 1}.png",
            page_number=page_number
        )
        if error:
            logger.warning(f"Page {page_number + 1} failed: {error}")
            return {
                "session_id": session_id,
                "page_number": page_number + 1,
                "success": False,
                "error": error,
                "vocabulary": [],
                "vocabulary_count": 0,
            }
        page_vocabulary = []
        for entry in vocabulary:
            entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
            entry_dict['source_page'] = page_number + 1
            if 'id' not in entry_dict or not entry_dict['id']:
                entry_dict['id'] = str(uuid.uuid4())
            page_vocabulary.append(entry_dict)
    logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
    # Store rotation for this page (used by image/thumbnail endpoints)
    session.setdefault("page_rotations", {})[page_number] = rotation_deg
    # Add to session's vocabulary (append, don't replace)
    existing_vocab = session.get("vocabulary", [])
    # Remove any existing entries from this page (in case of re-processing)
    existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
    existing_vocab.extend(page_vocabulary)
    session["vocabulary"] = existing_vocab
    session["vocabulary_count"] = len(existing_vocab)
    session["status"] = SessionStatus.EXTRACTED.value
    result = {
        "session_id": session_id,
        "page_number": page_number + 1,
        "success": True,
        "vocabulary": page_vocabulary,
        "vocabulary_count": len(page_vocabulary),
        "total_vocabulary_count": len(existing_vocab),
        "extraction_confidence": 0.9,
        "rotation": rotation_deg,
    }
    # Add scan quality report + active steps info
    if quality_report:
        sq = quality_report.to_dict()
        sq["active_steps"] = {
            "step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
            "step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
            "step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
        }
        result["scan_quality"] = sq
    return result
 # =============================================================================
 # POST /sessions/{session_id}/process-pages  (DEPRECATED)
 # =============================================================================
@upload_router.post("/sessions/{session_id}/process-pages")
 async def process_pdf_pages(
    session_id: str,
    pages: List[int] = None,
    process_all: bool = False,
 ):
    """
    Process specific pages of an uploaded PDF.
    DEPRECATED: Use /process-single-page/{page_number} instead for better results.
    Args:
        pages: List of 0-indexed page numbers to process
        process_all: If True, process all pages
    """
    logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")
    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
    page_count = session.get("pdf_page_count", 1)
    # Determine which pages to process
    if process_all:
        pages = list(range(page_count))
    elif pages is None or len(pages) == 0:
        pages = [0]  # Default to first page
    # Convert selected pages to images
    images = await convert_pdf_to_images(pdf_data, pages)
    # Extract vocabulary from each page SEQUENTIALLY
    all_vocabulary = []
    total_confidence = 0.0
    successful_pages = []
    failed_pages = []
    error_messages = []
    for i, image_data in enumerate(images):
        page_num = pages[i]
        logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_num + 1}.png",
            page_number=page_num
        )
        if error:
            failed_pages.append(page_num + 1)
            error_messages.append(error)
            logger.warning(f"Page {page_num + 1} failed: {error}")
        else:
            successful_pages.append(page_num + 1)
            total_confidence += confidence
            # Add page info to each entry and convert to dict
            for entry in vocabulary:
                entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
                entry_dict['source_page'] = page_num + 1
                all_vocabulary.append(entry_dict)
            logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
    avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
    # Update session
    session["vocabulary"] = all_vocabulary
    session["vocabulary_count"] = len(all_vocabulary)
    session["extraction_confidence"] = avg_confidence
    session["processed_pages"] = pages
    session["successful_pages"] = successful_pages
    session["failed_pages"] = failed_pages
    session["status"] = SessionStatus.EXTRACTED.value
    # Save first page as preview image
    if images:
        session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
        image_path = os.path.join(session_dir, "source.png")
        with open(image_path, 'wb') as f:
            f.write(images[0])
        session["image_path"] = image_path
    result = {
        "session_id": session_id,
        "pages_processed": len(pages),
        "pages_successful": len(successful_pages),
        "pages_failed": len(failed_pages),
        "successful_pages": successful_pages,
        "failed_pages": failed_pages,
        "vocabulary_count": len(all_vocabulary),
        "extraction_confidence": avg_confidence,
        "status": SessionStatus.EXTRACTED.value,
    }
    if error_messages:
        result["errors"] = error_messages
    return result