From 17f0fdb2ed60e0e0f273b79dd733abe690f9d9ca Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Wed, 15 Apr 2026 08:54:55 +0200
Subject: [PATCH] Refactor: extract _build_grid_core into grid_build_core.py +
 clean StepAnsicht
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

grid_editor_api.py: 2411 → 474 lines
- Extracted _build_grid_core() (1892 lines) into grid_build_core.py
- API file now only contains endpoints (build, save, get, gutter, box, unified)

StepAnsicht.tsx: 212 → 112 lines
- Removed useGridEditor imports (not needed for read-only spreadsheet)
- Removed unified grid fetch/build (not used with multi-sheet approach)
- Removed Spreadsheet/Grid toggle (only spreadsheet mode now)
- Simple: fetch grid-editor data → pass to SpreadsheetView

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../components/ocr-kombi/StepAnsicht.tsx      |  172 +-
 klausur-service/backend/grid_build_core.py    | 1943 ++++++++++++++++
 klausur-service/backend/grid_editor_api.py    | 1945 +----------------
 3 files changed, 1982 insertions(+), 2078 deletions(-)
 create mode 100644 klausur-service/backend/grid_build_core.py
diff --git a/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx b/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx
index 74bd858..6633df1 100644
--- a/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx
@@ -1,19 +1,15 @@
 'use client'
 
 /**
- * StepAnsicht — Unified Grid View.
+ * StepAnsicht — Excel-like Spreadsheet View.
  *
  * Left:  Original scan with OCR word overlay
- * Right: Unified grid (single zone, boxes integrated) rendered via GridTable
+ * Right: Fortune Sheet spreadsheet with multi-sheet tabs per zone
  */
 
-import { useCallback, useEffect, useRef, useState } from 'react'
+import { useEffect, useRef, useState } from 'react'
 import dynamic from 'next/dynamic'
-import { useGridEditor } from '@/components/grid-editor/useGridEditor'
-import { GridTable } from '@/components/grid-editor/GridTable'
-import type { GridZone } from '@/components/grid-editor/types'
 
-// Lazy-load SpreadsheetView (Fortune Sheet, SSR-incompatible)
 const SpreadsheetView = dynamic(
   () => import('./SpreadsheetView').then((m) => m.SpreadsheetView),
   { ssr: false, loading: () => <div className="py-8 text-center text-sm text-gray-400">Spreadsheet wird geladen...</div> },
@@ -27,67 +23,29 @@ interface StepAnsichtProps {
 }
 
 export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
-  const gridEditor = useGridEditor(sessionId)
-  const {
-    loading, error, selectedCell, setSelectedCell,
-    updateCellText, toggleColumnBold, toggleRowHeader,
-    getAdjacentCell, deleteColumn, addColumn, deleteRow, addRow,
-    commitUndoPoint, selectedCells, toggleCellSelection,
-    clearCellSelection, toggleSelectedBold, setCellColor,
-    saveGrid, saving, dirty, undo, redo, canUndo, canRedo,
-  } = gridEditor
-
-  const [unifiedGrid, setUnifiedGrid] = useState<any>(null)
-  const [building, setBuilding] = useState(false)
-  const [buildError, setBuildError] = useState<string | null>(null)
+  const [gridData, setGridData] = useState<any>(null)
+  const [loading, setLoading] = useState(true)
+  const [error, setError] = useState<string | null>(null)
   const leftRef = useRef<HTMLDivElement>(null)
   const [leftHeight, setLeftHeight] = useState(600)
-  const [viewMode, setViewMode] = useState<'spreadsheet' | 'grid'>('spreadsheet')
 
-  // Build unified grid
-  const buildUnified = useCallback(async () => {
-    if (!sessionId) return
-    setBuilding(true)
-    setBuildError(null)
-    try {
-      const res = await fetch(
-        `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/build-unified-grid`,
-        { method: 'POST' },
-      )
-      if (!res.ok) {
-        const d = await res.json().catch(() => ({}))
-        throw new Error(d.detail || `HTTP ${res.status}`)
-      }
-      const data = await res.json()
-      setUnifiedGrid(data)
-    } catch (e) {
-      setBuildError(e instanceof Error ? e.message : String(e))
-    } finally {
-      setBuilding(false)
-    }
-  }, [sessionId])
-
-  // Load both grids on mount
+  // Load grid data on mount
   useEffect(() => {
     if (!sessionId) return
-    // Load multi-zone grid (for spreadsheet mode)
-    gridEditor.loadGrid()
-    // Load unified grid (for grid mode)
     ;(async () => {
       try {
-        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/unified-grid`)
-        if (res.ok) {
-          setUnifiedGrid(await res.json())
-        } else {
-          buildUnified()
-        }
-      } catch {
-        buildUnified()
+        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/grid-editor`)
+        if (!res.ok) throw new Error(`HTTP ${res.status}`)
+        setGridData(await res.json())
+      } catch (e) {
+        setError(e instanceof Error ? e.message : 'Fehler beim Laden')
+      } finally {
+        setLoading(false)
       }
     })()
-  }, [sessionId]) // eslint-disable-line react-hooks/exhaustive-deps
+  }, [sessionId])
 
-  // Track left panel height for sync
+  // Track left panel height
   useEffect(() => {
     if (!leftRef.current) return
     const ro = new ResizeObserver(([e]) => setLeftHeight(e.contentRect.height))
@@ -95,13 +53,20 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
     return () => ro.disconnect()
   }, [])
 
-  const unifiedZone: GridZone | null = unifiedGrid?.zones?.[0] ?? null
-
-  if (loading || building) {
+  if (loading) {
     return (
       <div className="flex items-center justify-center py-16">
         <div className="w-8 h-8 border-4 border-teal-500 border-t-transparent rounded-full animate-spin" />
-        <span className="ml-3 text-gray-500">{building ? 'Baue Unified Grid...' : 'Lade...'}</span>
+        <span className="ml-3 text-gray-500">Lade Spreadsheet...</span>
+      </div>
+    )
+  }
+
+  if (error || !gridData) {
+    return (
+      <div className="p-8 text-center">
+        <p className="text-red-500 mb-4">{error || 'Keine Grid-Daten.'}</p>
+        <button onClick={onNext} className="px-5 py-2 bg-teal-600 text-white rounded-lg">Weiter →</button>
       </div>
     )
   }
@@ -111,51 +76,16 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
       {/* Header */}
       <div className="flex items-center justify-between">
         <div>
-          <h3 className="text-lg font-semibold text-gray-900 dark:text-white">Ansicht — Unified Grid</h3>
+          <h3 className="text-lg font-semibold text-gray-900 dark:text-white">Ansicht — Spreadsheet</h3>
           <p className="text-sm text-gray-500 dark:text-gray-400">
-            Alle Inhalte in einem Grid. Boxen sind integriert (farbig markiert).
-            {unifiedGrid && (
-              <span className="ml-2 font-mono text-xs">
-                {unifiedGrid.summary?.total_rows} Zeilen × {unifiedGrid.summary?.total_columns} Spalten
-                {unifiedGrid.dominant_row_h && ` · Zeilenhöhe: ${Math.round(unifiedGrid.dominant_row_h)}px`}
-              </span>
-            )}
+            Jede Zone als eigenes Sheet-Tab. Spaltenbreiten pro Sheet optimiert.
           </p>
         </div>
-        <div className="flex items-center gap-2">
-          <div className="flex rounded-lg overflow-hidden border border-gray-300 dark:border-gray-600">
-            <button
-              onClick={() => setViewMode('spreadsheet')}
-              className={`px-3 py-1.5 text-xs font-medium ${viewMode === 'spreadsheet' ? 'bg-teal-600 text-white' : 'bg-white dark:bg-gray-700 text-gray-600 dark:text-gray-300'}`}
-            >
-              Spreadsheet
-            </button>
-            <button
-              onClick={() => setViewMode('grid')}
-              className={`px-3 py-1.5 text-xs font-medium ${viewMode === 'grid' ? 'bg-teal-600 text-white' : 'bg-white dark:bg-gray-700 text-gray-600 dark:text-gray-300'}`}
-            >
-              Grid
-            </button>
-          </div>
-          <button
-            onClick={buildUnified}
-            disabled={building}
-            className="px-3 py-1.5 bg-amber-600 text-white rounded-lg hover:bg-amber-700 text-xs font-medium disabled:opacity-50"
-          >
-            {building ? 'Baut...' : 'Neu aufbauen'}
-          </button>
-          <button onClick={onNext} className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 text-sm font-medium">
-            Weiter →
-          </button>
-        </div>
+        <button onClick={onNext} className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 text-sm font-medium">
+          Weiter →
+        </button>
       </div>
 
-      {(error || buildError) && (
-        <div className="p-3 bg-red-50 dark:bg-red-900/30 border border-red-200 dark:border-red-800 rounded-lg text-red-700 dark:text-red-300 text-sm">
-          {error || buildError}
-        </div>
-      )}
-
       {/* Split view */}
       <div className="flex gap-2">
         {/* LEFT: Original + OCR overlay */}
@@ -170,41 +100,9 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
           )}
         </div>
 
-        {/* RIGHT: Spreadsheet or Grid view */}
-        <div className="flex-1 border border-gray-300 dark:border-gray-600 rounded-lg overflow-hidden bg-white dark:bg-gray-900" style={{ maxHeight: `${Math.max(700, leftHeight)}px` }}>
-          {viewMode === 'spreadsheet' && (unifiedGrid || gridEditor.grid) ? (
-            <SpreadsheetView gridData={gridEditor.grid} height={Math.max(650, leftHeight - 10)} />
-          ) : viewMode === 'grid' && unifiedZone ? (
-            <div className="overflow-auto h-full">
-              <div className="px-2 py-1 bg-teal-600/80 text-white text-[10px] font-medium sticky top-0 z-20">
-                Grid View ({unifiedGrid?.summary?.total_rows}×{unifiedGrid?.summary?.total_columns})
-              </div>
-              <GridTable
-                zone={unifiedZone}
-                selectedCell={selectedCell}
-                selectedCells={selectedCells}
-                onSelectCell={setSelectedCell}
-                onCellTextChange={updateCellText}
-                onToggleColumnBold={toggleColumnBold}
-                onToggleRowHeader={toggleRowHeader}
-                onNavigate={(cellId, dir) => {
-                  const next = getAdjacentCell(cellId, dir)
-                  if (next) setSelectedCell(next)
-                }}
-                onDeleteColumn={deleteColumn}
-                onAddColumn={addColumn}
-                onDeleteRow={deleteRow}
-                onAddRow={addRow}
-                onToggleCellSelection={toggleCellSelection}
-                onSetCellColor={setCellColor}
-              />
-            </div>
-          ) : (
-            <div className="p-8 text-center text-gray-400">
-              <p>Kein Unified Grid verfügbar.</p>
-              <button onClick={buildUnified} className="mt-2 text-teal-600 text-sm">Jetzt aufbauen</button>
-            </div>
-          )}
+        {/* RIGHT: Fortune Sheet */}
+        <div className="flex-1 border border-gray-300 dark:border-gray-600 rounded-lg overflow-hidden bg-white dark:bg-gray-900">
+          <SpreadsheetView gridData={gridData} height={Math.max(650, leftHeight - 10)} />
         </div>
       </div>
     </div>
diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py
new file mode 100644
index 0000000..cab5277
--- /dev/null
+++ b/klausur-service/backend/grid_build_core.py
@@ -0,0 +1,1943 @@
+"""
+Grid Build Core — the main _build_grid_core() function.
+
+Extracted from grid_editor_api.py for maintainability.
+Takes merged OCR word positions and builds a structured, zone-aware grid.
+"""
+
+import logging
+import re
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+
+from cv_box_detect import detect_boxes, split_page_into_zones
+from cv_graphic_detect import detect_graphic_elements
+from cv_vocab_types import PageZone
+from cv_color_detect import detect_word_colors, recover_colored_text
+from cv_ocr_engines import (
+    fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
+    _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines,
+)
+from ocr_pipeline_session_store import get_session_image
+
+from grid_editor_helpers import (
+    _filter_border_strip_words,
+    _cluster_columns_by_alignment,
+    _GRID_GHOST_CHARS,
+    _filter_border_ghosts,
+    _MARKER_CHARS,
+    _merge_inline_marker_columns,
+    _flatten_word_boxes,
+    _words_in_zone,
+    _PIPE_RE_VSPLIT,
+    _detect_vertical_dividers,
+    _split_zone_at_vertical_dividers,
+    _merge_content_zones_across_boxes,
+    _detect_heading_rows_by_color,
+    _detect_heading_rows_by_single_cell,
+    _detect_header_rows,
+    _build_zone_grid,
+    _get_content_bounds,
+    _filter_decorative_margin,
+    _filter_footer_words,
+    _filter_header_junk,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def _build_grid_core(
+    session_id: str,
+    session: dict,
+    *,
+    ipa_mode: str = "auto",
+    syllable_mode: str = "auto",
+) -> dict:
+    """Core grid building logic — pure computation, no HTTP or DB side effects.
+
+    Args:
+        session_id: Session identifier (for logging and image loading).
+        session: Full session dict from get_session_db().
+        ipa_mode: "auto" (only when English headwords detected), "all"
+            (force IPA on all content columns), "en" (English column only),
+            "de" (German/definition columns only), or "none" (skip entirely).
+        syllable_mode: "auto" (only when original has pipe dividers),
+            "all" (force syllabification on all words), "en" (English only),
+            "de" (German only), or "none" (skip).
+
+    Returns:
+        StructuredGrid result dict.
+
+    Raises:
+        ValueError: If session data is incomplete.
+    """
+    t0 = time.time()
+
+    # 1. Validate and load word results
+    word_result = session.get("word_result")
+    if not word_result or not word_result.get("cells"):
+        raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
+
+    img_w = word_result.get("image_width", 0)
+    img_h = word_result.get("image_height", 0)
+    if not img_w or not img_h:
+        raise ValueError("Missing image dimensions in word_result")
+
+    # 2. Flatten all word boxes from cells
+    all_words = _flatten_word_boxes(word_result["cells"])
+    if not all_words:
+        raise ValueError("No word boxes found in cells")
+
+    logger.info("build-grid session %s: %d words from %d cells",
+                session_id, len(all_words), len(word_result["cells"]))
+
+    # 2b. Filter decorative margin columns (alphabet graphics).
+    # Some worksheets have a decorative alphabet strip along one margin
+    # (A-Z in a graphic).  OCR reads these as single-char words aligned
+    # vertically.  Detect and remove them before grid building.
+    margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
+    margin_strip_detected = margin_strip_info.get("found", False)
+
+    # Read document_category from session (user-selected or auto-detected)
+    document_category = session.get("document_category")
+
+    # 2c. Filter footer rows (page numbers at the very bottom).
+    # Isolated short text in the bottom 5% of the page is typically a
+    # page number ("64", "S. 12") and not real content.  The page number
+    # is extracted as metadata for the frontend header display.
+    page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
+
+    # 2c2. Filter OCR junk from header illustrations.
+    # Low-confidence short fragments above the first real content row.
+    _filter_header_junk(all_words, img_h, logger, session_id)
+
+    # 2d. Filter words inside user-defined exclude regions (from Structure step).
+    # These are explicitly marked by the user, so ALL words inside are removed
+    # regardless of confidence.
+    structure_result = session.get("structure_result")
+    exclude_rects = []
+    if structure_result:
+        for er in structure_result.get("exclude_regions", []):
+            exclude_rects.append({
+                "x": er["x"], "y": er["y"],
+                "w": er["w"], "h": er["h"],
+            })
+    if exclude_rects:
+        before = len(all_words)
+        filtered = []
+        for w in all_words:
+            w_cx = w["left"] + w.get("width", 0) / 2
+            w_cy = w["top"] + w.get("height", 0) / 2
+            inside = any(
+                er["x"] <= w_cx <= er["x"] + er["w"]
+                and er["y"] <= w_cy <= er["y"] + er["h"]
+                for er in exclude_rects
+            )
+            if not inside:
+                filtered.append(w)
+        removed = before - len(filtered)
+        if removed:
+            all_words = filtered
+            logger.info(
+                "build-grid session %s: removed %d words inside %d user exclude region(s)",
+                session_id, removed, len(exclude_rects),
+            )
+
+    # 2e. Hard-filter words inside graphic/image regions from structure step.
+    # ALL words inside graphic regions are removed regardless of confidence —
+    # images cannot contain real text; any OCR words inside are artifacts.
+    # After image loading (Step 3a) we augment these with freshly detected
+    # graphic regions from cv_graphic_detect.
+    graphic_rects: List[Dict[str, int]] = []
+    if structure_result:
+        for g in structure_result.get("graphics", []):
+            graphic_rects.append({
+                "x": g["x"], "y": g["y"],
+                "w": g["w"], "h": g["h"],
+            })
+    if graphic_rects:
+        before = len(all_words)
+        all_words = [
+            w for w in all_words
+            if not any(
+                gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                for gr in graphic_rects
+            )
+        ]
+        removed = before - len(all_words)
+        if removed:
+            logger.info(
+                "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
+                session_id, removed, len(graphic_rects),
+            )
+
+    # 3. Load image for box detection
+    img_png = await get_session_image(session_id, "cropped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "dewarped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "original")
+
+    zones_data: List[Dict[str, Any]] = []
+    boxes_detected = 0
+    recovered_count = 0
+    border_prefiltered = False
+    img_bgr = None
+
+    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
+
+    if img_png:
+        # Decode image for color detection + box detection
+        arr = np.frombuffer(img_png, dtype=np.uint8)
+        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+
+        if img_bgr is not None:
+            # --- 3a. Detect graphic/image regions via CV and hard-filter ---
+            # Pass only significant words (len >= 3) to the detector so that
+            # short OCR artifacts inside images don't fool the text-vs-graphic
+            # heuristic (it counts word centroids to distinguish text from images).
+            sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
+            fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
+            if fresh_graphics:
+                fresh_rects = [
+                    {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
+                    for g in fresh_graphics
+                ]
+                graphic_rects.extend(fresh_rects)
+                logger.info(
+                    "build-grid session %s: detected %d graphic region(s) via CV",
+                    session_id, len(fresh_graphics),
+                )
+                # Hard-filter words inside newly detected graphic regions
+                before = len(all_words)
+                all_words = [
+                    w for w in all_words
+                    if not any(
+                        gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                        and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                        for gr in fresh_rects
+                    )
+                ]
+                removed = before - len(all_words)
+                if removed:
+                    logger.info(
+                        "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
+                        session_id, removed, len(fresh_rects),
+                    )
+
+            # --- Recover colored text that OCR missed (before grid building) ---
+            recovered = recover_colored_text(img_bgr, all_words)
+            if recovered and graphic_rects:
+                # Filter recovered chars inside graphic regions
+                recovered = [
+                    r for r in recovered
+                    if not any(
+                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                        for gr in graphic_rects
+                    )
+                ]
+            if recovered:
+                recovered_count = len(recovered)
+                all_words.extend(recovered)
+                logger.info(
+                    "build-grid session %s: +%d recovered colored words",
+                    session_id, recovered_count,
+                )
+
+            # Detect bordered boxes
+            boxes = detect_boxes(
+                img_bgr,
+                content_x=content_x,
+                content_w=content_w,
+                content_y=content_y,
+                content_h=content_h,
+            )
+            boxes_detected = len(boxes)
+
+            if boxes:
+                # Filter border ghost words before grid building
+                all_words, ghost_count = _filter_border_ghosts(all_words, boxes)
+                if ghost_count:
+                    logger.info(
+                        "build-grid session %s: removed %d border ghost words",
+                        session_id, ghost_count,
+                    )
+
+                # Split page into zones
+                page_zones = split_page_into_zones(
+                    content_x, content_y, content_w, content_h, boxes
+                )
+
+                # Merge content zones separated by box zones
+                page_zones = _merge_content_zones_across_boxes(
+                    page_zones, content_x, content_w
+                )
+
+                # 3b. Detect vertical dividers and split content zones
+                vsplit_group_counter = 0
+                expanded_zones: List = []
+                for pz in page_zones:
+                    if pz.zone_type != "content":
+                        expanded_zones.append(pz)
+                        continue
+                    zone_words = _words_in_zone(
+                        all_words, pz.y, pz.height, pz.x, pz.width
+                    )
+                    divider_xs = _detect_vertical_dividers(
+                        zone_words, pz.x, pz.width, pz.y, pz.height
+                    )
+                    if divider_xs:
+                        sub_zones = _split_zone_at_vertical_dividers(
+                            pz, divider_xs, vsplit_group_counter
+                        )
+                        expanded_zones.extend(sub_zones)
+                        vsplit_group_counter += 1
+                        # Remove pipe words so they don't appear in sub-zones
+                        pipe_ids = set(
+                            id(w) for w in zone_words
+                            if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+                        )
+                        all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
+                        logger.info(
+                            "build-grid: vertical split zone %d at x=%s → %d sub-zones",
+                            pz.index, [int(x) for x in divider_xs], len(sub_zones),
+                        )
+                    else:
+                        expanded_zones.append(pz)
+                # Re-index zones
+                for i, pz in enumerate(expanded_zones):
+                    pz.index = i
+                page_zones = expanded_zones
+
+                # --- Union columns from all content zones ---
+                # Each content zone detects columns independently.  Narrow
+                # columns (page refs, markers) may appear in only one zone.
+                # Merge column split-points from ALL content zones so every
+                # zone shares the full column set.
+                # NOTE: Zones from a vertical split are independent and must
+                # NOT share columns with each other.
+
+                # First pass: build grids per zone independently
+                zone_grids: List[Dict] = []
+
+                for pz in page_zones:
+                    zone_words = _words_in_zone(
+                        all_words, pz.y, pz.height, pz.x, pz.width
+                    )
+                    if pz.zone_type == "content":
+                        logger.info(
+                            "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d → %d/%d words",
+                            pz.index, pz.zone_type,
+                            pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
+                            len(zone_words), len(all_words),
+                        )
+                    # Filter recovered single-char artifacts in ALL zones
+                    # (decorative colored pixel blobs like !, ?, • from
+                    # recover_colored_text that don't represent real text)
+                    before = len(zone_words)
+                    zone_words = [
+                        w for w in zone_words
+                        if not (
+                            w.get("recovered")
+                            and len(w.get("text", "").strip()) <= 2
+                        )
+                    ]
+                    removed = before - len(zone_words)
+                    if removed:
+                        logger.info(
+                            "build-grid: filtered %d recovered artifacts from %s zone %d",
+                            removed, pz.zone_type, pz.index,
+                        )
+                    # Filter words inside image overlay regions (merged box zones)
+                    if pz.image_overlays:
+                        before_ov = len(zone_words)
+                        zone_words = [
+                            w for w in zone_words
+                            if not any(
+                                ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
+                                and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
+                                for ov in pz.image_overlays
+                            )
+                        ]
+                        ov_removed = before_ov - len(zone_words)
+                        if ov_removed:
+                            logger.info(
+                                "build-grid: filtered %d words inside image overlays from zone %d",
+                                ov_removed, pz.index,
+                            )
+                    zone_words, bs_removed = _filter_border_strip_words(zone_words)
+                    if bs_removed:
+                        border_prefiltered = True
+                        logger.info(
+                            "build-grid: pre-filtered %d border-strip words from zone %d",
+                            bs_removed, pz.index,
+                        )
+                    grid = _build_zone_grid(
+                        zone_words, pz.x, pz.y, pz.width, pz.height,
+                        pz.index, img_w, img_h,
+                        skip_first_row_header=bool(pz.image_overlays),
+                    )
+                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
+
+                # Second pass: merge column boundaries from all content zones
+                # Exclude zones from vertical splits — they have independent columns.
+                content_zones = [
+                    zg for zg in zone_grids
+                    if zg["pz"].zone_type == "content"
+                    and zg["pz"].vsplit_group is None
+                ]
+                if len(content_zones) > 1:
+                    # Collect column split points (x_min of non-first columns)
+                    all_split_xs: List[float] = []
+                    for zg in content_zones:
+                        raw_cols = zg["grid"].get("_raw_columns", [])
+                        for col in raw_cols[1:]:
+                            all_split_xs.append(col["x_min"])
+
+                    if all_split_xs:
+                        all_split_xs.sort()
+                        merge_distance = max(25, int(content_w * 0.03))
+                        merged_xs = [all_split_xs[0]]
+                        for x in all_split_xs[1:]:
+                            if x - merged_xs[-1] < merge_distance:
+                                merged_xs[-1] = (merged_xs[-1] + x) / 2
+                            else:
+                                merged_xs.append(x)
+
+                        total_cols = len(merged_xs) + 1
+                        max_zone_cols = max(
+                            len(zg["grid"].get("_raw_columns", []))
+                            for zg in content_zones
+                        )
+
+                        # Apply union whenever it has at least as many
+                        # columns as the best single zone.  Even with the
+                        # same count the union boundaries are better because
+                        # they incorporate evidence from all zones.
+                        if total_cols >= max_zone_cols:
+                            cx_min = min(w["left"] for w in all_words)
+                            cx_max = max(
+                                w["left"] + w["width"] for w in all_words
+                            )
+                            merged_columns: List[Dict[str, Any]] = []
+                            prev_x = cx_min
+                            for i, sx in enumerate(merged_xs):
+                                merged_columns.append({
+                                    "index": i,
+                                    "type": f"column_{i + 1}",
+                                    "x_min": prev_x,
+                                    "x_max": sx,
+                                })
+                                prev_x = sx
+                            merged_columns.append({
+                                "index": len(merged_xs),
+                                "type": f"column_{len(merged_xs) + 1}",
+                                "x_min": prev_x,
+                                "x_max": cx_max,
+                            })
+
+                            # Re-build ALL content zones with merged columns
+                            for zg in zone_grids:
+                                pz = zg["pz"]
+                                if pz.zone_type == "content":
+                                    grid = _build_zone_grid(
+                                        zg["words"], pz.x, pz.y,
+                                        pz.width, pz.height,
+                                        pz.index, img_w, img_h,
+                                        global_columns=merged_columns,
+                                        skip_first_row_header=bool(pz.image_overlays),
+                                    )
+                                    zg["grid"] = grid
+                            logger.info(
+                                "build-grid session %s: union of %d content "
+                                "zones → %d merged columns (max single zone: %d)",
+                                session_id, len(content_zones),
+                                total_cols, max_zone_cols,
+                            )
+
+                for zg in zone_grids:
+                    pz = zg["pz"]
+                    grid = zg["grid"]
+                    # Remove internal _raw_columns before adding to response
+                    grid.pop("_raw_columns", None)
+
+                    zone_entry: Dict[str, Any] = {
+                        "zone_index": pz.index,
+                        "zone_type": pz.zone_type,
+                        "bbox_px": {
+                            "x": pz.x, "y": pz.y,
+                            "w": pz.width, "h": pz.height,
+                        },
+                        "bbox_pct": {
+                            "x": round(pz.x / img_w * 100, 2) if img_w else 0,
+                            "y": round(pz.y / img_h * 100, 2) if img_h else 0,
+                            "w": round(pz.width / img_w * 100, 2) if img_w else 0,
+                            "h": round(pz.height / img_h * 100, 2) if img_h else 0,
+                        },
+                        "border": None,
+                        "word_count": len(zg["words"]),
+                        **grid,
+                    }
+
+                    if pz.box:
+                        zone_entry["border"] = {
+                            "thickness": pz.box.border_thickness,
+                            "confidence": pz.box.confidence,
+                        }
+
+                    if pz.image_overlays:
+                        zone_entry["image_overlays"] = pz.image_overlays
+
+                    if pz.layout_hint:
+                        zone_entry["layout_hint"] = pz.layout_hint
+                    if pz.vsplit_group is not None:
+                        zone_entry["vsplit_group"] = pz.vsplit_group
+
+                    zones_data.append(zone_entry)
+
+    # 4. Fallback: no boxes detected → single zone with all words
+    if not zones_data:
+        # Filter recovered single-char artifacts (same as in zone loop above)
+        before = len(all_words)
+        filtered_words = [
+            w for w in all_words
+            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
+        ]
+        removed = before - len(filtered_words)
+        if removed:
+            logger.info(
+                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
+                session_id, removed,
+            )
+        # Pre-filter border-strip words so column detection is not
+        # confused by edge artifacts.  When this removes words, Step 4e
+        # is skipped (it would otherwise re-detect content as a "strip").
+        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
+        if bs_removed:
+            border_prefiltered = True
+            logger.info(
+                "build-grid session %s: pre-filtered %d border-strip words",
+                session_id, bs_removed,
+            )
+        grid = _build_zone_grid(
+            filtered_words, content_x, content_y, content_w, content_h,
+            0, img_w, img_h,
+        )
+        grid.pop("_raw_columns", None)
+        zones_data.append({
+            "zone_index": 0,
+            "zone_type": "content",
+            "bbox_px": {
+                "x": content_x, "y": content_y,
+                "w": content_w, "h": content_h,
+            },
+            "bbox_pct": {
+                "x": round(content_x / img_w * 100, 2) if img_w else 0,
+                "y": round(content_y / img_h * 100, 2) if img_h else 0,
+                "w": round(content_w / img_w * 100, 2) if img_w else 0,
+                "h": round(content_h / img_h * 100, 2) if img_h else 0,
+            },
+            "border": None,
+            "word_count": len(all_words),
+            **grid,
+        })
+
+    # 4b. Remove junk rows: rows where ALL cells contain only short,
+    # low-confidence text (OCR noise, stray marks).  Real vocabulary rows
+    # have at least one word with conf >= 50 or meaningful text length.
+    # Also remove "oversized stub" rows: rows with ≤2 very short words
+    # whose word-boxes are significantly taller than the median (e.g.
+    # large red page numbers like "( 9" that are not real text content).
+    _JUNK_CONF_THRESHOLD = 50
+    _JUNK_MAX_TEXT_LEN = 3
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        if not cells or not rows:
+            continue
+
+        # Compute median word height across the zone for oversized detection
+        all_wb_heights = [
+            wb["height"]
+            for cell in cells
+            for wb in cell.get("word_boxes") or []
+            if wb.get("height", 0) > 0
+        ]
+        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
+
+        junk_row_indices = set()
+        for row in rows:
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            if not row_cells:
+                continue
+
+            row_wbs = [
+                wb for cell in row_cells
+                for wb in cell.get("word_boxes") or []
+            ]
+
+            # Rule 1: ALL word_boxes are low-conf AND short text
+            all_junk = True
+            for wb in row_wbs:
+                text = (wb.get("text") or "").strip()
+                conf = wb.get("conf", 0)
+                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
+                    all_junk = False
+                    break
+            if all_junk and row_wbs:
+                junk_row_indices.add(ri)
+                continue
+
+            # Rule 2: oversized stub — ≤3 words, short total text,
+            # and word height > 1.8× median (page numbers, stray marks,
+            # OCR from illustration labels like "SEA &")
+            # Skip if any word looks like a page reference (p.55, S.12).
+            if len(row_wbs) <= 3:
+                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
+                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
+                has_page_ref = any(
+                    re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
+                    for wb in row_wbs
+                )
+                if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
+                    junk_row_indices.add(ri)
+                    continue
+
+            # Rule 3: scattered debris — rows with only tiny fragments
+            # (e.g. OCR artifacts from illustrations/graphics).
+            # If the row has no word longer than 2 chars, it's noise.
+            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
+            if longest <= 2:
+                junk_row_indices.add(ri)
+                continue
+
+        if junk_row_indices:
+            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
+            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
+            logger.info(
+                "build-grid: removed %d junk rows from zone %d: %s",
+                len(junk_row_indices), z["zone_index"],
+                sorted(junk_row_indices),
+            )
+
+    # 4b2. Remove individual cells that consist of a single very-short,
+    # low-confidence word (OCR artifacts like "as", "b" from stray marks).
+    # These survive row-level junk removal when the row has valid cells
+    # in other columns.
+    _ARTIFACT_MAX_LEN = 2
+    _ARTIFACT_CONF_THRESHOLD = 65
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        artifact_ids = set()
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) != 1:
+                continue
+            wb = wbs[0]
+            text = (wb.get("text") or "").strip()
+            conf = wb.get("conf", 100)
+            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
+                artifact_ids.add(cell.get("cell_id"))
+        if artifact_ids:
+            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
+            logger.info(
+                "build-grid: removed %d artifact cells from zone %d: %s",
+                len(artifact_ids), z.get("zone_index", 0),
+                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
+            )
+
+    # 4c. Remove oversized word_boxes from individual cells.
+    # OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
+    # have word heights 3-5x the median.  Remove them per-word so they don't
+    # pollute cells that also contain valid text in other columns.
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        all_wh = [
+            wb["height"]
+            for cell in cells
+            for wb in cell.get("word_boxes") or []
+            if wb.get("height", 0) > 0
+        ]
+        if not all_wh:
+            continue
+        med_h = sorted(all_wh)[len(all_wh) // 2]
+        oversized_threshold = med_h * 3
+        removed_oversized = 0
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
+            if len(filtered) < len(wbs):
+                removed_oversized += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        if removed_oversized:
+            # Remove cells that became empty after oversized removal
+            z["cells"] = [c for c in cells if c.get("word_boxes")]
+            logger.info(
+                "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
+                removed_oversized, oversized_threshold, z.get("zone_index", 0),
+            )
+
+    # 4d. Remove pipe-character word_boxes (column divider artifacts).
+    # OCR reads physical vertical divider lines as "|" or "||" characters.
+    # These sit at consistent x positions near column boundaries and pollute
+    # cell text.  Remove them from word_boxes and rebuild cell text.
+    # NOTE: Zones from a vertical split already had pipes removed in step 3b.
+    _PIPE_RE = re.compile(r"^\|+$")
+    for z in zones_data:
+        if z.get("vsplit_group") is not None:
+            continue  # pipes already removed before split
+        removed_pipes = 0
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                removed_pipes += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        # Remove cells that became empty after pipe removal
+        if removed_pipes:
+            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
+            logger.info(
+                "build-grid: removed %d pipe-divider word_boxes from zone %d",
+                removed_pipes, z.get("zone_index", 0),
+            )
+
+    # Strip pipe chars ONLY from word_boxes/cells where the pipe is an
+    # OCR column-divider artifact.  Preserve pipes that are embedded in
+    # words as syllable separators (e.g. "zu|trau|en") — these are
+    # intentional and used in dictionary Ground Truth.
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            for wb in cell.get("word_boxes", []):
+                wbt = wb.get("text", "")
+                # Only strip if the ENTIRE word_box is just pipe(s)
+                # (handled by _PIPE_RE above) — leave embedded pipes alone
+            text = cell.get("text", "")
+            if "|" in text:
+                # Only strip leading/trailing pipes (OCR artifacts at cell edges)
+                cleaned = text.strip("|").strip()
+                if cleaned != text.strip():
+                    cell["text"] = cleaned
+
+    # 4d2. Normalize narrow connector columns.
+    # In synonym dictionaries a narrow column repeats the same word
+    # (e.g. "oder") in every row.  OCR sometimes appends noise chars
+    # (e.g. "oderb" instead of "oder").  If ≥60% of cells in a column
+    # share the same short text, normalize near-match outliers.
+    for z in zones_data:
+        cols = z.get("columns", [])
+        cells = z.get("cells", [])
+        if not cols or not cells:
+            continue
+        for col in cols:
+            ci = col.get("index")
+            col_cells = [c for c in cells if c.get("col_index") == ci]
+            if len(col_cells) < 3:
+                continue
+            # Count text occurrences
+            text_counts: Dict[str, int] = {}
+            for c in col_cells:
+                t = (c.get("text") or "").strip()
+                if t:
+                    text_counts[t] = text_counts.get(t, 0) + 1
+            if not text_counts:
+                continue
+            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
+            dominant_count = text_counts[dominant_text]
+            # Only normalize if dominant word is short and appears in ≥60%
+            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
+                continue
+            # Fix outliers that start with the dominant text
+            fixed = 0
+            for c in col_cells:
+                t = (c.get("text") or "").strip()
+                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
+                    c["text"] = dominant_text
+                    # Also fix word_boxes
+                    wbs = c.get("word_boxes") or []
+                    if len(wbs) == 1:
+                        wbs[0]["text"] = dominant_text
+                    fixed += 1
+            if fixed:
+                logger.info(
+                    "build-grid: normalized %d outlier cells in connector column %d "
+                    "(dominant='%s') zone %d",
+                    fixed, ci, dominant_text, z.get("zone_index", 0),
+                )
+
+    # 4e. Detect and remove page-border decoration strips.
+    # Skipped when the pre-filter already removed border words BEFORE
+    # column detection — re-running would incorrectly detect the
+    # leftmost content column as a "strip".
+    border_strip_removed = 0
+    if border_prefiltered:
+        logger.info("Step 4e: skipped (border pre-filter already applied)")
+    else:
+        # Some textbooks have decorative alphabet strips along the page
+        # edge.  OCR picks up scattered letters from these as artifacts.
+        # Detection: find the first significant x-gap (>30 px) from each
+        # page edge between a small cluster (<20 %) and the main content.
+        for z in zones_data:
+            cells = z.get("cells", [])
+            if not cells:
+                continue
+            all_wbs_with_cell: List[tuple] = []  # (left, wb, cell)
+            for cell in cells:
+                for wb in cell.get("word_boxes") or []:
+                    all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
+            if len(all_wbs_with_cell) < 10:
+                continue
+            all_wbs_with_cell.sort(key=lambda t: t[0])
+            total = len(all_wbs_with_cell)
+
+            # -- Left-edge scan --
+            left_strip_count = 0
+            left_gap = 0
+            running_right = 0
+            for gi in range(total - 1):
+                running_right = max(
+                    running_right,
+                    all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
+                )
+                gap = all_wbs_with_cell[gi + 1][0] - running_right
+                if gap > 30:
+                    left_strip_count = gi + 1
+                    left_gap = gap
+                    break
+
+            # -- Right-edge scan --
+            right_strip_count = 0
+            right_gap = 0
+            running_left = all_wbs_with_cell[-1][0]
+            for gi in range(total - 1, 0, -1):
+                running_left = min(running_left, all_wbs_with_cell[gi][0])
+                prev_right = (
+                    all_wbs_with_cell[gi - 1][0]
+                    + all_wbs_with_cell[gi - 1][1].get("width", 0)
+                )
+                gap = running_left - prev_right
+                if gap > 30:
+                    right_strip_count = total - gi
+                    right_gap = gap
+                    break
+
+            strip_wbs: set = set()
+            strip_side = ""
+            strip_gap = 0
+            strip_count = 0
+            if left_strip_count > 0 and left_strip_count / total < 0.20:
+                strip_side = "left"
+                strip_count = left_strip_count
+                strip_gap = left_gap
+                strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
+            elif right_strip_count > 0 and right_strip_count / total < 0.20:
+                strip_side = "right"
+                strip_count = right_strip_count
+                strip_gap = right_gap
+                strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
+
+            if not strip_wbs:
+                continue
+            for cell in cells:
+                wbs = cell.get("word_boxes") or []
+                filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
+                if len(filtered) < len(wbs):
+                    border_strip_removed += len(wbs) - len(filtered)
+                    cell["word_boxes"] = filtered
+                    cell["text"] = _words_to_reading_order_text(filtered)
+            z["cells"] = [c for c in cells
+                          if (c.get("word_boxes") or c.get("text", "").strip())]
+            logger.info(
+                "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
+                "(gap=%dpx, strip=%d/%d wbs)",
+                border_strip_removed, strip_side, z.get("zone_index", 0),
+                strip_gap, strip_count, total,
+            )
+
+    # 4f. Remove decorative edge columns (alphabet sidebar safety net).
+    # Dictionary pages have A-Z letter sidebars that OCR reads as single-
+    # character word_boxes.  These form narrow columns with very short text.
+    # Detection: edge column where almost ALL cells are single characters.
+    for z in zones_data:
+        columns = z.get("columns", [])
+        cells = z.get("cells", [])
+        if len(columns) < 3 or not cells:
+            continue
+        # Group cells by col_type (skip spanning_header)
+        col_cells: Dict[str, List[Dict]] = {}
+        for cell in cells:
+            ct = cell.get("col_type", "")
+            if ct.startswith("column_"):
+                col_cells.setdefault(ct, []).append(cell)
+        col_types_ordered = sorted(col_cells.keys())
+        if len(col_types_ordered) < 3:
+            continue
+        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
+            edge_cells_list = col_cells.get(edge_ct, [])
+            if len(edge_cells_list) < 3:
+                continue
+            # Key criterion: average text length and single-char ratio.
+            # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
+            # are single characters.
+            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
+            avg_len = sum(len(t) for t in texts) / len(texts)
+            single_char = sum(1 for t in texts if len(t) <= 1)
+            single_ratio = single_char / len(texts)
+            if avg_len > 1.5:
+                continue  # real content has longer text
+            if single_ratio < 0.7:
+                continue  # not dominated by single chars
+            # Remove this edge column
+            removed_count = len(edge_cells_list)
+            edge_ids = {id(c) for c in edge_cells_list}
+            z["cells"] = [c for c in cells if id(c) not in edge_ids]
+            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
+            logger.info(
+                "Step 4f: removed decorative edge column '%s' from zone %d "
+                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
+                edge_ct, z.get("zone_index", 0), removed_count,
+                avg_len, single_ratio * 100,
+            )
+            break  # only remove one edge per zone
+
+    # 5. Color annotation on final word_boxes in cells
+    if img_bgr is not None:
+        all_wb: List[Dict] = []
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                all_wb.extend(cell.get("word_boxes", []))
+        detect_word_colors(img_bgr, all_wb)
+
+    # 5a. Heading detection by color + height (after color is available)
+    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
+    if heading_count:
+        logger.info("Detected %d heading rows by color+height", heading_count)
+
+    # 5b. Fix unmatched parentheses in cell text
+    # OCR often misses opening "(" while detecting closing ")".
+    # If a cell's text has ")" without a matching "(", prepend "(".
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if ")" in text and "(" not in text:
+                cell["text"] = "(" + text
+
+    # 5c. IPA phonetic correction — replace garbled OCR phonetics with
+    # correct IPA from the dictionary (same as in the OCR pipeline).
+    # Only applies to vocabulary tables (≥3 columns: EN | article | DE).
+    # Single/two-column layouts are continuous text, not vocab tables.
+    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
+    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
+    en_col_type = None
+    ipa_target_cols: set = set()
+    all_content_cols: set = set()
+    skip_ipa = (ipa_mode == "none")
+
+    # When ipa_mode=none, strip ALL square brackets from ALL content columns
+    if skip_ipa:
+        _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
+        for cell in all_cells:
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if "[" in text:
+                stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
+                if stripped != text:
+                    cell["text"] = stripped.strip()
+                    cell["_ipa_corrected"] = True
+
+    if not skip_ipa and total_cols >= 3:
+        # Detect English headword column via IPA signals (brackets or garbled).
+        col_ipa_count: Dict[str, int] = {}
+        all_content_cols: set = set()
+        for cell in all_cells:
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            txt = cell.get("text", "") or ""
+            if txt.strip():
+                all_content_cols.add(ct)
+            if '[' in txt or _text_has_garbled_ipa(txt):
+                col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
+        if col_ipa_count:
+            en_col_type = max(col_ipa_count, key=col_ipa_count.get)
+        elif ipa_mode == "all":
+            # Force-all mode without auto-detection: pick column with most cells
+            col_cell_count: Dict[str, int] = {}
+            for cell in all_cells:
+                ct = cell.get("col_type", "")
+                if ct.startswith("column_") and (cell.get("text") or "").strip():
+                    col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
+            if col_cell_count:
+                en_col_type = max(col_cell_count, key=col_cell_count.get)
+
+        # Decide which columns to process based on ipa_mode:
+        # auto/en: only the detected EN headword column (English IPA)
+        # de: all content columns EXCEPT the EN column (German IPA)
+        # all: EN column gets English IPA, other columns get German IPA
+        en_ipa_target_cols: set = set()
+        de_ipa_target_cols: set = set()
+        if ipa_mode in ("auto", "en"):
+            if en_col_type:
+                en_ipa_target_cols.add(en_col_type)
+        elif ipa_mode == "de":
+            de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
+        elif ipa_mode == "all":
+            if en_col_type:
+                en_ipa_target_cols.add(en_col_type)
+            de_ipa_target_cols = all_content_cols - en_ipa_target_cols
+
+        # --- Strip IPA from columns NOT in the target set ---
+        # When user selects "nur DE", English IPA from the OCR scan must
+        # be removed.  When "none", all IPA is removed.
+        # In vocab columns, square brackets [...] are always IPA (both
+        # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]).
+        _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
+        strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
+        if strip_en_ipa or ipa_mode == "none":
+            strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
+            for cell in all_cells:
+                ct = cell.get("col_type", "")
+                if ct not in strip_cols:
+                    continue
+                text = cell.get("text", "")
+                if "[" in text:
+                    stripped = _SQUARE_BRACKET_RE.sub("", text)
+                    if stripped != text:
+                        cell["text"] = stripped.strip()
+                        cell["_ipa_corrected"] = True
+
+        # --- English IPA (Britfone + eng_to_ipa) ---
+        if en_ipa_target_cols:
+            for cell in all_cells:
+                ct = cell.get("col_type")
+                if ct in en_ipa_target_cols:
+                    cell["_orig_col_type"] = ct
+                    cell["col_type"] = "column_en"
+        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
+        fix_cell_phonetics(all_cells, pronunciation="british")
+        for cell in all_cells:
+            orig = cell.pop("_orig_col_type", None)
+            if orig:
+                cell["col_type"] = orig
+            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+                cell["_ipa_corrected"] = True
+
+        # --- German IPA (wiki-pronunciation-dict + epitran) ---
+        if de_ipa_target_cols:
+            from cv_ipa_german import insert_german_ipa
+            insert_german_ipa(all_cells, de_ipa_target_cols)
+
+        ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
+
+        # Mark cells whose text was changed by IPA correction so that
+        # later steps (5i) don't overwrite the corrected text when
+        # reconstructing from word_boxes.  (Already done inline above
+        # for English; insert_german_ipa sets _ipa_corrected too.)
+        for cell in all_cells:
+            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+                cell["_ipa_corrected"] = True
+
+        # 5d. Fix IPA continuation cells — cells where the printed
+        # phonetic transcription wraps to a line below the headword.
+        # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]").
+        # Replace garbled text with proper IPA looked up from the
+        # headword in the previous row's same column.
+        # Note: We check ALL columns, not just en_col_type, because
+        # the EN headword column may not be the longest-average column.
+        _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
+        ipa_cont_fixed = 0
+        for z in ([] if skip_ipa else zones_data):
+            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
+            z_cells = z.get("cells", [])
+            for idx, row in enumerate(rows_sorted):
+                if idx == 0:
+                    continue
+                ri = row["index"]
+                row_cells = [c for c in z_cells if c.get("row_index") == ri]
+                for cell in row_cells:
+                    ct = cell.get("col_type", "")
+                    if not ct.startswith("column_"):
+                        continue
+                    cell_text = (cell.get("text") or "").strip()
+                    if not cell_text:
+                        # Step 5c may have emptied garbled IPA cells like
+                        # "[n, nn]" — recover text from word_boxes.
+                        wb_texts = [w.get("text", "")
+                                    for w in cell.get("word_boxes", [])]
+                        cell_text = " ".join(wb_texts).strip()
+                        if not cell_text:
+                            continue
+
+                    is_bracketed = (
+                        cell_text.startswith('[') and cell_text.endswith(']')
+                    )
+
+                    if is_bracketed:
+                        # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
+                        # Text like "employee [im'ploi:]" is NOT fully
+                        # bracketed and won't match here.
+                        if not _text_has_garbled_ipa(cell_text):
+                            continue
+                        # Already has proper IPA brackets → skip
+                        if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+                            continue
+                    else:
+                        # Unbracketed continuation: "ska:f – ska:vz",
+                        # "'sekandarr sku:l".  Only treat as IPA
+                        # continuation if this is the ONLY content cell
+                        # in the row (single-cell row) and the text is
+                        # garbled IPA without real IPA Unicode symbols.
+                        content_cells_in_row = [
+                            c for c in row_cells
+                            if c.get("col_type", "").startswith("column_")
+                            and c.get("col_type") != "column_1"
+                        ]
+                        if len(content_cells_in_row) != 1:
+                            continue
+                        if not _text_has_garbled_ipa(cell_text):
+                            continue
+                        # Has real IPA symbols → already fixed or valid
+                        if any(c in _REAL_IPA_CHARS for c in cell_text):
+                            continue
+
+                    # Find headword in previous row, same column
+                    prev_ri = rows_sorted[idx - 1]["index"]
+                    prev_same_col = [
+                        c for c in z_cells
+                        if c.get("row_index") == prev_ri
+                        and c.get("col_type") == ct
+                    ]
+                    if not prev_same_col:
+                        continue
+                    prev_text = prev_same_col[0].get("text", "")
+                    fixed = fix_ipa_continuation_cell(
+                        cell_text, prev_text, pronunciation="british",
+                    )
+                    if fixed != cell_text:
+                        cell["text"] = fixed
+                        ipa_cont_fixed += 1
+                        logger.info(
+                            "IPA continuation R%d %s: '%s' → '%s'",
+                            ri, ct, cell_text, fixed,
+                        )
+        if ipa_cont_fixed:
+            logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
+
+    # 5e. Heading detection by single-cell rows — black headings like
+    # "Theme" that have normal color and height but are the ONLY cell
+    # in their row (excluding page_ref column_1).  Must run AFTER 5d
+    # so IPA continuation cells are already processed.
+    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
+    if single_heading_count:
+        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
+
+    # 5f. Strip IPA from headings — headings detected in 5e ran AFTER
+    # IPA correction (5c), so they may have dictionary IPA appended
+    # (e.g. "Theme [θˈiːm]" → "Theme").  Headings should show the
+    # original text only.
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            if cell.get("col_type") != "heading":
+                continue
+            text = cell.get("text", "")
+            # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme"
+            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
+            if stripped and stripped != text:
+                cell["text"] = stripped
+
+    # 5g. Extract page_ref cells and footer rows from content zones.
+    # Page references (column_1 cells like "p.70") sit in rows that
+    # also contain vocabulary — extract them as zone metadata without
+    # removing the row.  Footer lines (e.g. "two hundred and twelve"
+    # = page number at bottom) are standalone rows that should be
+    # removed from the table entirely.
+    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
+    # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
+    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
+    _NUMBER_WORDS = {
+        "one", "two", "three", "four", "five", "six", "seven",
+        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
+        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
+        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
+        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
+        "einhundert", "zweihundert", "dreihundert", "vierhundert",
+        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
+    }
+    for z in zones_data:
+        if z.get("zone_type") != "content":
+            continue
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        if not rows:
+            continue
+
+        # Extract column_1 cells that look like page references
+        page_refs = []
+        page_ref_cell_ids = set()
+        for cell in cells:
+            if cell.get("col_type") != "column_1":
+                continue
+            text = (cell.get("text") or "").strip()
+            if not text:
+                continue
+            if not _PAGE_REF_RE.match(text):
+                continue
+            page_refs.append({
+                "row_index": cell.get("row_index"),
+                "text": text,
+                "bbox_pct": cell.get("bbox_pct", {}),
+            })
+            page_ref_cell_ids.add(cell.get("cell_id"))
+
+        # Keep page_ref cells in the table as a visible column.
+        # Previously these were removed, but users want to see them.
+        # The metadata extraction above still populates zone["page_refs"]
+        # for the frontend header display.
+
+        # Detect footer: last non-header row if it has only 1 cell
+        # with short, non-content text (page numbers like "233" or
+        # "two hundred and twelve").  Comma-separated lists and long
+        # text are content continuations, not page numbers.
+        footer_rows = []
+        non_header_rows = [r for r in rows if not r.get("is_header")]
+        if non_header_rows:
+            last_row = non_header_rows[-1]
+            last_ri = last_row["index"]
+            last_cells = [c for c in z["cells"]
+                          if c.get("row_index") == last_ri]
+            if len(last_cells) == 1:
+                text = (last_cells[0].get("text") or "").strip()
+                # Not IPA (no real IPA symbols) and not a heading
+                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
+                # Comma-separated text is a content continuation, not a footer
+                has_commas = ',' in text
+                # Written-out page numbers like "two hundred and nine"
+                text_words = set(text.lower().split())
+                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
+                # Short text or written-out number
+                is_page_number = len(text) <= 20 or is_written_number
+                if (text and not has_real_ipa and not has_commas
+                        and is_page_number
+                        and last_cells[0].get("col_type") != "heading"):
+                    footer_rows.append({
+                        "row_index": last_ri,
+                        "text": text,
+                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
+                    })
+
+        # Classify footer rows: page numbers are removed from the grid
+        # and promoted to page_number metadata; other footers stay as rows.
+        page_number_footers = []
+        other_footers = []
+        for fr in footer_rows:
+            ft = fr["text"].strip()
+            # Pure digits
+            digits = "".join(c for c in ft if c.isdigit())
+            if digits and re.match(r'^[\d\s.]+$', ft):
+                page_number_footers.append(fr)
+            # Written-out numbers
+            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
+                page_number_footers.append(fr)
+            else:
+                other_footers.append(fr)
+
+        # Remove page-number footer rows from grid entirely
+        if page_number_footers:
+            pn_ris = {fr["row_index"] for fr in page_number_footers}
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
+            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
+            # Set page_number metadata (use first one)
+            pn_text = page_number_footers[0]["text"].strip()
+            pn_digits = "".join(c for c in pn_text if c.isdigit())
+            if not page_number_info:
+                page_number_info = {
+                    "text": pn_text,
+                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
+                }
+                if pn_digits:
+                    page_number_info["number"] = int(pn_digits)
+
+        # Mark remaining footer rows (non-page-number content)
+        if other_footers:
+            footer_ris = {fr["row_index"] for fr in other_footers}
+            for r in z["rows"]:
+                if r["index"] in footer_ris:
+                    r["is_footer"] = True
+            for c in z["cells"]:
+                if c.get("row_index") in footer_ris:
+                    c["col_type"] = "footer"
+
+        if page_refs or footer_rows:
+            logger.info(
+                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
+                len(page_refs), len(footer_rows), len(page_number_footers),
+                z.get("zone_index", 0),
+            )
+
+        # Store as zone-level metadata
+        if page_refs:
+            z["page_refs"] = page_refs
+        if other_footers:
+            z["footer"] = other_footers
+
+    # 5h. Convert slash-delimited IPA to bracket notation.
+    # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
+    # Detect the pattern <headword> /ocr_ipa/ and replace with [dict_ipa]
+    # using the IPA dictionary when available, falling back to the OCR text.
+    # The regex requires a word character (or ² ³) right before the opening
+    # slash to avoid false positives like "sb/sth".
+    _SLASH_IPA_RE = re.compile(
+        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
+        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
+    )
+    # Standalone slash IPA at start of text (headword on previous line)
+    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
+    # IPA between slashes never contains spaces, parentheses, or commas.
+    # Reject matches that look like grammar: "sb/sth up a) jdn/"
+    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
+    slash_ipa_fixed = 0
+    for z in ([] if skip_ipa else zones_data):
+        for cell in z.get("cells", []):
+            # Only process English headword column — avoid converting
+            # German text like "der/die/das" to IPA.
+            if en_col_type and cell.get("col_type") != en_col_type:
+                continue
+            text = cell.get("text", "")
+            if "/" not in text:
+                continue
+
+            def _replace_slash_ipa(m: re.Match) -> str:
+                nonlocal slash_ipa_fixed
+                headword = m.group(1)
+                ocr_ipa = m.group(2)  # includes slashes
+                inner_raw = ocr_ipa.strip("/").strip()
+                # Reject if inner content has spaces/parens/commas (grammar)
+                if _SLASH_IPA_REJECT_RE.search(inner_raw):
+                    return m.group(0)
+                # Strip superscript digits for lookup
+                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
+                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
+                if ipa:
+                    slash_ipa_fixed += 1
+                    return f"{headword} [{ipa}]"
+                # Fallback: keep OCR IPA but convert slashes to brackets
+                inner = inner_raw.lstrip("'").strip()
+                if inner:
+                    slash_ipa_fixed += 1
+                    return f"{headword} [{inner}]"
+                return m.group(0)
+
+            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
+
+            # Second pass: convert remaining /ipa/ after [ipa] from first pass.
+            # Pattern: [ipa] /ipa2/ → [ipa] [ipa2]  (second pronunciation variant)
+            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
+            def _replace_trailing_slash(m: re.Match) -> str:
+                nonlocal slash_ipa_fixed
+                inner = m.group(1).strip("/").strip().lstrip("'").strip()
+                if _SLASH_IPA_REJECT_RE.search(inner):
+                    return m.group(0)
+                if inner:
+                    slash_ipa_fixed += 1
+                    return f" [{inner}]"
+                return m.group(0)
+            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
+
+            # Handle standalone /ipa/ at start (no headword in this cell)
+            if new_text == text:
+                m = _STANDALONE_SLASH_IPA_RE.match(text)
+                if m:
+                    inner = m.group(1).strip()
+                    if not _SLASH_IPA_REJECT_RE.search(inner):
+                        inner = inner.lstrip("'").strip()
+                        if inner:
+                            new_text = "[" + inner + "]" + text[m.end():]
+                            slash_ipa_fixed += 1
+
+            if new_text != text:
+                cell["text"] = new_text
+
+    if slash_ipa_fixed:
+        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
+
+    # 5i. Remove blue bullet/artifact word_boxes.
+    # Dictionary pages have small blue square bullets (■) before entries.
+    # OCR reads these as text artifacts (©, e, *, or even plausible words
+    # like "fighily" overlapping the real word "tightly").
+    # Detection rules:
+    #   a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black)
+    #   b) Overlapping word_boxes: >40% x-overlap → remove lower confidence
+    #   c) Duplicate text: consecutive blue wbs with identical text, gap < 6px
+    bullet_removed = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) < 2:
+                continue
+            to_remove: set = set()
+
+            # Rule (a): tiny coloured symbols (bullets, graphic fragments)
+            for i, wb in enumerate(wbs):
+                cn = wb.get("color_name", "black")
+                if (cn != "black"
+                        and wb.get("width", 0) * wb.get("height", 0) < 200
+                        and wb.get("conf", 100) < 85):
+                    to_remove.add(i)
+
+            # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
+            # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
+            # Remove word boxes that contain NO letters or digits.
+            for i, wb in enumerate(wbs):
+                t = (wb.get("text") or "").strip()
+                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
+                    to_remove.add(i)
+
+            # Rule (b) + (c): overlap and duplicate detection
+            # Sort by x for pairwise comparison
+            _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
+            to_merge: List[Tuple[int, int]] = []  # pairs (i1, i2) to merge
+            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
+            for p in range(len(indexed) - 1):
+                i1, w1 = indexed[p]
+                i2, w2 = indexed[p + 1]
+                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
+                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
+                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
+                min_w = min(w1.get("width", 1), w2.get("width", 1))
+                gap = x2s - x1e
+                overlap_pct = overlap / min_w if min_w > 0 else 0
+
+                # (b) Significant x-overlap
+                if overlap_pct > 0.20:
+                    t1 = (w1.get("text") or "").strip()
+                    t2 = (w2.get("text") or "").strip()
+
+                    # Syllable-split words: both are alphabetic text with
+                    # moderate overlap (20-75%).  Merge instead of removing.
+                    # OCR splits words at syllable marks, producing overlapping
+                    # boxes like "zu" + "tiefst" → "zutiefst".
+                    if (overlap_pct <= 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)):
+                        to_merge.append((i1, i2))
+                        continue
+
+                    # High overlap (>75%) with different alphabetic text:
+                    # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
+                    # causing it to heavily overlap with the next fragment ("brech").
+                    # Merge instead of removing when one is a short prefix (≤4 chars)
+                    # and the texts are different.
+                    if (overlap_pct > 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)
+                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
+                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
+                        to_merge.append((i1, i2))
+                        continue
+
+                    if overlap_pct <= 0.40:
+                        continue  # too little overlap and not alphabetic merge
+
+                    c1 = w1.get("conf", 50)
+                    c2 = w2.get("conf", 50)
+
+                    # For very high overlap (>90%) with different text,
+                    # prefer the word that exists in the IPA dictionary
+                    # over confidence (OCR can give artifacts high conf).
+                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
+                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
+                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
+                        if in_dict_1 and not in_dict_2:
+                            to_remove.add(i2)
+                            continue
+                        elif in_dict_2 and not in_dict_1:
+                            to_remove.add(i1)
+                            continue
+
+                    if c1 < c2:
+                        to_remove.add(i1)
+                    elif c2 < c1:
+                        to_remove.add(i2)
+                    else:
+                        # Same confidence: remove the taller one (bullet slivers)
+                        if w1.get("height", 0) > w2.get("height", 0):
+                            to_remove.add(i1)
+                        else:
+                            to_remove.add(i2)
+
+                # (c) Duplicate text: consecutive blue with same text, gap < 6px
+                elif (gap < 6
+                      and w1.get("color_name") == "blue"
+                      and w2.get("color_name") == "blue"
+                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
+                    # Remove the one with lower confidence; if equal, first one
+                    c1 = w1.get("conf", 50)
+                    c2 = w2.get("conf", 50)
+                    to_remove.add(i1 if c1 <= c2 else i2)
+
+            # Execute merges first (syllable-split words).
+            # Use merge_parent to support chain merging: if "zer" absorbed
+            # "brech" and then "brech"+"lich" is a merge pair, redirect to
+            # merge "lich" into "zer" → "zerbrechlich".
+            if to_merge:
+                merge_parent: Dict[int, int] = {}  # absorbed → absorber
+                for mi1, mi2 in to_merge:
+                    # Follow chain: if mi1 was absorbed, find root absorber
+                    actual_mi1 = mi1
+                    while actual_mi1 in merge_parent:
+                        actual_mi1 = merge_parent[actual_mi1]
+                    if actual_mi1 in to_remove or mi2 in to_remove:
+                        continue
+                    if mi2 in merge_parent:
+                        continue  # mi2 already absorbed
+                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
+                    # Concatenate text (no space — they're parts of one word)
+                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
+                    mt2 = (mw2.get("text") or "").strip()
+                    merged_text = mt1 + mt2
+                    # Union bounding box
+                    mx = min(mw1["left"], mw2["left"])
+                    my = min(mw1["top"], mw2["top"])
+                    mr = max(mw1["left"] + mw1["width"],
+                             mw2["left"] + mw2["width"])
+                    mb = max(mw1["top"] + mw1["height"],
+                             mw2["top"] + mw2["height"])
+                    mw1["text"] = merged_text
+                    mw1["left"] = mx
+                    mw1["top"] = my
+                    mw1["width"] = mr - mx
+                    mw1["height"] = mb - my
+                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
+                    to_remove.add(mi2)
+                    merge_parent[mi2] = actual_mi1
+                    bullet_removed -= 1  # net: merge, not removal
+
+            if to_remove:
+                bullet_removed += len(to_remove)
+                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
+                cell["word_boxes"] = filtered
+                # Don't overwrite text that was corrected by Step 5c IPA fix
+                if not cell.get("_ipa_corrected"):
+                    cell["text"] = _words_to_reading_order_text(filtered)
+
+    # Remove cells that became empty after bullet removal
+    if bullet_removed:
+        for z in zones_data:
+            z["cells"] = [c for c in z.get("cells", [])
+                          if (c.get("word_boxes") or c.get("text", "").strip())]
+        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
+
+    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
+    # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
+    # "\\", "PEE", "a=") that survive earlier filters because their rows also
+    # contain real content in other columns.  Remove them here.
+    _COMMON_SHORT_WORDS = {
+        # German
+        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
+        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
+        "die", "der", "das", "dem", "den", "des", "ein", "und",
+        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
+        # English
+        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
+        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
+        "on", "or", "so", "to", "up", "us", "we",
+        "the", "and", "but", "for", "not",
+    }
+    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
+    artifact_cells_removed = 0
+    for z in zones_data:
+        before = len(z.get("cells", []))
+        kept = []
+        for cell in z.get("cells", []):
+            text = (cell.get("text") or "").strip()
+            core = text.rstrip(".,;:!?'\"")
+            is_artifact = False
+            if not core:
+                is_artifact = True
+            elif _PURE_JUNK_RE.match(core):
+                is_artifact = True
+            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
+                # Short non-alphabetic text like "a=", not word beginnings like "Zw"
+                is_artifact = True
+            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
+                is_artifact = True
+            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
+                  and not re.match(r'^[pPsS]\.?\d+$', core)):
+                # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
+                # but NOT page references like "p.43", "p50", "S.12"
+                is_artifact = True
+            if is_artifact:
+                kept.append(None)  # placeholder
+            else:
+                kept.append(cell)
+        z["cells"] = [c for c in kept if c is not None]
+        artifact_cells_removed += before - len(z["cells"])
+    if artifact_cells_removed:
+        # Also remove rows that became completely empty
+        for z in zones_data:
+            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
+            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
+        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
+
+    # 5j. Normalise word_box order to reading order (group by Y, sort by X).
+    # The frontend renders colored cells from word_boxes array order
+    # (GridTable.tsx), so they MUST be in left-to-right reading order.
+    wb_reordered = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) < 2:
+                continue
+            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
+            sorted_wbs = [w for line in lines for w in line]
+            # Check if order actually changed
+            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
+                cell["word_boxes"] = sorted_wbs
+                wb_reordered += 1
+    if wb_reordered:
+        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
+
+    duration = time.time() - t0
+
+    # 6. Build result
+    total_cells = sum(len(z.get("cells", [])) for z in zones_data)
+    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
+    total_rows = sum(len(z.get("rows", [])) for z in zones_data)
+
+    # Collect color statistics from all word_boxes in cells
+    color_stats: Dict[str, int] = {}
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            for wb in cell.get("word_boxes", []):
+                cn = wb.get("color_name", "black")
+                color_stats[cn] = color_stats.get(cn, 0) + 1
+
+    # Compute layout metrics for faithful grid reconstruction
+    all_content_row_heights: List[float] = []
+    for z in zones_data:
+        for row in z.get("rows", []):
+            if not row.get("is_header", False):
+                h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
+                if h > 0:
+                    all_content_row_heights.append(h)
+    avg_row_height = (
+        sum(all_content_row_heights) / len(all_content_row_heights)
+        if all_content_row_heights else 30.0
+    )
+    font_size_suggestion = max(10, int(avg_row_height * 0.6))
+
+    # --- Dictionary detection on assembled grid ---
+    # Build lightweight ColumnGeometry-like structures from zone columns for
+    # dictionary signal scoring.
+    from cv_layout import _score_dictionary_signals
+    dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
+    try:
+        from cv_vocab_types import ColumnGeometry
+        for z in zones_data:
+            zone_cells = z.get("cells", [])
+            zone_cols = z.get("columns", [])
+            if len(zone_cols) < 2 or len(zone_cells) < 10:
+                continue
+            # Build pseudo-ColumnGeometry per column
+            pseudo_geoms = []
+            for col in zone_cols:
+                ci = col["index"]
+                col_cells = [c for c in zone_cells if c.get("col_index") == ci]
+                # Flatten word_boxes into word dicts compatible with _score_language
+                col_words = []
+                for cell in col_cells:
+                    for wb in cell.get("word_boxes") or []:
+                        col_words.append({
+                            "text": wb.get("text", ""),
+                            "conf": wb.get("conf", 0),
+                            "top": wb.get("top", 0),
+                            "left": wb.get("left", 0),
+                            "height": wb.get("height", 0),
+                            "width": wb.get("width", 0),
+                        })
+                    # Fallback: use cell text if no word_boxes
+                    if not cell.get("word_boxes") and cell.get("text"):
+                        col_words.append({
+                            "text": cell["text"],
+                            "conf": cell.get("confidence", 50),
+                            "top": cell.get("bbox_px", {}).get("y", 0),
+                            "left": cell.get("bbox_px", {}).get("x", 0),
+                            "height": cell.get("bbox_px", {}).get("h", 20),
+                            "width": cell.get("bbox_px", {}).get("w", 50),
+                        })
+                col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
+                pseudo_geoms.append(ColumnGeometry(
+                    index=ci, x=col.get("x_min_px", 0), y=0,
+                    width=max(col_w, 1), height=img_h,
+                    word_count=len(col_words), words=col_words,
+                    width_ratio=col_w / max(img_w, 1),
+                ))
+            if len(pseudo_geoms) >= 2:
+                dd = _score_dictionary_signals(
+                    pseudo_geoms,
+                    document_category=document_category,
+                    margin_strip_detected=margin_strip_detected,
+                )
+                if dd["confidence"] > dict_detection["confidence"]:
+                    dict_detection = dd
+    except Exception as e:
+        logger.warning("Dictionary detection failed: %s", e)
+
+    # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
+    try:
+        from cv_syllable_detect import merge_word_gaps_in_zones
+        merge_word_gaps_in_zones(zones_data, session_id)
+    except Exception as e:
+        logger.warning("Word-gap merge failed: %s", e)
+
+    # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
+    # Strips | from words, validates with pyphen, tries char-deletion for garbled
+    # words like "Ze|plpe|lin" → "Zeppelin".
+    try:
+        from cv_syllable_detect import autocorrect_pipe_artifacts
+        autocorrect_pipe_artifacts(zones_data, session_id)
+    except Exception as e:
+        logger.warning("Pipe autocorrect failed: %s", e)
+
+    # --- Syllable divider insertion for dictionary pages ---
+    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
+    #   "all" = force on all content words, "en" = English column only,
+    #   "de" = German columns only, "none" = skip entirely.
+    syllable_insertions = 0
+    if syllable_mode != "none" and img_bgr is not None:
+        _syllable_eligible = False
+        if syllable_mode in ("all", "de", "en"):
+            _syllable_eligible = True
+        elif (dict_detection.get("is_dictionary")
+                and dict_detection.get("article_col_index") is not None):
+            # auto: only on dictionary pages with article columns
+            _syllable_eligible = True
+        # For language-specific modes, determine allowed columns
+        _syllable_col_filter: Optional[set] = None  # None = all columns
+        if syllable_mode == "en":
+            _syllable_col_filter = {en_col_type} if en_col_type else set()
+        elif syllable_mode == "de":
+            if en_col_type and total_cols >= 3:
+                _syllable_col_filter = all_content_cols - {en_col_type}
+            # else None → all columns (correct for German-only dicts)
+        if _syllable_eligible:
+            try:
+                from cv_syllable_detect import insert_syllable_dividers
+                force_syllables = (syllable_mode in ("all", "de", "en"))
+                syllable_insertions = insert_syllable_dividers(
+                    zones_data, img_bgr, session_id,
+                    force=force_syllables,
+                    col_filter=_syllable_col_filter,
+                )
+            except Exception as e:
+                logger.warning("Syllable insertion failed: %s", e)
+
+    # When syllable mode is "none", strip any residual | from OCR so
+    # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
+    if syllable_mode == "none":
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                t = cell.get("text", "")
+                if "|" in t:
+                    cell["text"] = t.replace("|", "")
+
+    # --- Split merged words (OCR sometimes glues adjacent words) ---
+    # Uses dictionary lookup to split e.g. "atmyschool" → "at my school"
+    try:
+        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
+        if _SPELL_AVAILABLE:
+            split_count = 0
+            for z in zones_data:
+                for cell in z.get("cells", []):
+                    text = cell.get("text", "")
+                    if not text:
+                        continue
+                    parts = []
+                    changed = False
+                    for token in text.split():
+                        # Try splitting pure-alpha tokens >= 4 chars
+                        # Strip trailing punctuation AND IPA brackets
+                        clean = token
+                        # Remove trailing IPA like [dɪsˈɪʒən] first
+                        bracket_pos = clean.find('[')
+                        suffix_ipa = ""
+                        if bracket_pos > 0:
+                            suffix_ipa = clean[bracket_pos:]
+                            clean = clean[:bracket_pos]
+                        suffix_punct = ""
+                        stripped = clean.rstrip(".,!?;:'\")")
+                        if stripped != clean:
+                            suffix_punct = clean[len(stripped):]
+                            clean = stripped
+                        suffix = suffix_punct + suffix_ipa
+                        # Handle contractions: "solet's" → try "solet" + "'s"
+                        contraction = ""
+                        if "'" in clean and clean.index("'") >= 2:
+                            apos_pos = clean.index("'")
+                            contraction = clean[apos_pos:]
+                            clean = clean[:apos_pos]
+                            suffix = contraction + suffix
+                        if len(clean) >= 4 and clean.isalpha():
+                            split = _try_split_merged_word(clean)
+                            if split:
+                                parts.append(split + suffix)
+                                changed = True
+                                continue
+                        parts.append(token)
+                    if changed:
+                        cell["text"] = " ".join(parts)
+                        split_count += 1
+            if split_count:
+                logger.info("build-grid session %s: split %d merged words", session_id, split_count)
+    except ImportError:
+        pass
+
+    # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" ---
+    # Matches any [bracket] directly after a letter, as long as the bracket
+    # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]").
+    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if text and "[" in text:
+                fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
+                if fixed != text:
+                    cell["text"] = fixed
+
+    # --- SmartSpellChecker: language-aware OCR correction on all cells ---
+    try:
+        from smart_spell import SmartSpellChecker
+        _ssc = SmartSpellChecker()
+        spell_fix_count = 0
+
+        # Determine language per column:
+        # en_col_type was already detected (column with IPA = English).
+        # All other content columns are assumed German for vocab tables.
+        # For single/two-column layouts, use auto-detection.
+        for z in zones_data:
+            zone_cols = z.get("columns", [])
+            for cell in z.get("cells", []):
+                text = cell.get("text", "")
+                if not text or not text.strip():
+                    continue
+                ct = cell.get("col_type", "")
+                if not ct.startswith("column_"):
+                    continue
+
+                # Determine language for this cell
+                if total_cols >= 3 and en_col_type:
+                    lang = "en" if ct == en_col_type else "de"
+                elif total_cols <= 2:
+                    lang = "auto"  # auto-detect for non-vocab layouts
+                else:
+                    lang = "auto"
+
+                result = _ssc.correct_text(text, lang=lang)
+                if result.changed:
+                    cell["text"] = result.corrected
+                    spell_fix_count += 1
+
+        if spell_fix_count:
+            logger.info(
+                "build-grid session %s: SmartSpellChecker fixed %d cells",
+                session_id, spell_fix_count,
+            )
+    except ImportError:
+        logger.debug("SmartSpellChecker not available in build-grid")
+    except Exception as e:
+        logger.warning("SmartSpellChecker error in build-grid: %s", e)
+
+    # --- Debug: log cell counts per column before empty-column removal ---
+    for z in zones_data:
+        if z.get("zone_type") == "content":
+            from collections import Counter as _Counter
+            _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
+            _cols = z.get("columns", [])
+            logger.info(
+                "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
+                z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
+            )
+
+    # --- Remove empty columns (no cells assigned) ---
+    for z in zones_data:
+        cells = z.get("cells", [])
+        used_col_indices = {c.get("col_index") for c in cells}
+        old_cols = z.get("columns", [])
+        new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
+        if len(new_cols) < len(old_cols):
+            # Re-index columns and cells
+            old_to_new = {}
+            for new_i, col in enumerate(new_cols):
+                old_i = col.get("col_index", col.get("index", new_i))
+                old_to_new[old_i] = new_i
+                col["col_index"] = new_i
+                col["index"] = new_i
+                col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
+            for cell in cells:
+                old_ci = cell.get("col_index", 0)
+                cell["col_index"] = old_to_new.get(old_ci, old_ci)
+                cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
+            z["columns"] = new_cols
+
+    # Clean up internal flags before returning
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            cell.pop("_ipa_corrected", None)
+
+    result = {
+        "session_id": session_id,
+        "image_width": img_w,
+        "image_height": img_h,
+        "zones": zones_data,
+        "boxes_detected": boxes_detected,
+        "summary": {
+            "total_zones": len(zones_data),
+            "total_columns": total_columns,
+            "total_rows": total_rows,
+            "total_cells": total_cells,
+            "total_words": len(all_words),
+            "recovered_colored": recovered_count,
+            "color_stats": color_stats,
+        },
+        "formatting": {
+            "bold_columns": [],
+            "header_rows": [],
+        },
+        "layout_metrics": {
+            "page_width_px": img_w,
+            "page_height_px": img_h,
+            "avg_row_height_px": round(avg_row_height, 1),
+            "font_size_suggestion_px": font_size_suggestion,
+        },
+        "dictionary_detection": {
+            "is_dictionary": dict_detection.get("is_dictionary", False),
+            "confidence": dict_detection.get("confidence", 0.0),
+            "signals": dict_detection.get("signals", {}),
+            "article_col_index": dict_detection.get("article_col_index"),
+            "headword_col_index": dict_detection.get("headword_col_index"),
+        },
+        "processing_modes": {
+            "ipa_mode": ipa_mode,
+            "syllable_mode": syllable_mode,
+            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
+            "syllables_applied": syllable_insertions > 0,
+        },
+        "page_number": page_number_info,
+        "duration_seconds": round(duration, 2),
+    }
+
+    return result
+
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index a8884de..ea91384 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1,14 +1,7 @@
 """
-Grid Editor API — builds a structured, zone-aware grid from Kombi OCR results.
+Grid Editor API — endpoints for grid building, editing, and export.
 
-Takes the merged word positions from paddle-kombi / rapid-kombi and:
-  1. Detects bordered boxes on the image (cv_box_detect)
-  2. Splits the page into zones (content + box regions)
-  3. Clusters words into columns and rows per zone
-  4. Returns a hierarchical StructuredGrid for the frontend Excel-like editor
-
-Lizenz: Apache 2.0 (kommerziell nutzbar)
-DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+The core grid building logic is in grid_build_core.py.
 """
 
 import logging
@@ -16,1950 +9,20 @@ import re
 import time
 from typing import Any, Dict, List, Optional, Tuple
 
-import cv2
-import numpy as np
 from fastapi import APIRouter, HTTPException, Query, Request
 
-from cv_box_detect import detect_boxes, split_page_into_zones
-from cv_graphic_detect import detect_graphic_elements
-from cv_vocab_types import PageZone
-from cv_color_detect import detect_word_colors, recover_colored_text
-from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines
+from grid_build_core import _build_grid_core
+from grid_editor_helpers import _words_in_zone
 from ocr_pipeline_session_store import (
     get_session_db,
-    get_session_image,
     update_session_db,
 )
 
-from grid_editor_helpers import (
-    _filter_border_strip_words,
-    _cluster_columns_by_alignment,
-    _GRID_GHOST_CHARS,
-    _filter_border_ghosts,
-    _MARKER_CHARS,
-    _merge_inline_marker_columns,
-    _flatten_word_boxes,
-    _words_in_zone,
-    _PIPE_RE_VSPLIT,
-    _detect_vertical_dividers,
-    _split_zone_at_vertical_dividers,
-    _merge_content_zones_across_boxes,
-    _detect_heading_rows_by_color,
-    _detect_heading_rows_by_single_cell,
-    _detect_header_rows,
-    _build_zone_grid,
-    _get_content_bounds,
-    _filter_decorative_margin,
-    _filter_footer_words,
-    _filter_header_junk,
-)
 logger = logging.getLogger(__name__)
 
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
 
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-# ---------------------------------------------------------------------------
-# Core computation (used by build-grid endpoint and regression tests)
-# ---------------------------------------------------------------------------
-
-async def _build_grid_core(
-    session_id: str,
-    session: dict,
-    *,
-    ipa_mode: str = "auto",
-    syllable_mode: str = "auto",
-) -> dict:
-    """Core grid building logic — pure computation, no HTTP or DB side effects.
-
-    Args:
-        session_id: Session identifier (for logging and image loading).
-        session: Full session dict from get_session_db().
-        ipa_mode: "auto" (only when English headwords detected), "all"
-            (force IPA on all content columns), "en" (English column only),
-            "de" (German/definition columns only), or "none" (skip entirely).
-        syllable_mode: "auto" (only when original has pipe dividers),
-            "all" (force syllabification on all words), "en" (English only),
-            "de" (German only), or "none" (skip).
-
-    Returns:
-        StructuredGrid result dict.
-
-    Raises:
-        ValueError: If session data is incomplete.
-    """
-    t0 = time.time()
-
-    # 1. Validate and load word results
-    word_result = session.get("word_result")
-    if not word_result or not word_result.get("cells"):
-        raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
-
-    img_w = word_result.get("image_width", 0)
-    img_h = word_result.get("image_height", 0)
-    if not img_w or not img_h:
-        raise ValueError("Missing image dimensions in word_result")
-
-    # 2. Flatten all word boxes from cells
-    all_words = _flatten_word_boxes(word_result["cells"])
-    if not all_words:
-        raise ValueError("No word boxes found in cells")
-
-    logger.info("build-grid session %s: %d words from %d cells",
-                session_id, len(all_words), len(word_result["cells"]))
-
-    # 2b. Filter decorative margin columns (alphabet graphics).
-    # Some worksheets have a decorative alphabet strip along one margin
-    # (A-Z in a graphic).  OCR reads these as single-char words aligned
-    # vertically.  Detect and remove them before grid building.
-    margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
-    margin_strip_detected = margin_strip_info.get("found", False)
-
-    # Read document_category from session (user-selected or auto-detected)
-    document_category = session.get("document_category")
-
-    # 2c. Filter footer rows (page numbers at the very bottom).
-    # Isolated short text in the bottom 5% of the page is typically a
-    # page number ("64", "S. 12") and not real content.  The page number
-    # is extracted as metadata for the frontend header display.
-    page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
-
-    # 2c2. Filter OCR junk from header illustrations.
-    # Low-confidence short fragments above the first real content row.
-    _filter_header_junk(all_words, img_h, logger, session_id)
-
-    # 2d. Filter words inside user-defined exclude regions (from Structure step).
-    # These are explicitly marked by the user, so ALL words inside are removed
-    # regardless of confidence.
-    structure_result = session.get("structure_result")
-    exclude_rects = []
-    if structure_result:
-        for er in structure_result.get("exclude_regions", []):
-            exclude_rects.append({
-                "x": er["x"], "y": er["y"],
-                "w": er["w"], "h": er["h"],
-            })
-    if exclude_rects:
-        before = len(all_words)
-        filtered = []
-        for w in all_words:
-            w_cx = w["left"] + w.get("width", 0) / 2
-            w_cy = w["top"] + w.get("height", 0) / 2
-            inside = any(
-                er["x"] <= w_cx <= er["x"] + er["w"]
-                and er["y"] <= w_cy <= er["y"] + er["h"]
-                for er in exclude_rects
-            )
-            if not inside:
-                filtered.append(w)
-        removed = before - len(filtered)
-        if removed:
-            all_words = filtered
-            logger.info(
-                "build-grid session %s: removed %d words inside %d user exclude region(s)",
-                session_id, removed, len(exclude_rects),
-            )
-
-    # 2e. Hard-filter words inside graphic/image regions from structure step.
-    # ALL words inside graphic regions are removed regardless of confidence —
-    # images cannot contain real text; any OCR words inside are artifacts.
-    # After image loading (Step 3a) we augment these with freshly detected
-    # graphic regions from cv_graphic_detect.
-    graphic_rects: List[Dict[str, int]] = []
-    if structure_result:
-        for g in structure_result.get("graphics", []):
-            graphic_rects.append({
-                "x": g["x"], "y": g["y"],
-                "w": g["w"], "h": g["h"],
-            })
-    if graphic_rects:
-        before = len(all_words)
-        all_words = [
-            w for w in all_words
-            if not any(
-                gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
-                and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
-                for gr in graphic_rects
-            )
-        ]
-        removed = before - len(all_words)
-        if removed:
-            logger.info(
-                "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
-                session_id, removed, len(graphic_rects),
-            )
-
-    # 3. Load image for box detection
-    img_png = await get_session_image(session_id, "cropped")
-    if not img_png:
-        img_png = await get_session_image(session_id, "dewarped")
-    if not img_png:
-        img_png = await get_session_image(session_id, "original")
-
-    zones_data: List[Dict[str, Any]] = []
-    boxes_detected = 0
-    recovered_count = 0
-    border_prefiltered = False
-    img_bgr = None
-
-    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
-
-    if img_png:
-        # Decode image for color detection + box detection
-        arr = np.frombuffer(img_png, dtype=np.uint8)
-        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-
-        if img_bgr is not None:
-            # --- 3a. Detect graphic/image regions via CV and hard-filter ---
-            # Pass only significant words (len >= 3) to the detector so that
-            # short OCR artifacts inside images don't fool the text-vs-graphic
-            # heuristic (it counts word centroids to distinguish text from images).
-            sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
-            fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
-            if fresh_graphics:
-                fresh_rects = [
-                    {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
-                    for g in fresh_graphics
-                ]
-                graphic_rects.extend(fresh_rects)
-                logger.info(
-                    "build-grid session %s: detected %d graphic region(s) via CV",
-                    session_id, len(fresh_graphics),
-                )
-                # Hard-filter words inside newly detected graphic regions
-                before = len(all_words)
-                all_words = [
-                    w for w in all_words
-                    if not any(
-                        gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
-                        and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
-                        for gr in fresh_rects
-                    )
-                ]
-                removed = before - len(all_words)
-                if removed:
-                    logger.info(
-                        "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
-                        session_id, removed, len(fresh_rects),
-                    )
-
-            # --- Recover colored text that OCR missed (before grid building) ---
-            recovered = recover_colored_text(img_bgr, all_words)
-            if recovered and graphic_rects:
-                # Filter recovered chars inside graphic regions
-                recovered = [
-                    r for r in recovered
-                    if not any(
-                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
-                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
-                        for gr in graphic_rects
-                    )
-                ]
-            if recovered:
-                recovered_count = len(recovered)
-                all_words.extend(recovered)
-                logger.info(
-                    "build-grid session %s: +%d recovered colored words",
-                    session_id, recovered_count,
-                )
-
-            # Detect bordered boxes
-            boxes = detect_boxes(
-                img_bgr,
-                content_x=content_x,
-                content_w=content_w,
-                content_y=content_y,
-                content_h=content_h,
-            )
-            boxes_detected = len(boxes)
-
-            if boxes:
-                # Filter border ghost words before grid building
-                all_words, ghost_count = _filter_border_ghosts(all_words, boxes)
-                if ghost_count:
-                    logger.info(
-                        "build-grid session %s: removed %d border ghost words",
-                        session_id, ghost_count,
-                    )
-
-                # Split page into zones
-                page_zones = split_page_into_zones(
-                    content_x, content_y, content_w, content_h, boxes
-                )
-
-                # Merge content zones separated by box zones
-                page_zones = _merge_content_zones_across_boxes(
-                    page_zones, content_x, content_w
-                )
-
-                # 3b. Detect vertical dividers and split content zones
-                vsplit_group_counter = 0
-                expanded_zones: List = []
-                for pz in page_zones:
-                    if pz.zone_type != "content":
-                        expanded_zones.append(pz)
-                        continue
-                    zone_words = _words_in_zone(
-                        all_words, pz.y, pz.height, pz.x, pz.width
-                    )
-                    divider_xs = _detect_vertical_dividers(
-                        zone_words, pz.x, pz.width, pz.y, pz.height
-                    )
-                    if divider_xs:
-                        sub_zones = _split_zone_at_vertical_dividers(
-                            pz, divider_xs, vsplit_group_counter
-                        )
-                        expanded_zones.extend(sub_zones)
-                        vsplit_group_counter += 1
-                        # Remove pipe words so they don't appear in sub-zones
-                        pipe_ids = set(
-                            id(w) for w in zone_words
-                            if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
-                        )
-                        all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
-                        logger.info(
-                            "build-grid: vertical split zone %d at x=%s → %d sub-zones",
-                            pz.index, [int(x) for x in divider_xs], len(sub_zones),
-                        )
-                    else:
-                        expanded_zones.append(pz)
-                # Re-index zones
-                for i, pz in enumerate(expanded_zones):
-                    pz.index = i
-                page_zones = expanded_zones
-
-                # --- Union columns from all content zones ---
-                # Each content zone detects columns independently.  Narrow
-                # columns (page refs, markers) may appear in only one zone.
-                # Merge column split-points from ALL content zones so every
-                # zone shares the full column set.
-                # NOTE: Zones from a vertical split are independent and must
-                # NOT share columns with each other.
-
-                # First pass: build grids per zone independently
-                zone_grids: List[Dict] = []
-
-                for pz in page_zones:
-                    zone_words = _words_in_zone(
-                        all_words, pz.y, pz.height, pz.x, pz.width
-                    )
-                    if pz.zone_type == "content":
-                        logger.info(
-                            "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d → %d/%d words",
-                            pz.index, pz.zone_type,
-                            pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
-                            len(zone_words), len(all_words),
-                        )
-                    # Filter recovered single-char artifacts in ALL zones
-                    # (decorative colored pixel blobs like !, ?, • from
-                    # recover_colored_text that don't represent real text)
-                    before = len(zone_words)
-                    zone_words = [
-                        w for w in zone_words
-                        if not (
-                            w.get("recovered")
-                            and len(w.get("text", "").strip()) <= 2
-                        )
-                    ]
-                    removed = before - len(zone_words)
-                    if removed:
-                        logger.info(
-                            "build-grid: filtered %d recovered artifacts from %s zone %d",
-                            removed, pz.zone_type, pz.index,
-                        )
-                    # Filter words inside image overlay regions (merged box zones)
-                    if pz.image_overlays:
-                        before_ov = len(zone_words)
-                        zone_words = [
-                            w for w in zone_words
-                            if not any(
-                                ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
-                                and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
-                                for ov in pz.image_overlays
-                            )
-                        ]
-                        ov_removed = before_ov - len(zone_words)
-                        if ov_removed:
-                            logger.info(
-                                "build-grid: filtered %d words inside image overlays from zone %d",
-                                ov_removed, pz.index,
-                            )
-                    zone_words, bs_removed = _filter_border_strip_words(zone_words)
-                    if bs_removed:
-                        border_prefiltered = True
-                        logger.info(
-                            "build-grid: pre-filtered %d border-strip words from zone %d",
-                            bs_removed, pz.index,
-                        )
-                    grid = _build_zone_grid(
-                        zone_words, pz.x, pz.y, pz.width, pz.height,
-                        pz.index, img_w, img_h,
-                        skip_first_row_header=bool(pz.image_overlays),
-                    )
-                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
-
-                # Second pass: merge column boundaries from all content zones
-                # Exclude zones from vertical splits — they have independent columns.
-                content_zones = [
-                    zg for zg in zone_grids
-                    if zg["pz"].zone_type == "content"
-                    and zg["pz"].vsplit_group is None
-                ]
-                if len(content_zones) > 1:
-                    # Collect column split points (x_min of non-first columns)
-                    all_split_xs: List[float] = []
-                    for zg in content_zones:
-                        raw_cols = zg["grid"].get("_raw_columns", [])
-                        for col in raw_cols[1:]:
-                            all_split_xs.append(col["x_min"])
-
-                    if all_split_xs:
-                        all_split_xs.sort()
-                        merge_distance = max(25, int(content_w * 0.03))
-                        merged_xs = [all_split_xs[0]]
-                        for x in all_split_xs[1:]:
-                            if x - merged_xs[-1] < merge_distance:
-                                merged_xs[-1] = (merged_xs[-1] + x) / 2
-                            else:
-                                merged_xs.append(x)
-
-                        total_cols = len(merged_xs) + 1
-                        max_zone_cols = max(
-                            len(zg["grid"].get("_raw_columns", []))
-                            for zg in content_zones
-                        )
-
-                        # Apply union whenever it has at least as many
-                        # columns as the best single zone.  Even with the
-                        # same count the union boundaries are better because
-                        # they incorporate evidence from all zones.
-                        if total_cols >= max_zone_cols:
-                            cx_min = min(w["left"] for w in all_words)
-                            cx_max = max(
-                                w["left"] + w["width"] for w in all_words
-                            )
-                            merged_columns: List[Dict[str, Any]] = []
-                            prev_x = cx_min
-                            for i, sx in enumerate(merged_xs):
-                                merged_columns.append({
-                                    "index": i,
-                                    "type": f"column_{i + 1}",
-                                    "x_min": prev_x,
-                                    "x_max": sx,
-                                })
-                                prev_x = sx
-                            merged_columns.append({
-                                "index": len(merged_xs),
-                                "type": f"column_{len(merged_xs) + 1}",
-                                "x_min": prev_x,
-                                "x_max": cx_max,
-                            })
-
-                            # Re-build ALL content zones with merged columns
-                            for zg in zone_grids:
-                                pz = zg["pz"]
-                                if pz.zone_type == "content":
-                                    grid = _build_zone_grid(
-                                        zg["words"], pz.x, pz.y,
-                                        pz.width, pz.height,
-                                        pz.index, img_w, img_h,
-                                        global_columns=merged_columns,
-                                        skip_first_row_header=bool(pz.image_overlays),
-                                    )
-                                    zg["grid"] = grid
-                            logger.info(
-                                "build-grid session %s: union of %d content "
-                                "zones → %d merged columns (max single zone: %d)",
-                                session_id, len(content_zones),
-                                total_cols, max_zone_cols,
-                            )
-
-                for zg in zone_grids:
-                    pz = zg["pz"]
-                    grid = zg["grid"]
-                    # Remove internal _raw_columns before adding to response
-                    grid.pop("_raw_columns", None)
-
-                    zone_entry: Dict[str, Any] = {
-                        "zone_index": pz.index,
-                        "zone_type": pz.zone_type,
-                        "bbox_px": {
-                            "x": pz.x, "y": pz.y,
-                            "w": pz.width, "h": pz.height,
-                        },
-                        "bbox_pct": {
-                            "x": round(pz.x / img_w * 100, 2) if img_w else 0,
-                            "y": round(pz.y / img_h * 100, 2) if img_h else 0,
-                            "w": round(pz.width / img_w * 100, 2) if img_w else 0,
-                            "h": round(pz.height / img_h * 100, 2) if img_h else 0,
-                        },
-                        "border": None,
-                        "word_count": len(zg["words"]),
-                        **grid,
-                    }
-
-                    if pz.box:
-                        zone_entry["border"] = {
-                            "thickness": pz.box.border_thickness,
-                            "confidence": pz.box.confidence,
-                        }
-
-                    if pz.image_overlays:
-                        zone_entry["image_overlays"] = pz.image_overlays
-
-                    if pz.layout_hint:
-                        zone_entry["layout_hint"] = pz.layout_hint
-                    if pz.vsplit_group is not None:
-                        zone_entry["vsplit_group"] = pz.vsplit_group
-
-                    zones_data.append(zone_entry)
-
-    # 4. Fallback: no boxes detected → single zone with all words
-    if not zones_data:
-        # Filter recovered single-char artifacts (same as in zone loop above)
-        before = len(all_words)
-        filtered_words = [
-            w for w in all_words
-            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
-        ]
-        removed = before - len(filtered_words)
-        if removed:
-            logger.info(
-                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
-                session_id, removed,
-            )
-        # Pre-filter border-strip words so column detection is not
-        # confused by edge artifacts.  When this removes words, Step 4e
-        # is skipped (it would otherwise re-detect content as a "strip").
-        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
-        if bs_removed:
-            border_prefiltered = True
-            logger.info(
-                "build-grid session %s: pre-filtered %d border-strip words",
-                session_id, bs_removed,
-            )
-        grid = _build_zone_grid(
-            filtered_words, content_x, content_y, content_w, content_h,
-            0, img_w, img_h,
-        )
-        grid.pop("_raw_columns", None)
-        zones_data.append({
-            "zone_index": 0,
-            "zone_type": "content",
-            "bbox_px": {
-                "x": content_x, "y": content_y,
-                "w": content_w, "h": content_h,
-            },
-            "bbox_pct": {
-                "x": round(content_x / img_w * 100, 2) if img_w else 0,
-                "y": round(content_y / img_h * 100, 2) if img_h else 0,
-                "w": round(content_w / img_w * 100, 2) if img_w else 0,
-                "h": round(content_h / img_h * 100, 2) if img_h else 0,
-            },
-            "border": None,
-            "word_count": len(all_words),
-            **grid,
-        })
-
-    # 4b. Remove junk rows: rows where ALL cells contain only short,
-    # low-confidence text (OCR noise, stray marks).  Real vocabulary rows
-    # have at least one word with conf >= 50 or meaningful text length.
-    # Also remove "oversized stub" rows: rows with ≤2 very short words
-    # whose word-boxes are significantly taller than the median (e.g.
-    # large red page numbers like "( 9" that are not real text content).
-    _JUNK_CONF_THRESHOLD = 50
-    _JUNK_MAX_TEXT_LEN = 3
-    for z in zones_data:
-        cells = z.get("cells", [])
-        rows = z.get("rows", [])
-        if not cells or not rows:
-            continue
-
-        # Compute median word height across the zone for oversized detection
-        all_wb_heights = [
-            wb["height"]
-            for cell in cells
-            for wb in cell.get("word_boxes") or []
-            if wb.get("height", 0) > 0
-        ]
-        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
-
-        junk_row_indices = set()
-        for row in rows:
-            ri = row["index"]
-            row_cells = [c for c in cells if c.get("row_index") == ri]
-            if not row_cells:
-                continue
-
-            row_wbs = [
-                wb for cell in row_cells
-                for wb in cell.get("word_boxes") or []
-            ]
-
-            # Rule 1: ALL word_boxes are low-conf AND short text
-            all_junk = True
-            for wb in row_wbs:
-                text = (wb.get("text") or "").strip()
-                conf = wb.get("conf", 0)
-                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
-                    all_junk = False
-                    break
-            if all_junk and row_wbs:
-                junk_row_indices.add(ri)
-                continue
-
-            # Rule 2: oversized stub — ≤3 words, short total text,
-            # and word height > 1.8× median (page numbers, stray marks,
-            # OCR from illustration labels like "SEA &")
-            # Skip if any word looks like a page reference (p.55, S.12).
-            if len(row_wbs) <= 3:
-                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
-                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
-                has_page_ref = any(
-                    re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
-                    for wb in row_wbs
-                )
-                if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
-                    junk_row_indices.add(ri)
-                    continue
-
-            # Rule 3: scattered debris — rows with only tiny fragments
-            # (e.g. OCR artifacts from illustrations/graphics).
-            # If the row has no word longer than 2 chars, it's noise.
-            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
-            if longest <= 2:
-                junk_row_indices.add(ri)
-                continue
-
-        if junk_row_indices:
-            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
-            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
-            logger.info(
-                "build-grid: removed %d junk rows from zone %d: %s",
-                len(junk_row_indices), z["zone_index"],
-                sorted(junk_row_indices),
-            )
-
-    # 4b2. Remove individual cells that consist of a single very-short,
-    # low-confidence word (OCR artifacts like "as", "b" from stray marks).
-    # These survive row-level junk removal when the row has valid cells
-    # in other columns.
-    _ARTIFACT_MAX_LEN = 2
-    _ARTIFACT_CONF_THRESHOLD = 65
-    for z in zones_data:
-        cells = z.get("cells", [])
-        if not cells:
-            continue
-        artifact_ids = set()
-        for cell in cells:
-            wbs = cell.get("word_boxes") or []
-            if len(wbs) != 1:
-                continue
-            wb = wbs[0]
-            text = (wb.get("text") or "").strip()
-            conf = wb.get("conf", 100)
-            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
-                artifact_ids.add(cell.get("cell_id"))
-        if artifact_ids:
-            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
-            logger.info(
-                "build-grid: removed %d artifact cells from zone %d: %s",
-                len(artifact_ids), z.get("zone_index", 0),
-                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
-            )
-
-    # 4c. Remove oversized word_boxes from individual cells.
-    # OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
-    # have word heights 3-5x the median.  Remove them per-word so they don't
-    # pollute cells that also contain valid text in other columns.
-    for z in zones_data:
-        cells = z.get("cells", [])
-        if not cells:
-            continue
-        all_wh = [
-            wb["height"]
-            for cell in cells
-            for wb in cell.get("word_boxes") or []
-            if wb.get("height", 0) > 0
-        ]
-        if not all_wh:
-            continue
-        med_h = sorted(all_wh)[len(all_wh) // 2]
-        oversized_threshold = med_h * 3
-        removed_oversized = 0
-        for cell in cells:
-            wbs = cell.get("word_boxes") or []
-            filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
-            if len(filtered) < len(wbs):
-                removed_oversized += len(wbs) - len(filtered)
-                cell["word_boxes"] = filtered
-                cell["text"] = _words_to_reading_order_text(filtered)
-        if removed_oversized:
-            # Remove cells that became empty after oversized removal
-            z["cells"] = [c for c in cells if c.get("word_boxes")]
-            logger.info(
-                "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
-                removed_oversized, oversized_threshold, z.get("zone_index", 0),
-            )
-
-    # 4d. Remove pipe-character word_boxes (column divider artifacts).
-    # OCR reads physical vertical divider lines as "|" or "||" characters.
-    # These sit at consistent x positions near column boundaries and pollute
-    # cell text.  Remove them from word_boxes and rebuild cell text.
-    # NOTE: Zones from a vertical split already had pipes removed in step 3b.
-    _PIPE_RE = re.compile(r"^\|+$")
-    for z in zones_data:
-        if z.get("vsplit_group") is not None:
-            continue  # pipes already removed before split
-        removed_pipes = 0
-        for cell in z.get("cells", []):
-            wbs = cell.get("word_boxes") or []
-            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
-            if len(filtered) < len(wbs):
-                removed_pipes += len(wbs) - len(filtered)
-                cell["word_boxes"] = filtered
-                cell["text"] = _words_to_reading_order_text(filtered)
-        # Remove cells that became empty after pipe removal
-        if removed_pipes:
-            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
-            logger.info(
-                "build-grid: removed %d pipe-divider word_boxes from zone %d",
-                removed_pipes, z.get("zone_index", 0),
-            )
-
-    # Strip pipe chars ONLY from word_boxes/cells where the pipe is an
-    # OCR column-divider artifact.  Preserve pipes that are embedded in
-    # words as syllable separators (e.g. "zu|trau|en") — these are
-    # intentional and used in dictionary Ground Truth.
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            for wb in cell.get("word_boxes", []):
-                wbt = wb.get("text", "")
-                # Only strip if the ENTIRE word_box is just pipe(s)
-                # (handled by _PIPE_RE above) — leave embedded pipes alone
-            text = cell.get("text", "")
-            if "|" in text:
-                # Only strip leading/trailing pipes (OCR artifacts at cell edges)
-                cleaned = text.strip("|").strip()
-                if cleaned != text.strip():
-                    cell["text"] = cleaned
-
-    # 4d2. Normalize narrow connector columns.
-    # In synonym dictionaries a narrow column repeats the same word
-    # (e.g. "oder") in every row.  OCR sometimes appends noise chars
-    # (e.g. "oderb" instead of "oder").  If ≥60% of cells in a column
-    # share the same short text, normalize near-match outliers.
-    for z in zones_data:
-        cols = z.get("columns", [])
-        cells = z.get("cells", [])
-        if not cols or not cells:
-            continue
-        for col in cols:
-            ci = col.get("index")
-            col_cells = [c for c in cells if c.get("col_index") == ci]
-            if len(col_cells) < 3:
-                continue
-            # Count text occurrences
-            text_counts: Dict[str, int] = {}
-            for c in col_cells:
-                t = (c.get("text") or "").strip()
-                if t:
-                    text_counts[t] = text_counts.get(t, 0) + 1
-            if not text_counts:
-                continue
-            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
-            dominant_count = text_counts[dominant_text]
-            # Only normalize if dominant word is short and appears in ≥60%
-            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
-                continue
-            # Fix outliers that start with the dominant text
-            fixed = 0
-            for c in col_cells:
-                t = (c.get("text") or "").strip()
-                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
-                    c["text"] = dominant_text
-                    # Also fix word_boxes
-                    wbs = c.get("word_boxes") or []
-                    if len(wbs) == 1:
-                        wbs[0]["text"] = dominant_text
-                    fixed += 1
-            if fixed:
-                logger.info(
-                    "build-grid: normalized %d outlier cells in connector column %d "
-                    "(dominant='%s') zone %d",
-                    fixed, ci, dominant_text, z.get("zone_index", 0),
-                )
-
-    # 4e. Detect and remove page-border decoration strips.
-    # Skipped when the pre-filter already removed border words BEFORE
-    # column detection — re-running would incorrectly detect the
-    # leftmost content column as a "strip".
-    border_strip_removed = 0
-    if border_prefiltered:
-        logger.info("Step 4e: skipped (border pre-filter already applied)")
-    else:
-        # Some textbooks have decorative alphabet strips along the page
-        # edge.  OCR picks up scattered letters from these as artifacts.
-        # Detection: find the first significant x-gap (>30 px) from each
-        # page edge between a small cluster (<20 %) and the main content.
-        for z in zones_data:
-            cells = z.get("cells", [])
-            if not cells:
-                continue
-            all_wbs_with_cell: List[tuple] = []  # (left, wb, cell)
-            for cell in cells:
-                for wb in cell.get("word_boxes") or []:
-                    all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
-            if len(all_wbs_with_cell) < 10:
-                continue
-            all_wbs_with_cell.sort(key=lambda t: t[0])
-            total = len(all_wbs_with_cell)
-
-            # -- Left-edge scan --
-            left_strip_count = 0
-            left_gap = 0
-            running_right = 0
-            for gi in range(total - 1):
-                running_right = max(
-                    running_right,
-                    all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
-                )
-                gap = all_wbs_with_cell[gi + 1][0] - running_right
-                if gap > 30:
-                    left_strip_count = gi + 1
-                    left_gap = gap
-                    break
-
-            # -- Right-edge scan --
-            right_strip_count = 0
-            right_gap = 0
-            running_left = all_wbs_with_cell[-1][0]
-            for gi in range(total - 1, 0, -1):
-                running_left = min(running_left, all_wbs_with_cell[gi][0])
-                prev_right = (
-                    all_wbs_with_cell[gi - 1][0]
-                    + all_wbs_with_cell[gi - 1][1].get("width", 0)
-                )
-                gap = running_left - prev_right
-                if gap > 30:
-                    right_strip_count = total - gi
-                    right_gap = gap
-                    break
-
-            strip_wbs: set = set()
-            strip_side = ""
-            strip_gap = 0
-            strip_count = 0
-            if left_strip_count > 0 and left_strip_count / total < 0.20:
-                strip_side = "left"
-                strip_count = left_strip_count
-                strip_gap = left_gap
-                strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
-            elif right_strip_count > 0 and right_strip_count / total < 0.20:
-                strip_side = "right"
-                strip_count = right_strip_count
-                strip_gap = right_gap
-                strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
-
-            if not strip_wbs:
-                continue
-            for cell in cells:
-                wbs = cell.get("word_boxes") or []
-                filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
-                if len(filtered) < len(wbs):
-                    border_strip_removed += len(wbs) - len(filtered)
-                    cell["word_boxes"] = filtered
-                    cell["text"] = _words_to_reading_order_text(filtered)
-            z["cells"] = [c for c in cells
-                          if (c.get("word_boxes") or c.get("text", "").strip())]
-            logger.info(
-                "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
-                "(gap=%dpx, strip=%d/%d wbs)",
-                border_strip_removed, strip_side, z.get("zone_index", 0),
-                strip_gap, strip_count, total,
-            )
-
-    # 4f. Remove decorative edge columns (alphabet sidebar safety net).
-    # Dictionary pages have A-Z letter sidebars that OCR reads as single-
-    # character word_boxes.  These form narrow columns with very short text.
-    # Detection: edge column where almost ALL cells are single characters.
-    for z in zones_data:
-        columns = z.get("columns", [])
-        cells = z.get("cells", [])
-        if len(columns) < 3 or not cells:
-            continue
-        # Group cells by col_type (skip spanning_header)
-        col_cells: Dict[str, List[Dict]] = {}
-        for cell in cells:
-            ct = cell.get("col_type", "")
-            if ct.startswith("column_"):
-                col_cells.setdefault(ct, []).append(cell)
-        col_types_ordered = sorted(col_cells.keys())
-        if len(col_types_ordered) < 3:
-            continue
-        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
-            edge_cells_list = col_cells.get(edge_ct, [])
-            if len(edge_cells_list) < 3:
-                continue
-            # Key criterion: average text length and single-char ratio.
-            # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
-            # are single characters.
-            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
-            avg_len = sum(len(t) for t in texts) / len(texts)
-            single_char = sum(1 for t in texts if len(t) <= 1)
-            single_ratio = single_char / len(texts)
-            if avg_len > 1.5:
-                continue  # real content has longer text
-            if single_ratio < 0.7:
-                continue  # not dominated by single chars
-            # Remove this edge column
-            removed_count = len(edge_cells_list)
-            edge_ids = {id(c) for c in edge_cells_list}
-            z["cells"] = [c for c in cells if id(c) not in edge_ids]
-            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
-            logger.info(
-                "Step 4f: removed decorative edge column '%s' from zone %d "
-                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
-                edge_ct, z.get("zone_index", 0), removed_count,
-                avg_len, single_ratio * 100,
-            )
-            break  # only remove one edge per zone
-
-    # 5. Color annotation on final word_boxes in cells
-    if img_bgr is not None:
-        all_wb: List[Dict] = []
-        for z in zones_data:
-            for cell in z.get("cells", []):
-                all_wb.extend(cell.get("word_boxes", []))
-        detect_word_colors(img_bgr, all_wb)
-
-    # 5a. Heading detection by color + height (after color is available)
-    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
-    if heading_count:
-        logger.info("Detected %d heading rows by color+height", heading_count)
-
-    # 5b. Fix unmatched parentheses in cell text
-    # OCR often misses opening "(" while detecting closing ")".
-    # If a cell's text has ")" without a matching "(", prepend "(".
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            text = cell.get("text", "")
-            if ")" in text and "(" not in text:
-                cell["text"] = "(" + text
-
-    # 5c. IPA phonetic correction — replace garbled OCR phonetics with
-    # correct IPA from the dictionary (same as in the OCR pipeline).
-    # Only applies to vocabulary tables (≥3 columns: EN | article | DE).
-    # Single/two-column layouts are continuous text, not vocab tables.
-    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
-    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
-    en_col_type = None
-    ipa_target_cols: set = set()
-    all_content_cols: set = set()
-    skip_ipa = (ipa_mode == "none")
-
-    # When ipa_mode=none, strip ALL square brackets from ALL content columns
-    if skip_ipa:
-        _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
-        for cell in all_cells:
-            ct = cell.get("col_type", "")
-            if not ct.startswith("column_"):
-                continue
-            text = cell.get("text", "")
-            if "[" in text:
-                stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
-                if stripped != text:
-                    cell["text"] = stripped.strip()
-                    cell["_ipa_corrected"] = True
-
-    if not skip_ipa and total_cols >= 3:
-        # Detect English headword column via IPA signals (brackets or garbled).
-        col_ipa_count: Dict[str, int] = {}
-        all_content_cols: set = set()
-        for cell in all_cells:
-            ct = cell.get("col_type", "")
-            if not ct.startswith("column_"):
-                continue
-            txt = cell.get("text", "") or ""
-            if txt.strip():
-                all_content_cols.add(ct)
-            if '[' in txt or _text_has_garbled_ipa(txt):
-                col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
-        if col_ipa_count:
-            en_col_type = max(col_ipa_count, key=col_ipa_count.get)
-        elif ipa_mode == "all":
-            # Force-all mode without auto-detection: pick column with most cells
-            col_cell_count: Dict[str, int] = {}
-            for cell in all_cells:
-                ct = cell.get("col_type", "")
-                if ct.startswith("column_") and (cell.get("text") or "").strip():
-                    col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
-            if col_cell_count:
-                en_col_type = max(col_cell_count, key=col_cell_count.get)
-
-        # Decide which columns to process based on ipa_mode:
-        # auto/en: only the detected EN headword column (English IPA)
-        # de: all content columns EXCEPT the EN column (German IPA)
-        # all: EN column gets English IPA, other columns get German IPA
-        en_ipa_target_cols: set = set()
-        de_ipa_target_cols: set = set()
-        if ipa_mode in ("auto", "en"):
-            if en_col_type:
-                en_ipa_target_cols.add(en_col_type)
-        elif ipa_mode == "de":
-            de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
-        elif ipa_mode == "all":
-            if en_col_type:
-                en_ipa_target_cols.add(en_col_type)
-            de_ipa_target_cols = all_content_cols - en_ipa_target_cols
-
-        # --- Strip IPA from columns NOT in the target set ---
-        # When user selects "nur DE", English IPA from the OCR scan must
-        # be removed.  When "none", all IPA is removed.
-        # In vocab columns, square brackets [...] are always IPA (both
-        # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]).
-        _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
-        strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
-        if strip_en_ipa or ipa_mode == "none":
-            strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
-            for cell in all_cells:
-                ct = cell.get("col_type", "")
-                if ct not in strip_cols:
-                    continue
-                text = cell.get("text", "")
-                if "[" in text:
-                    stripped = _SQUARE_BRACKET_RE.sub("", text)
-                    if stripped != text:
-                        cell["text"] = stripped.strip()
-                        cell["_ipa_corrected"] = True
-
-        # --- English IPA (Britfone + eng_to_ipa) ---
-        if en_ipa_target_cols:
-            for cell in all_cells:
-                ct = cell.get("col_type")
-                if ct in en_ipa_target_cols:
-                    cell["_orig_col_type"] = ct
-                    cell["col_type"] = "column_en"
-        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
-        fix_cell_phonetics(all_cells, pronunciation="british")
-        for cell in all_cells:
-            orig = cell.pop("_orig_col_type", None)
-            if orig:
-                cell["col_type"] = orig
-            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
-                cell["_ipa_corrected"] = True
-
-        # --- German IPA (wiki-pronunciation-dict + epitran) ---
-        if de_ipa_target_cols:
-            from cv_ipa_german import insert_german_ipa
-            insert_german_ipa(all_cells, de_ipa_target_cols)
-
-        ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
-
-        # Mark cells whose text was changed by IPA correction so that
-        # later steps (5i) don't overwrite the corrected text when
-        # reconstructing from word_boxes.  (Already done inline above
-        # for English; insert_german_ipa sets _ipa_corrected too.)
-        for cell in all_cells:
-            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
-                cell["_ipa_corrected"] = True
-
-        # 5d. Fix IPA continuation cells — cells where the printed
-        # phonetic transcription wraps to a line below the headword.
-        # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]").
-        # Replace garbled text with proper IPA looked up from the
-        # headword in the previous row's same column.
-        # Note: We check ALL columns, not just en_col_type, because
-        # the EN headword column may not be the longest-average column.
-        _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
-        ipa_cont_fixed = 0
-        for z in ([] if skip_ipa else zones_data):
-            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
-            z_cells = z.get("cells", [])
-            for idx, row in enumerate(rows_sorted):
-                if idx == 0:
-                    continue
-                ri = row["index"]
-                row_cells = [c for c in z_cells if c.get("row_index") == ri]
-                for cell in row_cells:
-                    ct = cell.get("col_type", "")
-                    if not ct.startswith("column_"):
-                        continue
-                    cell_text = (cell.get("text") or "").strip()
-                    if not cell_text:
-                        # Step 5c may have emptied garbled IPA cells like
-                        # "[n, nn]" — recover text from word_boxes.
-                        wb_texts = [w.get("text", "")
-                                    for w in cell.get("word_boxes", [])]
-                        cell_text = " ".join(wb_texts).strip()
-                        if not cell_text:
-                            continue
-
-                    is_bracketed = (
-                        cell_text.startswith('[') and cell_text.endswith(']')
-                    )
-
-                    if is_bracketed:
-                        # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
-                        # Text like "employee [im'ploi:]" is NOT fully
-                        # bracketed and won't match here.
-                        if not _text_has_garbled_ipa(cell_text):
-                            continue
-                        # Already has proper IPA brackets → skip
-                        if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
-                            continue
-                    else:
-                        # Unbracketed continuation: "ska:f – ska:vz",
-                        # "'sekandarr sku:l".  Only treat as IPA
-                        # continuation if this is the ONLY content cell
-                        # in the row (single-cell row) and the text is
-                        # garbled IPA without real IPA Unicode symbols.
-                        content_cells_in_row = [
-                            c for c in row_cells
-                            if c.get("col_type", "").startswith("column_")
-                            and c.get("col_type") != "column_1"
-                        ]
-                        if len(content_cells_in_row) != 1:
-                            continue
-                        if not _text_has_garbled_ipa(cell_text):
-                            continue
-                        # Has real IPA symbols → already fixed or valid
-                        if any(c in _REAL_IPA_CHARS for c in cell_text):
-                            continue
-
-                    # Find headword in previous row, same column
-                    prev_ri = rows_sorted[idx - 1]["index"]
-                    prev_same_col = [
-                        c for c in z_cells
-                        if c.get("row_index") == prev_ri
-                        and c.get("col_type") == ct
-                    ]
-                    if not prev_same_col:
-                        continue
-                    prev_text = prev_same_col[0].get("text", "")
-                    fixed = fix_ipa_continuation_cell(
-                        cell_text, prev_text, pronunciation="british",
-                    )
-                    if fixed != cell_text:
-                        cell["text"] = fixed
-                        ipa_cont_fixed += 1
-                        logger.info(
-                            "IPA continuation R%d %s: '%s' → '%s'",
-                            ri, ct, cell_text, fixed,
-                        )
-        if ipa_cont_fixed:
-            logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
-
-    # 5e. Heading detection by single-cell rows — black headings like
-    # "Theme" that have normal color and height but are the ONLY cell
-    # in their row (excluding page_ref column_1).  Must run AFTER 5d
-    # so IPA continuation cells are already processed.
-    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
-    if single_heading_count:
-        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
-
-    # 5f. Strip IPA from headings — headings detected in 5e ran AFTER
-    # IPA correction (5c), so they may have dictionary IPA appended
-    # (e.g. "Theme [θˈiːm]" → "Theme").  Headings should show the
-    # original text only.
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            if cell.get("col_type") != "heading":
-                continue
-            text = cell.get("text", "")
-            # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme"
-            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
-            if stripped and stripped != text:
-                cell["text"] = stripped
-
-    # 5g. Extract page_ref cells and footer rows from content zones.
-    # Page references (column_1 cells like "p.70") sit in rows that
-    # also contain vocabulary — extract them as zone metadata without
-    # removing the row.  Footer lines (e.g. "two hundred and twelve"
-    # = page number at bottom) are standalone rows that should be
-    # removed from the table entirely.
-    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
-    # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
-    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
-    _NUMBER_WORDS = {
-        "one", "two", "three", "four", "five", "six", "seven",
-        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
-        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
-        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
-        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
-        "einhundert", "zweihundert", "dreihundert", "vierhundert",
-        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
-    }
-    for z in zones_data:
-        if z.get("zone_type") != "content":
-            continue
-        cells = z.get("cells", [])
-        rows = z.get("rows", [])
-        if not rows:
-            continue
-
-        # Extract column_1 cells that look like page references
-        page_refs = []
-        page_ref_cell_ids = set()
-        for cell in cells:
-            if cell.get("col_type") != "column_1":
-                continue
-            text = (cell.get("text") or "").strip()
-            if not text:
-                continue
-            if not _PAGE_REF_RE.match(text):
-                continue
-            page_refs.append({
-                "row_index": cell.get("row_index"),
-                "text": text,
-                "bbox_pct": cell.get("bbox_pct", {}),
-            })
-            page_ref_cell_ids.add(cell.get("cell_id"))
-
-        # Keep page_ref cells in the table as a visible column.
-        # Previously these were removed, but users want to see them.
-        # The metadata extraction above still populates zone["page_refs"]
-        # for the frontend header display.
-
-        # Detect footer: last non-header row if it has only 1 cell
-        # with short, non-content text (page numbers like "233" or
-        # "two hundred and twelve").  Comma-separated lists and long
-        # text are content continuations, not page numbers.
-        footer_rows = []
-        non_header_rows = [r for r in rows if not r.get("is_header")]
-        if non_header_rows:
-            last_row = non_header_rows[-1]
-            last_ri = last_row["index"]
-            last_cells = [c for c in z["cells"]
-                          if c.get("row_index") == last_ri]
-            if len(last_cells) == 1:
-                text = (last_cells[0].get("text") or "").strip()
-                # Not IPA (no real IPA symbols) and not a heading
-                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
-                # Comma-separated text is a content continuation, not a footer
-                has_commas = ',' in text
-                # Written-out page numbers like "two hundred and nine"
-                text_words = set(text.lower().split())
-                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
-                # Short text or written-out number
-                is_page_number = len(text) <= 20 or is_written_number
-                if (text and not has_real_ipa and not has_commas
-                        and is_page_number
-                        and last_cells[0].get("col_type") != "heading"):
-                    footer_rows.append({
-                        "row_index": last_ri,
-                        "text": text,
-                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
-                    })
-
-        # Classify footer rows: page numbers are removed from the grid
-        # and promoted to page_number metadata; other footers stay as rows.
-        page_number_footers = []
-        other_footers = []
-        for fr in footer_rows:
-            ft = fr["text"].strip()
-            # Pure digits
-            digits = "".join(c for c in ft if c.isdigit())
-            if digits and re.match(r'^[\d\s.]+$', ft):
-                page_number_footers.append(fr)
-            # Written-out numbers
-            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
-                page_number_footers.append(fr)
-            else:
-                other_footers.append(fr)
-
-        # Remove page-number footer rows from grid entirely
-        if page_number_footers:
-            pn_ris = {fr["row_index"] for fr in page_number_footers}
-            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
-            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
-            # Set page_number metadata (use first one)
-            pn_text = page_number_footers[0]["text"].strip()
-            pn_digits = "".join(c for c in pn_text if c.isdigit())
-            if not page_number_info:
-                page_number_info = {
-                    "text": pn_text,
-                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
-                }
-                if pn_digits:
-                    page_number_info["number"] = int(pn_digits)
-
-        # Mark remaining footer rows (non-page-number content)
-        if other_footers:
-            footer_ris = {fr["row_index"] for fr in other_footers}
-            for r in z["rows"]:
-                if r["index"] in footer_ris:
-                    r["is_footer"] = True
-            for c in z["cells"]:
-                if c.get("row_index") in footer_ris:
-                    c["col_type"] = "footer"
-
-        if page_refs or footer_rows:
-            logger.info(
-                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
-                len(page_refs), len(footer_rows), len(page_number_footers),
-                z.get("zone_index", 0),
-            )
-
-        # Store as zone-level metadata
-        if page_refs:
-            z["page_refs"] = page_refs
-        if other_footers:
-            z["footer"] = other_footers
-
-    # 5h. Convert slash-delimited IPA to bracket notation.
-    # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
-    # Detect the pattern <headword> /ocr_ipa/ and replace with [dict_ipa]
-    # using the IPA dictionary when available, falling back to the OCR text.
-    # The regex requires a word character (or ² ³) right before the opening
-    # slash to avoid false positives like "sb/sth".
-    _SLASH_IPA_RE = re.compile(
-        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
-        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
-    )
-    # Standalone slash IPA at start of text (headword on previous line)
-    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
-    # IPA between slashes never contains spaces, parentheses, or commas.
-    # Reject matches that look like grammar: "sb/sth up a) jdn/"
-    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
-    slash_ipa_fixed = 0
-    for z in ([] if skip_ipa else zones_data):
-        for cell in z.get("cells", []):
-            # Only process English headword column — avoid converting
-            # German text like "der/die/das" to IPA.
-            if en_col_type and cell.get("col_type") != en_col_type:
-                continue
-            text = cell.get("text", "")
-            if "/" not in text:
-                continue
-
-            def _replace_slash_ipa(m: re.Match) -> str:
-                nonlocal slash_ipa_fixed
-                headword = m.group(1)
-                ocr_ipa = m.group(2)  # includes slashes
-                inner_raw = ocr_ipa.strip("/").strip()
-                # Reject if inner content has spaces/parens/commas (grammar)
-                if _SLASH_IPA_REJECT_RE.search(inner_raw):
-                    return m.group(0)
-                # Strip superscript digits for lookup
-                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
-                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
-                if ipa:
-                    slash_ipa_fixed += 1
-                    return f"{headword} [{ipa}]"
-                # Fallback: keep OCR IPA but convert slashes to brackets
-                inner = inner_raw.lstrip("'").strip()
-                if inner:
-                    slash_ipa_fixed += 1
-                    return f"{headword} [{inner}]"
-                return m.group(0)
-
-            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
-
-            # Second pass: convert remaining /ipa/ after [ipa] from first pass.
-            # Pattern: [ipa] /ipa2/ → [ipa] [ipa2]  (second pronunciation variant)
-            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
-            def _replace_trailing_slash(m: re.Match) -> str:
-                nonlocal slash_ipa_fixed
-                inner = m.group(1).strip("/").strip().lstrip("'").strip()
-                if _SLASH_IPA_REJECT_RE.search(inner):
-                    return m.group(0)
-                if inner:
-                    slash_ipa_fixed += 1
-                    return f" [{inner}]"
-                return m.group(0)
-            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
-
-            # Handle standalone /ipa/ at start (no headword in this cell)
-            if new_text == text:
-                m = _STANDALONE_SLASH_IPA_RE.match(text)
-                if m:
-                    inner = m.group(1).strip()
-                    if not _SLASH_IPA_REJECT_RE.search(inner):
-                        inner = inner.lstrip("'").strip()
-                        if inner:
-                            new_text = "[" + inner + "]" + text[m.end():]
-                            slash_ipa_fixed += 1
-
-            if new_text != text:
-                cell["text"] = new_text
-
-    if slash_ipa_fixed:
-        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
-
-    # 5i. Remove blue bullet/artifact word_boxes.
-    # Dictionary pages have small blue square bullets (■) before entries.
-    # OCR reads these as text artifacts (©, e, *, or even plausible words
-    # like "fighily" overlapping the real word "tightly").
-    # Detection rules:
-    #   a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black)
-    #   b) Overlapping word_boxes: >40% x-overlap → remove lower confidence
-    #   c) Duplicate text: consecutive blue wbs with identical text, gap < 6px
-    bullet_removed = 0
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            wbs = cell.get("word_boxes") or []
-            if len(wbs) < 2:
-                continue
-            to_remove: set = set()
-
-            # Rule (a): tiny coloured symbols (bullets, graphic fragments)
-            for i, wb in enumerate(wbs):
-                cn = wb.get("color_name", "black")
-                if (cn != "black"
-                        and wb.get("width", 0) * wb.get("height", 0) < 200
-                        and wb.get("conf", 100) < 85):
-                    to_remove.add(i)
-
-            # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
-            # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
-            # Remove word boxes that contain NO letters or digits.
-            for i, wb in enumerate(wbs):
-                t = (wb.get("text") or "").strip()
-                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
-                    to_remove.add(i)
-
-            # Rule (b) + (c): overlap and duplicate detection
-            # Sort by x for pairwise comparison
-            _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
-            to_merge: List[Tuple[int, int]] = []  # pairs (i1, i2) to merge
-            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
-            for p in range(len(indexed) - 1):
-                i1, w1 = indexed[p]
-                i2, w2 = indexed[p + 1]
-                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
-                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
-                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
-                min_w = min(w1.get("width", 1), w2.get("width", 1))
-                gap = x2s - x1e
-                overlap_pct = overlap / min_w if min_w > 0 else 0
-
-                # (b) Significant x-overlap
-                if overlap_pct > 0.20:
-                    t1 = (w1.get("text") or "").strip()
-                    t2 = (w2.get("text") or "").strip()
-
-                    # Syllable-split words: both are alphabetic text with
-                    # moderate overlap (20-75%).  Merge instead of removing.
-                    # OCR splits words at syllable marks, producing overlapping
-                    # boxes like "zu" + "tiefst" → "zutiefst".
-                    if (overlap_pct <= 0.75
-                            and _ALPHA_WORD_RE.match(t1)
-                            and _ALPHA_WORD_RE.match(t2)):
-                        to_merge.append((i1, i2))
-                        continue
-
-                    # High overlap (>75%) with different alphabetic text:
-                    # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
-                    # causing it to heavily overlap with the next fragment ("brech").
-                    # Merge instead of removing when one is a short prefix (≤4 chars)
-                    # and the texts are different.
-                    if (overlap_pct > 0.75
-                            and _ALPHA_WORD_RE.match(t1)
-                            and _ALPHA_WORD_RE.match(t2)
-                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
-                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
-                        to_merge.append((i1, i2))
-                        continue
-
-                    if overlap_pct <= 0.40:
-                        continue  # too little overlap and not alphabetic merge
-
-                    c1 = w1.get("conf", 50)
-                    c2 = w2.get("conf", 50)
-
-                    # For very high overlap (>90%) with different text,
-                    # prefer the word that exists in the IPA dictionary
-                    # over confidence (OCR can give artifacts high conf).
-                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
-                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
-                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
-                        if in_dict_1 and not in_dict_2:
-                            to_remove.add(i2)
-                            continue
-                        elif in_dict_2 and not in_dict_1:
-                            to_remove.add(i1)
-                            continue
-
-                    if c1 < c2:
-                        to_remove.add(i1)
-                    elif c2 < c1:
-                        to_remove.add(i2)
-                    else:
-                        # Same confidence: remove the taller one (bullet slivers)
-                        if w1.get("height", 0) > w2.get("height", 0):
-                            to_remove.add(i1)
-                        else:
-                            to_remove.add(i2)
-
-                # (c) Duplicate text: consecutive blue with same text, gap < 6px
-                elif (gap < 6
-                      and w1.get("color_name") == "blue"
-                      and w2.get("color_name") == "blue"
-                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
-                    # Remove the one with lower confidence; if equal, first one
-                    c1 = w1.get("conf", 50)
-                    c2 = w2.get("conf", 50)
-                    to_remove.add(i1 if c1 <= c2 else i2)
-
-            # Execute merges first (syllable-split words).
-            # Use merge_parent to support chain merging: if "zer" absorbed
-            # "brech" and then "brech"+"lich" is a merge pair, redirect to
-            # merge "lich" into "zer" → "zerbrechlich".
-            if to_merge:
-                merge_parent: Dict[int, int] = {}  # absorbed → absorber
-                for mi1, mi2 in to_merge:
-                    # Follow chain: if mi1 was absorbed, find root absorber
-                    actual_mi1 = mi1
-                    while actual_mi1 in merge_parent:
-                        actual_mi1 = merge_parent[actual_mi1]
-                    if actual_mi1 in to_remove or mi2 in to_remove:
-                        continue
-                    if mi2 in merge_parent:
-                        continue  # mi2 already absorbed
-                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
-                    # Concatenate text (no space — they're parts of one word)
-                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
-                    mt2 = (mw2.get("text") or "").strip()
-                    merged_text = mt1 + mt2
-                    # Union bounding box
-                    mx = min(mw1["left"], mw2["left"])
-                    my = min(mw1["top"], mw2["top"])
-                    mr = max(mw1["left"] + mw1["width"],
-                             mw2["left"] + mw2["width"])
-                    mb = max(mw1["top"] + mw1["height"],
-                             mw2["top"] + mw2["height"])
-                    mw1["text"] = merged_text
-                    mw1["left"] = mx
-                    mw1["top"] = my
-                    mw1["width"] = mr - mx
-                    mw1["height"] = mb - my
-                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
-                    to_remove.add(mi2)
-                    merge_parent[mi2] = actual_mi1
-                    bullet_removed -= 1  # net: merge, not removal
-
-            if to_remove:
-                bullet_removed += len(to_remove)
-                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
-                cell["word_boxes"] = filtered
-                # Don't overwrite text that was corrected by Step 5c IPA fix
-                if not cell.get("_ipa_corrected"):
-                    cell["text"] = _words_to_reading_order_text(filtered)
-
-    # Remove cells that became empty after bullet removal
-    if bullet_removed:
-        for z in zones_data:
-            z["cells"] = [c for c in z.get("cells", [])
-                          if (c.get("word_boxes") or c.get("text", "").strip())]
-        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
-
-    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
-    # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
-    # "\\", "PEE", "a=") that survive earlier filters because their rows also
-    # contain real content in other columns.  Remove them here.
-    _COMMON_SHORT_WORDS = {
-        # German
-        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
-        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
-        "die", "der", "das", "dem", "den", "des", "ein", "und",
-        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
-        # English
-        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
-        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
-        "on", "or", "so", "to", "up", "us", "we",
-        "the", "and", "but", "for", "not",
-    }
-    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
-    artifact_cells_removed = 0
-    for z in zones_data:
-        before = len(z.get("cells", []))
-        kept = []
-        for cell in z.get("cells", []):
-            text = (cell.get("text") or "").strip()
-            core = text.rstrip(".,;:!?'\"")
-            is_artifact = False
-            if not core:
-                is_artifact = True
-            elif _PURE_JUNK_RE.match(core):
-                is_artifact = True
-            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
-                # Short non-alphabetic text like "a=", not word beginnings like "Zw"
-                is_artifact = True
-            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
-                is_artifact = True
-            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
-                  and not re.match(r'^[pPsS]\.?\d+$', core)):
-                # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
-                # but NOT page references like "p.43", "p50", "S.12"
-                is_artifact = True
-            if is_artifact:
-                kept.append(None)  # placeholder
-            else:
-                kept.append(cell)
-        z["cells"] = [c for c in kept if c is not None]
-        artifact_cells_removed += before - len(z["cells"])
-    if artifact_cells_removed:
-        # Also remove rows that became completely empty
-        for z in zones_data:
-            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
-            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
-        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
-
-    # 5j. Normalise word_box order to reading order (group by Y, sort by X).
-    # The frontend renders colored cells from word_boxes array order
-    # (GridTable.tsx), so they MUST be in left-to-right reading order.
-    wb_reordered = 0
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            wbs = cell.get("word_boxes") or []
-            if len(wbs) < 2:
-                continue
-            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
-            sorted_wbs = [w for line in lines for w in line]
-            # Check if order actually changed
-            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
-                cell["word_boxes"] = sorted_wbs
-                wb_reordered += 1
-    if wb_reordered:
-        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
-
-    duration = time.time() - t0
-
-    # 6. Build result
-    total_cells = sum(len(z.get("cells", [])) for z in zones_data)
-    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
-    total_rows = sum(len(z.get("rows", [])) for z in zones_data)
-
-    # Collect color statistics from all word_boxes in cells
-    color_stats: Dict[str, int] = {}
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            for wb in cell.get("word_boxes", []):
-                cn = wb.get("color_name", "black")
-                color_stats[cn] = color_stats.get(cn, 0) + 1
-
-    # Compute layout metrics for faithful grid reconstruction
-    all_content_row_heights: List[float] = []
-    for z in zones_data:
-        for row in z.get("rows", []):
-            if not row.get("is_header", False):
-                h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
-                if h > 0:
-                    all_content_row_heights.append(h)
-    avg_row_height = (
-        sum(all_content_row_heights) / len(all_content_row_heights)
-        if all_content_row_heights else 30.0
-    )
-    font_size_suggestion = max(10, int(avg_row_height * 0.6))
-
-    # --- Dictionary detection on assembled grid ---
-    # Build lightweight ColumnGeometry-like structures from zone columns for
-    # dictionary signal scoring.
-    from cv_layout import _score_dictionary_signals
-    dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
-    try:
-        from cv_vocab_types import ColumnGeometry
-        for z in zones_data:
-            zone_cells = z.get("cells", [])
-            zone_cols = z.get("columns", [])
-            if len(zone_cols) < 2 or len(zone_cells) < 10:
-                continue
-            # Build pseudo-ColumnGeometry per column
-            pseudo_geoms = []
-            for col in zone_cols:
-                ci = col["index"]
-                col_cells = [c for c in zone_cells if c.get("col_index") == ci]
-                # Flatten word_boxes into word dicts compatible with _score_language
-                col_words = []
-                for cell in col_cells:
-                    for wb in cell.get("word_boxes") or []:
-                        col_words.append({
-                            "text": wb.get("text", ""),
-                            "conf": wb.get("conf", 0),
-                            "top": wb.get("top", 0),
-                            "left": wb.get("left", 0),
-                            "height": wb.get("height", 0),
-                            "width": wb.get("width", 0),
-                        })
-                    # Fallback: use cell text if no word_boxes
-                    if not cell.get("word_boxes") and cell.get("text"):
-                        col_words.append({
-                            "text": cell["text"],
-                            "conf": cell.get("confidence", 50),
-                            "top": cell.get("bbox_px", {}).get("y", 0),
-                            "left": cell.get("bbox_px", {}).get("x", 0),
-                            "height": cell.get("bbox_px", {}).get("h", 20),
-                            "width": cell.get("bbox_px", {}).get("w", 50),
-                        })
-                col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
-                pseudo_geoms.append(ColumnGeometry(
-                    index=ci, x=col.get("x_min_px", 0), y=0,
-                    width=max(col_w, 1), height=img_h,
-                    word_count=len(col_words), words=col_words,
-                    width_ratio=col_w / max(img_w, 1),
-                ))
-            if len(pseudo_geoms) >= 2:
-                dd = _score_dictionary_signals(
-                    pseudo_geoms,
-                    document_category=document_category,
-                    margin_strip_detected=margin_strip_detected,
-                )
-                if dd["confidence"] > dict_detection["confidence"]:
-                    dict_detection = dd
-    except Exception as e:
-        logger.warning("Dictionary detection failed: %s", e)
-
-    # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
-    try:
-        from cv_syllable_detect import merge_word_gaps_in_zones
-        merge_word_gaps_in_zones(zones_data, session_id)
-    except Exception as e:
-        logger.warning("Word-gap merge failed: %s", e)
-
-    # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
-    # Strips | from words, validates with pyphen, tries char-deletion for garbled
-    # words like "Ze|plpe|lin" → "Zeppelin".
-    try:
-        from cv_syllable_detect import autocorrect_pipe_artifacts
-        autocorrect_pipe_artifacts(zones_data, session_id)
-    except Exception as e:
-        logger.warning("Pipe autocorrect failed: %s", e)
-
-    # --- Syllable divider insertion for dictionary pages ---
-    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
-    #   "all" = force on all content words, "en" = English column only,
-    #   "de" = German columns only, "none" = skip entirely.
-    syllable_insertions = 0
-    if syllable_mode != "none" and img_bgr is not None:
-        _syllable_eligible = False
-        if syllable_mode in ("all", "de", "en"):
-            _syllable_eligible = True
-        elif (dict_detection.get("is_dictionary")
-                and dict_detection.get("article_col_index") is not None):
-            # auto: only on dictionary pages with article columns
-            _syllable_eligible = True
-        # For language-specific modes, determine allowed columns
-        _syllable_col_filter: Optional[set] = None  # None = all columns
-        if syllable_mode == "en":
-            _syllable_col_filter = {en_col_type} if en_col_type else set()
-        elif syllable_mode == "de":
-            if en_col_type and total_cols >= 3:
-                _syllable_col_filter = all_content_cols - {en_col_type}
-            # else None → all columns (correct for German-only dicts)
-        if _syllable_eligible:
-            try:
-                from cv_syllable_detect import insert_syllable_dividers
-                force_syllables = (syllable_mode in ("all", "de", "en"))
-                syllable_insertions = insert_syllable_dividers(
-                    zones_data, img_bgr, session_id,
-                    force=force_syllables,
-                    col_filter=_syllable_col_filter,
-                )
-            except Exception as e:
-                logger.warning("Syllable insertion failed: %s", e)
-
-    # When syllable mode is "none", strip any residual | from OCR so
-    # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
-    if syllable_mode == "none":
-        for z in zones_data:
-            for cell in z.get("cells", []):
-                t = cell.get("text", "")
-                if "|" in t:
-                    cell["text"] = t.replace("|", "")
-
-    # --- Split merged words (OCR sometimes glues adjacent words) ---
-    # Uses dictionary lookup to split e.g. "atmyschool" → "at my school"
-    try:
-        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
-        if _SPELL_AVAILABLE:
-            split_count = 0
-            for z in zones_data:
-                for cell in z.get("cells", []):
-                    text = cell.get("text", "")
-                    if not text:
-                        continue
-                    parts = []
-                    changed = False
-                    for token in text.split():
-                        # Try splitting pure-alpha tokens >= 4 chars
-                        # Strip trailing punctuation AND IPA brackets
-                        clean = token
-                        # Remove trailing IPA like [dɪsˈɪʒən] first
-                        bracket_pos = clean.find('[')
-                        suffix_ipa = ""
-                        if bracket_pos > 0:
-                            suffix_ipa = clean[bracket_pos:]
-                            clean = clean[:bracket_pos]
-                        suffix_punct = ""
-                        stripped = clean.rstrip(".,!?;:'\")")
-                        if stripped != clean:
-                            suffix_punct = clean[len(stripped):]
-                            clean = stripped
-                        suffix = suffix_punct + suffix_ipa
-                        # Handle contractions: "solet's" → try "solet" + "'s"
-                        contraction = ""
-                        if "'" in clean and clean.index("'") >= 2:
-                            apos_pos = clean.index("'")
-                            contraction = clean[apos_pos:]
-                            clean = clean[:apos_pos]
-                            suffix = contraction + suffix
-                        if len(clean) >= 4 and clean.isalpha():
-                            split = _try_split_merged_word(clean)
-                            if split:
-                                parts.append(split + suffix)
-                                changed = True
-                                continue
-                        parts.append(token)
-                    if changed:
-                        cell["text"] = " ".join(parts)
-                        split_count += 1
-            if split_count:
-                logger.info("build-grid session %s: split %d merged words", session_id, split_count)
-    except ImportError:
-        pass
-
-    # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" ---
-    # Matches any [bracket] directly after a letter, as long as the bracket
-    # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]").
-    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            text = cell.get("text", "")
-            if text and "[" in text:
-                fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
-                if fixed != text:
-                    cell["text"] = fixed
-
-    # --- SmartSpellChecker: language-aware OCR correction on all cells ---
-    try:
-        from smart_spell import SmartSpellChecker
-        _ssc = SmartSpellChecker()
-        spell_fix_count = 0
-
-        # Determine language per column:
-        # en_col_type was already detected (column with IPA = English).
-        # All other content columns are assumed German for vocab tables.
-        # For single/two-column layouts, use auto-detection.
-        for z in zones_data:
-            zone_cols = z.get("columns", [])
-            for cell in z.get("cells", []):
-                text = cell.get("text", "")
-                if not text or not text.strip():
-                    continue
-                ct = cell.get("col_type", "")
-                if not ct.startswith("column_"):
-                    continue
-
-                # Determine language for this cell
-                if total_cols >= 3 and en_col_type:
-                    lang = "en" if ct == en_col_type else "de"
-                elif total_cols <= 2:
-                    lang = "auto"  # auto-detect for non-vocab layouts
-                else:
-                    lang = "auto"
-
-                result = _ssc.correct_text(text, lang=lang)
-                if result.changed:
-                    cell["text"] = result.corrected
-                    spell_fix_count += 1
-
-        if spell_fix_count:
-            logger.info(
-                "build-grid session %s: SmartSpellChecker fixed %d cells",
-                session_id, spell_fix_count,
-            )
-    except ImportError:
-        logger.debug("SmartSpellChecker not available in build-grid")
-    except Exception as e:
-        logger.warning("SmartSpellChecker error in build-grid: %s", e)
-
-    # --- Debug: log cell counts per column before empty-column removal ---
-    for z in zones_data:
-        if z.get("zone_type") == "content":
-            from collections import Counter as _Counter
-            _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
-            _cols = z.get("columns", [])
-            logger.info(
-                "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
-                z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
-            )
-
-    # --- Remove empty columns (no cells assigned) ---
-    for z in zones_data:
-        cells = z.get("cells", [])
-        used_col_indices = {c.get("col_index") for c in cells}
-        old_cols = z.get("columns", [])
-        new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
-        if len(new_cols) < len(old_cols):
-            # Re-index columns and cells
-            old_to_new = {}
-            for new_i, col in enumerate(new_cols):
-                old_i = col.get("col_index", col.get("index", new_i))
-                old_to_new[old_i] = new_i
-                col["col_index"] = new_i
-                col["index"] = new_i
-                col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
-            for cell in cells:
-                old_ci = cell.get("col_index", 0)
-                cell["col_index"] = old_to_new.get(old_ci, old_ci)
-                cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
-            z["columns"] = new_cols
-
-    # Clean up internal flags before returning
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            cell.pop("_ipa_corrected", None)
-
-    result = {
-        "session_id": session_id,
-        "image_width": img_w,
-        "image_height": img_h,
-        "zones": zones_data,
-        "boxes_detected": boxes_detected,
-        "summary": {
-            "total_zones": len(zones_data),
-            "total_columns": total_columns,
-            "total_rows": total_rows,
-            "total_cells": total_cells,
-            "total_words": len(all_words),
-            "recovered_colored": recovered_count,
-            "color_stats": color_stats,
-        },
-        "formatting": {
-            "bold_columns": [],
-            "header_rows": [],
-        },
-        "layout_metrics": {
-            "page_width_px": img_w,
-            "page_height_px": img_h,
-            "avg_row_height_px": round(avg_row_height, 1),
-            "font_size_suggestion_px": font_size_suggestion,
-        },
-        "dictionary_detection": {
-            "is_dictionary": dict_detection.get("is_dictionary", False),
-            "confidence": dict_detection.get("confidence", 0.0),
-            "signals": dict_detection.get("signals", {}),
-            "article_col_index": dict_detection.get("article_col_index"),
-            "headword_col_index": dict_detection.get("headword_col_index"),
-        },
-        "processing_modes": {
-            "ipa_mode": ipa_mode,
-            "syllable_mode": syllable_mode,
-            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
-            "syllables_applied": syllable_insertions > 0,
-        },
-        "page_number": page_number_info,
-        "duration_seconds": round(duration, 2),
-    }
-
-    return result
-
-
 # ---------------------------------------------------------------------------
 # Endpoints
 # ---------------------------------------------------------------------------