diff --git a/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx b/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx
index 74bd858..6633df1 100644
--- a/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx
@@ -1,19 +1,15 @@
'use client'
/**
- * StepAnsicht — Unified Grid View.
+ * StepAnsicht — Excel-like Spreadsheet View.
*
* Left: Original scan with OCR word overlay
- * Right: Unified grid (single zone, boxes integrated) rendered via GridTable
+ * Right: Fortune Sheet spreadsheet with multi-sheet tabs per zone
*/
-import { useCallback, useEffect, useRef, useState } from 'react'
+import { useEffect, useRef, useState } from 'react'
import dynamic from 'next/dynamic'
-import { useGridEditor } from '@/components/grid-editor/useGridEditor'
-import { GridTable } from '@/components/grid-editor/GridTable'
-import type { GridZone } from '@/components/grid-editor/types'
-// Lazy-load SpreadsheetView (Fortune Sheet, SSR-incompatible)
const SpreadsheetView = dynamic(
() => import('./SpreadsheetView').then((m) => m.SpreadsheetView),
{ ssr: false, loading: () =>
Spreadsheet wird geladen...
},
@@ -27,67 +23,29 @@ interface StepAnsichtProps {
}
export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
- const gridEditor = useGridEditor(sessionId)
- const {
- loading, error, selectedCell, setSelectedCell,
- updateCellText, toggleColumnBold, toggleRowHeader,
- getAdjacentCell, deleteColumn, addColumn, deleteRow, addRow,
- commitUndoPoint, selectedCells, toggleCellSelection,
- clearCellSelection, toggleSelectedBold, setCellColor,
- saveGrid, saving, dirty, undo, redo, canUndo, canRedo,
- } = gridEditor
-
- const [unifiedGrid, setUnifiedGrid] = useState(null)
- const [building, setBuilding] = useState(false)
- const [buildError, setBuildError] = useState(null)
+ const [gridData, setGridData] = useState(null)
+ const [loading, setLoading] = useState(true)
+ const [error, setError] = useState(null)
const leftRef = useRef(null)
const [leftHeight, setLeftHeight] = useState(600)
- const [viewMode, setViewMode] = useState<'spreadsheet' | 'grid'>('spreadsheet')
- // Build unified grid
- const buildUnified = useCallback(async () => {
- if (!sessionId) return
- setBuilding(true)
- setBuildError(null)
- try {
- const res = await fetch(
- `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/build-unified-grid`,
- { method: 'POST' },
- )
- if (!res.ok) {
- const d = await res.json().catch(() => ({}))
- throw new Error(d.detail || `HTTP ${res.status}`)
- }
- const data = await res.json()
- setUnifiedGrid(data)
- } catch (e) {
- setBuildError(e instanceof Error ? e.message : String(e))
- } finally {
- setBuilding(false)
- }
- }, [sessionId])
-
- // Load both grids on mount
+ // Load grid data on mount
useEffect(() => {
if (!sessionId) return
- // Load multi-zone grid (for spreadsheet mode)
- gridEditor.loadGrid()
- // Load unified grid (for grid mode)
;(async () => {
try {
- const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/unified-grid`)
- if (res.ok) {
- setUnifiedGrid(await res.json())
- } else {
- buildUnified()
- }
- } catch {
- buildUnified()
+ const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/grid-editor`)
+ if (!res.ok) throw new Error(`HTTP ${res.status}`)
+ setGridData(await res.json())
+ } catch (e) {
+ setError(e instanceof Error ? e.message : 'Fehler beim Laden')
+ } finally {
+ setLoading(false)
}
})()
- }, [sessionId]) // eslint-disable-line react-hooks/exhaustive-deps
+ }, [sessionId])
- // Track left panel height for sync
+ // Track left panel height
useEffect(() => {
if (!leftRef.current) return
const ro = new ResizeObserver(([e]) => setLeftHeight(e.contentRect.height))
@@ -95,13 +53,20 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
return () => ro.disconnect()
}, [])
- const unifiedZone: GridZone | null = unifiedGrid?.zones?.[0] ?? null
-
- if (loading || building) {
+ if (loading) {
return (
-
{building ? 'Baue Unified Grid...' : 'Lade...'}
+
Lade Spreadsheet...
+
+ )
+ }
+
+ if (error || !gridData) {
+ return (
+
+
{error || 'Keine Grid-Daten.'}
+
)
}
@@ -111,51 +76,16 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
{/* Header */}
-
Ansicht — Unified Grid
+
Ansicht — Spreadsheet
- Alle Inhalte in einem Grid. Boxen sind integriert (farbig markiert).
- {unifiedGrid && (
-
- {unifiedGrid.summary?.total_rows} Zeilen × {unifiedGrid.summary?.total_columns} Spalten
- {unifiedGrid.dominant_row_h && ` · Zeilenhöhe: ${Math.round(unifiedGrid.dominant_row_h)}px`}
-
- )}
+ Jede Zone als eigenes Sheet-Tab. Spaltenbreiten pro Sheet optimiert.
-
-
-
-
-
-
-
-
+
- {(error || buildError) && (
-
- {error || buildError}
-
- )}
-
{/* Split view */}
{/* LEFT: Original + OCR overlay */}
@@ -170,41 +100,9 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) {
)}
- {/* RIGHT: Spreadsheet or Grid view */}
-
- {viewMode === 'spreadsheet' && (unifiedGrid || gridEditor.grid) ? (
-
- ) : viewMode === 'grid' && unifiedZone ? (
-
-
- Grid View ({unifiedGrid?.summary?.total_rows}×{unifiedGrid?.summary?.total_columns})
-
-
{
- const next = getAdjacentCell(cellId, dir)
- if (next) setSelectedCell(next)
- }}
- onDeleteColumn={deleteColumn}
- onAddColumn={addColumn}
- onDeleteRow={deleteRow}
- onAddRow={addRow}
- onToggleCellSelection={toggleCellSelection}
- onSetCellColor={setCellColor}
- />
-
- ) : (
-
-
Kein Unified Grid verfügbar.
-
-
- )}
+ {/* RIGHT: Fortune Sheet */}
+
+
diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py
new file mode 100644
index 0000000..cab5277
--- /dev/null
+++ b/klausur-service/backend/grid_build_core.py
@@ -0,0 +1,1943 @@
+"""
+Grid Build Core — the main _build_grid_core() function.
+
+Extracted from grid_editor_api.py for maintainability.
+Takes merged OCR word positions and builds a structured, zone-aware grid.
+"""
+
+import logging
+import re
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+
+from cv_box_detect import detect_boxes, split_page_into_zones
+from cv_graphic_detect import detect_graphic_elements
+from cv_vocab_types import PageZone
+from cv_color_detect import detect_word_colors, recover_colored_text
+from cv_ocr_engines import (
+ fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
+ _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines,
+)
+from ocr_pipeline_session_store import get_session_image
+
+from grid_editor_helpers import (
+ _filter_border_strip_words,
+ _cluster_columns_by_alignment,
+ _GRID_GHOST_CHARS,
+ _filter_border_ghosts,
+ _MARKER_CHARS,
+ _merge_inline_marker_columns,
+ _flatten_word_boxes,
+ _words_in_zone,
+ _PIPE_RE_VSPLIT,
+ _detect_vertical_dividers,
+ _split_zone_at_vertical_dividers,
+ _merge_content_zones_across_boxes,
+ _detect_heading_rows_by_color,
+ _detect_heading_rows_by_single_cell,
+ _detect_header_rows,
+ _build_zone_grid,
+ _get_content_bounds,
+ _filter_decorative_margin,
+ _filter_footer_words,
+ _filter_header_junk,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def _build_grid_core(
+ session_id: str,
+ session: dict,
+ *,
+ ipa_mode: str = "auto",
+ syllable_mode: str = "auto",
+) -> dict:
+ """Core grid building logic — pure computation, no HTTP or DB side effects.
+
+ Args:
+ session_id: Session identifier (for logging and image loading).
+ session: Full session dict from get_session_db().
+ ipa_mode: "auto" (only when English headwords detected), "all"
+ (force IPA on all content columns), "en" (English column only),
+ "de" (German/definition columns only), or "none" (skip entirely).
+ syllable_mode: "auto" (only when original has pipe dividers),
+ "all" (force syllabification on all words), "en" (English only),
+ "de" (German only), or "none" (skip).
+
+ Returns:
+ StructuredGrid result dict.
+
+ Raises:
+ ValueError: If session data is incomplete.
+ """
+ t0 = time.time()
+
+ # 1. Validate and load word results
+ word_result = session.get("word_result")
+ if not word_result or not word_result.get("cells"):
+ raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
+
+ img_w = word_result.get("image_width", 0)
+ img_h = word_result.get("image_height", 0)
+ if not img_w or not img_h:
+ raise ValueError("Missing image dimensions in word_result")
+
+ # 2. Flatten all word boxes from cells
+ all_words = _flatten_word_boxes(word_result["cells"])
+ if not all_words:
+ raise ValueError("No word boxes found in cells")
+
+ logger.info("build-grid session %s: %d words from %d cells",
+ session_id, len(all_words), len(word_result["cells"]))
+
+ # 2b. Filter decorative margin columns (alphabet graphics).
+ # Some worksheets have a decorative alphabet strip along one margin
+ # (A-Z in a graphic). OCR reads these as single-char words aligned
+ # vertically. Detect and remove them before grid building.
+ margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
+ margin_strip_detected = margin_strip_info.get("found", False)
+
+ # Read document_category from session (user-selected or auto-detected)
+ document_category = session.get("document_category")
+
+ # 2c. Filter footer rows (page numbers at the very bottom).
+ # Isolated short text in the bottom 5% of the page is typically a
+ # page number ("64", "S. 12") and not real content. The page number
+ # is extracted as metadata for the frontend header display.
+ page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
+
+ # 2c2. Filter OCR junk from header illustrations.
+ # Low-confidence short fragments above the first real content row.
+ _filter_header_junk(all_words, img_h, logger, session_id)
+
+ # 2d. Filter words inside user-defined exclude regions (from Structure step).
+ # These are explicitly marked by the user, so ALL words inside are removed
+ # regardless of confidence.
+ structure_result = session.get("structure_result")
+ exclude_rects = []
+ if structure_result:
+ for er in structure_result.get("exclude_regions", []):
+ exclude_rects.append({
+ "x": er["x"], "y": er["y"],
+ "w": er["w"], "h": er["h"],
+ })
+ if exclude_rects:
+ before = len(all_words)
+ filtered = []
+ for w in all_words:
+ w_cx = w["left"] + w.get("width", 0) / 2
+ w_cy = w["top"] + w.get("height", 0) / 2
+ inside = any(
+ er["x"] <= w_cx <= er["x"] + er["w"]
+ and er["y"] <= w_cy <= er["y"] + er["h"]
+ for er in exclude_rects
+ )
+ if not inside:
+ filtered.append(w)
+ removed = before - len(filtered)
+ if removed:
+ all_words = filtered
+ logger.info(
+ "build-grid session %s: removed %d words inside %d user exclude region(s)",
+ session_id, removed, len(exclude_rects),
+ )
+
+ # 2e. Hard-filter words inside graphic/image regions from structure step.
+ # ALL words inside graphic regions are removed regardless of confidence —
+ # images cannot contain real text; any OCR words inside are artifacts.
+ # After image loading (Step 3a) we augment these with freshly detected
+ # graphic regions from cv_graphic_detect.
+ graphic_rects: List[Dict[str, int]] = []
+ if structure_result:
+ for g in structure_result.get("graphics", []):
+ graphic_rects.append({
+ "x": g["x"], "y": g["y"],
+ "w": g["w"], "h": g["h"],
+ })
+ if graphic_rects:
+ before = len(all_words)
+ all_words = [
+ w for w in all_words
+ if not any(
+ gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
+ and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
+ for gr in graphic_rects
+ )
+ ]
+ removed = before - len(all_words)
+ if removed:
+ logger.info(
+ "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
+ session_id, removed, len(graphic_rects),
+ )
+
+ # 3. Load image for box detection
+ img_png = await get_session_image(session_id, "cropped")
+ if not img_png:
+ img_png = await get_session_image(session_id, "dewarped")
+ if not img_png:
+ img_png = await get_session_image(session_id, "original")
+
+ zones_data: List[Dict[str, Any]] = []
+ boxes_detected = 0
+ recovered_count = 0
+ border_prefiltered = False
+ img_bgr = None
+
+ content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
+
+ if img_png:
+ # Decode image for color detection + box detection
+ arr = np.frombuffer(img_png, dtype=np.uint8)
+ img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+
+ if img_bgr is not None:
+ # --- 3a. Detect graphic/image regions via CV and hard-filter ---
+ # Pass only significant words (len >= 3) to the detector so that
+ # short OCR artifacts inside images don't fool the text-vs-graphic
+ # heuristic (it counts word centroids to distinguish text from images).
+ sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
+ fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
+ if fresh_graphics:
+ fresh_rects = [
+ {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
+ for g in fresh_graphics
+ ]
+ graphic_rects.extend(fresh_rects)
+ logger.info(
+ "build-grid session %s: detected %d graphic region(s) via CV",
+ session_id, len(fresh_graphics),
+ )
+ # Hard-filter words inside newly detected graphic regions
+ before = len(all_words)
+ all_words = [
+ w for w in all_words
+ if not any(
+ gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
+ and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
+ for gr in fresh_rects
+ )
+ ]
+ removed = before - len(all_words)
+ if removed:
+ logger.info(
+ "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
+ session_id, removed, len(fresh_rects),
+ )
+
+ # --- Recover colored text that OCR missed (before grid building) ---
+ recovered = recover_colored_text(img_bgr, all_words)
+ if recovered and graphic_rects:
+ # Filter recovered chars inside graphic regions
+ recovered = [
+ r for r in recovered
+ if not any(
+ gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
+ and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
+ for gr in graphic_rects
+ )
+ ]
+ if recovered:
+ recovered_count = len(recovered)
+ all_words.extend(recovered)
+ logger.info(
+ "build-grid session %s: +%d recovered colored words",
+ session_id, recovered_count,
+ )
+
+ # Detect bordered boxes
+ boxes = detect_boxes(
+ img_bgr,
+ content_x=content_x,
+ content_w=content_w,
+ content_y=content_y,
+ content_h=content_h,
+ )
+ boxes_detected = len(boxes)
+
+ if boxes:
+ # Filter border ghost words before grid building
+ all_words, ghost_count = _filter_border_ghosts(all_words, boxes)
+ if ghost_count:
+ logger.info(
+ "build-grid session %s: removed %d border ghost words",
+ session_id, ghost_count,
+ )
+
+ # Split page into zones
+ page_zones = split_page_into_zones(
+ content_x, content_y, content_w, content_h, boxes
+ )
+
+ # Merge content zones separated by box zones
+ page_zones = _merge_content_zones_across_boxes(
+ page_zones, content_x, content_w
+ )
+
+ # 3b. Detect vertical dividers and split content zones
+ vsplit_group_counter = 0
+ expanded_zones: List = []
+ for pz in page_zones:
+ if pz.zone_type != "content":
+ expanded_zones.append(pz)
+ continue
+ zone_words = _words_in_zone(
+ all_words, pz.y, pz.height, pz.x, pz.width
+ )
+ divider_xs = _detect_vertical_dividers(
+ zone_words, pz.x, pz.width, pz.y, pz.height
+ )
+ if divider_xs:
+ sub_zones = _split_zone_at_vertical_dividers(
+ pz, divider_xs, vsplit_group_counter
+ )
+ expanded_zones.extend(sub_zones)
+ vsplit_group_counter += 1
+ # Remove pipe words so they don't appear in sub-zones
+ pipe_ids = set(
+ id(w) for w in zone_words
+ if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+ )
+ all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
+ logger.info(
+ "build-grid: vertical split zone %d at x=%s → %d sub-zones",
+ pz.index, [int(x) for x in divider_xs], len(sub_zones),
+ )
+ else:
+ expanded_zones.append(pz)
+ # Re-index zones
+ for i, pz in enumerate(expanded_zones):
+ pz.index = i
+ page_zones = expanded_zones
+
+ # --- Union columns from all content zones ---
+ # Each content zone detects columns independently. Narrow
+ # columns (page refs, markers) may appear in only one zone.
+ # Merge column split-points from ALL content zones so every
+ # zone shares the full column set.
+ # NOTE: Zones from a vertical split are independent and must
+ # NOT share columns with each other.
+
+ # First pass: build grids per zone independently
+ zone_grids: List[Dict] = []
+
+ for pz in page_zones:
+ zone_words = _words_in_zone(
+ all_words, pz.y, pz.height, pz.x, pz.width
+ )
+ if pz.zone_type == "content":
+ logger.info(
+ "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d → %d/%d words",
+ pz.index, pz.zone_type,
+ pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
+ len(zone_words), len(all_words),
+ )
+ # Filter recovered single-char artifacts in ALL zones
+ # (decorative colored pixel blobs like !, ?, • from
+ # recover_colored_text that don't represent real text)
+ before = len(zone_words)
+ zone_words = [
+ w for w in zone_words
+ if not (
+ w.get("recovered")
+ and len(w.get("text", "").strip()) <= 2
+ )
+ ]
+ removed = before - len(zone_words)
+ if removed:
+ logger.info(
+ "build-grid: filtered %d recovered artifacts from %s zone %d",
+ removed, pz.zone_type, pz.index,
+ )
+ # Filter words inside image overlay regions (merged box zones)
+ if pz.image_overlays:
+ before_ov = len(zone_words)
+ zone_words = [
+ w for w in zone_words
+ if not any(
+ ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
+ and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
+ for ov in pz.image_overlays
+ )
+ ]
+ ov_removed = before_ov - len(zone_words)
+ if ov_removed:
+ logger.info(
+ "build-grid: filtered %d words inside image overlays from zone %d",
+ ov_removed, pz.index,
+ )
+ zone_words, bs_removed = _filter_border_strip_words(zone_words)
+ if bs_removed:
+ border_prefiltered = True
+ logger.info(
+ "build-grid: pre-filtered %d border-strip words from zone %d",
+ bs_removed, pz.index,
+ )
+ grid = _build_zone_grid(
+ zone_words, pz.x, pz.y, pz.width, pz.height,
+ pz.index, img_w, img_h,
+ skip_first_row_header=bool(pz.image_overlays),
+ )
+ zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
+
+ # Second pass: merge column boundaries from all content zones
+ # Exclude zones from vertical splits — they have independent columns.
+ content_zones = [
+ zg for zg in zone_grids
+ if zg["pz"].zone_type == "content"
+ and zg["pz"].vsplit_group is None
+ ]
+ if len(content_zones) > 1:
+ # Collect column split points (x_min of non-first columns)
+ all_split_xs: List[float] = []
+ for zg in content_zones:
+ raw_cols = zg["grid"].get("_raw_columns", [])
+ for col in raw_cols[1:]:
+ all_split_xs.append(col["x_min"])
+
+ if all_split_xs:
+ all_split_xs.sort()
+ merge_distance = max(25, int(content_w * 0.03))
+ merged_xs = [all_split_xs[0]]
+ for x in all_split_xs[1:]:
+ if x - merged_xs[-1] < merge_distance:
+ merged_xs[-1] = (merged_xs[-1] + x) / 2
+ else:
+ merged_xs.append(x)
+
+ total_cols = len(merged_xs) + 1
+ max_zone_cols = max(
+ len(zg["grid"].get("_raw_columns", []))
+ for zg in content_zones
+ )
+
+ # Apply union whenever it has at least as many
+ # columns as the best single zone. Even with the
+ # same count the union boundaries are better because
+ # they incorporate evidence from all zones.
+ if total_cols >= max_zone_cols:
+ cx_min = min(w["left"] for w in all_words)
+ cx_max = max(
+ w["left"] + w["width"] for w in all_words
+ )
+ merged_columns: List[Dict[str, Any]] = []
+ prev_x = cx_min
+ for i, sx in enumerate(merged_xs):
+ merged_columns.append({
+ "index": i,
+ "type": f"column_{i + 1}",
+ "x_min": prev_x,
+ "x_max": sx,
+ })
+ prev_x = sx
+ merged_columns.append({
+ "index": len(merged_xs),
+ "type": f"column_{len(merged_xs) + 1}",
+ "x_min": prev_x,
+ "x_max": cx_max,
+ })
+
+ # Re-build ALL content zones with merged columns
+ for zg in zone_grids:
+ pz = zg["pz"]
+ if pz.zone_type == "content":
+ grid = _build_zone_grid(
+ zg["words"], pz.x, pz.y,
+ pz.width, pz.height,
+ pz.index, img_w, img_h,
+ global_columns=merged_columns,
+ skip_first_row_header=bool(pz.image_overlays),
+ )
+ zg["grid"] = grid
+ logger.info(
+ "build-grid session %s: union of %d content "
+ "zones → %d merged columns (max single zone: %d)",
+ session_id, len(content_zones),
+ total_cols, max_zone_cols,
+ )
+
+ for zg in zone_grids:
+ pz = zg["pz"]
+ grid = zg["grid"]
+ # Remove internal _raw_columns before adding to response
+ grid.pop("_raw_columns", None)
+
+ zone_entry: Dict[str, Any] = {
+ "zone_index": pz.index,
+ "zone_type": pz.zone_type,
+ "bbox_px": {
+ "x": pz.x, "y": pz.y,
+ "w": pz.width, "h": pz.height,
+ },
+ "bbox_pct": {
+ "x": round(pz.x / img_w * 100, 2) if img_w else 0,
+ "y": round(pz.y / img_h * 100, 2) if img_h else 0,
+ "w": round(pz.width / img_w * 100, 2) if img_w else 0,
+ "h": round(pz.height / img_h * 100, 2) if img_h else 0,
+ },
+ "border": None,
+ "word_count": len(zg["words"]),
+ **grid,
+ }
+
+ if pz.box:
+ zone_entry["border"] = {
+ "thickness": pz.box.border_thickness,
+ "confidence": pz.box.confidence,
+ }
+
+ if pz.image_overlays:
+ zone_entry["image_overlays"] = pz.image_overlays
+
+ if pz.layout_hint:
+ zone_entry["layout_hint"] = pz.layout_hint
+ if pz.vsplit_group is not None:
+ zone_entry["vsplit_group"] = pz.vsplit_group
+
+ zones_data.append(zone_entry)
+
+ # 4. Fallback: no boxes detected → single zone with all words
+ if not zones_data:
+ # Filter recovered single-char artifacts (same as in zone loop above)
+ before = len(all_words)
+ filtered_words = [
+ w for w in all_words
+ if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
+ ]
+ removed = before - len(filtered_words)
+ if removed:
+ logger.info(
+ "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
+ session_id, removed,
+ )
+ # Pre-filter border-strip words so column detection is not
+ # confused by edge artifacts. When this removes words, Step 4e
+ # is skipped (it would otherwise re-detect content as a "strip").
+ filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
+ if bs_removed:
+ border_prefiltered = True
+ logger.info(
+ "build-grid session %s: pre-filtered %d border-strip words",
+ session_id, bs_removed,
+ )
+ grid = _build_zone_grid(
+ filtered_words, content_x, content_y, content_w, content_h,
+ 0, img_w, img_h,
+ )
+ grid.pop("_raw_columns", None)
+ zones_data.append({
+ "zone_index": 0,
+ "zone_type": "content",
+ "bbox_px": {
+ "x": content_x, "y": content_y,
+ "w": content_w, "h": content_h,
+ },
+ "bbox_pct": {
+ "x": round(content_x / img_w * 100, 2) if img_w else 0,
+ "y": round(content_y / img_h * 100, 2) if img_h else 0,
+ "w": round(content_w / img_w * 100, 2) if img_w else 0,
+ "h": round(content_h / img_h * 100, 2) if img_h else 0,
+ },
+ "border": None,
+ "word_count": len(all_words),
+ **grid,
+ })
+
+ # 4b. Remove junk rows: rows where ALL cells contain only short,
+ # low-confidence text (OCR noise, stray marks). Real vocabulary rows
+ # have at least one word with conf >= 50 or meaningful text length.
+ # Also remove "oversized stub" rows: rows with ≤2 very short words
+ # whose word-boxes are significantly taller than the median (e.g.
+ # large red page numbers like "( 9" that are not real text content).
+ _JUNK_CONF_THRESHOLD = 50
+ _JUNK_MAX_TEXT_LEN = 3
+ for z in zones_data:
+ cells = z.get("cells", [])
+ rows = z.get("rows", [])
+ if not cells or not rows:
+ continue
+
+ # Compute median word height across the zone for oversized detection
+ all_wb_heights = [
+ wb["height"]
+ for cell in cells
+ for wb in cell.get("word_boxes") or []
+ if wb.get("height", 0) > 0
+ ]
+ median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
+
+ junk_row_indices = set()
+ for row in rows:
+ ri = row["index"]
+ row_cells = [c for c in cells if c.get("row_index") == ri]
+ if not row_cells:
+ continue
+
+ row_wbs = [
+ wb for cell in row_cells
+ for wb in cell.get("word_boxes") or []
+ ]
+
+ # Rule 1: ALL word_boxes are low-conf AND short text
+ all_junk = True
+ for wb in row_wbs:
+ text = (wb.get("text") or "").strip()
+ conf = wb.get("conf", 0)
+ if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
+ all_junk = False
+ break
+ if all_junk and row_wbs:
+ junk_row_indices.add(ri)
+ continue
+
+ # Rule 2: oversized stub — ≤3 words, short total text,
+ # and word height > 1.8× median (page numbers, stray marks,
+ # OCR from illustration labels like "SEA &")
+ # Skip if any word looks like a page reference (p.55, S.12).
+ if len(row_wbs) <= 3:
+ total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
+ max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
+ has_page_ref = any(
+ re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
+ for wb in row_wbs
+ )
+ if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
+ junk_row_indices.add(ri)
+ continue
+
+ # Rule 3: scattered debris — rows with only tiny fragments
+ # (e.g. OCR artifacts from illustrations/graphics).
+ # If the row has no word longer than 2 chars, it's noise.
+ longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
+ if longest <= 2:
+ junk_row_indices.add(ri)
+ continue
+
+ if junk_row_indices:
+ z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
+ z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
+ logger.info(
+ "build-grid: removed %d junk rows from zone %d: %s",
+ len(junk_row_indices), z["zone_index"],
+ sorted(junk_row_indices),
+ )
+
+ # 4b2. Remove individual cells that consist of a single very-short,
+ # low-confidence word (OCR artifacts like "as", "b" from stray marks).
+ # These survive row-level junk removal when the row has valid cells
+ # in other columns.
+ _ARTIFACT_MAX_LEN = 2
+ _ARTIFACT_CONF_THRESHOLD = 65
+ for z in zones_data:
+ cells = z.get("cells", [])
+ if not cells:
+ continue
+ artifact_ids = set()
+ for cell in cells:
+ wbs = cell.get("word_boxes") or []
+ if len(wbs) != 1:
+ continue
+ wb = wbs[0]
+ text = (wb.get("text") or "").strip()
+ conf = wb.get("conf", 100)
+ if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
+ artifact_ids.add(cell.get("cell_id"))
+ if artifact_ids:
+ z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
+ logger.info(
+ "build-grid: removed %d artifact cells from zone %d: %s",
+ len(artifact_ids), z.get("zone_index", 0),
+ [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
+ )
+
+ # 4c. Remove oversized word_boxes from individual cells.
+ # OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
+ # have word heights 3-5x the median. Remove them per-word so they don't
+ # pollute cells that also contain valid text in other columns.
+ for z in zones_data:
+ cells = z.get("cells", [])
+ if not cells:
+ continue
+ all_wh = [
+ wb["height"]
+ for cell in cells
+ for wb in cell.get("word_boxes") or []
+ if wb.get("height", 0) > 0
+ ]
+ if not all_wh:
+ continue
+ med_h = sorted(all_wh)[len(all_wh) // 2]
+ oversized_threshold = med_h * 3
+ removed_oversized = 0
+ for cell in cells:
+ wbs = cell.get("word_boxes") or []
+ filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
+ if len(filtered) < len(wbs):
+ removed_oversized += len(wbs) - len(filtered)
+ cell["word_boxes"] = filtered
+ cell["text"] = _words_to_reading_order_text(filtered)
+ if removed_oversized:
+ # Remove cells that became empty after oversized removal
+ z["cells"] = [c for c in cells if c.get("word_boxes")]
+ logger.info(
+ "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
+ removed_oversized, oversized_threshold, z.get("zone_index", 0),
+ )
+
+ # 4d. Remove pipe-character word_boxes (column divider artifacts).
+ # OCR reads physical vertical divider lines as "|" or "||" characters.
+ # These sit at consistent x positions near column boundaries and pollute
+ # cell text. Remove them from word_boxes and rebuild cell text.
+ # NOTE: Zones from a vertical split already had pipes removed in step 3b.
+ _PIPE_RE = re.compile(r"^\|+$")
+ for z in zones_data:
+ if z.get("vsplit_group") is not None:
+ continue # pipes already removed before split
+ removed_pipes = 0
+ for cell in z.get("cells", []):
+ wbs = cell.get("word_boxes") or []
+ filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+ if len(filtered) < len(wbs):
+ removed_pipes += len(wbs) - len(filtered)
+ cell["word_boxes"] = filtered
+ cell["text"] = _words_to_reading_order_text(filtered)
+ # Remove cells that became empty after pipe removal
+ if removed_pipes:
+ z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
+ logger.info(
+ "build-grid: removed %d pipe-divider word_boxes from zone %d",
+ removed_pipes, z.get("zone_index", 0),
+ )
+
+ # Strip pipe chars ONLY from word_boxes/cells where the pipe is an
+ # OCR column-divider artifact. Preserve pipes that are embedded in
+ # words as syllable separators (e.g. "zu|trau|en") — these are
+ # intentional and used in dictionary Ground Truth.
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ for wb in cell.get("word_boxes", []):
+ wbt = wb.get("text", "")
+ # Only strip if the ENTIRE word_box is just pipe(s)
+ # (handled by _PIPE_RE above) — leave embedded pipes alone
+ text = cell.get("text", "")
+ if "|" in text:
+ # Only strip leading/trailing pipes (OCR artifacts at cell edges)
+ cleaned = text.strip("|").strip()
+ if cleaned != text.strip():
+ cell["text"] = cleaned
+
+ # 4d2. Normalize narrow connector columns.
+ # In synonym dictionaries a narrow column repeats the same word
+ # (e.g. "oder") in every row. OCR sometimes appends noise chars
+ # (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column
+ # share the same short text, normalize near-match outliers.
+ for z in zones_data:
+ cols = z.get("columns", [])
+ cells = z.get("cells", [])
+ if not cols or not cells:
+ continue
+ for col in cols:
+ ci = col.get("index")
+ col_cells = [c for c in cells if c.get("col_index") == ci]
+ if len(col_cells) < 3:
+ continue
+ # Count text occurrences
+ text_counts: Dict[str, int] = {}
+ for c in col_cells:
+ t = (c.get("text") or "").strip()
+ if t:
+ text_counts[t] = text_counts.get(t, 0) + 1
+ if not text_counts:
+ continue
+ dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
+ dominant_count = text_counts[dominant_text]
+ # Only normalize if dominant word is short and appears in ≥60%
+ if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
+ continue
+ # Fix outliers that start with the dominant text
+ fixed = 0
+ for c in col_cells:
+ t = (c.get("text") or "").strip()
+ if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
+ c["text"] = dominant_text
+ # Also fix word_boxes
+ wbs = c.get("word_boxes") or []
+ if len(wbs) == 1:
+ wbs[0]["text"] = dominant_text
+ fixed += 1
+ if fixed:
+ logger.info(
+ "build-grid: normalized %d outlier cells in connector column %d "
+ "(dominant='%s') zone %d",
+ fixed, ci, dominant_text, z.get("zone_index", 0),
+ )
+
+ # 4e. Detect and remove page-border decoration strips.
+ # Skipped when the pre-filter already removed border words BEFORE
+ # column detection — re-running would incorrectly detect the
+ # leftmost content column as a "strip".
+ border_strip_removed = 0
+ if border_prefiltered:
+ logger.info("Step 4e: skipped (border pre-filter already applied)")
+ else:
+ # Some textbooks have decorative alphabet strips along the page
+ # edge. OCR picks up scattered letters from these as artifacts.
+ # Detection: find the first significant x-gap (>30 px) from each
+ # page edge between a small cluster (<20 %) and the main content.
+ for z in zones_data:
+ cells = z.get("cells", [])
+ if not cells:
+ continue
+ all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
+ for cell in cells:
+ for wb in cell.get("word_boxes") or []:
+ all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
+ if len(all_wbs_with_cell) < 10:
+ continue
+ all_wbs_with_cell.sort(key=lambda t: t[0])
+ total = len(all_wbs_with_cell)
+
+ # -- Left-edge scan --
+ left_strip_count = 0
+ left_gap = 0
+ running_right = 0
+ for gi in range(total - 1):
+ running_right = max(
+ running_right,
+ all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
+ )
+ gap = all_wbs_with_cell[gi + 1][0] - running_right
+ if gap > 30:
+ left_strip_count = gi + 1
+ left_gap = gap
+ break
+
+ # -- Right-edge scan --
+ right_strip_count = 0
+ right_gap = 0
+ running_left = all_wbs_with_cell[-1][0]
+ for gi in range(total - 1, 0, -1):
+ running_left = min(running_left, all_wbs_with_cell[gi][0])
+ prev_right = (
+ all_wbs_with_cell[gi - 1][0]
+ + all_wbs_with_cell[gi - 1][1].get("width", 0)
+ )
+ gap = running_left - prev_right
+ if gap > 30:
+ right_strip_count = total - gi
+ right_gap = gap
+ break
+
+ strip_wbs: set = set()
+ strip_side = ""
+ strip_gap = 0
+ strip_count = 0
+ if left_strip_count > 0 and left_strip_count / total < 0.20:
+ strip_side = "left"
+ strip_count = left_strip_count
+ strip_gap = left_gap
+ strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
+ elif right_strip_count > 0 and right_strip_count / total < 0.20:
+ strip_side = "right"
+ strip_count = right_strip_count
+ strip_gap = right_gap
+ strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
+
+ if not strip_wbs:
+ continue
+ for cell in cells:
+ wbs = cell.get("word_boxes") or []
+ filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
+ if len(filtered) < len(wbs):
+ border_strip_removed += len(wbs) - len(filtered)
+ cell["word_boxes"] = filtered
+ cell["text"] = _words_to_reading_order_text(filtered)
+ z["cells"] = [c for c in cells
+ if (c.get("word_boxes") or c.get("text", "").strip())]
+ logger.info(
+ "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
+ "(gap=%dpx, strip=%d/%d wbs)",
+ border_strip_removed, strip_side, z.get("zone_index", 0),
+ strip_gap, strip_count, total,
+ )
+
+ # 4f. Remove decorative edge columns (alphabet sidebar safety net).
+ # Dictionary pages have A-Z letter sidebars that OCR reads as single-
+ # character word_boxes. These form narrow columns with very short text.
+ # Detection: edge column where almost ALL cells are single characters.
+ for z in zones_data:
+ columns = z.get("columns", [])
+ cells = z.get("cells", [])
+ if len(columns) < 3 or not cells:
+ continue
+ # Group cells by col_type (skip spanning_header)
+ col_cells: Dict[str, List[Dict]] = {}
+ for cell in cells:
+ ct = cell.get("col_type", "")
+ if ct.startswith("column_"):
+ col_cells.setdefault(ct, []).append(cell)
+ col_types_ordered = sorted(col_cells.keys())
+ if len(col_types_ordered) < 3:
+ continue
+ for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
+ edge_cells_list = col_cells.get(edge_ct, [])
+ if len(edge_cells_list) < 3:
+ continue
+ # Key criterion: average text length and single-char ratio.
+ # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
+ # are single characters.
+ texts = [(c.get("text") or "").strip() for c in edge_cells_list]
+ avg_len = sum(len(t) for t in texts) / len(texts)
+ single_char = sum(1 for t in texts if len(t) <= 1)
+ single_ratio = single_char / len(texts)
+ if avg_len > 1.5:
+ continue # real content has longer text
+ if single_ratio < 0.7:
+ continue # not dominated by single chars
+ # Remove this edge column
+ removed_count = len(edge_cells_list)
+ edge_ids = {id(c) for c in edge_cells_list}
+ z["cells"] = [c for c in cells if id(c) not in edge_ids]
+ z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
+ logger.info(
+ "Step 4f: removed decorative edge column '%s' from zone %d "
+ "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
+ edge_ct, z.get("zone_index", 0), removed_count,
+ avg_len, single_ratio * 100,
+ )
+ break # only remove one edge per zone
+
+ # 5. Color annotation on final word_boxes in cells
+ if img_bgr is not None:
+ all_wb: List[Dict] = []
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ all_wb.extend(cell.get("word_boxes", []))
+ detect_word_colors(img_bgr, all_wb)
+
+ # 5a. Heading detection by color + height (after color is available)
+ heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
+ if heading_count:
+ logger.info("Detected %d heading rows by color+height", heading_count)
+
+ # 5b. Fix unmatched parentheses in cell text
+ # OCR often misses opening "(" while detecting closing ")".
+ # If a cell's text has ")" without a matching "(", prepend "(".
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ text = cell.get("text", "")
+ if ")" in text and "(" not in text:
+ cell["text"] = "(" + text
+
+ # 5c. IPA phonetic correction — replace garbled OCR phonetics with
+ # correct IPA from the dictionary (same as in the OCR pipeline).
+ # Only applies to vocabulary tables (≥3 columns: EN | article | DE).
+ # Single/two-column layouts are continuous text, not vocab tables.
+ all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
+ total_cols = sum(len(z.get("columns", [])) for z in zones_data)
+ en_col_type = None
+ ipa_target_cols: set = set()
+ all_content_cols: set = set()
+ skip_ipa = (ipa_mode == "none")
+
+ # When ipa_mode=none, strip ALL square brackets from ALL content columns
+ if skip_ipa:
+ _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
+ for cell in all_cells:
+ ct = cell.get("col_type", "")
+ if not ct.startswith("column_"):
+ continue
+ text = cell.get("text", "")
+ if "[" in text:
+ stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
+ if stripped != text:
+ cell["text"] = stripped.strip()
+ cell["_ipa_corrected"] = True
+
+ if not skip_ipa and total_cols >= 3:
+ # Detect English headword column via IPA signals (brackets or garbled).
+ col_ipa_count: Dict[str, int] = {}
+ all_content_cols: set = set()
+ for cell in all_cells:
+ ct = cell.get("col_type", "")
+ if not ct.startswith("column_"):
+ continue
+ txt = cell.get("text", "") or ""
+ if txt.strip():
+ all_content_cols.add(ct)
+ if '[' in txt or _text_has_garbled_ipa(txt):
+ col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
+ if col_ipa_count:
+ en_col_type = max(col_ipa_count, key=col_ipa_count.get)
+ elif ipa_mode == "all":
+ # Force-all mode without auto-detection: pick column with most cells
+ col_cell_count: Dict[str, int] = {}
+ for cell in all_cells:
+ ct = cell.get("col_type", "")
+ if ct.startswith("column_") and (cell.get("text") or "").strip():
+ col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
+ if col_cell_count:
+ en_col_type = max(col_cell_count, key=col_cell_count.get)
+
+ # Decide which columns to process based on ipa_mode:
+ # auto/en: only the detected EN headword column (English IPA)
+ # de: all content columns EXCEPT the EN column (German IPA)
+ # all: EN column gets English IPA, other columns get German IPA
+ en_ipa_target_cols: set = set()
+ de_ipa_target_cols: set = set()
+ if ipa_mode in ("auto", "en"):
+ if en_col_type:
+ en_ipa_target_cols.add(en_col_type)
+ elif ipa_mode == "de":
+ de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
+ elif ipa_mode == "all":
+ if en_col_type:
+ en_ipa_target_cols.add(en_col_type)
+ de_ipa_target_cols = all_content_cols - en_ipa_target_cols
+
+ # --- Strip IPA from columns NOT in the target set ---
+ # When user selects "nur DE", English IPA from the OCR scan must
+ # be removed. When "none", all IPA is removed.
+ # In vocab columns, square brackets [...] are always IPA (both
+ # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]).
+ _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
+ strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
+ if strip_en_ipa or ipa_mode == "none":
+ strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
+ for cell in all_cells:
+ ct = cell.get("col_type", "")
+ if ct not in strip_cols:
+ continue
+ text = cell.get("text", "")
+ if "[" in text:
+ stripped = _SQUARE_BRACKET_RE.sub("", text)
+ if stripped != text:
+ cell["text"] = stripped.strip()
+ cell["_ipa_corrected"] = True
+
+ # --- English IPA (Britfone + eng_to_ipa) ---
+ if en_ipa_target_cols:
+ for cell in all_cells:
+ ct = cell.get("col_type")
+ if ct in en_ipa_target_cols:
+ cell["_orig_col_type"] = ct
+ cell["col_type"] = "column_en"
+ _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
+ fix_cell_phonetics(all_cells, pronunciation="british")
+ for cell in all_cells:
+ orig = cell.pop("_orig_col_type", None)
+ if orig:
+ cell["col_type"] = orig
+ if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+ cell["_ipa_corrected"] = True
+
+ # --- German IPA (wiki-pronunciation-dict + epitran) ---
+ if de_ipa_target_cols:
+ from cv_ipa_german import insert_german_ipa
+ insert_german_ipa(all_cells, de_ipa_target_cols)
+
+ ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
+
+ # Mark cells whose text was changed by IPA correction so that
+ # later steps (5i) don't overwrite the corrected text when
+ # reconstructing from word_boxes. (Already done inline above
+ # for English; insert_german_ipa sets _ipa_corrected too.)
+ for cell in all_cells:
+ if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+ cell["_ipa_corrected"] = True
+
+ # 5d. Fix IPA continuation cells — cells where the printed
+ # phonetic transcription wraps to a line below the headword.
+ # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]").
+ # Replace garbled text with proper IPA looked up from the
+ # headword in the previous row's same column.
+ # Note: We check ALL columns, not just en_col_type, because
+ # the EN headword column may not be the longest-average column.
+ _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
+ ipa_cont_fixed = 0
+ for z in ([] if skip_ipa else zones_data):
+ rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
+ z_cells = z.get("cells", [])
+ for idx, row in enumerate(rows_sorted):
+ if idx == 0:
+ continue
+ ri = row["index"]
+ row_cells = [c for c in z_cells if c.get("row_index") == ri]
+ for cell in row_cells:
+ ct = cell.get("col_type", "")
+ if not ct.startswith("column_"):
+ continue
+ cell_text = (cell.get("text") or "").strip()
+ if not cell_text:
+ # Step 5c may have emptied garbled IPA cells like
+ # "[n, nn]" — recover text from word_boxes.
+ wb_texts = [w.get("text", "")
+ for w in cell.get("word_boxes", [])]
+ cell_text = " ".join(wb_texts).strip()
+ if not cell_text:
+ continue
+
+ is_bracketed = (
+ cell_text.startswith('[') and cell_text.endswith(']')
+ )
+
+ if is_bracketed:
+ # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
+ # Text like "employee [im'ploi:]" is NOT fully
+ # bracketed and won't match here.
+ if not _text_has_garbled_ipa(cell_text):
+ continue
+ # Already has proper IPA brackets → skip
+ if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+ continue
+ else:
+ # Unbracketed continuation: "ska:f – ska:vz",
+ # "'sekandarr sku:l". Only treat as IPA
+ # continuation if this is the ONLY content cell
+ # in the row (single-cell row) and the text is
+ # garbled IPA without real IPA Unicode symbols.
+ content_cells_in_row = [
+ c for c in row_cells
+ if c.get("col_type", "").startswith("column_")
+ and c.get("col_type") != "column_1"
+ ]
+ if len(content_cells_in_row) != 1:
+ continue
+ if not _text_has_garbled_ipa(cell_text):
+ continue
+ # Has real IPA symbols → already fixed or valid
+ if any(c in _REAL_IPA_CHARS for c in cell_text):
+ continue
+
+ # Find headword in previous row, same column
+ prev_ri = rows_sorted[idx - 1]["index"]
+ prev_same_col = [
+ c for c in z_cells
+ if c.get("row_index") == prev_ri
+ and c.get("col_type") == ct
+ ]
+ if not prev_same_col:
+ continue
+ prev_text = prev_same_col[0].get("text", "")
+ fixed = fix_ipa_continuation_cell(
+ cell_text, prev_text, pronunciation="british",
+ )
+ if fixed != cell_text:
+ cell["text"] = fixed
+ ipa_cont_fixed += 1
+ logger.info(
+ "IPA continuation R%d %s: '%s' → '%s'",
+ ri, ct, cell_text, fixed,
+ )
+ if ipa_cont_fixed:
+ logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
+
+ # 5e. Heading detection by single-cell rows — black headings like
+ # "Theme" that have normal color and height but are the ONLY cell
+ # in their row (excluding page_ref column_1). Must run AFTER 5d
+ # so IPA continuation cells are already processed.
+ single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
+ if single_heading_count:
+ logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
+
+ # 5f. Strip IPA from headings — headings detected in 5e ran AFTER
+ # IPA correction (5c), so they may have dictionary IPA appended
+ # (e.g. "Theme [θˈiːm]" → "Theme"). Headings should show the
+ # original text only.
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ if cell.get("col_type") != "heading":
+ continue
+ text = cell.get("text", "")
+ # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme"
+ stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
+ if stripped and stripped != text:
+ cell["text"] = stripped
+
+ # 5g. Extract page_ref cells and footer rows from content zones.
+ # Page references (column_1 cells like "p.70") sit in rows that
+ # also contain vocabulary — extract them as zone metadata without
+ # removing the row. Footer lines (e.g. "two hundred and twelve"
+ # = page number at bottom) are standalone rows that should be
+ # removed from the table entirely.
+ _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
+ # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
+ _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
+ _NUMBER_WORDS = {
+ "one", "two", "three", "four", "five", "six", "seven",
+ "eight", "nine", "ten", "eleven", "twelve", "thirteen",
+ "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
+ "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
+ "seventy", "eighty", "ninety", "hundred", "thousand", "and",
+ "einhundert", "zweihundert", "dreihundert", "vierhundert",
+ "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
+ }
+ for z in zones_data:
+ if z.get("zone_type") != "content":
+ continue
+ cells = z.get("cells", [])
+ rows = z.get("rows", [])
+ if not rows:
+ continue
+
+ # Extract column_1 cells that look like page references
+ page_refs = []
+ page_ref_cell_ids = set()
+ for cell in cells:
+ if cell.get("col_type") != "column_1":
+ continue
+ text = (cell.get("text") or "").strip()
+ if not text:
+ continue
+ if not _PAGE_REF_RE.match(text):
+ continue
+ page_refs.append({
+ "row_index": cell.get("row_index"),
+ "text": text,
+ "bbox_pct": cell.get("bbox_pct", {}),
+ })
+ page_ref_cell_ids.add(cell.get("cell_id"))
+
+ # Keep page_ref cells in the table as a visible column.
+ # Previously these were removed, but users want to see them.
+ # The metadata extraction above still populates zone["page_refs"]
+ # for the frontend header display.
+
+ # Detect footer: last non-header row if it has only 1 cell
+ # with short, non-content text (page numbers like "233" or
+ # "two hundred and twelve"). Comma-separated lists and long
+ # text are content continuations, not page numbers.
+ footer_rows = []
+ non_header_rows = [r for r in rows if not r.get("is_header")]
+ if non_header_rows:
+ last_row = non_header_rows[-1]
+ last_ri = last_row["index"]
+ last_cells = [c for c in z["cells"]
+ if c.get("row_index") == last_ri]
+ if len(last_cells) == 1:
+ text = (last_cells[0].get("text") or "").strip()
+ # Not IPA (no real IPA symbols) and not a heading
+ has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
+ # Comma-separated text is a content continuation, not a footer
+ has_commas = ',' in text
+ # Written-out page numbers like "two hundred and nine"
+ text_words = set(text.lower().split())
+ is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
+ # Short text or written-out number
+ is_page_number = len(text) <= 20 or is_written_number
+ if (text and not has_real_ipa and not has_commas
+ and is_page_number
+ and last_cells[0].get("col_type") != "heading"):
+ footer_rows.append({
+ "row_index": last_ri,
+ "text": text,
+ "bbox_pct": last_cells[0].get("bbox_pct", {}),
+ })
+
+ # Classify footer rows: page numbers are removed from the grid
+ # and promoted to page_number metadata; other footers stay as rows.
+ page_number_footers = []
+ other_footers = []
+ for fr in footer_rows:
+ ft = fr["text"].strip()
+ # Pure digits
+ digits = "".join(c for c in ft if c.isdigit())
+ if digits and re.match(r'^[\d\s.]+$', ft):
+ page_number_footers.append(fr)
+ # Written-out numbers
+ elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
+ page_number_footers.append(fr)
+ else:
+ other_footers.append(fr)
+
+ # Remove page-number footer rows from grid entirely
+ if page_number_footers:
+ pn_ris = {fr["row_index"] for fr in page_number_footers}
+ z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
+ z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
+ # Set page_number metadata (use first one)
+ pn_text = page_number_footers[0]["text"].strip()
+ pn_digits = "".join(c for c in pn_text if c.isdigit())
+ if not page_number_info:
+ page_number_info = {
+ "text": pn_text,
+ "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
+ }
+ if pn_digits:
+ page_number_info["number"] = int(pn_digits)
+
+ # Mark remaining footer rows (non-page-number content)
+ if other_footers:
+ footer_ris = {fr["row_index"] for fr in other_footers}
+ for r in z["rows"]:
+ if r["index"] in footer_ris:
+ r["is_footer"] = True
+ for c in z["cells"]:
+ if c.get("row_index") in footer_ris:
+ c["col_type"] = "footer"
+
+ if page_refs or footer_rows:
+ logger.info(
+ "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
+ len(page_refs), len(footer_rows), len(page_number_footers),
+ z.get("zone_index", 0),
+ )
+
+ # Store as zone-level metadata
+ if page_refs:
+ z["page_refs"] = page_refs
+ if other_footers:
+ z["footer"] = other_footers
+
+ # 5h. Convert slash-delimited IPA to bracket notation.
+ # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
+ # Detect the pattern /ocr_ipa/ and replace with [dict_ipa]
+ # using the IPA dictionary when available, falling back to the OCR text.
+ # The regex requires a word character (or ² ³) right before the opening
+ # slash to avoid false positives like "sb/sth".
+ _SLASH_IPA_RE = re.compile(
+ r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
+ r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
+ )
+ # Standalone slash IPA at start of text (headword on previous line)
+ _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
+ # IPA between slashes never contains spaces, parentheses, or commas.
+ # Reject matches that look like grammar: "sb/sth up a) jdn/"
+ _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
+ slash_ipa_fixed = 0
+ for z in ([] if skip_ipa else zones_data):
+ for cell in z.get("cells", []):
+ # Only process English headword column — avoid converting
+ # German text like "der/die/das" to IPA.
+ if en_col_type and cell.get("col_type") != en_col_type:
+ continue
+ text = cell.get("text", "")
+ if "/" not in text:
+ continue
+
+ def _replace_slash_ipa(m: re.Match) -> str:
+ nonlocal slash_ipa_fixed
+ headword = m.group(1)
+ ocr_ipa = m.group(2) # includes slashes
+ inner_raw = ocr_ipa.strip("/").strip()
+ # Reject if inner content has spaces/parens/commas (grammar)
+ if _SLASH_IPA_REJECT_RE.search(inner_raw):
+ return m.group(0)
+ # Strip superscript digits for lookup
+ clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
+ ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
+ if ipa:
+ slash_ipa_fixed += 1
+ return f"{headword} [{ipa}]"
+ # Fallback: keep OCR IPA but convert slashes to brackets
+ inner = inner_raw.lstrip("'").strip()
+ if inner:
+ slash_ipa_fixed += 1
+ return f"{headword} [{inner}]"
+ return m.group(0)
+
+ new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
+
+ # Second pass: convert remaining /ipa/ after [ipa] from first pass.
+ # Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant)
+ _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
+ def _replace_trailing_slash(m: re.Match) -> str:
+ nonlocal slash_ipa_fixed
+ inner = m.group(1).strip("/").strip().lstrip("'").strip()
+ if _SLASH_IPA_REJECT_RE.search(inner):
+ return m.group(0)
+ if inner:
+ slash_ipa_fixed += 1
+ return f" [{inner}]"
+ return m.group(0)
+ new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
+
+ # Handle standalone /ipa/ at start (no headword in this cell)
+ if new_text == text:
+ m = _STANDALONE_SLASH_IPA_RE.match(text)
+ if m:
+ inner = m.group(1).strip()
+ if not _SLASH_IPA_REJECT_RE.search(inner):
+ inner = inner.lstrip("'").strip()
+ if inner:
+ new_text = "[" + inner + "]" + text[m.end():]
+ slash_ipa_fixed += 1
+
+ if new_text != text:
+ cell["text"] = new_text
+
+ if slash_ipa_fixed:
+ logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
+
+ # 5i. Remove blue bullet/artifact word_boxes.
+ # Dictionary pages have small blue square bullets (■) before entries.
+ # OCR reads these as text artifacts (©, e, *, or even plausible words
+ # like "fighily" overlapping the real word "tightly").
+ # Detection rules:
+ # a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black)
+ # b) Overlapping word_boxes: >40% x-overlap → remove lower confidence
+ # c) Duplicate text: consecutive blue wbs with identical text, gap < 6px
+ bullet_removed = 0
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ wbs = cell.get("word_boxes") or []
+ if len(wbs) < 2:
+ continue
+ to_remove: set = set()
+
+ # Rule (a): tiny coloured symbols (bullets, graphic fragments)
+ for i, wb in enumerate(wbs):
+ cn = wb.get("color_name", "black")
+ if (cn != "black"
+ and wb.get("width", 0) * wb.get("height", 0) < 200
+ and wb.get("conf", 100) < 85):
+ to_remove.add(i)
+
+ # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
+ # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
+ # Remove word boxes that contain NO letters or digits.
+ for i, wb in enumerate(wbs):
+ t = (wb.get("text") or "").strip()
+ if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
+ to_remove.add(i)
+
+ # Rule (b) + (c): overlap and duplicate detection
+ # Sort by x for pairwise comparison
+ _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
+ to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge
+ indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
+ for p in range(len(indexed) - 1):
+ i1, w1 = indexed[p]
+ i2, w2 = indexed[p + 1]
+ x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
+ x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
+ overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
+ min_w = min(w1.get("width", 1), w2.get("width", 1))
+ gap = x2s - x1e
+ overlap_pct = overlap / min_w if min_w > 0 else 0
+
+ # (b) Significant x-overlap
+ if overlap_pct > 0.20:
+ t1 = (w1.get("text") or "").strip()
+ t2 = (w2.get("text") or "").strip()
+
+ # Syllable-split words: both are alphabetic text with
+ # moderate overlap (20-75%). Merge instead of removing.
+ # OCR splits words at syllable marks, producing overlapping
+ # boxes like "zu" + "tiefst" → "zutiefst".
+ if (overlap_pct <= 0.75
+ and _ALPHA_WORD_RE.match(t1)
+ and _ALPHA_WORD_RE.match(t2)):
+ to_merge.append((i1, i2))
+ continue
+
+ # High overlap (>75%) with different alphabetic text:
+ # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
+ # causing it to heavily overlap with the next fragment ("brech").
+ # Merge instead of removing when one is a short prefix (≤4 chars)
+ # and the texts are different.
+ if (overlap_pct > 0.75
+ and _ALPHA_WORD_RE.match(t1)
+ and _ALPHA_WORD_RE.match(t2)
+ and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
+ and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
+ to_merge.append((i1, i2))
+ continue
+
+ if overlap_pct <= 0.40:
+ continue # too little overlap and not alphabetic merge
+
+ c1 = w1.get("conf", 50)
+ c2 = w2.get("conf", 50)
+
+ # For very high overlap (>90%) with different text,
+ # prefer the word that exists in the IPA dictionary
+ # over confidence (OCR can give artifacts high conf).
+ if overlap_pct > 0.90 and t1.lower() != t2.lower():
+ in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
+ in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
+ if in_dict_1 and not in_dict_2:
+ to_remove.add(i2)
+ continue
+ elif in_dict_2 and not in_dict_1:
+ to_remove.add(i1)
+ continue
+
+ if c1 < c2:
+ to_remove.add(i1)
+ elif c2 < c1:
+ to_remove.add(i2)
+ else:
+ # Same confidence: remove the taller one (bullet slivers)
+ if w1.get("height", 0) > w2.get("height", 0):
+ to_remove.add(i1)
+ else:
+ to_remove.add(i2)
+
+ # (c) Duplicate text: consecutive blue with same text, gap < 6px
+ elif (gap < 6
+ and w1.get("color_name") == "blue"
+ and w2.get("color_name") == "blue"
+ and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
+ # Remove the one with lower confidence; if equal, first one
+ c1 = w1.get("conf", 50)
+ c2 = w2.get("conf", 50)
+ to_remove.add(i1 if c1 <= c2 else i2)
+
+ # Execute merges first (syllable-split words).
+ # Use merge_parent to support chain merging: if "zer" absorbed
+ # "brech" and then "brech"+"lich" is a merge pair, redirect to
+ # merge "lich" into "zer" → "zerbrechlich".
+ if to_merge:
+ merge_parent: Dict[int, int] = {} # absorbed → absorber
+ for mi1, mi2 in to_merge:
+ # Follow chain: if mi1 was absorbed, find root absorber
+ actual_mi1 = mi1
+ while actual_mi1 in merge_parent:
+ actual_mi1 = merge_parent[actual_mi1]
+ if actual_mi1 in to_remove or mi2 in to_remove:
+ continue
+ if mi2 in merge_parent:
+ continue # mi2 already absorbed
+ mw1, mw2 = wbs[actual_mi1], wbs[mi2]
+ # Concatenate text (no space — they're parts of one word)
+ mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
+ mt2 = (mw2.get("text") or "").strip()
+ merged_text = mt1 + mt2
+ # Union bounding box
+ mx = min(mw1["left"], mw2["left"])
+ my = min(mw1["top"], mw2["top"])
+ mr = max(mw1["left"] + mw1["width"],
+ mw2["left"] + mw2["width"])
+ mb = max(mw1["top"] + mw1["height"],
+ mw2["top"] + mw2["height"])
+ mw1["text"] = merged_text
+ mw1["left"] = mx
+ mw1["top"] = my
+ mw1["width"] = mr - mx
+ mw1["height"] = mb - my
+ mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
+ to_remove.add(mi2)
+ merge_parent[mi2] = actual_mi1
+ bullet_removed -= 1 # net: merge, not removal
+
+ if to_remove:
+ bullet_removed += len(to_remove)
+ filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
+ cell["word_boxes"] = filtered
+ # Don't overwrite text that was corrected by Step 5c IPA fix
+ if not cell.get("_ipa_corrected"):
+ cell["text"] = _words_to_reading_order_text(filtered)
+
+ # Remove cells that became empty after bullet removal
+ if bullet_removed:
+ for z in zones_data:
+ z["cells"] = [c for c in z.get("cells", [])
+ if (c.get("word_boxes") or c.get("text", "").strip())]
+ logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
+
+ # 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
+ # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
+ # "\\", "PEE", "a=") that survive earlier filters because their rows also
+ # contain real content in other columns. Remove them here.
+ _COMMON_SHORT_WORDS = {
+ # German
+ "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
+ "ob", "so", "um", "zu", "wo", "je", "oh", "or",
+ "die", "der", "das", "dem", "den", "des", "ein", "und",
+ "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
+ # English
+ "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
+ "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
+ "on", "or", "so", "to", "up", "us", "we",
+ "the", "and", "but", "for", "not",
+ }
+ _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
+ artifact_cells_removed = 0
+ for z in zones_data:
+ before = len(z.get("cells", []))
+ kept = []
+ for cell in z.get("cells", []):
+ text = (cell.get("text") or "").strip()
+ core = text.rstrip(".,;:!?'\"")
+ is_artifact = False
+ if not core:
+ is_artifact = True
+ elif _PURE_JUNK_RE.match(core):
+ is_artifact = True
+ elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
+ # Short non-alphabetic text like "a=", not word beginnings like "Zw"
+ is_artifact = True
+ elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
+ is_artifact = True
+ elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
+ and not re.match(r'^[pPsS]\.?\d+$', core)):
+ # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
+ # but NOT page references like "p.43", "p50", "S.12"
+ is_artifact = True
+ if is_artifact:
+ kept.append(None) # placeholder
+ else:
+ kept.append(cell)
+ z["cells"] = [c for c in kept if c is not None]
+ artifact_cells_removed += before - len(z["cells"])
+ if artifact_cells_removed:
+ # Also remove rows that became completely empty
+ for z in zones_data:
+ cell_ris = {c.get("row_index") for c in z.get("cells", [])}
+ z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
+ logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
+
+ # 5j. Normalise word_box order to reading order (group by Y, sort by X).
+ # The frontend renders colored cells from word_boxes array order
+ # (GridTable.tsx), so they MUST be in left-to-right reading order.
+ wb_reordered = 0
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ wbs = cell.get("word_boxes") or []
+ if len(wbs) < 2:
+ continue
+ lines = _group_words_into_lines(wbs, y_tolerance_px=15)
+ sorted_wbs = [w for line in lines for w in line]
+ # Check if order actually changed
+ if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
+ cell["word_boxes"] = sorted_wbs
+ wb_reordered += 1
+ if wb_reordered:
+ logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
+
+ duration = time.time() - t0
+
+ # 6. Build result
+ total_cells = sum(len(z.get("cells", [])) for z in zones_data)
+ total_columns = sum(len(z.get("columns", [])) for z in zones_data)
+ total_rows = sum(len(z.get("rows", [])) for z in zones_data)
+
+ # Collect color statistics from all word_boxes in cells
+ color_stats: Dict[str, int] = {}
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ for wb in cell.get("word_boxes", []):
+ cn = wb.get("color_name", "black")
+ color_stats[cn] = color_stats.get(cn, 0) + 1
+
+ # Compute layout metrics for faithful grid reconstruction
+ all_content_row_heights: List[float] = []
+ for z in zones_data:
+ for row in z.get("rows", []):
+ if not row.get("is_header", False):
+ h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
+ if h > 0:
+ all_content_row_heights.append(h)
+ avg_row_height = (
+ sum(all_content_row_heights) / len(all_content_row_heights)
+ if all_content_row_heights else 30.0
+ )
+ font_size_suggestion = max(10, int(avg_row_height * 0.6))
+
+ # --- Dictionary detection on assembled grid ---
+ # Build lightweight ColumnGeometry-like structures from zone columns for
+ # dictionary signal scoring.
+ from cv_layout import _score_dictionary_signals
+ dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
+ try:
+ from cv_vocab_types import ColumnGeometry
+ for z in zones_data:
+ zone_cells = z.get("cells", [])
+ zone_cols = z.get("columns", [])
+ if len(zone_cols) < 2 or len(zone_cells) < 10:
+ continue
+ # Build pseudo-ColumnGeometry per column
+ pseudo_geoms = []
+ for col in zone_cols:
+ ci = col["index"]
+ col_cells = [c for c in zone_cells if c.get("col_index") == ci]
+ # Flatten word_boxes into word dicts compatible with _score_language
+ col_words = []
+ for cell in col_cells:
+ for wb in cell.get("word_boxes") or []:
+ col_words.append({
+ "text": wb.get("text", ""),
+ "conf": wb.get("conf", 0),
+ "top": wb.get("top", 0),
+ "left": wb.get("left", 0),
+ "height": wb.get("height", 0),
+ "width": wb.get("width", 0),
+ })
+ # Fallback: use cell text if no word_boxes
+ if not cell.get("word_boxes") and cell.get("text"):
+ col_words.append({
+ "text": cell["text"],
+ "conf": cell.get("confidence", 50),
+ "top": cell.get("bbox_px", {}).get("y", 0),
+ "left": cell.get("bbox_px", {}).get("x", 0),
+ "height": cell.get("bbox_px", {}).get("h", 20),
+ "width": cell.get("bbox_px", {}).get("w", 50),
+ })
+ col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
+ pseudo_geoms.append(ColumnGeometry(
+ index=ci, x=col.get("x_min_px", 0), y=0,
+ width=max(col_w, 1), height=img_h,
+ word_count=len(col_words), words=col_words,
+ width_ratio=col_w / max(img_w, 1),
+ ))
+ if len(pseudo_geoms) >= 2:
+ dd = _score_dictionary_signals(
+ pseudo_geoms,
+ document_category=document_category,
+ margin_strip_detected=margin_strip_detected,
+ )
+ if dd["confidence"] > dict_detection["confidence"]:
+ dict_detection = dd
+ except Exception as e:
+ logger.warning("Dictionary detection failed: %s", e)
+
+ # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
+ try:
+ from cv_syllable_detect import merge_word_gaps_in_zones
+ merge_word_gaps_in_zones(zones_data, session_id)
+ except Exception as e:
+ logger.warning("Word-gap merge failed: %s", e)
+
+ # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
+ # Strips | from words, validates with pyphen, tries char-deletion for garbled
+ # words like "Ze|plpe|lin" → "Zeppelin".
+ try:
+ from cv_syllable_detect import autocorrect_pipe_artifacts
+ autocorrect_pipe_artifacts(zones_data, session_id)
+ except Exception as e:
+ logger.warning("Pipe autocorrect failed: %s", e)
+
+ # --- Syllable divider insertion for dictionary pages ---
+ # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
+ # "all" = force on all content words, "en" = English column only,
+ # "de" = German columns only, "none" = skip entirely.
+ syllable_insertions = 0
+ if syllable_mode != "none" and img_bgr is not None:
+ _syllable_eligible = False
+ if syllable_mode in ("all", "de", "en"):
+ _syllable_eligible = True
+ elif (dict_detection.get("is_dictionary")
+ and dict_detection.get("article_col_index") is not None):
+ # auto: only on dictionary pages with article columns
+ _syllable_eligible = True
+ # For language-specific modes, determine allowed columns
+ _syllable_col_filter: Optional[set] = None # None = all columns
+ if syllable_mode == "en":
+ _syllable_col_filter = {en_col_type} if en_col_type else set()
+ elif syllable_mode == "de":
+ if en_col_type and total_cols >= 3:
+ _syllable_col_filter = all_content_cols - {en_col_type}
+ # else None → all columns (correct for German-only dicts)
+ if _syllable_eligible:
+ try:
+ from cv_syllable_detect import insert_syllable_dividers
+ force_syllables = (syllable_mode in ("all", "de", "en"))
+ syllable_insertions = insert_syllable_dividers(
+ zones_data, img_bgr, session_id,
+ force=force_syllables,
+ col_filter=_syllable_col_filter,
+ )
+ except Exception as e:
+ logger.warning("Syllable insertion failed: %s", e)
+
+ # When syllable mode is "none", strip any residual | from OCR so
+ # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
+ if syllable_mode == "none":
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ t = cell.get("text", "")
+ if "|" in t:
+ cell["text"] = t.replace("|", "")
+
+ # --- Split merged words (OCR sometimes glues adjacent words) ---
+ # Uses dictionary lookup to split e.g. "atmyschool" → "at my school"
+ try:
+ from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
+ if _SPELL_AVAILABLE:
+ split_count = 0
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ text = cell.get("text", "")
+ if not text:
+ continue
+ parts = []
+ changed = False
+ for token in text.split():
+ # Try splitting pure-alpha tokens >= 4 chars
+ # Strip trailing punctuation AND IPA brackets
+ clean = token
+ # Remove trailing IPA like [dɪsˈɪʒən] first
+ bracket_pos = clean.find('[')
+ suffix_ipa = ""
+ if bracket_pos > 0:
+ suffix_ipa = clean[bracket_pos:]
+ clean = clean[:bracket_pos]
+ suffix_punct = ""
+ stripped = clean.rstrip(".,!?;:'\")")
+ if stripped != clean:
+ suffix_punct = clean[len(stripped):]
+ clean = stripped
+ suffix = suffix_punct + suffix_ipa
+ # Handle contractions: "solet's" → try "solet" + "'s"
+ contraction = ""
+ if "'" in clean and clean.index("'") >= 2:
+ apos_pos = clean.index("'")
+ contraction = clean[apos_pos:]
+ clean = clean[:apos_pos]
+ suffix = contraction + suffix
+ if len(clean) >= 4 and clean.isalpha():
+ split = _try_split_merged_word(clean)
+ if split:
+ parts.append(split + suffix)
+ changed = True
+ continue
+ parts.append(token)
+ if changed:
+ cell["text"] = " ".join(parts)
+ split_count += 1
+ if split_count:
+ logger.info("build-grid session %s: split %d merged words", session_id, split_count)
+ except ImportError:
+ pass
+
+ # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" ---
+ # Matches any [bracket] directly after a letter, as long as the bracket
+ # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]").
+ _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ text = cell.get("text", "")
+ if text and "[" in text:
+ fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
+ if fixed != text:
+ cell["text"] = fixed
+
+ # --- SmartSpellChecker: language-aware OCR correction on all cells ---
+ try:
+ from smart_spell import SmartSpellChecker
+ _ssc = SmartSpellChecker()
+ spell_fix_count = 0
+
+ # Determine language per column:
+ # en_col_type was already detected (column with IPA = English).
+ # All other content columns are assumed German for vocab tables.
+ # For single/two-column layouts, use auto-detection.
+ for z in zones_data:
+ zone_cols = z.get("columns", [])
+ for cell in z.get("cells", []):
+ text = cell.get("text", "")
+ if not text or not text.strip():
+ continue
+ ct = cell.get("col_type", "")
+ if not ct.startswith("column_"):
+ continue
+
+ # Determine language for this cell
+ if total_cols >= 3 and en_col_type:
+ lang = "en" if ct == en_col_type else "de"
+ elif total_cols <= 2:
+ lang = "auto" # auto-detect for non-vocab layouts
+ else:
+ lang = "auto"
+
+ result = _ssc.correct_text(text, lang=lang)
+ if result.changed:
+ cell["text"] = result.corrected
+ spell_fix_count += 1
+
+ if spell_fix_count:
+ logger.info(
+ "build-grid session %s: SmartSpellChecker fixed %d cells",
+ session_id, spell_fix_count,
+ )
+ except ImportError:
+ logger.debug("SmartSpellChecker not available in build-grid")
+ except Exception as e:
+ logger.warning("SmartSpellChecker error in build-grid: %s", e)
+
+ # --- Debug: log cell counts per column before empty-column removal ---
+ for z in zones_data:
+ if z.get("zone_type") == "content":
+ from collections import Counter as _Counter
+ _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
+ _cols = z.get("columns", [])
+ logger.info(
+ "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
+ z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
+ )
+
+ # --- Remove empty columns (no cells assigned) ---
+ for z in zones_data:
+ cells = z.get("cells", [])
+ used_col_indices = {c.get("col_index") for c in cells}
+ old_cols = z.get("columns", [])
+ new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
+ if len(new_cols) < len(old_cols):
+ # Re-index columns and cells
+ old_to_new = {}
+ for new_i, col in enumerate(new_cols):
+ old_i = col.get("col_index", col.get("index", new_i))
+ old_to_new[old_i] = new_i
+ col["col_index"] = new_i
+ col["index"] = new_i
+ col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
+ for cell in cells:
+ old_ci = cell.get("col_index", 0)
+ cell["col_index"] = old_to_new.get(old_ci, old_ci)
+ cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
+ z["columns"] = new_cols
+
+ # Clean up internal flags before returning
+ for z in zones_data:
+ for cell in z.get("cells", []):
+ cell.pop("_ipa_corrected", None)
+
+ result = {
+ "session_id": session_id,
+ "image_width": img_w,
+ "image_height": img_h,
+ "zones": zones_data,
+ "boxes_detected": boxes_detected,
+ "summary": {
+ "total_zones": len(zones_data),
+ "total_columns": total_columns,
+ "total_rows": total_rows,
+ "total_cells": total_cells,
+ "total_words": len(all_words),
+ "recovered_colored": recovered_count,
+ "color_stats": color_stats,
+ },
+ "formatting": {
+ "bold_columns": [],
+ "header_rows": [],
+ },
+ "layout_metrics": {
+ "page_width_px": img_w,
+ "page_height_px": img_h,
+ "avg_row_height_px": round(avg_row_height, 1),
+ "font_size_suggestion_px": font_size_suggestion,
+ },
+ "dictionary_detection": {
+ "is_dictionary": dict_detection.get("is_dictionary", False),
+ "confidence": dict_detection.get("confidence", 0.0),
+ "signals": dict_detection.get("signals", {}),
+ "article_col_index": dict_detection.get("article_col_index"),
+ "headword_col_index": dict_detection.get("headword_col_index"),
+ },
+ "processing_modes": {
+ "ipa_mode": ipa_mode,
+ "syllable_mode": syllable_mode,
+ "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
+ "syllables_applied": syllable_insertions > 0,
+ },
+ "page_number": page_number_info,
+ "duration_seconds": round(duration, 2),
+ }
+
+ return result
+
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index a8884de..ea91384 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1,14 +1,7 @@
"""
-Grid Editor API — builds a structured, zone-aware grid from Kombi OCR results.
+Grid Editor API — endpoints for grid building, editing, and export.
-Takes the merged word positions from paddle-kombi / rapid-kombi and:
- 1. Detects bordered boxes on the image (cv_box_detect)
- 2. Splits the page into zones (content + box regions)
- 3. Clusters words into columns and rows per zone
- 4. Returns a hierarchical StructuredGrid for the frontend Excel-like editor
-
-Lizenz: Apache 2.0 (kommerziell nutzbar)
-DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+The core grid building logic is in grid_build_core.py.
"""
import logging
@@ -16,1950 +9,20 @@ import re
import time
from typing import Any, Dict, List, Optional, Tuple
-import cv2
-import numpy as np
from fastapi import APIRouter, HTTPException, Query, Request
-from cv_box_detect import detect_boxes, split_page_into_zones
-from cv_graphic_detect import detect_graphic_elements
-from cv_vocab_types import PageZone
-from cv_color_detect import detect_word_colors, recover_colored_text
-from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines
+from grid_build_core import _build_grid_core
+from grid_editor_helpers import _words_in_zone
from ocr_pipeline_session_store import (
get_session_db,
- get_session_image,
update_session_db,
)
-from grid_editor_helpers import (
- _filter_border_strip_words,
- _cluster_columns_by_alignment,
- _GRID_GHOST_CHARS,
- _filter_border_ghosts,
- _MARKER_CHARS,
- _merge_inline_marker_columns,
- _flatten_word_boxes,
- _words_in_zone,
- _PIPE_RE_VSPLIT,
- _detect_vertical_dividers,
- _split_zone_at_vertical_dividers,
- _merge_content_zones_across_boxes,
- _detect_heading_rows_by_color,
- _detect_heading_rows_by_single_cell,
- _detect_header_rows,
- _build_zone_grid,
- _get_content_bounds,
- _filter_decorative_margin,
- _filter_footer_words,
- _filter_header_junk,
-)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-# ---------------------------------------------------------------------------
-# Core computation (used by build-grid endpoint and regression tests)
-# ---------------------------------------------------------------------------
-
-async def _build_grid_core(
- session_id: str,
- session: dict,
- *,
- ipa_mode: str = "auto",
- syllable_mode: str = "auto",
-) -> dict:
- """Core grid building logic — pure computation, no HTTP or DB side effects.
-
- Args:
- session_id: Session identifier (for logging and image loading).
- session: Full session dict from get_session_db().
- ipa_mode: "auto" (only when English headwords detected), "all"
- (force IPA on all content columns), "en" (English column only),
- "de" (German/definition columns only), or "none" (skip entirely).
- syllable_mode: "auto" (only when original has pipe dividers),
- "all" (force syllabification on all words), "en" (English only),
- "de" (German only), or "none" (skip).
-
- Returns:
- StructuredGrid result dict.
-
- Raises:
- ValueError: If session data is incomplete.
- """
- t0 = time.time()
-
- # 1. Validate and load word results
- word_result = session.get("word_result")
- if not word_result or not word_result.get("cells"):
- raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
-
- img_w = word_result.get("image_width", 0)
- img_h = word_result.get("image_height", 0)
- if not img_w or not img_h:
- raise ValueError("Missing image dimensions in word_result")
-
- # 2. Flatten all word boxes from cells
- all_words = _flatten_word_boxes(word_result["cells"])
- if not all_words:
- raise ValueError("No word boxes found in cells")
-
- logger.info("build-grid session %s: %d words from %d cells",
- session_id, len(all_words), len(word_result["cells"]))
-
- # 2b. Filter decorative margin columns (alphabet graphics).
- # Some worksheets have a decorative alphabet strip along one margin
- # (A-Z in a graphic). OCR reads these as single-char words aligned
- # vertically. Detect and remove them before grid building.
- margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
- margin_strip_detected = margin_strip_info.get("found", False)
-
- # Read document_category from session (user-selected or auto-detected)
- document_category = session.get("document_category")
-
- # 2c. Filter footer rows (page numbers at the very bottom).
- # Isolated short text in the bottom 5% of the page is typically a
- # page number ("64", "S. 12") and not real content. The page number
- # is extracted as metadata for the frontend header display.
- page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
-
- # 2c2. Filter OCR junk from header illustrations.
- # Low-confidence short fragments above the first real content row.
- _filter_header_junk(all_words, img_h, logger, session_id)
-
- # 2d. Filter words inside user-defined exclude regions (from Structure step).
- # These are explicitly marked by the user, so ALL words inside are removed
- # regardless of confidence.
- structure_result = session.get("structure_result")
- exclude_rects = []
- if structure_result:
- for er in structure_result.get("exclude_regions", []):
- exclude_rects.append({
- "x": er["x"], "y": er["y"],
- "w": er["w"], "h": er["h"],
- })
- if exclude_rects:
- before = len(all_words)
- filtered = []
- for w in all_words:
- w_cx = w["left"] + w.get("width", 0) / 2
- w_cy = w["top"] + w.get("height", 0) / 2
- inside = any(
- er["x"] <= w_cx <= er["x"] + er["w"]
- and er["y"] <= w_cy <= er["y"] + er["h"]
- for er in exclude_rects
- )
- if not inside:
- filtered.append(w)
- removed = before - len(filtered)
- if removed:
- all_words = filtered
- logger.info(
- "build-grid session %s: removed %d words inside %d user exclude region(s)",
- session_id, removed, len(exclude_rects),
- )
-
- # 2e. Hard-filter words inside graphic/image regions from structure step.
- # ALL words inside graphic regions are removed regardless of confidence —
- # images cannot contain real text; any OCR words inside are artifacts.
- # After image loading (Step 3a) we augment these with freshly detected
- # graphic regions from cv_graphic_detect.
- graphic_rects: List[Dict[str, int]] = []
- if structure_result:
- for g in structure_result.get("graphics", []):
- graphic_rects.append({
- "x": g["x"], "y": g["y"],
- "w": g["w"], "h": g["h"],
- })
- if graphic_rects:
- before = len(all_words)
- all_words = [
- w for w in all_words
- if not any(
- gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
- and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
- for gr in graphic_rects
- )
- ]
- removed = before - len(all_words)
- if removed:
- logger.info(
- "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
- session_id, removed, len(graphic_rects),
- )
-
- # 3. Load image for box detection
- img_png = await get_session_image(session_id, "cropped")
- if not img_png:
- img_png = await get_session_image(session_id, "dewarped")
- if not img_png:
- img_png = await get_session_image(session_id, "original")
-
- zones_data: List[Dict[str, Any]] = []
- boxes_detected = 0
- recovered_count = 0
- border_prefiltered = False
- img_bgr = None
-
- content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
-
- if img_png:
- # Decode image for color detection + box detection
- arr = np.frombuffer(img_png, dtype=np.uint8)
- img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-
- if img_bgr is not None:
- # --- 3a. Detect graphic/image regions via CV and hard-filter ---
- # Pass only significant words (len >= 3) to the detector so that
- # short OCR artifacts inside images don't fool the text-vs-graphic
- # heuristic (it counts word centroids to distinguish text from images).
- sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
- fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
- if fresh_graphics:
- fresh_rects = [
- {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
- for g in fresh_graphics
- ]
- graphic_rects.extend(fresh_rects)
- logger.info(
- "build-grid session %s: detected %d graphic region(s) via CV",
- session_id, len(fresh_graphics),
- )
- # Hard-filter words inside newly detected graphic regions
- before = len(all_words)
- all_words = [
- w for w in all_words
- if not any(
- gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
- and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
- for gr in fresh_rects
- )
- ]
- removed = before - len(all_words)
- if removed:
- logger.info(
- "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
- session_id, removed, len(fresh_rects),
- )
-
- # --- Recover colored text that OCR missed (before grid building) ---
- recovered = recover_colored_text(img_bgr, all_words)
- if recovered and graphic_rects:
- # Filter recovered chars inside graphic regions
- recovered = [
- r for r in recovered
- if not any(
- gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
- and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
- for gr in graphic_rects
- )
- ]
- if recovered:
- recovered_count = len(recovered)
- all_words.extend(recovered)
- logger.info(
- "build-grid session %s: +%d recovered colored words",
- session_id, recovered_count,
- )
-
- # Detect bordered boxes
- boxes = detect_boxes(
- img_bgr,
- content_x=content_x,
- content_w=content_w,
- content_y=content_y,
- content_h=content_h,
- )
- boxes_detected = len(boxes)
-
- if boxes:
- # Filter border ghost words before grid building
- all_words, ghost_count = _filter_border_ghosts(all_words, boxes)
- if ghost_count:
- logger.info(
- "build-grid session %s: removed %d border ghost words",
- session_id, ghost_count,
- )
-
- # Split page into zones
- page_zones = split_page_into_zones(
- content_x, content_y, content_w, content_h, boxes
- )
-
- # Merge content zones separated by box zones
- page_zones = _merge_content_zones_across_boxes(
- page_zones, content_x, content_w
- )
-
- # 3b. Detect vertical dividers and split content zones
- vsplit_group_counter = 0
- expanded_zones: List = []
- for pz in page_zones:
- if pz.zone_type != "content":
- expanded_zones.append(pz)
- continue
- zone_words = _words_in_zone(
- all_words, pz.y, pz.height, pz.x, pz.width
- )
- divider_xs = _detect_vertical_dividers(
- zone_words, pz.x, pz.width, pz.y, pz.height
- )
- if divider_xs:
- sub_zones = _split_zone_at_vertical_dividers(
- pz, divider_xs, vsplit_group_counter
- )
- expanded_zones.extend(sub_zones)
- vsplit_group_counter += 1
- # Remove pipe words so they don't appear in sub-zones
- pipe_ids = set(
- id(w) for w in zone_words
- if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
- )
- all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
- logger.info(
- "build-grid: vertical split zone %d at x=%s → %d sub-zones",
- pz.index, [int(x) for x in divider_xs], len(sub_zones),
- )
- else:
- expanded_zones.append(pz)
- # Re-index zones
- for i, pz in enumerate(expanded_zones):
- pz.index = i
- page_zones = expanded_zones
-
- # --- Union columns from all content zones ---
- # Each content zone detects columns independently. Narrow
- # columns (page refs, markers) may appear in only one zone.
- # Merge column split-points from ALL content zones so every
- # zone shares the full column set.
- # NOTE: Zones from a vertical split are independent and must
- # NOT share columns with each other.
-
- # First pass: build grids per zone independently
- zone_grids: List[Dict] = []
-
- for pz in page_zones:
- zone_words = _words_in_zone(
- all_words, pz.y, pz.height, pz.x, pz.width
- )
- if pz.zone_type == "content":
- logger.info(
- "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d → %d/%d words",
- pz.index, pz.zone_type,
- pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
- len(zone_words), len(all_words),
- )
- # Filter recovered single-char artifacts in ALL zones
- # (decorative colored pixel blobs like !, ?, • from
- # recover_colored_text that don't represent real text)
- before = len(zone_words)
- zone_words = [
- w for w in zone_words
- if not (
- w.get("recovered")
- and len(w.get("text", "").strip()) <= 2
- )
- ]
- removed = before - len(zone_words)
- if removed:
- logger.info(
- "build-grid: filtered %d recovered artifacts from %s zone %d",
- removed, pz.zone_type, pz.index,
- )
- # Filter words inside image overlay regions (merged box zones)
- if pz.image_overlays:
- before_ov = len(zone_words)
- zone_words = [
- w for w in zone_words
- if not any(
- ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
- and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
- for ov in pz.image_overlays
- )
- ]
- ov_removed = before_ov - len(zone_words)
- if ov_removed:
- logger.info(
- "build-grid: filtered %d words inside image overlays from zone %d",
- ov_removed, pz.index,
- )
- zone_words, bs_removed = _filter_border_strip_words(zone_words)
- if bs_removed:
- border_prefiltered = True
- logger.info(
- "build-grid: pre-filtered %d border-strip words from zone %d",
- bs_removed, pz.index,
- )
- grid = _build_zone_grid(
- zone_words, pz.x, pz.y, pz.width, pz.height,
- pz.index, img_w, img_h,
- skip_first_row_header=bool(pz.image_overlays),
- )
- zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
-
- # Second pass: merge column boundaries from all content zones
- # Exclude zones from vertical splits — they have independent columns.
- content_zones = [
- zg for zg in zone_grids
- if zg["pz"].zone_type == "content"
- and zg["pz"].vsplit_group is None
- ]
- if len(content_zones) > 1:
- # Collect column split points (x_min of non-first columns)
- all_split_xs: List[float] = []
- for zg in content_zones:
- raw_cols = zg["grid"].get("_raw_columns", [])
- for col in raw_cols[1:]:
- all_split_xs.append(col["x_min"])
-
- if all_split_xs:
- all_split_xs.sort()
- merge_distance = max(25, int(content_w * 0.03))
- merged_xs = [all_split_xs[0]]
- for x in all_split_xs[1:]:
- if x - merged_xs[-1] < merge_distance:
- merged_xs[-1] = (merged_xs[-1] + x) / 2
- else:
- merged_xs.append(x)
-
- total_cols = len(merged_xs) + 1
- max_zone_cols = max(
- len(zg["grid"].get("_raw_columns", []))
- for zg in content_zones
- )
-
- # Apply union whenever it has at least as many
- # columns as the best single zone. Even with the
- # same count the union boundaries are better because
- # they incorporate evidence from all zones.
- if total_cols >= max_zone_cols:
- cx_min = min(w["left"] for w in all_words)
- cx_max = max(
- w["left"] + w["width"] for w in all_words
- )
- merged_columns: List[Dict[str, Any]] = []
- prev_x = cx_min
- for i, sx in enumerate(merged_xs):
- merged_columns.append({
- "index": i,
- "type": f"column_{i + 1}",
- "x_min": prev_x,
- "x_max": sx,
- })
- prev_x = sx
- merged_columns.append({
- "index": len(merged_xs),
- "type": f"column_{len(merged_xs) + 1}",
- "x_min": prev_x,
- "x_max": cx_max,
- })
-
- # Re-build ALL content zones with merged columns
- for zg in zone_grids:
- pz = zg["pz"]
- if pz.zone_type == "content":
- grid = _build_zone_grid(
- zg["words"], pz.x, pz.y,
- pz.width, pz.height,
- pz.index, img_w, img_h,
- global_columns=merged_columns,
- skip_first_row_header=bool(pz.image_overlays),
- )
- zg["grid"] = grid
- logger.info(
- "build-grid session %s: union of %d content "
- "zones → %d merged columns (max single zone: %d)",
- session_id, len(content_zones),
- total_cols, max_zone_cols,
- )
-
- for zg in zone_grids:
- pz = zg["pz"]
- grid = zg["grid"]
- # Remove internal _raw_columns before adding to response
- grid.pop("_raw_columns", None)
-
- zone_entry: Dict[str, Any] = {
- "zone_index": pz.index,
- "zone_type": pz.zone_type,
- "bbox_px": {
- "x": pz.x, "y": pz.y,
- "w": pz.width, "h": pz.height,
- },
- "bbox_pct": {
- "x": round(pz.x / img_w * 100, 2) if img_w else 0,
- "y": round(pz.y / img_h * 100, 2) if img_h else 0,
- "w": round(pz.width / img_w * 100, 2) if img_w else 0,
- "h": round(pz.height / img_h * 100, 2) if img_h else 0,
- },
- "border": None,
- "word_count": len(zg["words"]),
- **grid,
- }
-
- if pz.box:
- zone_entry["border"] = {
- "thickness": pz.box.border_thickness,
- "confidence": pz.box.confidence,
- }
-
- if pz.image_overlays:
- zone_entry["image_overlays"] = pz.image_overlays
-
- if pz.layout_hint:
- zone_entry["layout_hint"] = pz.layout_hint
- if pz.vsplit_group is not None:
- zone_entry["vsplit_group"] = pz.vsplit_group
-
- zones_data.append(zone_entry)
-
- # 4. Fallback: no boxes detected → single zone with all words
- if not zones_data:
- # Filter recovered single-char artifacts (same as in zone loop above)
- before = len(all_words)
- filtered_words = [
- w for w in all_words
- if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
- ]
- removed = before - len(filtered_words)
- if removed:
- logger.info(
- "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
- session_id, removed,
- )
- # Pre-filter border-strip words so column detection is not
- # confused by edge artifacts. When this removes words, Step 4e
- # is skipped (it would otherwise re-detect content as a "strip").
- filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
- if bs_removed:
- border_prefiltered = True
- logger.info(
- "build-grid session %s: pre-filtered %d border-strip words",
- session_id, bs_removed,
- )
- grid = _build_zone_grid(
- filtered_words, content_x, content_y, content_w, content_h,
- 0, img_w, img_h,
- )
- grid.pop("_raw_columns", None)
- zones_data.append({
- "zone_index": 0,
- "zone_type": "content",
- "bbox_px": {
- "x": content_x, "y": content_y,
- "w": content_w, "h": content_h,
- },
- "bbox_pct": {
- "x": round(content_x / img_w * 100, 2) if img_w else 0,
- "y": round(content_y / img_h * 100, 2) if img_h else 0,
- "w": round(content_w / img_w * 100, 2) if img_w else 0,
- "h": round(content_h / img_h * 100, 2) if img_h else 0,
- },
- "border": None,
- "word_count": len(all_words),
- **grid,
- })
-
- # 4b. Remove junk rows: rows where ALL cells contain only short,
- # low-confidence text (OCR noise, stray marks). Real vocabulary rows
- # have at least one word with conf >= 50 or meaningful text length.
- # Also remove "oversized stub" rows: rows with ≤2 very short words
- # whose word-boxes are significantly taller than the median (e.g.
- # large red page numbers like "( 9" that are not real text content).
- _JUNK_CONF_THRESHOLD = 50
- _JUNK_MAX_TEXT_LEN = 3
- for z in zones_data:
- cells = z.get("cells", [])
- rows = z.get("rows", [])
- if not cells or not rows:
- continue
-
- # Compute median word height across the zone for oversized detection
- all_wb_heights = [
- wb["height"]
- for cell in cells
- for wb in cell.get("word_boxes") or []
- if wb.get("height", 0) > 0
- ]
- median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
-
- junk_row_indices = set()
- for row in rows:
- ri = row["index"]
- row_cells = [c for c in cells if c.get("row_index") == ri]
- if not row_cells:
- continue
-
- row_wbs = [
- wb for cell in row_cells
- for wb in cell.get("word_boxes") or []
- ]
-
- # Rule 1: ALL word_boxes are low-conf AND short text
- all_junk = True
- for wb in row_wbs:
- text = (wb.get("text") or "").strip()
- conf = wb.get("conf", 0)
- if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
- all_junk = False
- break
- if all_junk and row_wbs:
- junk_row_indices.add(ri)
- continue
-
- # Rule 2: oversized stub — ≤3 words, short total text,
- # and word height > 1.8× median (page numbers, stray marks,
- # OCR from illustration labels like "SEA &")
- # Skip if any word looks like a page reference (p.55, S.12).
- if len(row_wbs) <= 3:
- total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
- max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
- has_page_ref = any(
- re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
- for wb in row_wbs
- )
- if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
- junk_row_indices.add(ri)
- continue
-
- # Rule 3: scattered debris — rows with only tiny fragments
- # (e.g. OCR artifacts from illustrations/graphics).
- # If the row has no word longer than 2 chars, it's noise.
- longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
- if longest <= 2:
- junk_row_indices.add(ri)
- continue
-
- if junk_row_indices:
- z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
- z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
- logger.info(
- "build-grid: removed %d junk rows from zone %d: %s",
- len(junk_row_indices), z["zone_index"],
- sorted(junk_row_indices),
- )
-
- # 4b2. Remove individual cells that consist of a single very-short,
- # low-confidence word (OCR artifacts like "as", "b" from stray marks).
- # These survive row-level junk removal when the row has valid cells
- # in other columns.
- _ARTIFACT_MAX_LEN = 2
- _ARTIFACT_CONF_THRESHOLD = 65
- for z in zones_data:
- cells = z.get("cells", [])
- if not cells:
- continue
- artifact_ids = set()
- for cell in cells:
- wbs = cell.get("word_boxes") or []
- if len(wbs) != 1:
- continue
- wb = wbs[0]
- text = (wb.get("text") or "").strip()
- conf = wb.get("conf", 100)
- if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
- artifact_ids.add(cell.get("cell_id"))
- if artifact_ids:
- z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
- logger.info(
- "build-grid: removed %d artifact cells from zone %d: %s",
- len(artifact_ids), z.get("zone_index", 0),
- [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
- )
-
- # 4c. Remove oversized word_boxes from individual cells.
- # OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
- # have word heights 3-5x the median. Remove them per-word so they don't
- # pollute cells that also contain valid text in other columns.
- for z in zones_data:
- cells = z.get("cells", [])
- if not cells:
- continue
- all_wh = [
- wb["height"]
- for cell in cells
- for wb in cell.get("word_boxes") or []
- if wb.get("height", 0) > 0
- ]
- if not all_wh:
- continue
- med_h = sorted(all_wh)[len(all_wh) // 2]
- oversized_threshold = med_h * 3
- removed_oversized = 0
- for cell in cells:
- wbs = cell.get("word_boxes") or []
- filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
- if len(filtered) < len(wbs):
- removed_oversized += len(wbs) - len(filtered)
- cell["word_boxes"] = filtered
- cell["text"] = _words_to_reading_order_text(filtered)
- if removed_oversized:
- # Remove cells that became empty after oversized removal
- z["cells"] = [c for c in cells if c.get("word_boxes")]
- logger.info(
- "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
- removed_oversized, oversized_threshold, z.get("zone_index", 0),
- )
-
- # 4d. Remove pipe-character word_boxes (column divider artifacts).
- # OCR reads physical vertical divider lines as "|" or "||" characters.
- # These sit at consistent x positions near column boundaries and pollute
- # cell text. Remove them from word_boxes and rebuild cell text.
- # NOTE: Zones from a vertical split already had pipes removed in step 3b.
- _PIPE_RE = re.compile(r"^\|+$")
- for z in zones_data:
- if z.get("vsplit_group") is not None:
- continue # pipes already removed before split
- removed_pipes = 0
- for cell in z.get("cells", []):
- wbs = cell.get("word_boxes") or []
- filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
- if len(filtered) < len(wbs):
- removed_pipes += len(wbs) - len(filtered)
- cell["word_boxes"] = filtered
- cell["text"] = _words_to_reading_order_text(filtered)
- # Remove cells that became empty after pipe removal
- if removed_pipes:
- z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
- logger.info(
- "build-grid: removed %d pipe-divider word_boxes from zone %d",
- removed_pipes, z.get("zone_index", 0),
- )
-
- # Strip pipe chars ONLY from word_boxes/cells where the pipe is an
- # OCR column-divider artifact. Preserve pipes that are embedded in
- # words as syllable separators (e.g. "zu|trau|en") — these are
- # intentional and used in dictionary Ground Truth.
- for z in zones_data:
- for cell in z.get("cells", []):
- for wb in cell.get("word_boxes", []):
- wbt = wb.get("text", "")
- # Only strip if the ENTIRE word_box is just pipe(s)
- # (handled by _PIPE_RE above) — leave embedded pipes alone
- text = cell.get("text", "")
- if "|" in text:
- # Only strip leading/trailing pipes (OCR artifacts at cell edges)
- cleaned = text.strip("|").strip()
- if cleaned != text.strip():
- cell["text"] = cleaned
-
- # 4d2. Normalize narrow connector columns.
- # In synonym dictionaries a narrow column repeats the same word
- # (e.g. "oder") in every row. OCR sometimes appends noise chars
- # (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column
- # share the same short text, normalize near-match outliers.
- for z in zones_data:
- cols = z.get("columns", [])
- cells = z.get("cells", [])
- if not cols or not cells:
- continue
- for col in cols:
- ci = col.get("index")
- col_cells = [c for c in cells if c.get("col_index") == ci]
- if len(col_cells) < 3:
- continue
- # Count text occurrences
- text_counts: Dict[str, int] = {}
- for c in col_cells:
- t = (c.get("text") or "").strip()
- if t:
- text_counts[t] = text_counts.get(t, 0) + 1
- if not text_counts:
- continue
- dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
- dominant_count = text_counts[dominant_text]
- # Only normalize if dominant word is short and appears in ≥60%
- if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
- continue
- # Fix outliers that start with the dominant text
- fixed = 0
- for c in col_cells:
- t = (c.get("text") or "").strip()
- if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
- c["text"] = dominant_text
- # Also fix word_boxes
- wbs = c.get("word_boxes") or []
- if len(wbs) == 1:
- wbs[0]["text"] = dominant_text
- fixed += 1
- if fixed:
- logger.info(
- "build-grid: normalized %d outlier cells in connector column %d "
- "(dominant='%s') zone %d",
- fixed, ci, dominant_text, z.get("zone_index", 0),
- )
-
- # 4e. Detect and remove page-border decoration strips.
- # Skipped when the pre-filter already removed border words BEFORE
- # column detection — re-running would incorrectly detect the
- # leftmost content column as a "strip".
- border_strip_removed = 0
- if border_prefiltered:
- logger.info("Step 4e: skipped (border pre-filter already applied)")
- else:
- # Some textbooks have decorative alphabet strips along the page
- # edge. OCR picks up scattered letters from these as artifacts.
- # Detection: find the first significant x-gap (>30 px) from each
- # page edge between a small cluster (<20 %) and the main content.
- for z in zones_data:
- cells = z.get("cells", [])
- if not cells:
- continue
- all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
- for cell in cells:
- for wb in cell.get("word_boxes") or []:
- all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
- if len(all_wbs_with_cell) < 10:
- continue
- all_wbs_with_cell.sort(key=lambda t: t[0])
- total = len(all_wbs_with_cell)
-
- # -- Left-edge scan --
- left_strip_count = 0
- left_gap = 0
- running_right = 0
- for gi in range(total - 1):
- running_right = max(
- running_right,
- all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
- )
- gap = all_wbs_with_cell[gi + 1][0] - running_right
- if gap > 30:
- left_strip_count = gi + 1
- left_gap = gap
- break
-
- # -- Right-edge scan --
- right_strip_count = 0
- right_gap = 0
- running_left = all_wbs_with_cell[-1][0]
- for gi in range(total - 1, 0, -1):
- running_left = min(running_left, all_wbs_with_cell[gi][0])
- prev_right = (
- all_wbs_with_cell[gi - 1][0]
- + all_wbs_with_cell[gi - 1][1].get("width", 0)
- )
- gap = running_left - prev_right
- if gap > 30:
- right_strip_count = total - gi
- right_gap = gap
- break
-
- strip_wbs: set = set()
- strip_side = ""
- strip_gap = 0
- strip_count = 0
- if left_strip_count > 0 and left_strip_count / total < 0.20:
- strip_side = "left"
- strip_count = left_strip_count
- strip_gap = left_gap
- strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
- elif right_strip_count > 0 and right_strip_count / total < 0.20:
- strip_side = "right"
- strip_count = right_strip_count
- strip_gap = right_gap
- strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
-
- if not strip_wbs:
- continue
- for cell in cells:
- wbs = cell.get("word_boxes") or []
- filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
- if len(filtered) < len(wbs):
- border_strip_removed += len(wbs) - len(filtered)
- cell["word_boxes"] = filtered
- cell["text"] = _words_to_reading_order_text(filtered)
- z["cells"] = [c for c in cells
- if (c.get("word_boxes") or c.get("text", "").strip())]
- logger.info(
- "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
- "(gap=%dpx, strip=%d/%d wbs)",
- border_strip_removed, strip_side, z.get("zone_index", 0),
- strip_gap, strip_count, total,
- )
-
- # 4f. Remove decorative edge columns (alphabet sidebar safety net).
- # Dictionary pages have A-Z letter sidebars that OCR reads as single-
- # character word_boxes. These form narrow columns with very short text.
- # Detection: edge column where almost ALL cells are single characters.
- for z in zones_data:
- columns = z.get("columns", [])
- cells = z.get("cells", [])
- if len(columns) < 3 or not cells:
- continue
- # Group cells by col_type (skip spanning_header)
- col_cells: Dict[str, List[Dict]] = {}
- for cell in cells:
- ct = cell.get("col_type", "")
- if ct.startswith("column_"):
- col_cells.setdefault(ct, []).append(cell)
- col_types_ordered = sorted(col_cells.keys())
- if len(col_types_ordered) < 3:
- continue
- for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
- edge_cells_list = col_cells.get(edge_ct, [])
- if len(edge_cells_list) < 3:
- continue
- # Key criterion: average text length and single-char ratio.
- # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
- # are single characters.
- texts = [(c.get("text") or "").strip() for c in edge_cells_list]
- avg_len = sum(len(t) for t in texts) / len(texts)
- single_char = sum(1 for t in texts if len(t) <= 1)
- single_ratio = single_char / len(texts)
- if avg_len > 1.5:
- continue # real content has longer text
- if single_ratio < 0.7:
- continue # not dominated by single chars
- # Remove this edge column
- removed_count = len(edge_cells_list)
- edge_ids = {id(c) for c in edge_cells_list}
- z["cells"] = [c for c in cells if id(c) not in edge_ids]
- z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
- logger.info(
- "Step 4f: removed decorative edge column '%s' from zone %d "
- "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
- edge_ct, z.get("zone_index", 0), removed_count,
- avg_len, single_ratio * 100,
- )
- break # only remove one edge per zone
-
- # 5. Color annotation on final word_boxes in cells
- if img_bgr is not None:
- all_wb: List[Dict] = []
- for z in zones_data:
- for cell in z.get("cells", []):
- all_wb.extend(cell.get("word_boxes", []))
- detect_word_colors(img_bgr, all_wb)
-
- # 5a. Heading detection by color + height (after color is available)
- heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
- if heading_count:
- logger.info("Detected %d heading rows by color+height", heading_count)
-
- # 5b. Fix unmatched parentheses in cell text
- # OCR often misses opening "(" while detecting closing ")".
- # If a cell's text has ")" without a matching "(", prepend "(".
- for z in zones_data:
- for cell in z.get("cells", []):
- text = cell.get("text", "")
- if ")" in text and "(" not in text:
- cell["text"] = "(" + text
-
- # 5c. IPA phonetic correction — replace garbled OCR phonetics with
- # correct IPA from the dictionary (same as in the OCR pipeline).
- # Only applies to vocabulary tables (≥3 columns: EN | article | DE).
- # Single/two-column layouts are continuous text, not vocab tables.
- all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
- total_cols = sum(len(z.get("columns", [])) for z in zones_data)
- en_col_type = None
- ipa_target_cols: set = set()
- all_content_cols: set = set()
- skip_ipa = (ipa_mode == "none")
-
- # When ipa_mode=none, strip ALL square brackets from ALL content columns
- if skip_ipa:
- _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
- for cell in all_cells:
- ct = cell.get("col_type", "")
- if not ct.startswith("column_"):
- continue
- text = cell.get("text", "")
- if "[" in text:
- stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
- if stripped != text:
- cell["text"] = stripped.strip()
- cell["_ipa_corrected"] = True
-
- if not skip_ipa and total_cols >= 3:
- # Detect English headword column via IPA signals (brackets or garbled).
- col_ipa_count: Dict[str, int] = {}
- all_content_cols: set = set()
- for cell in all_cells:
- ct = cell.get("col_type", "")
- if not ct.startswith("column_"):
- continue
- txt = cell.get("text", "") or ""
- if txt.strip():
- all_content_cols.add(ct)
- if '[' in txt or _text_has_garbled_ipa(txt):
- col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
- if col_ipa_count:
- en_col_type = max(col_ipa_count, key=col_ipa_count.get)
- elif ipa_mode == "all":
- # Force-all mode without auto-detection: pick column with most cells
- col_cell_count: Dict[str, int] = {}
- for cell in all_cells:
- ct = cell.get("col_type", "")
- if ct.startswith("column_") and (cell.get("text") or "").strip():
- col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
- if col_cell_count:
- en_col_type = max(col_cell_count, key=col_cell_count.get)
-
- # Decide which columns to process based on ipa_mode:
- # auto/en: only the detected EN headword column (English IPA)
- # de: all content columns EXCEPT the EN column (German IPA)
- # all: EN column gets English IPA, other columns get German IPA
- en_ipa_target_cols: set = set()
- de_ipa_target_cols: set = set()
- if ipa_mode in ("auto", "en"):
- if en_col_type:
- en_ipa_target_cols.add(en_col_type)
- elif ipa_mode == "de":
- de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
- elif ipa_mode == "all":
- if en_col_type:
- en_ipa_target_cols.add(en_col_type)
- de_ipa_target_cols = all_content_cols - en_ipa_target_cols
-
- # --- Strip IPA from columns NOT in the target set ---
- # When user selects "nur DE", English IPA from the OCR scan must
- # be removed. When "none", all IPA is removed.
- # In vocab columns, square brackets [...] are always IPA (both
- # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]).
- _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
- strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
- if strip_en_ipa or ipa_mode == "none":
- strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
- for cell in all_cells:
- ct = cell.get("col_type", "")
- if ct not in strip_cols:
- continue
- text = cell.get("text", "")
- if "[" in text:
- stripped = _SQUARE_BRACKET_RE.sub("", text)
- if stripped != text:
- cell["text"] = stripped.strip()
- cell["_ipa_corrected"] = True
-
- # --- English IPA (Britfone + eng_to_ipa) ---
- if en_ipa_target_cols:
- for cell in all_cells:
- ct = cell.get("col_type")
- if ct in en_ipa_target_cols:
- cell["_orig_col_type"] = ct
- cell["col_type"] = "column_en"
- _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
- fix_cell_phonetics(all_cells, pronunciation="british")
- for cell in all_cells:
- orig = cell.pop("_orig_col_type", None)
- if orig:
- cell["col_type"] = orig
- if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
- cell["_ipa_corrected"] = True
-
- # --- German IPA (wiki-pronunciation-dict + epitran) ---
- if de_ipa_target_cols:
- from cv_ipa_german import insert_german_ipa
- insert_german_ipa(all_cells, de_ipa_target_cols)
-
- ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
-
- # Mark cells whose text was changed by IPA correction so that
- # later steps (5i) don't overwrite the corrected text when
- # reconstructing from word_boxes. (Already done inline above
- # for English; insert_german_ipa sets _ipa_corrected too.)
- for cell in all_cells:
- if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
- cell["_ipa_corrected"] = True
-
- # 5d. Fix IPA continuation cells — cells where the printed
- # phonetic transcription wraps to a line below the headword.
- # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]").
- # Replace garbled text with proper IPA looked up from the
- # headword in the previous row's same column.
- # Note: We check ALL columns, not just en_col_type, because
- # the EN headword column may not be the longest-average column.
- _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
- ipa_cont_fixed = 0
- for z in ([] if skip_ipa else zones_data):
- rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
- z_cells = z.get("cells", [])
- for idx, row in enumerate(rows_sorted):
- if idx == 0:
- continue
- ri = row["index"]
- row_cells = [c for c in z_cells if c.get("row_index") == ri]
- for cell in row_cells:
- ct = cell.get("col_type", "")
- if not ct.startswith("column_"):
- continue
- cell_text = (cell.get("text") or "").strip()
- if not cell_text:
- # Step 5c may have emptied garbled IPA cells like
- # "[n, nn]" — recover text from word_boxes.
- wb_texts = [w.get("text", "")
- for w in cell.get("word_boxes", [])]
- cell_text = " ".join(wb_texts).strip()
- if not cell_text:
- continue
-
- is_bracketed = (
- cell_text.startswith('[') and cell_text.endswith(']')
- )
-
- if is_bracketed:
- # Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
- # Text like "employee [im'ploi:]" is NOT fully
- # bracketed and won't match here.
- if not _text_has_garbled_ipa(cell_text):
- continue
- # Already has proper IPA brackets → skip
- if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
- continue
- else:
- # Unbracketed continuation: "ska:f – ska:vz",
- # "'sekandarr sku:l". Only treat as IPA
- # continuation if this is the ONLY content cell
- # in the row (single-cell row) and the text is
- # garbled IPA without real IPA Unicode symbols.
- content_cells_in_row = [
- c for c in row_cells
- if c.get("col_type", "").startswith("column_")
- and c.get("col_type") != "column_1"
- ]
- if len(content_cells_in_row) != 1:
- continue
- if not _text_has_garbled_ipa(cell_text):
- continue
- # Has real IPA symbols → already fixed or valid
- if any(c in _REAL_IPA_CHARS for c in cell_text):
- continue
-
- # Find headword in previous row, same column
- prev_ri = rows_sorted[idx - 1]["index"]
- prev_same_col = [
- c for c in z_cells
- if c.get("row_index") == prev_ri
- and c.get("col_type") == ct
- ]
- if not prev_same_col:
- continue
- prev_text = prev_same_col[0].get("text", "")
- fixed = fix_ipa_continuation_cell(
- cell_text, prev_text, pronunciation="british",
- )
- if fixed != cell_text:
- cell["text"] = fixed
- ipa_cont_fixed += 1
- logger.info(
- "IPA continuation R%d %s: '%s' → '%s'",
- ri, ct, cell_text, fixed,
- )
- if ipa_cont_fixed:
- logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
-
- # 5e. Heading detection by single-cell rows — black headings like
- # "Theme" that have normal color and height but are the ONLY cell
- # in their row (excluding page_ref column_1). Must run AFTER 5d
- # so IPA continuation cells are already processed.
- single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
- if single_heading_count:
- logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
-
- # 5f. Strip IPA from headings — headings detected in 5e ran AFTER
- # IPA correction (5c), so they may have dictionary IPA appended
- # (e.g. "Theme [θˈiːm]" → "Theme"). Headings should show the
- # original text only.
- for z in zones_data:
- for cell in z.get("cells", []):
- if cell.get("col_type") != "heading":
- continue
- text = cell.get("text", "")
- # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme"
- stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
- if stripped and stripped != text:
- cell["text"] = stripped
-
- # 5g. Extract page_ref cells and footer rows from content zones.
- # Page references (column_1 cells like "p.70") sit in rows that
- # also contain vocabulary — extract them as zone metadata without
- # removing the row. Footer lines (e.g. "two hundred and twelve"
- # = page number at bottom) are standalone rows that should be
- # removed from the table entirely.
- _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
- # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
- _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
- _NUMBER_WORDS = {
- "one", "two", "three", "four", "five", "six", "seven",
- "eight", "nine", "ten", "eleven", "twelve", "thirteen",
- "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
- "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
- "seventy", "eighty", "ninety", "hundred", "thousand", "and",
- "einhundert", "zweihundert", "dreihundert", "vierhundert",
- "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
- }
- for z in zones_data:
- if z.get("zone_type") != "content":
- continue
- cells = z.get("cells", [])
- rows = z.get("rows", [])
- if not rows:
- continue
-
- # Extract column_1 cells that look like page references
- page_refs = []
- page_ref_cell_ids = set()
- for cell in cells:
- if cell.get("col_type") != "column_1":
- continue
- text = (cell.get("text") or "").strip()
- if not text:
- continue
- if not _PAGE_REF_RE.match(text):
- continue
- page_refs.append({
- "row_index": cell.get("row_index"),
- "text": text,
- "bbox_pct": cell.get("bbox_pct", {}),
- })
- page_ref_cell_ids.add(cell.get("cell_id"))
-
- # Keep page_ref cells in the table as a visible column.
- # Previously these were removed, but users want to see them.
- # The metadata extraction above still populates zone["page_refs"]
- # for the frontend header display.
-
- # Detect footer: last non-header row if it has only 1 cell
- # with short, non-content text (page numbers like "233" or
- # "two hundred and twelve"). Comma-separated lists and long
- # text are content continuations, not page numbers.
- footer_rows = []
- non_header_rows = [r for r in rows if not r.get("is_header")]
- if non_header_rows:
- last_row = non_header_rows[-1]
- last_ri = last_row["index"]
- last_cells = [c for c in z["cells"]
- if c.get("row_index") == last_ri]
- if len(last_cells) == 1:
- text = (last_cells[0].get("text") or "").strip()
- # Not IPA (no real IPA symbols) and not a heading
- has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
- # Comma-separated text is a content continuation, not a footer
- has_commas = ',' in text
- # Written-out page numbers like "two hundred and nine"
- text_words = set(text.lower().split())
- is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
- # Short text or written-out number
- is_page_number = len(text) <= 20 or is_written_number
- if (text and not has_real_ipa and not has_commas
- and is_page_number
- and last_cells[0].get("col_type") != "heading"):
- footer_rows.append({
- "row_index": last_ri,
- "text": text,
- "bbox_pct": last_cells[0].get("bbox_pct", {}),
- })
-
- # Classify footer rows: page numbers are removed from the grid
- # and promoted to page_number metadata; other footers stay as rows.
- page_number_footers = []
- other_footers = []
- for fr in footer_rows:
- ft = fr["text"].strip()
- # Pure digits
- digits = "".join(c for c in ft if c.isdigit())
- if digits and re.match(r'^[\d\s.]+$', ft):
- page_number_footers.append(fr)
- # Written-out numbers
- elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
- page_number_footers.append(fr)
- else:
- other_footers.append(fr)
-
- # Remove page-number footer rows from grid entirely
- if page_number_footers:
- pn_ris = {fr["row_index"] for fr in page_number_footers}
- z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
- z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
- # Set page_number metadata (use first one)
- pn_text = page_number_footers[0]["text"].strip()
- pn_digits = "".join(c for c in pn_text if c.isdigit())
- if not page_number_info:
- page_number_info = {
- "text": pn_text,
- "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
- }
- if pn_digits:
- page_number_info["number"] = int(pn_digits)
-
- # Mark remaining footer rows (non-page-number content)
- if other_footers:
- footer_ris = {fr["row_index"] for fr in other_footers}
- for r in z["rows"]:
- if r["index"] in footer_ris:
- r["is_footer"] = True
- for c in z["cells"]:
- if c.get("row_index") in footer_ris:
- c["col_type"] = "footer"
-
- if page_refs or footer_rows:
- logger.info(
- "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
- len(page_refs), len(footer_rows), len(page_number_footers),
- z.get("zone_index", 0),
- )
-
- # Store as zone-level metadata
- if page_refs:
- z["page_refs"] = page_refs
- if other_footers:
- z["footer"] = other_footers
-
- # 5h. Convert slash-delimited IPA to bracket notation.
- # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
- # Detect the pattern /ocr_ipa/ and replace with [dict_ipa]
- # using the IPA dictionary when available, falling back to the OCR text.
- # The regex requires a word character (or ² ³) right before the opening
- # slash to avoid false positives like "sb/sth".
- _SLASH_IPA_RE = re.compile(
- r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
- r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
- )
- # Standalone slash IPA at start of text (headword on previous line)
- _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
- # IPA between slashes never contains spaces, parentheses, or commas.
- # Reject matches that look like grammar: "sb/sth up a) jdn/"
- _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
- slash_ipa_fixed = 0
- for z in ([] if skip_ipa else zones_data):
- for cell in z.get("cells", []):
- # Only process English headword column — avoid converting
- # German text like "der/die/das" to IPA.
- if en_col_type and cell.get("col_type") != en_col_type:
- continue
- text = cell.get("text", "")
- if "/" not in text:
- continue
-
- def _replace_slash_ipa(m: re.Match) -> str:
- nonlocal slash_ipa_fixed
- headword = m.group(1)
- ocr_ipa = m.group(2) # includes slashes
- inner_raw = ocr_ipa.strip("/").strip()
- # Reject if inner content has spaces/parens/commas (grammar)
- if _SLASH_IPA_REJECT_RE.search(inner_raw):
- return m.group(0)
- # Strip superscript digits for lookup
- clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
- ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
- if ipa:
- slash_ipa_fixed += 1
- return f"{headword} [{ipa}]"
- # Fallback: keep OCR IPA but convert slashes to brackets
- inner = inner_raw.lstrip("'").strip()
- if inner:
- slash_ipa_fixed += 1
- return f"{headword} [{inner}]"
- return m.group(0)
-
- new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
-
- # Second pass: convert remaining /ipa/ after [ipa] from first pass.
- # Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant)
- _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
- def _replace_trailing_slash(m: re.Match) -> str:
- nonlocal slash_ipa_fixed
- inner = m.group(1).strip("/").strip().lstrip("'").strip()
- if _SLASH_IPA_REJECT_RE.search(inner):
- return m.group(0)
- if inner:
- slash_ipa_fixed += 1
- return f" [{inner}]"
- return m.group(0)
- new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
-
- # Handle standalone /ipa/ at start (no headword in this cell)
- if new_text == text:
- m = _STANDALONE_SLASH_IPA_RE.match(text)
- if m:
- inner = m.group(1).strip()
- if not _SLASH_IPA_REJECT_RE.search(inner):
- inner = inner.lstrip("'").strip()
- if inner:
- new_text = "[" + inner + "]" + text[m.end():]
- slash_ipa_fixed += 1
-
- if new_text != text:
- cell["text"] = new_text
-
- if slash_ipa_fixed:
- logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
-
- # 5i. Remove blue bullet/artifact word_boxes.
- # Dictionary pages have small blue square bullets (■) before entries.
- # OCR reads these as text artifacts (©, e, *, or even plausible words
- # like "fighily" overlapping the real word "tightly").
- # Detection rules:
- # a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black)
- # b) Overlapping word_boxes: >40% x-overlap → remove lower confidence
- # c) Duplicate text: consecutive blue wbs with identical text, gap < 6px
- bullet_removed = 0
- for z in zones_data:
- for cell in z.get("cells", []):
- wbs = cell.get("word_boxes") or []
- if len(wbs) < 2:
- continue
- to_remove: set = set()
-
- # Rule (a): tiny coloured symbols (bullets, graphic fragments)
- for i, wb in enumerate(wbs):
- cn = wb.get("color_name", "black")
- if (cn != "black"
- and wb.get("width", 0) * wb.get("height", 0) < 200
- and wb.get("conf", 100) < 85):
- to_remove.add(i)
-
- # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
- # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
- # Remove word boxes that contain NO letters or digits.
- for i, wb in enumerate(wbs):
- t = (wb.get("text") or "").strip()
- if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
- to_remove.add(i)
-
- # Rule (b) + (c): overlap and duplicate detection
- # Sort by x for pairwise comparison
- _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
- to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge
- indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
- for p in range(len(indexed) - 1):
- i1, w1 = indexed[p]
- i2, w2 = indexed[p + 1]
- x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
- x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
- overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
- min_w = min(w1.get("width", 1), w2.get("width", 1))
- gap = x2s - x1e
- overlap_pct = overlap / min_w if min_w > 0 else 0
-
- # (b) Significant x-overlap
- if overlap_pct > 0.20:
- t1 = (w1.get("text") or "").strip()
- t2 = (w2.get("text") or "").strip()
-
- # Syllable-split words: both are alphabetic text with
- # moderate overlap (20-75%). Merge instead of removing.
- # OCR splits words at syllable marks, producing overlapping
- # boxes like "zu" + "tiefst" → "zutiefst".
- if (overlap_pct <= 0.75
- and _ALPHA_WORD_RE.match(t1)
- and _ALPHA_WORD_RE.match(t2)):
- to_merge.append((i1, i2))
- continue
-
- # High overlap (>75%) with different alphabetic text:
- # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
- # causing it to heavily overlap with the next fragment ("brech").
- # Merge instead of removing when one is a short prefix (≤4 chars)
- # and the texts are different.
- if (overlap_pct > 0.75
- and _ALPHA_WORD_RE.match(t1)
- and _ALPHA_WORD_RE.match(t2)
- and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
- and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
- to_merge.append((i1, i2))
- continue
-
- if overlap_pct <= 0.40:
- continue # too little overlap and not alphabetic merge
-
- c1 = w1.get("conf", 50)
- c2 = w2.get("conf", 50)
-
- # For very high overlap (>90%) with different text,
- # prefer the word that exists in the IPA dictionary
- # over confidence (OCR can give artifacts high conf).
- if overlap_pct > 0.90 and t1.lower() != t2.lower():
- in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
- in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
- if in_dict_1 and not in_dict_2:
- to_remove.add(i2)
- continue
- elif in_dict_2 and not in_dict_1:
- to_remove.add(i1)
- continue
-
- if c1 < c2:
- to_remove.add(i1)
- elif c2 < c1:
- to_remove.add(i2)
- else:
- # Same confidence: remove the taller one (bullet slivers)
- if w1.get("height", 0) > w2.get("height", 0):
- to_remove.add(i1)
- else:
- to_remove.add(i2)
-
- # (c) Duplicate text: consecutive blue with same text, gap < 6px
- elif (gap < 6
- and w1.get("color_name") == "blue"
- and w2.get("color_name") == "blue"
- and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
- # Remove the one with lower confidence; if equal, first one
- c1 = w1.get("conf", 50)
- c2 = w2.get("conf", 50)
- to_remove.add(i1 if c1 <= c2 else i2)
-
- # Execute merges first (syllable-split words).
- # Use merge_parent to support chain merging: if "zer" absorbed
- # "brech" and then "brech"+"lich" is a merge pair, redirect to
- # merge "lich" into "zer" → "zerbrechlich".
- if to_merge:
- merge_parent: Dict[int, int] = {} # absorbed → absorber
- for mi1, mi2 in to_merge:
- # Follow chain: if mi1 was absorbed, find root absorber
- actual_mi1 = mi1
- while actual_mi1 in merge_parent:
- actual_mi1 = merge_parent[actual_mi1]
- if actual_mi1 in to_remove or mi2 in to_remove:
- continue
- if mi2 in merge_parent:
- continue # mi2 already absorbed
- mw1, mw2 = wbs[actual_mi1], wbs[mi2]
- # Concatenate text (no space — they're parts of one word)
- mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
- mt2 = (mw2.get("text") or "").strip()
- merged_text = mt1 + mt2
- # Union bounding box
- mx = min(mw1["left"], mw2["left"])
- my = min(mw1["top"], mw2["top"])
- mr = max(mw1["left"] + mw1["width"],
- mw2["left"] + mw2["width"])
- mb = max(mw1["top"] + mw1["height"],
- mw2["top"] + mw2["height"])
- mw1["text"] = merged_text
- mw1["left"] = mx
- mw1["top"] = my
- mw1["width"] = mr - mx
- mw1["height"] = mb - my
- mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
- to_remove.add(mi2)
- merge_parent[mi2] = actual_mi1
- bullet_removed -= 1 # net: merge, not removal
-
- if to_remove:
- bullet_removed += len(to_remove)
- filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
- cell["word_boxes"] = filtered
- # Don't overwrite text that was corrected by Step 5c IPA fix
- if not cell.get("_ipa_corrected"):
- cell["text"] = _words_to_reading_order_text(filtered)
-
- # Remove cells that became empty after bullet removal
- if bullet_removed:
- for z in zones_data:
- z["cells"] = [c for c in z.get("cells", [])
- if (c.get("word_boxes") or c.get("text", "").strip())]
- logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
-
- # 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
- # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
- # "\\", "PEE", "a=") that survive earlier filters because their rows also
- # contain real content in other columns. Remove them here.
- _COMMON_SHORT_WORDS = {
- # German
- "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
- "ob", "so", "um", "zu", "wo", "je", "oh", "or",
- "die", "der", "das", "dem", "den", "des", "ein", "und",
- "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
- # English
- "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
- "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
- "on", "or", "so", "to", "up", "us", "we",
- "the", "and", "but", "for", "not",
- }
- _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
- artifact_cells_removed = 0
- for z in zones_data:
- before = len(z.get("cells", []))
- kept = []
- for cell in z.get("cells", []):
- text = (cell.get("text") or "").strip()
- core = text.rstrip(".,;:!?'\"")
- is_artifact = False
- if not core:
- is_artifact = True
- elif _PURE_JUNK_RE.match(core):
- is_artifact = True
- elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
- # Short non-alphabetic text like "a=", not word beginnings like "Zw"
- is_artifact = True
- elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
- is_artifact = True
- elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
- and not re.match(r'^[pPsS]\.?\d+$', core)):
- # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
- # but NOT page references like "p.43", "p50", "S.12"
- is_artifact = True
- if is_artifact:
- kept.append(None) # placeholder
- else:
- kept.append(cell)
- z["cells"] = [c for c in kept if c is not None]
- artifact_cells_removed += before - len(z["cells"])
- if artifact_cells_removed:
- # Also remove rows that became completely empty
- for z in zones_data:
- cell_ris = {c.get("row_index") for c in z.get("cells", [])}
- z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
- logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
-
- # 5j. Normalise word_box order to reading order (group by Y, sort by X).
- # The frontend renders colored cells from word_boxes array order
- # (GridTable.tsx), so they MUST be in left-to-right reading order.
- wb_reordered = 0
- for z in zones_data:
- for cell in z.get("cells", []):
- wbs = cell.get("word_boxes") or []
- if len(wbs) < 2:
- continue
- lines = _group_words_into_lines(wbs, y_tolerance_px=15)
- sorted_wbs = [w for line in lines for w in line]
- # Check if order actually changed
- if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
- cell["word_boxes"] = sorted_wbs
- wb_reordered += 1
- if wb_reordered:
- logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
-
- duration = time.time() - t0
-
- # 6. Build result
- total_cells = sum(len(z.get("cells", [])) for z in zones_data)
- total_columns = sum(len(z.get("columns", [])) for z in zones_data)
- total_rows = sum(len(z.get("rows", [])) for z in zones_data)
-
- # Collect color statistics from all word_boxes in cells
- color_stats: Dict[str, int] = {}
- for z in zones_data:
- for cell in z.get("cells", []):
- for wb in cell.get("word_boxes", []):
- cn = wb.get("color_name", "black")
- color_stats[cn] = color_stats.get(cn, 0) + 1
-
- # Compute layout metrics for faithful grid reconstruction
- all_content_row_heights: List[float] = []
- for z in zones_data:
- for row in z.get("rows", []):
- if not row.get("is_header", False):
- h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
- if h > 0:
- all_content_row_heights.append(h)
- avg_row_height = (
- sum(all_content_row_heights) / len(all_content_row_heights)
- if all_content_row_heights else 30.0
- )
- font_size_suggestion = max(10, int(avg_row_height * 0.6))
-
- # --- Dictionary detection on assembled grid ---
- # Build lightweight ColumnGeometry-like structures from zone columns for
- # dictionary signal scoring.
- from cv_layout import _score_dictionary_signals
- dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
- try:
- from cv_vocab_types import ColumnGeometry
- for z in zones_data:
- zone_cells = z.get("cells", [])
- zone_cols = z.get("columns", [])
- if len(zone_cols) < 2 or len(zone_cells) < 10:
- continue
- # Build pseudo-ColumnGeometry per column
- pseudo_geoms = []
- for col in zone_cols:
- ci = col["index"]
- col_cells = [c for c in zone_cells if c.get("col_index") == ci]
- # Flatten word_boxes into word dicts compatible with _score_language
- col_words = []
- for cell in col_cells:
- for wb in cell.get("word_boxes") or []:
- col_words.append({
- "text": wb.get("text", ""),
- "conf": wb.get("conf", 0),
- "top": wb.get("top", 0),
- "left": wb.get("left", 0),
- "height": wb.get("height", 0),
- "width": wb.get("width", 0),
- })
- # Fallback: use cell text if no word_boxes
- if not cell.get("word_boxes") and cell.get("text"):
- col_words.append({
- "text": cell["text"],
- "conf": cell.get("confidence", 50),
- "top": cell.get("bbox_px", {}).get("y", 0),
- "left": cell.get("bbox_px", {}).get("x", 0),
- "height": cell.get("bbox_px", {}).get("h", 20),
- "width": cell.get("bbox_px", {}).get("w", 50),
- })
- col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
- pseudo_geoms.append(ColumnGeometry(
- index=ci, x=col.get("x_min_px", 0), y=0,
- width=max(col_w, 1), height=img_h,
- word_count=len(col_words), words=col_words,
- width_ratio=col_w / max(img_w, 1),
- ))
- if len(pseudo_geoms) >= 2:
- dd = _score_dictionary_signals(
- pseudo_geoms,
- document_category=document_category,
- margin_strip_detected=margin_strip_detected,
- )
- if dd["confidence"] > dict_detection["confidence"]:
- dict_detection = dd
- except Exception as e:
- logger.warning("Dictionary detection failed: %s", e)
-
- # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
- try:
- from cv_syllable_detect import merge_word_gaps_in_zones
- merge_word_gaps_in_zones(zones_data, session_id)
- except Exception as e:
- logger.warning("Word-gap merge failed: %s", e)
-
- # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
- # Strips | from words, validates with pyphen, tries char-deletion for garbled
- # words like "Ze|plpe|lin" → "Zeppelin".
- try:
- from cv_syllable_detect import autocorrect_pipe_artifacts
- autocorrect_pipe_artifacts(zones_data, session_id)
- except Exception as e:
- logger.warning("Pipe autocorrect failed: %s", e)
-
- # --- Syllable divider insertion for dictionary pages ---
- # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
- # "all" = force on all content words, "en" = English column only,
- # "de" = German columns only, "none" = skip entirely.
- syllable_insertions = 0
- if syllable_mode != "none" and img_bgr is not None:
- _syllable_eligible = False
- if syllable_mode in ("all", "de", "en"):
- _syllable_eligible = True
- elif (dict_detection.get("is_dictionary")
- and dict_detection.get("article_col_index") is not None):
- # auto: only on dictionary pages with article columns
- _syllable_eligible = True
- # For language-specific modes, determine allowed columns
- _syllable_col_filter: Optional[set] = None # None = all columns
- if syllable_mode == "en":
- _syllable_col_filter = {en_col_type} if en_col_type else set()
- elif syllable_mode == "de":
- if en_col_type and total_cols >= 3:
- _syllable_col_filter = all_content_cols - {en_col_type}
- # else None → all columns (correct for German-only dicts)
- if _syllable_eligible:
- try:
- from cv_syllable_detect import insert_syllable_dividers
- force_syllables = (syllable_mode in ("all", "de", "en"))
- syllable_insertions = insert_syllable_dividers(
- zones_data, img_bgr, session_id,
- force=force_syllables,
- col_filter=_syllable_col_filter,
- )
- except Exception as e:
- logger.warning("Syllable insertion failed: %s", e)
-
- # When syllable mode is "none", strip any residual | from OCR so
- # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
- if syllable_mode == "none":
- for z in zones_data:
- for cell in z.get("cells", []):
- t = cell.get("text", "")
- if "|" in t:
- cell["text"] = t.replace("|", "")
-
- # --- Split merged words (OCR sometimes glues adjacent words) ---
- # Uses dictionary lookup to split e.g. "atmyschool" → "at my school"
- try:
- from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
- if _SPELL_AVAILABLE:
- split_count = 0
- for z in zones_data:
- for cell in z.get("cells", []):
- text = cell.get("text", "")
- if not text:
- continue
- parts = []
- changed = False
- for token in text.split():
- # Try splitting pure-alpha tokens >= 4 chars
- # Strip trailing punctuation AND IPA brackets
- clean = token
- # Remove trailing IPA like [dɪsˈɪʒən] first
- bracket_pos = clean.find('[')
- suffix_ipa = ""
- if bracket_pos > 0:
- suffix_ipa = clean[bracket_pos:]
- clean = clean[:bracket_pos]
- suffix_punct = ""
- stripped = clean.rstrip(".,!?;:'\")")
- if stripped != clean:
- suffix_punct = clean[len(stripped):]
- clean = stripped
- suffix = suffix_punct + suffix_ipa
- # Handle contractions: "solet's" → try "solet" + "'s"
- contraction = ""
- if "'" in clean and clean.index("'") >= 2:
- apos_pos = clean.index("'")
- contraction = clean[apos_pos:]
- clean = clean[:apos_pos]
- suffix = contraction + suffix
- if len(clean) >= 4 and clean.isalpha():
- split = _try_split_merged_word(clean)
- if split:
- parts.append(split + suffix)
- changed = True
- continue
- parts.append(token)
- if changed:
- cell["text"] = " ".join(parts)
- split_count += 1
- if split_count:
- logger.info("build-grid session %s: split %d merged words", session_id, split_count)
- except ImportError:
- pass
-
- # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" ---
- # Matches any [bracket] directly after a letter, as long as the bracket
- # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]").
- _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
- for z in zones_data:
- for cell in z.get("cells", []):
- text = cell.get("text", "")
- if text and "[" in text:
- fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
- if fixed != text:
- cell["text"] = fixed
-
- # --- SmartSpellChecker: language-aware OCR correction on all cells ---
- try:
- from smart_spell import SmartSpellChecker
- _ssc = SmartSpellChecker()
- spell_fix_count = 0
-
- # Determine language per column:
- # en_col_type was already detected (column with IPA = English).
- # All other content columns are assumed German for vocab tables.
- # For single/two-column layouts, use auto-detection.
- for z in zones_data:
- zone_cols = z.get("columns", [])
- for cell in z.get("cells", []):
- text = cell.get("text", "")
- if not text or not text.strip():
- continue
- ct = cell.get("col_type", "")
- if not ct.startswith("column_"):
- continue
-
- # Determine language for this cell
- if total_cols >= 3 and en_col_type:
- lang = "en" if ct == en_col_type else "de"
- elif total_cols <= 2:
- lang = "auto" # auto-detect for non-vocab layouts
- else:
- lang = "auto"
-
- result = _ssc.correct_text(text, lang=lang)
- if result.changed:
- cell["text"] = result.corrected
- spell_fix_count += 1
-
- if spell_fix_count:
- logger.info(
- "build-grid session %s: SmartSpellChecker fixed %d cells",
- session_id, spell_fix_count,
- )
- except ImportError:
- logger.debug("SmartSpellChecker not available in build-grid")
- except Exception as e:
- logger.warning("SmartSpellChecker error in build-grid: %s", e)
-
- # --- Debug: log cell counts per column before empty-column removal ---
- for z in zones_data:
- if z.get("zone_type") == "content":
- from collections import Counter as _Counter
- _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
- _cols = z.get("columns", [])
- logger.info(
- "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
- z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
- )
-
- # --- Remove empty columns (no cells assigned) ---
- for z in zones_data:
- cells = z.get("cells", [])
- used_col_indices = {c.get("col_index") for c in cells}
- old_cols = z.get("columns", [])
- new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
- if len(new_cols) < len(old_cols):
- # Re-index columns and cells
- old_to_new = {}
- for new_i, col in enumerate(new_cols):
- old_i = col.get("col_index", col.get("index", new_i))
- old_to_new[old_i] = new_i
- col["col_index"] = new_i
- col["index"] = new_i
- col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
- for cell in cells:
- old_ci = cell.get("col_index", 0)
- cell["col_index"] = old_to_new.get(old_ci, old_ci)
- cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
- z["columns"] = new_cols
-
- # Clean up internal flags before returning
- for z in zones_data:
- for cell in z.get("cells", []):
- cell.pop("_ipa_corrected", None)
-
- result = {
- "session_id": session_id,
- "image_width": img_w,
- "image_height": img_h,
- "zones": zones_data,
- "boxes_detected": boxes_detected,
- "summary": {
- "total_zones": len(zones_data),
- "total_columns": total_columns,
- "total_rows": total_rows,
- "total_cells": total_cells,
- "total_words": len(all_words),
- "recovered_colored": recovered_count,
- "color_stats": color_stats,
- },
- "formatting": {
- "bold_columns": [],
- "header_rows": [],
- },
- "layout_metrics": {
- "page_width_px": img_w,
- "page_height_px": img_h,
- "avg_row_height_px": round(avg_row_height, 1),
- "font_size_suggestion_px": font_size_suggestion,
- },
- "dictionary_detection": {
- "is_dictionary": dict_detection.get("is_dictionary", False),
- "confidence": dict_detection.get("confidence", 0.0),
- "signals": dict_detection.get("signals", {}),
- "article_col_index": dict_detection.get("article_col_index"),
- "headword_col_index": dict_detection.get("headword_col_index"),
- },
- "processing_modes": {
- "ipa_mode": ipa_mode,
- "syllable_mode": syllable_mode,
- "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
- "syllables_applied": syllable_insertions > 0,
- },
- "page_number": page_number_info,
- "duration_seconds": round(duration, 2),
- }
-
- return result
-
-
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------