From 17f0fdb2ed60e0e0f273b79dd733abe690f9d9ca Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 15 Apr 2026 08:54:55 +0200 Subject: [PATCH] Refactor: extract _build_grid_core into grid_build_core.py + clean StepAnsicht MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit grid_editor_api.py: 2411 → 474 lines - Extracted _build_grid_core() (1892 lines) into grid_build_core.py - API file now only contains endpoints (build, save, get, gutter, box, unified) StepAnsicht.tsx: 212 → 112 lines - Removed useGridEditor imports (not needed for read-only spreadsheet) - Removed unified grid fetch/build (not used with multi-sheet approach) - Removed Spreadsheet/Grid toggle (only spreadsheet mode now) - Simple: fetch grid-editor data → pass to SpreadsheetView Co-Authored-By: Claude Opus 4.6 (1M context) --- .../components/ocr-kombi/StepAnsicht.tsx | 172 +- klausur-service/backend/grid_build_core.py | 1943 ++++++++++++++++ klausur-service/backend/grid_editor_api.py | 1945 +---------------- 3 files changed, 1982 insertions(+), 2078 deletions(-) create mode 100644 klausur-service/backend/grid_build_core.py diff --git a/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx b/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx index 74bd858..6633df1 100644 --- a/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx +++ b/admin-lehrer/components/ocr-kombi/StepAnsicht.tsx @@ -1,19 +1,15 @@ 'use client' /** - * StepAnsicht — Unified Grid View. + * StepAnsicht — Excel-like Spreadsheet View. * * Left: Original scan with OCR word overlay - * Right: Unified grid (single zone, boxes integrated) rendered via GridTable + * Right: Fortune Sheet spreadsheet with multi-sheet tabs per zone */ -import { useCallback, useEffect, useRef, useState } from 'react' +import { useEffect, useRef, useState } from 'react' import dynamic from 'next/dynamic' -import { useGridEditor } from '@/components/grid-editor/useGridEditor' -import { GridTable } from '@/components/grid-editor/GridTable' -import type { GridZone } from '@/components/grid-editor/types' -// Lazy-load SpreadsheetView (Fortune Sheet, SSR-incompatible) const SpreadsheetView = dynamic( () => import('./SpreadsheetView').then((m) => m.SpreadsheetView), { ssr: false, loading: () =>
Spreadsheet wird geladen...
}, @@ -27,67 +23,29 @@ interface StepAnsichtProps { } export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) { - const gridEditor = useGridEditor(sessionId) - const { - loading, error, selectedCell, setSelectedCell, - updateCellText, toggleColumnBold, toggleRowHeader, - getAdjacentCell, deleteColumn, addColumn, deleteRow, addRow, - commitUndoPoint, selectedCells, toggleCellSelection, - clearCellSelection, toggleSelectedBold, setCellColor, - saveGrid, saving, dirty, undo, redo, canUndo, canRedo, - } = gridEditor - - const [unifiedGrid, setUnifiedGrid] = useState(null) - const [building, setBuilding] = useState(false) - const [buildError, setBuildError] = useState(null) + const [gridData, setGridData] = useState(null) + const [loading, setLoading] = useState(true) + const [error, setError] = useState(null) const leftRef = useRef(null) const [leftHeight, setLeftHeight] = useState(600) - const [viewMode, setViewMode] = useState<'spreadsheet' | 'grid'>('spreadsheet') - // Build unified grid - const buildUnified = useCallback(async () => { - if (!sessionId) return - setBuilding(true) - setBuildError(null) - try { - const res = await fetch( - `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/build-unified-grid`, - { method: 'POST' }, - ) - if (!res.ok) { - const d = await res.json().catch(() => ({})) - throw new Error(d.detail || `HTTP ${res.status}`) - } - const data = await res.json() - setUnifiedGrid(data) - } catch (e) { - setBuildError(e instanceof Error ? e.message : String(e)) - } finally { - setBuilding(false) - } - }, [sessionId]) - - // Load both grids on mount + // Load grid data on mount useEffect(() => { if (!sessionId) return - // Load multi-zone grid (for spreadsheet mode) - gridEditor.loadGrid() - // Load unified grid (for grid mode) ;(async () => { try { - const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/unified-grid`) - if (res.ok) { - setUnifiedGrid(await res.json()) - } else { - buildUnified() - } - } catch { - buildUnified() + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/grid-editor`) + if (!res.ok) throw new Error(`HTTP ${res.status}`) + setGridData(await res.json()) + } catch (e) { + setError(e instanceof Error ? e.message : 'Fehler beim Laden') + } finally { + setLoading(false) } })() - }, [sessionId]) // eslint-disable-line react-hooks/exhaustive-deps + }, [sessionId]) - // Track left panel height for sync + // Track left panel height useEffect(() => { if (!leftRef.current) return const ro = new ResizeObserver(([e]) => setLeftHeight(e.contentRect.height)) @@ -95,13 +53,20 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) { return () => ro.disconnect() }, []) - const unifiedZone: GridZone | null = unifiedGrid?.zones?.[0] ?? null - - if (loading || building) { + if (loading) { return (
- {building ? 'Baue Unified Grid...' : 'Lade...'} + Lade Spreadsheet... +
+ ) + } + + if (error || !gridData) { + return ( +
+

{error || 'Keine Grid-Daten.'}

+
) } @@ -111,51 +76,16 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) { {/* Header */}
-

Ansicht — Unified Grid

+

Ansicht — Spreadsheet

- Alle Inhalte in einem Grid. Boxen sind integriert (farbig markiert). - {unifiedGrid && ( - - {unifiedGrid.summary?.total_rows} Zeilen × {unifiedGrid.summary?.total_columns} Spalten - {unifiedGrid.dominant_row_h && ` · Zeilenhöhe: ${Math.round(unifiedGrid.dominant_row_h)}px`} - - )} + Jede Zone als eigenes Sheet-Tab. Spaltenbreiten pro Sheet optimiert.

-
-
- - -
- - -
+
- {(error || buildError) && ( -
- {error || buildError} -
- )} - {/* Split view */}
{/* LEFT: Original + OCR overlay */} @@ -170,41 +100,9 @@ export function StepAnsicht({ sessionId, onNext }: StepAnsichtProps) { )}
- {/* RIGHT: Spreadsheet or Grid view */} -
- {viewMode === 'spreadsheet' && (unifiedGrid || gridEditor.grid) ? ( - - ) : viewMode === 'grid' && unifiedZone ? ( -
-
- Grid View ({unifiedGrid?.summary?.total_rows}×{unifiedGrid?.summary?.total_columns}) -
- { - const next = getAdjacentCell(cellId, dir) - if (next) setSelectedCell(next) - }} - onDeleteColumn={deleteColumn} - onAddColumn={addColumn} - onDeleteRow={deleteRow} - onAddRow={addRow} - onToggleCellSelection={toggleCellSelection} - onSetCellColor={setCellColor} - /> -
- ) : ( -
-

Kein Unified Grid verfügbar.

- -
- )} + {/* RIGHT: Fortune Sheet */} +
+
diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py new file mode 100644 index 0000000..cab5277 --- /dev/null +++ b/klausur-service/backend/grid_build_core.py @@ -0,0 +1,1943 @@ +""" +Grid Build Core — the main _build_grid_core() function. + +Extracted from grid_editor_api.py for maintainability. +Takes merged OCR word positions and builds a structured, zone-aware grid. +""" + +import logging +import re +import time +from typing import Any, Dict, List, Optional, Tuple + +import cv2 +import numpy as np + +from cv_box_detect import detect_boxes, split_page_into_zones +from cv_graphic_detect import detect_graphic_elements +from cv_vocab_types import PageZone +from cv_color_detect import detect_word_colors, recover_colored_text +from cv_ocr_engines import ( + fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, + _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines, +) +from ocr_pipeline_session_store import get_session_image + +from grid_editor_helpers import ( + _filter_border_strip_words, + _cluster_columns_by_alignment, + _GRID_GHOST_CHARS, + _filter_border_ghosts, + _MARKER_CHARS, + _merge_inline_marker_columns, + _flatten_word_boxes, + _words_in_zone, + _PIPE_RE_VSPLIT, + _detect_vertical_dividers, + _split_zone_at_vertical_dividers, + _merge_content_zones_across_boxes, + _detect_heading_rows_by_color, + _detect_heading_rows_by_single_cell, + _detect_header_rows, + _build_zone_grid, + _get_content_bounds, + _filter_decorative_margin, + _filter_footer_words, + _filter_header_junk, +) + +logger = logging.getLogger(__name__) + + +async def _build_grid_core( + session_id: str, + session: dict, + *, + ipa_mode: str = "auto", + syllable_mode: str = "auto", +) -> dict: + """Core grid building logic — pure computation, no HTTP or DB side effects. + + Args: + session_id: Session identifier (for logging and image loading). + session: Full session dict from get_session_db(). + ipa_mode: "auto" (only when English headwords detected), "all" + (force IPA on all content columns), "en" (English column only), + "de" (German/definition columns only), or "none" (skip entirely). + syllable_mode: "auto" (only when original has pipe dividers), + "all" (force syllabification on all words), "en" (English only), + "de" (German only), or "none" (skip). + + Returns: + StructuredGrid result dict. + + Raises: + ValueError: If session data is incomplete. + """ + t0 = time.time() + + # 1. Validate and load word results + word_result = session.get("word_result") + if not word_result or not word_result.get("cells"): + raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.") + + img_w = word_result.get("image_width", 0) + img_h = word_result.get("image_height", 0) + if not img_w or not img_h: + raise ValueError("Missing image dimensions in word_result") + + # 2. Flatten all word boxes from cells + all_words = _flatten_word_boxes(word_result["cells"]) + if not all_words: + raise ValueError("No word boxes found in cells") + + logger.info("build-grid session %s: %d words from %d cells", + session_id, len(all_words), len(word_result["cells"])) + + # 2b. Filter decorative margin columns (alphabet graphics). + # Some worksheets have a decorative alphabet strip along one margin + # (A-Z in a graphic). OCR reads these as single-char words aligned + # vertically. Detect and remove them before grid building. + margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id) + margin_strip_detected = margin_strip_info.get("found", False) + + # Read document_category from session (user-selected or auto-detected) + document_category = session.get("document_category") + + # 2c. Filter footer rows (page numbers at the very bottom). + # Isolated short text in the bottom 5% of the page is typically a + # page number ("64", "S. 12") and not real content. The page number + # is extracted as metadata for the frontend header display. + page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) + + # 2c2. Filter OCR junk from header illustrations. + # Low-confidence short fragments above the first real content row. + _filter_header_junk(all_words, img_h, logger, session_id) + + # 2d. Filter words inside user-defined exclude regions (from Structure step). + # These are explicitly marked by the user, so ALL words inside are removed + # regardless of confidence. + structure_result = session.get("structure_result") + exclude_rects = [] + if structure_result: + for er in structure_result.get("exclude_regions", []): + exclude_rects.append({ + "x": er["x"], "y": er["y"], + "w": er["w"], "h": er["h"], + }) + if exclude_rects: + before = len(all_words) + filtered = [] + for w in all_words: + w_cx = w["left"] + w.get("width", 0) / 2 + w_cy = w["top"] + w.get("height", 0) / 2 + inside = any( + er["x"] <= w_cx <= er["x"] + er["w"] + and er["y"] <= w_cy <= er["y"] + er["h"] + for er in exclude_rects + ) + if not inside: + filtered.append(w) + removed = before - len(filtered) + if removed: + all_words = filtered + logger.info( + "build-grid session %s: removed %d words inside %d user exclude region(s)", + session_id, removed, len(exclude_rects), + ) + + # 2e. Hard-filter words inside graphic/image regions from structure step. + # ALL words inside graphic regions are removed regardless of confidence — + # images cannot contain real text; any OCR words inside are artifacts. + # After image loading (Step 3a) we augment these with freshly detected + # graphic regions from cv_graphic_detect. + graphic_rects: List[Dict[str, int]] = [] + if structure_result: + for g in structure_result.get("graphics", []): + graphic_rects.append({ + "x": g["x"], "y": g["y"], + "w": g["w"], "h": g["h"], + }) + if graphic_rects: + before = len(all_words) + all_words = [ + w for w in all_words + if not any( + gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in graphic_rects + ) + ] + removed = before - len(all_words) + if removed: + logger.info( + "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)", + session_id, removed, len(graphic_rects), + ) + + # 3. Load image for box detection + img_png = await get_session_image(session_id, "cropped") + if not img_png: + img_png = await get_session_image(session_id, "dewarped") + if not img_png: + img_png = await get_session_image(session_id, "original") + + zones_data: List[Dict[str, Any]] = [] + boxes_detected = 0 + recovered_count = 0 + border_prefiltered = False + img_bgr = None + + content_x, content_y, content_w, content_h = _get_content_bounds(all_words) + + if img_png: + # Decode image for color detection + box detection + arr = np.frombuffer(img_png, dtype=np.uint8) + img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) + + if img_bgr is not None: + # --- 3a. Detect graphic/image regions via CV and hard-filter --- + # Pass only significant words (len >= 3) to the detector so that + # short OCR artifacts inside images don't fool the text-vs-graphic + # heuristic (it counts word centroids to distinguish text from images). + sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3] + fresh_graphics = detect_graphic_elements(img_bgr, sig_words) + if fresh_graphics: + fresh_rects = [ + {"x": g.x, "y": g.y, "w": g.width, "h": g.height} + for g in fresh_graphics + ] + graphic_rects.extend(fresh_rects) + logger.info( + "build-grid session %s: detected %d graphic region(s) via CV", + session_id, len(fresh_graphics), + ) + # Hard-filter words inside newly detected graphic regions + before = len(all_words) + all_words = [ + w for w in all_words + if not any( + gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in fresh_rects + ) + ] + removed = before - len(all_words) + if removed: + logger.info( + "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)", + session_id, removed, len(fresh_rects), + ) + + # --- Recover colored text that OCR missed (before grid building) --- + recovered = recover_colored_text(img_bgr, all_words) + if recovered and graphic_rects: + # Filter recovered chars inside graphic regions + recovered = [ + r for r in recovered + if not any( + gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in graphic_rects + ) + ] + if recovered: + recovered_count = len(recovered) + all_words.extend(recovered) + logger.info( + "build-grid session %s: +%d recovered colored words", + session_id, recovered_count, + ) + + # Detect bordered boxes + boxes = detect_boxes( + img_bgr, + content_x=content_x, + content_w=content_w, + content_y=content_y, + content_h=content_h, + ) + boxes_detected = len(boxes) + + if boxes: + # Filter border ghost words before grid building + all_words, ghost_count = _filter_border_ghosts(all_words, boxes) + if ghost_count: + logger.info( + "build-grid session %s: removed %d border ghost words", + session_id, ghost_count, + ) + + # Split page into zones + page_zones = split_page_into_zones( + content_x, content_y, content_w, content_h, boxes + ) + + # Merge content zones separated by box zones + page_zones = _merge_content_zones_across_boxes( + page_zones, content_x, content_w + ) + + # 3b. Detect vertical dividers and split content zones + vsplit_group_counter = 0 + expanded_zones: List = [] + for pz in page_zones: + if pz.zone_type != "content": + expanded_zones.append(pz) + continue + zone_words = _words_in_zone( + all_words, pz.y, pz.height, pz.x, pz.width + ) + divider_xs = _detect_vertical_dividers( + zone_words, pz.x, pz.width, pz.y, pz.height + ) + if divider_xs: + sub_zones = _split_zone_at_vertical_dividers( + pz, divider_xs, vsplit_group_counter + ) + expanded_zones.extend(sub_zones) + vsplit_group_counter += 1 + # Remove pipe words so they don't appear in sub-zones + pipe_ids = set( + id(w) for w in zone_words + if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) + ) + all_words[:] = [w for w in all_words if id(w) not in pipe_ids] + logger.info( + "build-grid: vertical split zone %d at x=%s → %d sub-zones", + pz.index, [int(x) for x in divider_xs], len(sub_zones), + ) + else: + expanded_zones.append(pz) + # Re-index zones + for i, pz in enumerate(expanded_zones): + pz.index = i + page_zones = expanded_zones + + # --- Union columns from all content zones --- + # Each content zone detects columns independently. Narrow + # columns (page refs, markers) may appear in only one zone. + # Merge column split-points from ALL content zones so every + # zone shares the full column set. + # NOTE: Zones from a vertical split are independent and must + # NOT share columns with each other. + + # First pass: build grids per zone independently + zone_grids: List[Dict] = [] + + for pz in page_zones: + zone_words = _words_in_zone( + all_words, pz.y, pz.height, pz.x, pz.width + ) + if pz.zone_type == "content": + logger.info( + "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d → %d/%d words", + pz.index, pz.zone_type, + pz.x, pz.x + pz.width, pz.y, pz.y + pz.height, + len(zone_words), len(all_words), + ) + # Filter recovered single-char artifacts in ALL zones + # (decorative colored pixel blobs like !, ?, • from + # recover_colored_text that don't represent real text) + before = len(zone_words) + zone_words = [ + w for w in zone_words + if not ( + w.get("recovered") + and len(w.get("text", "").strip()) <= 2 + ) + ] + removed = before - len(zone_words) + if removed: + logger.info( + "build-grid: filtered %d recovered artifacts from %s zone %d", + removed, pz.zone_type, pz.index, + ) + # Filter words inside image overlay regions (merged box zones) + if pz.image_overlays: + before_ov = len(zone_words) + zone_words = [ + w for w in zone_words + if not any( + ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"] + and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"] + for ov in pz.image_overlays + ) + ] + ov_removed = before_ov - len(zone_words) + if ov_removed: + logger.info( + "build-grid: filtered %d words inside image overlays from zone %d", + ov_removed, pz.index, + ) + zone_words, bs_removed = _filter_border_strip_words(zone_words) + if bs_removed: + border_prefiltered = True + logger.info( + "build-grid: pre-filtered %d border-strip words from zone %d", + bs_removed, pz.index, + ) + grid = _build_zone_grid( + zone_words, pz.x, pz.y, pz.width, pz.height, + pz.index, img_w, img_h, + skip_first_row_header=bool(pz.image_overlays), + ) + zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) + + # Second pass: merge column boundaries from all content zones + # Exclude zones from vertical splits — they have independent columns. + content_zones = [ + zg for zg in zone_grids + if zg["pz"].zone_type == "content" + and zg["pz"].vsplit_group is None + ] + if len(content_zones) > 1: + # Collect column split points (x_min of non-first columns) + all_split_xs: List[float] = [] + for zg in content_zones: + raw_cols = zg["grid"].get("_raw_columns", []) + for col in raw_cols[1:]: + all_split_xs.append(col["x_min"]) + + if all_split_xs: + all_split_xs.sort() + merge_distance = max(25, int(content_w * 0.03)) + merged_xs = [all_split_xs[0]] + for x in all_split_xs[1:]: + if x - merged_xs[-1] < merge_distance: + merged_xs[-1] = (merged_xs[-1] + x) / 2 + else: + merged_xs.append(x) + + total_cols = len(merged_xs) + 1 + max_zone_cols = max( + len(zg["grid"].get("_raw_columns", [])) + for zg in content_zones + ) + + # Apply union whenever it has at least as many + # columns as the best single zone. Even with the + # same count the union boundaries are better because + # they incorporate evidence from all zones. + if total_cols >= max_zone_cols: + cx_min = min(w["left"] for w in all_words) + cx_max = max( + w["left"] + w["width"] for w in all_words + ) + merged_columns: List[Dict[str, Any]] = [] + prev_x = cx_min + for i, sx in enumerate(merged_xs): + merged_columns.append({ + "index": i, + "type": f"column_{i + 1}", + "x_min": prev_x, + "x_max": sx, + }) + prev_x = sx + merged_columns.append({ + "index": len(merged_xs), + "type": f"column_{len(merged_xs) + 1}", + "x_min": prev_x, + "x_max": cx_max, + }) + + # Re-build ALL content zones with merged columns + for zg in zone_grids: + pz = zg["pz"] + if pz.zone_type == "content": + grid = _build_zone_grid( + zg["words"], pz.x, pz.y, + pz.width, pz.height, + pz.index, img_w, img_h, + global_columns=merged_columns, + skip_first_row_header=bool(pz.image_overlays), + ) + zg["grid"] = grid + logger.info( + "build-grid session %s: union of %d content " + "zones → %d merged columns (max single zone: %d)", + session_id, len(content_zones), + total_cols, max_zone_cols, + ) + + for zg in zone_grids: + pz = zg["pz"] + grid = zg["grid"] + # Remove internal _raw_columns before adding to response + grid.pop("_raw_columns", None) + + zone_entry: Dict[str, Any] = { + "zone_index": pz.index, + "zone_type": pz.zone_type, + "bbox_px": { + "x": pz.x, "y": pz.y, + "w": pz.width, "h": pz.height, + }, + "bbox_pct": { + "x": round(pz.x / img_w * 100, 2) if img_w else 0, + "y": round(pz.y / img_h * 100, 2) if img_h else 0, + "w": round(pz.width / img_w * 100, 2) if img_w else 0, + "h": round(pz.height / img_h * 100, 2) if img_h else 0, + }, + "border": None, + "word_count": len(zg["words"]), + **grid, + } + + if pz.box: + zone_entry["border"] = { + "thickness": pz.box.border_thickness, + "confidence": pz.box.confidence, + } + + if pz.image_overlays: + zone_entry["image_overlays"] = pz.image_overlays + + if pz.layout_hint: + zone_entry["layout_hint"] = pz.layout_hint + if pz.vsplit_group is not None: + zone_entry["vsplit_group"] = pz.vsplit_group + + zones_data.append(zone_entry) + + # 4. Fallback: no boxes detected → single zone with all words + if not zones_data: + # Filter recovered single-char artifacts (same as in zone loop above) + before = len(all_words) + filtered_words = [ + w for w in all_words + if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2) + ] + removed = before - len(filtered_words) + if removed: + logger.info( + "build-grid session %s: filtered %d recovered artifacts (fallback zone)", + session_id, removed, + ) + # Pre-filter border-strip words so column detection is not + # confused by edge artifacts. When this removes words, Step 4e + # is skipped (it would otherwise re-detect content as a "strip"). + filtered_words, bs_removed = _filter_border_strip_words(filtered_words) + if bs_removed: + border_prefiltered = True + logger.info( + "build-grid session %s: pre-filtered %d border-strip words", + session_id, bs_removed, + ) + grid = _build_zone_grid( + filtered_words, content_x, content_y, content_w, content_h, + 0, img_w, img_h, + ) + grid.pop("_raw_columns", None) + zones_data.append({ + "zone_index": 0, + "zone_type": "content", + "bbox_px": { + "x": content_x, "y": content_y, + "w": content_w, "h": content_h, + }, + "bbox_pct": { + "x": round(content_x / img_w * 100, 2) if img_w else 0, + "y": round(content_y / img_h * 100, 2) if img_h else 0, + "w": round(content_w / img_w * 100, 2) if img_w else 0, + "h": round(content_h / img_h * 100, 2) if img_h else 0, + }, + "border": None, + "word_count": len(all_words), + **grid, + }) + + # 4b. Remove junk rows: rows where ALL cells contain only short, + # low-confidence text (OCR noise, stray marks). Real vocabulary rows + # have at least one word with conf >= 50 or meaningful text length. + # Also remove "oversized stub" rows: rows with ≤2 very short words + # whose word-boxes are significantly taller than the median (e.g. + # large red page numbers like "( 9" that are not real text content). + _JUNK_CONF_THRESHOLD = 50 + _JUNK_MAX_TEXT_LEN = 3 + for z in zones_data: + cells = z.get("cells", []) + rows = z.get("rows", []) + if not cells or not rows: + continue + + # Compute median word height across the zone for oversized detection + all_wb_heights = [ + wb["height"] + for cell in cells + for wb in cell.get("word_boxes") or [] + if wb.get("height", 0) > 0 + ] + median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28 + + junk_row_indices = set() + for row in rows: + ri = row["index"] + row_cells = [c for c in cells if c.get("row_index") == ri] + if not row_cells: + continue + + row_wbs = [ + wb for cell in row_cells + for wb in cell.get("word_boxes") or [] + ] + + # Rule 1: ALL word_boxes are low-conf AND short text + all_junk = True + for wb in row_wbs: + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 0) + if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: + all_junk = False + break + if all_junk and row_wbs: + junk_row_indices.add(ri) + continue + + # Rule 2: oversized stub — ≤3 words, short total text, + # and word height > 1.8× median (page numbers, stray marks, + # OCR from illustration labels like "SEA &") + # Skip if any word looks like a page reference (p.55, S.12). + if len(row_wbs) <= 3: + total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) + max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) + has_page_ref = any( + re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) + for wb in row_wbs + ) + if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: + junk_row_indices.add(ri) + continue + + # Rule 3: scattered debris — rows with only tiny fragments + # (e.g. OCR artifacts from illustrations/graphics). + # If the row has no word longer than 2 chars, it's noise. + longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs) + if longest <= 2: + junk_row_indices.add(ri) + continue + + if junk_row_indices: + z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] + z["rows"] = [r for r in rows if r["index"] not in junk_row_indices] + logger.info( + "build-grid: removed %d junk rows from zone %d: %s", + len(junk_row_indices), z["zone_index"], + sorted(junk_row_indices), + ) + + # 4b2. Remove individual cells that consist of a single very-short, + # low-confidence word (OCR artifacts like "as", "b" from stray marks). + # These survive row-level junk removal when the row has valid cells + # in other columns. + _ARTIFACT_MAX_LEN = 2 + _ARTIFACT_CONF_THRESHOLD = 65 + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + artifact_ids = set() + for cell in cells: + wbs = cell.get("word_boxes") or [] + if len(wbs) != 1: + continue + wb = wbs[0] + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 100) + if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD: + artifact_ids.add(cell.get("cell_id")) + if artifact_ids: + z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids] + logger.info( + "build-grid: removed %d artifact cells from zone %d: %s", + len(artifact_ids), z.get("zone_index", 0), + [c.get("text") for c in cells if c.get("cell_id") in artifact_ids], + ) + + # 4c. Remove oversized word_boxes from individual cells. + # OCR artifacts from graphics/images (e.g. a huge "N" from a map image) + # have word heights 3-5x the median. Remove them per-word so they don't + # pollute cells that also contain valid text in other columns. + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + all_wh = [ + wb["height"] + for cell in cells + for wb in cell.get("word_boxes") or [] + if wb.get("height", 0) > 0 + ] + if not all_wh: + continue + med_h = sorted(all_wh)[len(all_wh) // 2] + oversized_threshold = med_h * 3 + removed_oversized = 0 + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold] + if len(filtered) < len(wbs): + removed_oversized += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + if removed_oversized: + # Remove cells that became empty after oversized removal + z["cells"] = [c for c in cells if c.get("word_boxes")] + logger.info( + "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d", + removed_oversized, oversized_threshold, z.get("zone_index", 0), + ) + + # 4d. Remove pipe-character word_boxes (column divider artifacts). + # OCR reads physical vertical divider lines as "|" or "||" characters. + # These sit at consistent x positions near column boundaries and pollute + # cell text. Remove them from word_boxes and rebuild cell text. + # NOTE: Zones from a vertical split already had pipes removed in step 3b. + _PIPE_RE = re.compile(r"^\|+$") + for z in zones_data: + if z.get("vsplit_group") is not None: + continue # pipes already removed before split + removed_pipes = 0 + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] + if len(filtered) < len(wbs): + removed_pipes += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + # Remove cells that became empty after pipe removal + if removed_pipes: + z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "build-grid: removed %d pipe-divider word_boxes from zone %d", + removed_pipes, z.get("zone_index", 0), + ) + + # Strip pipe chars ONLY from word_boxes/cells where the pipe is an + # OCR column-divider artifact. Preserve pipes that are embedded in + # words as syllable separators (e.g. "zu|trau|en") — these are + # intentional and used in dictionary Ground Truth. + for z in zones_data: + for cell in z.get("cells", []): + for wb in cell.get("word_boxes", []): + wbt = wb.get("text", "") + # Only strip if the ENTIRE word_box is just pipe(s) + # (handled by _PIPE_RE above) — leave embedded pipes alone + text = cell.get("text", "") + if "|" in text: + # Only strip leading/trailing pipes (OCR artifacts at cell edges) + cleaned = text.strip("|").strip() + if cleaned != text.strip(): + cell["text"] = cleaned + + # 4d2. Normalize narrow connector columns. + # In synonym dictionaries a narrow column repeats the same word + # (e.g. "oder") in every row. OCR sometimes appends noise chars + # (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column + # share the same short text, normalize near-match outliers. + for z in zones_data: + cols = z.get("columns", []) + cells = z.get("cells", []) + if not cols or not cells: + continue + for col in cols: + ci = col.get("index") + col_cells = [c for c in cells if c.get("col_index") == ci] + if len(col_cells) < 3: + continue + # Count text occurrences + text_counts: Dict[str, int] = {} + for c in col_cells: + t = (c.get("text") or "").strip() + if t: + text_counts[t] = text_counts.get(t, 0) + 1 + if not text_counts: + continue + dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type] + dominant_count = text_counts[dominant_text] + # Only normalize if dominant word is short and appears in ≥60% + if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6: + continue + # Fix outliers that start with the dominant text + fixed = 0 + for c in col_cells: + t = (c.get("text") or "").strip() + if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2: + c["text"] = dominant_text + # Also fix word_boxes + wbs = c.get("word_boxes") or [] + if len(wbs) == 1: + wbs[0]["text"] = dominant_text + fixed += 1 + if fixed: + logger.info( + "build-grid: normalized %d outlier cells in connector column %d " + "(dominant='%s') zone %d", + fixed, ci, dominant_text, z.get("zone_index", 0), + ) + + # 4e. Detect and remove page-border decoration strips. + # Skipped when the pre-filter already removed border words BEFORE + # column detection — re-running would incorrectly detect the + # leftmost content column as a "strip". + border_strip_removed = 0 + if border_prefiltered: + logger.info("Step 4e: skipped (border pre-filter already applied)") + else: + # Some textbooks have decorative alphabet strips along the page + # edge. OCR picks up scattered letters from these as artifacts. + # Detection: find the first significant x-gap (>30 px) from each + # page edge between a small cluster (<20 %) and the main content. + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + all_wbs_with_cell: List[tuple] = [] # (left, wb, cell) + for cell in cells: + for wb in cell.get("word_boxes") or []: + all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) + if len(all_wbs_with_cell) < 10: + continue + all_wbs_with_cell.sort(key=lambda t: t[0]) + total = len(all_wbs_with_cell) + + # -- Left-edge scan -- + left_strip_count = 0 + left_gap = 0 + running_right = 0 + for gi in range(total - 1): + running_right = max( + running_right, + all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0), + ) + gap = all_wbs_with_cell[gi + 1][0] - running_right + if gap > 30: + left_strip_count = gi + 1 + left_gap = gap + break + + # -- Right-edge scan -- + right_strip_count = 0 + right_gap = 0 + running_left = all_wbs_with_cell[-1][0] + for gi in range(total - 1, 0, -1): + running_left = min(running_left, all_wbs_with_cell[gi][0]) + prev_right = ( + all_wbs_with_cell[gi - 1][0] + + all_wbs_with_cell[gi - 1][1].get("width", 0) + ) + gap = running_left - prev_right + if gap > 30: + right_strip_count = total - gi + right_gap = gap + break + + strip_wbs: set = set() + strip_side = "" + strip_gap = 0 + strip_count = 0 + if left_strip_count > 0 and left_strip_count / total < 0.20: + strip_side = "left" + strip_count = left_strip_count + strip_gap = left_gap + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]} + elif right_strip_count > 0 and right_strip_count / total < 0.20: + strip_side = "right" + strip_count = right_strip_count + strip_gap = right_gap + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]} + + if not strip_wbs: + continue + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if id(wb) not in strip_wbs] + if len(filtered) < len(wbs): + border_strip_removed += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + z["cells"] = [c for c in cells + if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " + "(gap=%dpx, strip=%d/%d wbs)", + border_strip_removed, strip_side, z.get("zone_index", 0), + strip_gap, strip_count, total, + ) + + # 4f. Remove decorative edge columns (alphabet sidebar safety net). + # Dictionary pages have A-Z letter sidebars that OCR reads as single- + # character word_boxes. These form narrow columns with very short text. + # Detection: edge column where almost ALL cells are single characters. + for z in zones_data: + columns = z.get("columns", []) + cells = z.get("cells", []) + if len(columns) < 3 or not cells: + continue + # Group cells by col_type (skip spanning_header) + col_cells: Dict[str, List[Dict]] = {} + for cell in cells: + ct = cell.get("col_type", "") + if ct.startswith("column_"): + col_cells.setdefault(ct, []).append(cell) + col_types_ordered = sorted(col_cells.keys()) + if len(col_types_ordered) < 3: + continue + for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: + edge_cells_list = col_cells.get(edge_ct, []) + if len(edge_cells_list) < 3: + continue + # Key criterion: average text length and single-char ratio. + # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells + # are single characters. + texts = [(c.get("text") or "").strip() for c in edge_cells_list] + avg_len = sum(len(t) for t in texts) / len(texts) + single_char = sum(1 for t in texts if len(t) <= 1) + single_ratio = single_char / len(texts) + if avg_len > 1.5: + continue # real content has longer text + if single_ratio < 0.7: + continue # not dominated by single chars + # Remove this edge column + removed_count = len(edge_cells_list) + edge_ids = {id(c) for c in edge_cells_list} + z["cells"] = [c for c in cells if id(c) not in edge_ids] + z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] + logger.info( + "Step 4f: removed decorative edge column '%s' from zone %d " + "(%d cells, avg_len=%.1f, single_char=%.0f%%)", + edge_ct, z.get("zone_index", 0), removed_count, + avg_len, single_ratio * 100, + ) + break # only remove one edge per zone + + # 5. Color annotation on final word_boxes in cells + if img_bgr is not None: + all_wb: List[Dict] = [] + for z in zones_data: + for cell in z.get("cells", []): + all_wb.extend(cell.get("word_boxes", [])) + detect_word_colors(img_bgr, all_wb) + + # 5a. Heading detection by color + height (after color is available) + heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h) + if heading_count: + logger.info("Detected %d heading rows by color+height", heading_count) + + # 5b. Fix unmatched parentheses in cell text + # OCR often misses opening "(" while detecting closing ")". + # If a cell's text has ")" without a matching "(", prepend "(". + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if ")" in text and "(" not in text: + cell["text"] = "(" + text + + # 5c. IPA phonetic correction — replace garbled OCR phonetics with + # correct IPA from the dictionary (same as in the OCR pipeline). + # Only applies to vocabulary tables (≥3 columns: EN | article | DE). + # Single/two-column layouts are continuous text, not vocab tables. + all_cells = [cell for z in zones_data for cell in z.get("cells", [])] + total_cols = sum(len(z.get("columns", [])) for z in zones_data) + en_col_type = None + ipa_target_cols: set = set() + all_content_cols: set = set() + skip_ipa = (ipa_mode == "none") + + # When ipa_mode=none, strip ALL square brackets from ALL content columns + if skip_ipa: + _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]') + for cell in all_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + text = cell.get("text", "") + if "[" in text: + stripped = _SQUARE_BRACKET_RE_NONE.sub("", text) + if stripped != text: + cell["text"] = stripped.strip() + cell["_ipa_corrected"] = True + + if not skip_ipa and total_cols >= 3: + # Detect English headword column via IPA signals (brackets or garbled). + col_ipa_count: Dict[str, int] = {} + all_content_cols: set = set() + for cell in all_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + txt = cell.get("text", "") or "" + if txt.strip(): + all_content_cols.add(ct) + if '[' in txt or _text_has_garbled_ipa(txt): + col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1 + if col_ipa_count: + en_col_type = max(col_ipa_count, key=col_ipa_count.get) + elif ipa_mode == "all": + # Force-all mode without auto-detection: pick column with most cells + col_cell_count: Dict[str, int] = {} + for cell in all_cells: + ct = cell.get("col_type", "") + if ct.startswith("column_") and (cell.get("text") or "").strip(): + col_cell_count[ct] = col_cell_count.get(ct, 0) + 1 + if col_cell_count: + en_col_type = max(col_cell_count, key=col_cell_count.get) + + # Decide which columns to process based on ipa_mode: + # auto/en: only the detected EN headword column (English IPA) + # de: all content columns EXCEPT the EN column (German IPA) + # all: EN column gets English IPA, other columns get German IPA + en_ipa_target_cols: set = set() + de_ipa_target_cols: set = set() + if ipa_mode in ("auto", "en"): + if en_col_type: + en_ipa_target_cols.add(en_col_type) + elif ipa_mode == "de": + de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols + elif ipa_mode == "all": + if en_col_type: + en_ipa_target_cols.add(en_col_type) + de_ipa_target_cols = all_content_cols - en_ipa_target_cols + + # --- Strip IPA from columns NOT in the target set --- + # When user selects "nur DE", English IPA from the OCR scan must + # be removed. When "none", all IPA is removed. + # In vocab columns, square brackets [...] are always IPA (both + # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]). + _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]') + strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols + if strip_en_ipa or ipa_mode == "none": + strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols + for cell in all_cells: + ct = cell.get("col_type", "") + if ct not in strip_cols: + continue + text = cell.get("text", "") + if "[" in text: + stripped = _SQUARE_BRACKET_RE.sub("", text) + if stripped != text: + cell["text"] = stripped.strip() + cell["_ipa_corrected"] = True + + # --- English IPA (Britfone + eng_to_ipa) --- + if en_ipa_target_cols: + for cell in all_cells: + ct = cell.get("col_type") + if ct in en_ipa_target_cols: + cell["_orig_col_type"] = ct + cell["col_type"] = "column_en" + _pre_ipa = {id(c): c.get("text", "") for c in all_cells} + fix_cell_phonetics(all_cells, pronunciation="british") + for cell in all_cells: + orig = cell.pop("_orig_col_type", None) + if orig: + cell["col_type"] = orig + if cell.get("text", "") != _pre_ipa.get(id(cell), ""): + cell["_ipa_corrected"] = True + + # --- German IPA (wiki-pronunciation-dict + epitran) --- + if de_ipa_target_cols: + from cv_ipa_german import insert_german_ipa + insert_german_ipa(all_cells, de_ipa_target_cols) + + ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols + + # Mark cells whose text was changed by IPA correction so that + # later steps (5i) don't overwrite the corrected text when + # reconstructing from word_boxes. (Already done inline above + # for English; insert_german_ipa sets _ipa_corrected too.) + for cell in all_cells: + if cell.get("text", "") != _pre_ipa.get(id(cell), ""): + cell["_ipa_corrected"] = True + + # 5d. Fix IPA continuation cells — cells where the printed + # phonetic transcription wraps to a line below the headword. + # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]"). + # Replace garbled text with proper IPA looked up from the + # headword in the previous row's same column. + # Note: We check ALL columns, not just en_col_type, because + # the EN headword column may not be the longest-average column. + _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") + ipa_cont_fixed = 0 + for z in ([] if skip_ipa else zones_data): + rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) + z_cells = z.get("cells", []) + for idx, row in enumerate(rows_sorted): + if idx == 0: + continue + ri = row["index"] + row_cells = [c for c in z_cells if c.get("row_index") == ri] + for cell in row_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + cell_text = (cell.get("text") or "").strip() + if not cell_text: + # Step 5c may have emptied garbled IPA cells like + # "[n, nn]" — recover text from word_boxes. + wb_texts = [w.get("text", "") + for w in cell.get("word_boxes", [])] + cell_text = " ".join(wb_texts).strip() + if not cell_text: + continue + + is_bracketed = ( + cell_text.startswith('[') and cell_text.endswith(']') + ) + + if is_bracketed: + # Bracketed continuation: "[n, nn]", "[klaoz 'daun]" + # Text like "employee [im'ploi:]" is NOT fully + # bracketed and won't match here. + if not _text_has_garbled_ipa(cell_text): + continue + # Already has proper IPA brackets → skip + if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): + continue + else: + # Unbracketed continuation: "ska:f – ska:vz", + # "'sekandarr sku:l". Only treat as IPA + # continuation if this is the ONLY content cell + # in the row (single-cell row) and the text is + # garbled IPA without real IPA Unicode symbols. + content_cells_in_row = [ + c for c in row_cells + if c.get("col_type", "").startswith("column_") + and c.get("col_type") != "column_1" + ] + if len(content_cells_in_row) != 1: + continue + if not _text_has_garbled_ipa(cell_text): + continue + # Has real IPA symbols → already fixed or valid + if any(c in _REAL_IPA_CHARS for c in cell_text): + continue + + # Find headword in previous row, same column + prev_ri = rows_sorted[idx - 1]["index"] + prev_same_col = [ + c for c in z_cells + if c.get("row_index") == prev_ri + and c.get("col_type") == ct + ] + if not prev_same_col: + continue + prev_text = prev_same_col[0].get("text", "") + fixed = fix_ipa_continuation_cell( + cell_text, prev_text, pronunciation="british", + ) + if fixed != cell_text: + cell["text"] = fixed + ipa_cont_fixed += 1 + logger.info( + "IPA continuation R%d %s: '%s' → '%s'", + ri, ct, cell_text, fixed, + ) + if ipa_cont_fixed: + logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) + + # 5e. Heading detection by single-cell rows — black headings like + # "Theme" that have normal color and height but are the ONLY cell + # in their row (excluding page_ref column_1). Must run AFTER 5d + # so IPA continuation cells are already processed. + single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h) + if single_heading_count: + logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) + + # 5f. Strip IPA from headings — headings detected in 5e ran AFTER + # IPA correction (5c), so they may have dictionary IPA appended + # (e.g. "Theme [θˈiːm]" → "Theme"). Headings should show the + # original text only. + for z in zones_data: + for cell in z.get("cells", []): + if cell.get("col_type") != "heading": + continue + text = cell.get("text", "") + # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme" + stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip() + if stripped and stripped != text: + cell["text"] = stripped + + # 5g. Extract page_ref cells and footer rows from content zones. + # Page references (column_1 cells like "p.70") sit in rows that + # also contain vocabulary — extract them as zone metadata without + # removing the row. Footer lines (e.g. "two hundred and twelve" + # = page number at bottom) are standalone rows that should be + # removed from the table entirely. + _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") + # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70" + _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') + _NUMBER_WORDS = { + "one", "two", "three", "four", "five", "six", "seven", + "eight", "nine", "ten", "eleven", "twelve", "thirteen", + "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", + "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", + "seventy", "eighty", "ninety", "hundred", "thousand", "and", + "einhundert", "zweihundert", "dreihundert", "vierhundert", + "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", + } + for z in zones_data: + if z.get("zone_type") != "content": + continue + cells = z.get("cells", []) + rows = z.get("rows", []) + if not rows: + continue + + # Extract column_1 cells that look like page references + page_refs = [] + page_ref_cell_ids = set() + for cell in cells: + if cell.get("col_type") != "column_1": + continue + text = (cell.get("text") or "").strip() + if not text: + continue + if not _PAGE_REF_RE.match(text): + continue + page_refs.append({ + "row_index": cell.get("row_index"), + "text": text, + "bbox_pct": cell.get("bbox_pct", {}), + }) + page_ref_cell_ids.add(cell.get("cell_id")) + + # Keep page_ref cells in the table as a visible column. + # Previously these were removed, but users want to see them. + # The metadata extraction above still populates zone["page_refs"] + # for the frontend header display. + + # Detect footer: last non-header row if it has only 1 cell + # with short, non-content text (page numbers like "233" or + # "two hundred and twelve"). Comma-separated lists and long + # text are content continuations, not page numbers. + footer_rows = [] + non_header_rows = [r for r in rows if not r.get("is_header")] + if non_header_rows: + last_row = non_header_rows[-1] + last_ri = last_row["index"] + last_cells = [c for c in z["cells"] + if c.get("row_index") == last_ri] + if len(last_cells) == 1: + text = (last_cells[0].get("text") or "").strip() + # Not IPA (no real IPA symbols) and not a heading + has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) + # Comma-separated text is a content continuation, not a footer + has_commas = ',' in text + # Written-out page numbers like "two hundred and nine" + text_words = set(text.lower().split()) + is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) + # Short text or written-out number + is_page_number = len(text) <= 20 or is_written_number + if (text and not has_real_ipa and not has_commas + and is_page_number + and last_cells[0].get("col_type") != "heading"): + footer_rows.append({ + "row_index": last_ri, + "text": text, + "bbox_pct": last_cells[0].get("bbox_pct", {}), + }) + + # Classify footer rows: page numbers are removed from the grid + # and promoted to page_number metadata; other footers stay as rows. + page_number_footers = [] + other_footers = [] + for fr in footer_rows: + ft = fr["text"].strip() + # Pure digits + digits = "".join(c for c in ft if c.isdigit()) + if digits and re.match(r'^[\d\s.]+$', ft): + page_number_footers.append(fr) + # Written-out numbers + elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS): + page_number_footers.append(fr) + else: + other_footers.append(fr) + + # Remove page-number footer rows from grid entirely + if page_number_footers: + pn_ris = {fr["row_index"] for fr in page_number_footers} + z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris] + z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris] + # Set page_number metadata (use first one) + pn_text = page_number_footers[0]["text"].strip() + pn_digits = "".join(c for c in pn_text if c.isdigit()) + if not page_number_info: + page_number_info = { + "text": pn_text, + "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95), + } + if pn_digits: + page_number_info["number"] = int(pn_digits) + + # Mark remaining footer rows (non-page-number content) + if other_footers: + footer_ris = {fr["row_index"] for fr in other_footers} + for r in z["rows"]: + if r["index"] in footer_ris: + r["is_footer"] = True + for c in z["cells"]: + if c.get("row_index") in footer_ris: + c["col_type"] = "footer" + + if page_refs or footer_rows: + logger.info( + "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d", + len(page_refs), len(footer_rows), len(page_number_footers), + z.get("zone_index", 0), + ) + + # Store as zone-level metadata + if page_refs: + z["page_refs"] = page_refs + if other_footers: + z["footer"] = other_footers + + # 5h. Convert slash-delimited IPA to bracket notation. + # Dictionary-style pages print IPA between slashes: "tiger /'taiga/" + # Detect the pattern /ocr_ipa/ and replace with [dict_ipa] + # using the IPA dictionary when available, falling back to the OCR text. + # The regex requires a word character (or ² ³) right before the opening + # slash to avoid false positives like "sb/sth". + _SLASH_IPA_RE = re.compile( + r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1) + r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars + ) + # Standalone slash IPA at start of text (headword on previous line) + _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') + # IPA between slashes never contains spaces, parentheses, or commas. + # Reject matches that look like grammar: "sb/sth up a) jdn/" + _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') + slash_ipa_fixed = 0 + for z in ([] if skip_ipa else zones_data): + for cell in z.get("cells", []): + # Only process English headword column — avoid converting + # German text like "der/die/das" to IPA. + if en_col_type and cell.get("col_type") != en_col_type: + continue + text = cell.get("text", "") + if "/" not in text: + continue + + def _replace_slash_ipa(m: re.Match) -> str: + nonlocal slash_ipa_fixed + headword = m.group(1) + ocr_ipa = m.group(2) # includes slashes + inner_raw = ocr_ipa.strip("/").strip() + # Reject if inner content has spaces/parens/commas (grammar) + if _SLASH_IPA_REJECT_RE.search(inner_raw): + return m.group(0) + # Strip superscript digits for lookup + clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() + ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None + if ipa: + slash_ipa_fixed += 1 + return f"{headword} [{ipa}]" + # Fallback: keep OCR IPA but convert slashes to brackets + inner = inner_raw.lstrip("'").strip() + if inner: + slash_ipa_fixed += 1 + return f"{headword} [{inner}]" + return m.group(0) + + new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) + + # Second pass: convert remaining /ipa/ after [ipa] from first pass. + # Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant) + _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') + def _replace_trailing_slash(m: re.Match) -> str: + nonlocal slash_ipa_fixed + inner = m.group(1).strip("/").strip().lstrip("'").strip() + if _SLASH_IPA_REJECT_RE.search(inner): + return m.group(0) + if inner: + slash_ipa_fixed += 1 + return f" [{inner}]" + return m.group(0) + new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text) + + # Handle standalone /ipa/ at start (no headword in this cell) + if new_text == text: + m = _STANDALONE_SLASH_IPA_RE.match(text) + if m: + inner = m.group(1).strip() + if not _SLASH_IPA_REJECT_RE.search(inner): + inner = inner.lstrip("'").strip() + if inner: + new_text = "[" + inner + "]" + text[m.end():] + slash_ipa_fixed += 1 + + if new_text != text: + cell["text"] = new_text + + if slash_ipa_fixed: + logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed) + + # 5i. Remove blue bullet/artifact word_boxes. + # Dictionary pages have small blue square bullets (■) before entries. + # OCR reads these as text artifacts (©, e, *, or even plausible words + # like "fighily" overlapping the real word "tightly"). + # Detection rules: + # a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black) + # b) Overlapping word_boxes: >40% x-overlap → remove lower confidence + # c) Duplicate text: consecutive blue wbs with identical text, gap < 6px + bullet_removed = 0 + for z in zones_data: + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + if len(wbs) < 2: + continue + to_remove: set = set() + + # Rule (a): tiny coloured symbols (bullets, graphic fragments) + for i, wb in enumerate(wbs): + cn = wb.get("color_name", "black") + if (cn != "black" + and wb.get("width", 0) * wb.get("height", 0) < 200 + and wb.get("conf", 100) < 85): + to_remove.add(i) + + # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) + # Small images/icons next to words get OCR'd as ">", "<", "~", etc. + # Remove word boxes that contain NO letters or digits. + for i, wb in enumerate(wbs): + t = (wb.get("text") or "").strip() + if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: + to_remove.add(i) + + # Rule (b) + (c): overlap and duplicate detection + # Sort by x for pairwise comparison + _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') + to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge + indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0)) + for p in range(len(indexed) - 1): + i1, w1 = indexed[p] + i2, w2 = indexed[p + 1] + x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0) + x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0) + overlap = max(0, min(x1e, x2e) - max(x1s, x2s)) + min_w = min(w1.get("width", 1), w2.get("width", 1)) + gap = x2s - x1e + overlap_pct = overlap / min_w if min_w > 0 else 0 + + # (b) Significant x-overlap + if overlap_pct > 0.20: + t1 = (w1.get("text") or "").strip() + t2 = (w2.get("text") or "").strip() + + # Syllable-split words: both are alphabetic text with + # moderate overlap (20-75%). Merge instead of removing. + # OCR splits words at syllable marks, producing overlapping + # boxes like "zu" + "tiefst" → "zutiefst". + if (overlap_pct <= 0.75 + and _ALPHA_WORD_RE.match(t1) + and _ALPHA_WORD_RE.match(t2)): + to_merge.append((i1, i2)) + continue + + # High overlap (>75%) with different alphabetic text: + # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104) + # causing it to heavily overlap with the next fragment ("brech"). + # Merge instead of removing when one is a short prefix (≤4 chars) + # and the texts are different. + if (overlap_pct > 0.75 + and _ALPHA_WORD_RE.match(t1) + and _ALPHA_WORD_RE.match(t2) + and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower() + and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4): + to_merge.append((i1, i2)) + continue + + if overlap_pct <= 0.40: + continue # too little overlap and not alphabetic merge + + c1 = w1.get("conf", 50) + c2 = w2.get("conf", 50) + + # For very high overlap (>90%) with different text, + # prefer the word that exists in the IPA dictionary + # over confidence (OCR can give artifacts high conf). + if overlap_pct > 0.90 and t1.lower() != t2.lower(): + in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False + in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False + if in_dict_1 and not in_dict_2: + to_remove.add(i2) + continue + elif in_dict_2 and not in_dict_1: + to_remove.add(i1) + continue + + if c1 < c2: + to_remove.add(i1) + elif c2 < c1: + to_remove.add(i2) + else: + # Same confidence: remove the taller one (bullet slivers) + if w1.get("height", 0) > w2.get("height", 0): + to_remove.add(i1) + else: + to_remove.add(i2) + + # (c) Duplicate text: consecutive blue with same text, gap < 6px + elif (gap < 6 + and w1.get("color_name") == "blue" + and w2.get("color_name") == "blue" + and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()): + # Remove the one with lower confidence; if equal, first one + c1 = w1.get("conf", 50) + c2 = w2.get("conf", 50) + to_remove.add(i1 if c1 <= c2 else i2) + + # Execute merges first (syllable-split words). + # Use merge_parent to support chain merging: if "zer" absorbed + # "brech" and then "brech"+"lich" is a merge pair, redirect to + # merge "lich" into "zer" → "zerbrechlich". + if to_merge: + merge_parent: Dict[int, int] = {} # absorbed → absorber + for mi1, mi2 in to_merge: + # Follow chain: if mi1 was absorbed, find root absorber + actual_mi1 = mi1 + while actual_mi1 in merge_parent: + actual_mi1 = merge_parent[actual_mi1] + if actual_mi1 in to_remove or mi2 in to_remove: + continue + if mi2 in merge_parent: + continue # mi2 already absorbed + mw1, mw2 = wbs[actual_mi1], wbs[mi2] + # Concatenate text (no space — they're parts of one word) + mt1 = (mw1.get("text") or "").rstrip(".,;:!?") + mt2 = (mw2.get("text") or "").strip() + merged_text = mt1 + mt2 + # Union bounding box + mx = min(mw1["left"], mw2["left"]) + my = min(mw1["top"], mw2["top"]) + mr = max(mw1["left"] + mw1["width"], + mw2["left"] + mw2["width"]) + mb = max(mw1["top"] + mw1["height"], + mw2["top"] + mw2["height"]) + mw1["text"] = merged_text + mw1["left"] = mx + mw1["top"] = my + mw1["width"] = mr - mx + mw1["height"] = mb - my + mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 + to_remove.add(mi2) + merge_parent[mi2] = actual_mi1 + bullet_removed -= 1 # net: merge, not removal + + if to_remove: + bullet_removed += len(to_remove) + filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] + cell["word_boxes"] = filtered + # Don't overwrite text that was corrected by Step 5c IPA fix + if not cell.get("_ipa_corrected"): + cell["text"] = _words_to_reading_order_text(filtered) + + # Remove cells that became empty after bullet removal + if bullet_removed: + for z in zones_data: + z["cells"] = [c for c in z.get("cells", []) + if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) + + # 5j-pre. Remove cells whose text is entirely garbled / artifact noise. + # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr", + # "\\", "PEE", "a=") that survive earlier filters because their rows also + # contain real content in other columns. Remove them here. + _COMMON_SHORT_WORDS = { + # German + "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja", + "ob", "so", "um", "zu", "wo", "je", "oh", "or", + "die", "der", "das", "dem", "den", "des", "ein", "und", + "auf", "aus", "bei", "bis", "für", "mit", "nur", "von", + # English + "a", "i", "an", "as", "at", "be", "by", "do", "go", "he", + "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok", + "on", "or", "so", "to", "up", "us", "we", + "the", "and", "but", "for", "not", + } + _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$') + artifact_cells_removed = 0 + for z in zones_data: + before = len(z.get("cells", [])) + kept = [] + for cell in z.get("cells", []): + text = (cell.get("text") or "").strip() + core = text.rstrip(".,;:!?'\"") + is_artifact = False + if not core: + is_artifact = True + elif _PURE_JUNK_RE.match(core): + is_artifact = True + elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): + # Short non-alphabetic text like "a=", not word beginnings like "Zw" + is_artifact = True + elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: + is_artifact = True + elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core) + and not re.match(r'^[pPsS]\.?\d+$', core)): + # Mixed digits + letters in short text (e.g. "7 EN", "a=3") + # but NOT page references like "p.43", "p50", "S.12" + is_artifact = True + if is_artifact: + kept.append(None) # placeholder + else: + kept.append(cell) + z["cells"] = [c for c in kept if c is not None] + artifact_cells_removed += before - len(z["cells"]) + if artifact_cells_removed: + # Also remove rows that became completely empty + for z in zones_data: + cell_ris = {c.get("row_index") for c in z.get("cells", [])} + z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris] + logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed) + + # 5j. Normalise word_box order to reading order (group by Y, sort by X). + # The frontend renders colored cells from word_boxes array order + # (GridTable.tsx), so they MUST be in left-to-right reading order. + wb_reordered = 0 + for z in zones_data: + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + if len(wbs) < 2: + continue + lines = _group_words_into_lines(wbs, y_tolerance_px=15) + sorted_wbs = [w for line in lines for w in line] + # Check if order actually changed + if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]: + cell["word_boxes"] = sorted_wbs + wb_reordered += 1 + if wb_reordered: + logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered) + + duration = time.time() - t0 + + # 6. Build result + total_cells = sum(len(z.get("cells", [])) for z in zones_data) + total_columns = sum(len(z.get("columns", [])) for z in zones_data) + total_rows = sum(len(z.get("rows", [])) for z in zones_data) + + # Collect color statistics from all word_boxes in cells + color_stats: Dict[str, int] = {} + for z in zones_data: + for cell in z.get("cells", []): + for wb in cell.get("word_boxes", []): + cn = wb.get("color_name", "black") + color_stats[cn] = color_stats.get(cn, 0) + 1 + + # Compute layout metrics for faithful grid reconstruction + all_content_row_heights: List[float] = [] + for z in zones_data: + for row in z.get("rows", []): + if not row.get("is_header", False): + h = row.get("y_max_px", 0) - row.get("y_min_px", 0) + if h > 0: + all_content_row_heights.append(h) + avg_row_height = ( + sum(all_content_row_heights) / len(all_content_row_heights) + if all_content_row_heights else 30.0 + ) + font_size_suggestion = max(10, int(avg_row_height * 0.6)) + + # --- Dictionary detection on assembled grid --- + # Build lightweight ColumnGeometry-like structures from zone columns for + # dictionary signal scoring. + from cv_layout import _score_dictionary_signals + dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0} + try: + from cv_vocab_types import ColumnGeometry + for z in zones_data: + zone_cells = z.get("cells", []) + zone_cols = z.get("columns", []) + if len(zone_cols) < 2 or len(zone_cells) < 10: + continue + # Build pseudo-ColumnGeometry per column + pseudo_geoms = [] + for col in zone_cols: + ci = col["index"] + col_cells = [c for c in zone_cells if c.get("col_index") == ci] + # Flatten word_boxes into word dicts compatible with _score_language + col_words = [] + for cell in col_cells: + for wb in cell.get("word_boxes") or []: + col_words.append({ + "text": wb.get("text", ""), + "conf": wb.get("conf", 0), + "top": wb.get("top", 0), + "left": wb.get("left", 0), + "height": wb.get("height", 0), + "width": wb.get("width", 0), + }) + # Fallback: use cell text if no word_boxes + if not cell.get("word_boxes") and cell.get("text"): + col_words.append({ + "text": cell["text"], + "conf": cell.get("confidence", 50), + "top": cell.get("bbox_px", {}).get("y", 0), + "left": cell.get("bbox_px", {}).get("x", 0), + "height": cell.get("bbox_px", {}).get("h", 20), + "width": cell.get("bbox_px", {}).get("w", 50), + }) + col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0) + pseudo_geoms.append(ColumnGeometry( + index=ci, x=col.get("x_min_px", 0), y=0, + width=max(col_w, 1), height=img_h, + word_count=len(col_words), words=col_words, + width_ratio=col_w / max(img_w, 1), + )) + if len(pseudo_geoms) >= 2: + dd = _score_dictionary_signals( + pseudo_geoms, + document_category=document_category, + margin_strip_detected=margin_strip_detected, + ) + if dd["confidence"] > dict_detection["confidence"]: + dict_detection = dd + except Exception as e: + logger.warning("Dictionary detection failed: %s", e) + + # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" --- + try: + from cv_syllable_detect import merge_word_gaps_in_zones + merge_word_gaps_in_zones(zones_data, session_id) + except Exception as e: + logger.warning("Word-gap merge failed: %s", e) + + # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers --- + # Strips | from words, validates with pyphen, tries char-deletion for garbled + # words like "Ze|plpe|lin" → "Zeppelin". + try: + from cv_syllable_detect import autocorrect_pipe_artifacts + autocorrect_pipe_artifacts(zones_data, session_id) + except Exception as e: + logger.warning("Pipe autocorrect failed: %s", e) + + # --- Syllable divider insertion for dictionary pages --- + # syllable_mode: "auto" = only when original has pipe dividers (1% threshold), + # "all" = force on all content words, "en" = English column only, + # "de" = German columns only, "none" = skip entirely. + syllable_insertions = 0 + if syllable_mode != "none" and img_bgr is not None: + _syllable_eligible = False + if syllable_mode in ("all", "de", "en"): + _syllable_eligible = True + elif (dict_detection.get("is_dictionary") + and dict_detection.get("article_col_index") is not None): + # auto: only on dictionary pages with article columns + _syllable_eligible = True + # For language-specific modes, determine allowed columns + _syllable_col_filter: Optional[set] = None # None = all columns + if syllable_mode == "en": + _syllable_col_filter = {en_col_type} if en_col_type else set() + elif syllable_mode == "de": + if en_col_type and total_cols >= 3: + _syllable_col_filter = all_content_cols - {en_col_type} + # else None → all columns (correct for German-only dicts) + if _syllable_eligible: + try: + from cv_syllable_detect import insert_syllable_dividers + force_syllables = (syllable_mode in ("all", "de", "en")) + syllable_insertions = insert_syllable_dividers( + zones_data, img_bgr, session_id, + force=force_syllables, + col_filter=_syllable_col_filter, + ) + except Exception as e: + logger.warning("Syllable insertion failed: %s", e) + + # When syllable mode is "none", strip any residual | from OCR so + # that the displayed text is clean (e.g. "Zel|le" → "Zelle"). + if syllable_mode == "none": + for z in zones_data: + for cell in z.get("cells", []): + t = cell.get("text", "") + if "|" in t: + cell["text"] = t.replace("|", "") + + # --- Split merged words (OCR sometimes glues adjacent words) --- + # Uses dictionary lookup to split e.g. "atmyschool" → "at my school" + try: + from cv_review import _try_split_merged_word, _SPELL_AVAILABLE + if _SPELL_AVAILABLE: + split_count = 0 + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if not text: + continue + parts = [] + changed = False + for token in text.split(): + # Try splitting pure-alpha tokens >= 4 chars + # Strip trailing punctuation AND IPA brackets + clean = token + # Remove trailing IPA like [dɪsˈɪʒən] first + bracket_pos = clean.find('[') + suffix_ipa = "" + if bracket_pos > 0: + suffix_ipa = clean[bracket_pos:] + clean = clean[:bracket_pos] + suffix_punct = "" + stripped = clean.rstrip(".,!?;:'\")") + if stripped != clean: + suffix_punct = clean[len(stripped):] + clean = stripped + suffix = suffix_punct + suffix_ipa + # Handle contractions: "solet's" → try "solet" + "'s" + contraction = "" + if "'" in clean and clean.index("'") >= 2: + apos_pos = clean.index("'") + contraction = clean[apos_pos:] + clean = clean[:apos_pos] + suffix = contraction + suffix + if len(clean) >= 4 and clean.isalpha(): + split = _try_split_merged_word(clean) + if split: + parts.append(split + suffix) + changed = True + continue + parts.append(token) + if changed: + cell["text"] = " ".join(parts) + split_count += 1 + if split_count: + logger.info("build-grid session %s: split %d merged words", session_id, split_count) + except ImportError: + pass + + # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" --- + # Matches any [bracket] directly after a letter, as long as the bracket + # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]"). + _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])') + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if text and "[" in text: + fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text) + if fixed != text: + cell["text"] = fixed + + # --- SmartSpellChecker: language-aware OCR correction on all cells --- + try: + from smart_spell import SmartSpellChecker + _ssc = SmartSpellChecker() + spell_fix_count = 0 + + # Determine language per column: + # en_col_type was already detected (column with IPA = English). + # All other content columns are assumed German for vocab tables. + # For single/two-column layouts, use auto-detection. + for z in zones_data: + zone_cols = z.get("columns", []) + for cell in z.get("cells", []): + text = cell.get("text", "") + if not text or not text.strip(): + continue + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + + # Determine language for this cell + if total_cols >= 3 and en_col_type: + lang = "en" if ct == en_col_type else "de" + elif total_cols <= 2: + lang = "auto" # auto-detect for non-vocab layouts + else: + lang = "auto" + + result = _ssc.correct_text(text, lang=lang) + if result.changed: + cell["text"] = result.corrected + spell_fix_count += 1 + + if spell_fix_count: + logger.info( + "build-grid session %s: SmartSpellChecker fixed %d cells", + session_id, spell_fix_count, + ) + except ImportError: + logger.debug("SmartSpellChecker not available in build-grid") + except Exception as e: + logger.warning("SmartSpellChecker error in build-grid: %s", e) + + # --- Debug: log cell counts per column before empty-column removal --- + for z in zones_data: + if z.get("zone_type") == "content": + from collections import Counter as _Counter + _cc = _Counter(c.get("col_index") for c in z.get("cells", [])) + _cols = z.get("columns", []) + logger.info( + "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s", + z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())), + ) + + # --- Remove empty columns (no cells assigned) --- + for z in zones_data: + cells = z.get("cells", []) + used_col_indices = {c.get("col_index") for c in cells} + old_cols = z.get("columns", []) + new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices] + if len(new_cols) < len(old_cols): + # Re-index columns and cells + old_to_new = {} + for new_i, col in enumerate(new_cols): + old_i = col.get("col_index", col.get("index", new_i)) + old_to_new[old_i] = new_i + col["col_index"] = new_i + col["index"] = new_i + col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text" + for cell in cells: + old_ci = cell.get("col_index", 0) + cell["col_index"] = old_to_new.get(old_ci, old_ci) + cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text" + z["columns"] = new_cols + + # Clean up internal flags before returning + for z in zones_data: + for cell in z.get("cells", []): + cell.pop("_ipa_corrected", None) + + result = { + "session_id": session_id, + "image_width": img_w, + "image_height": img_h, + "zones": zones_data, + "boxes_detected": boxes_detected, + "summary": { + "total_zones": len(zones_data), + "total_columns": total_columns, + "total_rows": total_rows, + "total_cells": total_cells, + "total_words": len(all_words), + "recovered_colored": recovered_count, + "color_stats": color_stats, + }, + "formatting": { + "bold_columns": [], + "header_rows": [], + }, + "layout_metrics": { + "page_width_px": img_w, + "page_height_px": img_h, + "avg_row_height_px": round(avg_row_height, 1), + "font_size_suggestion_px": font_size_suggestion, + }, + "dictionary_detection": { + "is_dictionary": dict_detection.get("is_dictionary", False), + "confidence": dict_detection.get("confidence", 0.0), + "signals": dict_detection.get("signals", {}), + "article_col_index": dict_detection.get("article_col_index"), + "headword_col_index": dict_detection.get("headword_col_index"), + }, + "processing_modes": { + "ipa_mode": ipa_mode, + "syllable_mode": syllable_mode, + "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, + "syllables_applied": syllable_insertions > 0, + }, + "page_number": page_number_info, + "duration_seconds": round(duration, 2), + } + + return result + diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index a8884de..ea91384 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1,14 +1,7 @@ """ -Grid Editor API — builds a structured, zone-aware grid from Kombi OCR results. +Grid Editor API — endpoints for grid building, editing, and export. -Takes the merged word positions from paddle-kombi / rapid-kombi and: - 1. Detects bordered boxes on the image (cv_box_detect) - 2. Splits the page into zones (content + box regions) - 3. Clusters words into columns and rows per zone - 4. Returns a hierarchical StructuredGrid for the frontend Excel-like editor - -Lizenz: Apache 2.0 (kommerziell nutzbar) -DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +The core grid building logic is in grid_build_core.py. """ import logging @@ -16,1950 +9,20 @@ import re import time from typing import Any, Dict, List, Optional, Tuple -import cv2 -import numpy as np from fastapi import APIRouter, HTTPException, Query, Request -from cv_box_detect import detect_boxes, split_page_into_zones -from cv_graphic_detect import detect_graphic_elements -from cv_vocab_types import PageZone -from cv_color_detect import detect_word_colors, recover_colored_text -from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines +from grid_build_core import _build_grid_core +from grid_editor_helpers import _words_in_zone from ocr_pipeline_session_store import ( get_session_db, - get_session_image, update_session_db, ) -from grid_editor_helpers import ( - _filter_border_strip_words, - _cluster_columns_by_alignment, - _GRID_GHOST_CHARS, - _filter_border_ghosts, - _MARKER_CHARS, - _merge_inline_marker_columns, - _flatten_word_boxes, - _words_in_zone, - _PIPE_RE_VSPLIT, - _detect_vertical_dividers, - _split_zone_at_vertical_dividers, - _merge_content_zones_across_boxes, - _detect_heading_rows_by_color, - _detect_heading_rows_by_single_cell, - _detect_header_rows, - _build_zone_grid, - _get_content_bounds, - _filter_decorative_margin, - _filter_footer_words, - _filter_header_junk, -) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -# --------------------------------------------------------------------------- -# Core computation (used by build-grid endpoint and regression tests) -# --------------------------------------------------------------------------- - -async def _build_grid_core( - session_id: str, - session: dict, - *, - ipa_mode: str = "auto", - syllable_mode: str = "auto", -) -> dict: - """Core grid building logic — pure computation, no HTTP or DB side effects. - - Args: - session_id: Session identifier (for logging and image loading). - session: Full session dict from get_session_db(). - ipa_mode: "auto" (only when English headwords detected), "all" - (force IPA on all content columns), "en" (English column only), - "de" (German/definition columns only), or "none" (skip entirely). - syllable_mode: "auto" (only when original has pipe dividers), - "all" (force syllabification on all words), "en" (English only), - "de" (German only), or "none" (skip). - - Returns: - StructuredGrid result dict. - - Raises: - ValueError: If session data is incomplete. - """ - t0 = time.time() - - # 1. Validate and load word results - word_result = session.get("word_result") - if not word_result or not word_result.get("cells"): - raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.") - - img_w = word_result.get("image_width", 0) - img_h = word_result.get("image_height", 0) - if not img_w or not img_h: - raise ValueError("Missing image dimensions in word_result") - - # 2. Flatten all word boxes from cells - all_words = _flatten_word_boxes(word_result["cells"]) - if not all_words: - raise ValueError("No word boxes found in cells") - - logger.info("build-grid session %s: %d words from %d cells", - session_id, len(all_words), len(word_result["cells"])) - - # 2b. Filter decorative margin columns (alphabet graphics). - # Some worksheets have a decorative alphabet strip along one margin - # (A-Z in a graphic). OCR reads these as single-char words aligned - # vertically. Detect and remove them before grid building. - margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id) - margin_strip_detected = margin_strip_info.get("found", False) - - # Read document_category from session (user-selected or auto-detected) - document_category = session.get("document_category") - - # 2c. Filter footer rows (page numbers at the very bottom). - # Isolated short text in the bottom 5% of the page is typically a - # page number ("64", "S. 12") and not real content. The page number - # is extracted as metadata for the frontend header display. - page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) - - # 2c2. Filter OCR junk from header illustrations. - # Low-confidence short fragments above the first real content row. - _filter_header_junk(all_words, img_h, logger, session_id) - - # 2d. Filter words inside user-defined exclude regions (from Structure step). - # These are explicitly marked by the user, so ALL words inside are removed - # regardless of confidence. - structure_result = session.get("structure_result") - exclude_rects = [] - if structure_result: - for er in structure_result.get("exclude_regions", []): - exclude_rects.append({ - "x": er["x"], "y": er["y"], - "w": er["w"], "h": er["h"], - }) - if exclude_rects: - before = len(all_words) - filtered = [] - for w in all_words: - w_cx = w["left"] + w.get("width", 0) / 2 - w_cy = w["top"] + w.get("height", 0) / 2 - inside = any( - er["x"] <= w_cx <= er["x"] + er["w"] - and er["y"] <= w_cy <= er["y"] + er["h"] - for er in exclude_rects - ) - if not inside: - filtered.append(w) - removed = before - len(filtered) - if removed: - all_words = filtered - logger.info( - "build-grid session %s: removed %d words inside %d user exclude region(s)", - session_id, removed, len(exclude_rects), - ) - - # 2e. Hard-filter words inside graphic/image regions from structure step. - # ALL words inside graphic regions are removed regardless of confidence — - # images cannot contain real text; any OCR words inside are artifacts. - # After image loading (Step 3a) we augment these with freshly detected - # graphic regions from cv_graphic_detect. - graphic_rects: List[Dict[str, int]] = [] - if structure_result: - for g in structure_result.get("graphics", []): - graphic_rects.append({ - "x": g["x"], "y": g["y"], - "w": g["w"], "h": g["h"], - }) - if graphic_rects: - before = len(all_words) - all_words = [ - w for w in all_words - if not any( - gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] - and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] - for gr in graphic_rects - ) - ] - removed = before - len(all_words) - if removed: - logger.info( - "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)", - session_id, removed, len(graphic_rects), - ) - - # 3. Load image for box detection - img_png = await get_session_image(session_id, "cropped") - if not img_png: - img_png = await get_session_image(session_id, "dewarped") - if not img_png: - img_png = await get_session_image(session_id, "original") - - zones_data: List[Dict[str, Any]] = [] - boxes_detected = 0 - recovered_count = 0 - border_prefiltered = False - img_bgr = None - - content_x, content_y, content_w, content_h = _get_content_bounds(all_words) - - if img_png: - # Decode image for color detection + box detection - arr = np.frombuffer(img_png, dtype=np.uint8) - img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) - - if img_bgr is not None: - # --- 3a. Detect graphic/image regions via CV and hard-filter --- - # Pass only significant words (len >= 3) to the detector so that - # short OCR artifacts inside images don't fool the text-vs-graphic - # heuristic (it counts word centroids to distinguish text from images). - sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3] - fresh_graphics = detect_graphic_elements(img_bgr, sig_words) - if fresh_graphics: - fresh_rects = [ - {"x": g.x, "y": g.y, "w": g.width, "h": g.height} - for g in fresh_graphics - ] - graphic_rects.extend(fresh_rects) - logger.info( - "build-grid session %s: detected %d graphic region(s) via CV", - session_id, len(fresh_graphics), - ) - # Hard-filter words inside newly detected graphic regions - before = len(all_words) - all_words = [ - w for w in all_words - if not any( - gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] - and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] - for gr in fresh_rects - ) - ] - removed = before - len(all_words) - if removed: - logger.info( - "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)", - session_id, removed, len(fresh_rects), - ) - - # --- Recover colored text that OCR missed (before grid building) --- - recovered = recover_colored_text(img_bgr, all_words) - if recovered and graphic_rects: - # Filter recovered chars inside graphic regions - recovered = [ - r for r in recovered - if not any( - gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"] - and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"] - for gr in graphic_rects - ) - ] - if recovered: - recovered_count = len(recovered) - all_words.extend(recovered) - logger.info( - "build-grid session %s: +%d recovered colored words", - session_id, recovered_count, - ) - - # Detect bordered boxes - boxes = detect_boxes( - img_bgr, - content_x=content_x, - content_w=content_w, - content_y=content_y, - content_h=content_h, - ) - boxes_detected = len(boxes) - - if boxes: - # Filter border ghost words before grid building - all_words, ghost_count = _filter_border_ghosts(all_words, boxes) - if ghost_count: - logger.info( - "build-grid session %s: removed %d border ghost words", - session_id, ghost_count, - ) - - # Split page into zones - page_zones = split_page_into_zones( - content_x, content_y, content_w, content_h, boxes - ) - - # Merge content zones separated by box zones - page_zones = _merge_content_zones_across_boxes( - page_zones, content_x, content_w - ) - - # 3b. Detect vertical dividers and split content zones - vsplit_group_counter = 0 - expanded_zones: List = [] - for pz in page_zones: - if pz.zone_type != "content": - expanded_zones.append(pz) - continue - zone_words = _words_in_zone( - all_words, pz.y, pz.height, pz.x, pz.width - ) - divider_xs = _detect_vertical_dividers( - zone_words, pz.x, pz.width, pz.y, pz.height - ) - if divider_xs: - sub_zones = _split_zone_at_vertical_dividers( - pz, divider_xs, vsplit_group_counter - ) - expanded_zones.extend(sub_zones) - vsplit_group_counter += 1 - # Remove pipe words so they don't appear in sub-zones - pipe_ids = set( - id(w) for w in zone_words - if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) - ) - all_words[:] = [w for w in all_words if id(w) not in pipe_ids] - logger.info( - "build-grid: vertical split zone %d at x=%s → %d sub-zones", - pz.index, [int(x) for x in divider_xs], len(sub_zones), - ) - else: - expanded_zones.append(pz) - # Re-index zones - for i, pz in enumerate(expanded_zones): - pz.index = i - page_zones = expanded_zones - - # --- Union columns from all content zones --- - # Each content zone detects columns independently. Narrow - # columns (page refs, markers) may appear in only one zone. - # Merge column split-points from ALL content zones so every - # zone shares the full column set. - # NOTE: Zones from a vertical split are independent and must - # NOT share columns with each other. - - # First pass: build grids per zone independently - zone_grids: List[Dict] = [] - - for pz in page_zones: - zone_words = _words_in_zone( - all_words, pz.y, pz.height, pz.x, pz.width - ) - if pz.zone_type == "content": - logger.info( - "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d → %d/%d words", - pz.index, pz.zone_type, - pz.x, pz.x + pz.width, pz.y, pz.y + pz.height, - len(zone_words), len(all_words), - ) - # Filter recovered single-char artifacts in ALL zones - # (decorative colored pixel blobs like !, ?, • from - # recover_colored_text that don't represent real text) - before = len(zone_words) - zone_words = [ - w for w in zone_words - if not ( - w.get("recovered") - and len(w.get("text", "").strip()) <= 2 - ) - ] - removed = before - len(zone_words) - if removed: - logger.info( - "build-grid: filtered %d recovered artifacts from %s zone %d", - removed, pz.zone_type, pz.index, - ) - # Filter words inside image overlay regions (merged box zones) - if pz.image_overlays: - before_ov = len(zone_words) - zone_words = [ - w for w in zone_words - if not any( - ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"] - and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"] - for ov in pz.image_overlays - ) - ] - ov_removed = before_ov - len(zone_words) - if ov_removed: - logger.info( - "build-grid: filtered %d words inside image overlays from zone %d", - ov_removed, pz.index, - ) - zone_words, bs_removed = _filter_border_strip_words(zone_words) - if bs_removed: - border_prefiltered = True - logger.info( - "build-grid: pre-filtered %d border-strip words from zone %d", - bs_removed, pz.index, - ) - grid = _build_zone_grid( - zone_words, pz.x, pz.y, pz.width, pz.height, - pz.index, img_w, img_h, - skip_first_row_header=bool(pz.image_overlays), - ) - zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) - - # Second pass: merge column boundaries from all content zones - # Exclude zones from vertical splits — they have independent columns. - content_zones = [ - zg for zg in zone_grids - if zg["pz"].zone_type == "content" - and zg["pz"].vsplit_group is None - ] - if len(content_zones) > 1: - # Collect column split points (x_min of non-first columns) - all_split_xs: List[float] = [] - for zg in content_zones: - raw_cols = zg["grid"].get("_raw_columns", []) - for col in raw_cols[1:]: - all_split_xs.append(col["x_min"]) - - if all_split_xs: - all_split_xs.sort() - merge_distance = max(25, int(content_w * 0.03)) - merged_xs = [all_split_xs[0]] - for x in all_split_xs[1:]: - if x - merged_xs[-1] < merge_distance: - merged_xs[-1] = (merged_xs[-1] + x) / 2 - else: - merged_xs.append(x) - - total_cols = len(merged_xs) + 1 - max_zone_cols = max( - len(zg["grid"].get("_raw_columns", [])) - for zg in content_zones - ) - - # Apply union whenever it has at least as many - # columns as the best single zone. Even with the - # same count the union boundaries are better because - # they incorporate evidence from all zones. - if total_cols >= max_zone_cols: - cx_min = min(w["left"] for w in all_words) - cx_max = max( - w["left"] + w["width"] for w in all_words - ) - merged_columns: List[Dict[str, Any]] = [] - prev_x = cx_min - for i, sx in enumerate(merged_xs): - merged_columns.append({ - "index": i, - "type": f"column_{i + 1}", - "x_min": prev_x, - "x_max": sx, - }) - prev_x = sx - merged_columns.append({ - "index": len(merged_xs), - "type": f"column_{len(merged_xs) + 1}", - "x_min": prev_x, - "x_max": cx_max, - }) - - # Re-build ALL content zones with merged columns - for zg in zone_grids: - pz = zg["pz"] - if pz.zone_type == "content": - grid = _build_zone_grid( - zg["words"], pz.x, pz.y, - pz.width, pz.height, - pz.index, img_w, img_h, - global_columns=merged_columns, - skip_first_row_header=bool(pz.image_overlays), - ) - zg["grid"] = grid - logger.info( - "build-grid session %s: union of %d content " - "zones → %d merged columns (max single zone: %d)", - session_id, len(content_zones), - total_cols, max_zone_cols, - ) - - for zg in zone_grids: - pz = zg["pz"] - grid = zg["grid"] - # Remove internal _raw_columns before adding to response - grid.pop("_raw_columns", None) - - zone_entry: Dict[str, Any] = { - "zone_index": pz.index, - "zone_type": pz.zone_type, - "bbox_px": { - "x": pz.x, "y": pz.y, - "w": pz.width, "h": pz.height, - }, - "bbox_pct": { - "x": round(pz.x / img_w * 100, 2) if img_w else 0, - "y": round(pz.y / img_h * 100, 2) if img_h else 0, - "w": round(pz.width / img_w * 100, 2) if img_w else 0, - "h": round(pz.height / img_h * 100, 2) if img_h else 0, - }, - "border": None, - "word_count": len(zg["words"]), - **grid, - } - - if pz.box: - zone_entry["border"] = { - "thickness": pz.box.border_thickness, - "confidence": pz.box.confidence, - } - - if pz.image_overlays: - zone_entry["image_overlays"] = pz.image_overlays - - if pz.layout_hint: - zone_entry["layout_hint"] = pz.layout_hint - if pz.vsplit_group is not None: - zone_entry["vsplit_group"] = pz.vsplit_group - - zones_data.append(zone_entry) - - # 4. Fallback: no boxes detected → single zone with all words - if not zones_data: - # Filter recovered single-char artifacts (same as in zone loop above) - before = len(all_words) - filtered_words = [ - w for w in all_words - if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2) - ] - removed = before - len(filtered_words) - if removed: - logger.info( - "build-grid session %s: filtered %d recovered artifacts (fallback zone)", - session_id, removed, - ) - # Pre-filter border-strip words so column detection is not - # confused by edge artifacts. When this removes words, Step 4e - # is skipped (it would otherwise re-detect content as a "strip"). - filtered_words, bs_removed = _filter_border_strip_words(filtered_words) - if bs_removed: - border_prefiltered = True - logger.info( - "build-grid session %s: pre-filtered %d border-strip words", - session_id, bs_removed, - ) - grid = _build_zone_grid( - filtered_words, content_x, content_y, content_w, content_h, - 0, img_w, img_h, - ) - grid.pop("_raw_columns", None) - zones_data.append({ - "zone_index": 0, - "zone_type": "content", - "bbox_px": { - "x": content_x, "y": content_y, - "w": content_w, "h": content_h, - }, - "bbox_pct": { - "x": round(content_x / img_w * 100, 2) if img_w else 0, - "y": round(content_y / img_h * 100, 2) if img_h else 0, - "w": round(content_w / img_w * 100, 2) if img_w else 0, - "h": round(content_h / img_h * 100, 2) if img_h else 0, - }, - "border": None, - "word_count": len(all_words), - **grid, - }) - - # 4b. Remove junk rows: rows where ALL cells contain only short, - # low-confidence text (OCR noise, stray marks). Real vocabulary rows - # have at least one word with conf >= 50 or meaningful text length. - # Also remove "oversized stub" rows: rows with ≤2 very short words - # whose word-boxes are significantly taller than the median (e.g. - # large red page numbers like "( 9" that are not real text content). - _JUNK_CONF_THRESHOLD = 50 - _JUNK_MAX_TEXT_LEN = 3 - for z in zones_data: - cells = z.get("cells", []) - rows = z.get("rows", []) - if not cells or not rows: - continue - - # Compute median word height across the zone for oversized detection - all_wb_heights = [ - wb["height"] - for cell in cells - for wb in cell.get("word_boxes") or [] - if wb.get("height", 0) > 0 - ] - median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28 - - junk_row_indices = set() - for row in rows: - ri = row["index"] - row_cells = [c for c in cells if c.get("row_index") == ri] - if not row_cells: - continue - - row_wbs = [ - wb for cell in row_cells - for wb in cell.get("word_boxes") or [] - ] - - # Rule 1: ALL word_boxes are low-conf AND short text - all_junk = True - for wb in row_wbs: - text = (wb.get("text") or "").strip() - conf = wb.get("conf", 0) - if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: - all_junk = False - break - if all_junk and row_wbs: - junk_row_indices.add(ri) - continue - - # Rule 2: oversized stub — ≤3 words, short total text, - # and word height > 1.8× median (page numbers, stray marks, - # OCR from illustration labels like "SEA &") - # Skip if any word looks like a page reference (p.55, S.12). - if len(row_wbs) <= 3: - total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) - max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) - has_page_ref = any( - re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) - for wb in row_wbs - ) - if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: - junk_row_indices.add(ri) - continue - - # Rule 3: scattered debris — rows with only tiny fragments - # (e.g. OCR artifacts from illustrations/graphics). - # If the row has no word longer than 2 chars, it's noise. - longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs) - if longest <= 2: - junk_row_indices.add(ri) - continue - - if junk_row_indices: - z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] - z["rows"] = [r for r in rows if r["index"] not in junk_row_indices] - logger.info( - "build-grid: removed %d junk rows from zone %d: %s", - len(junk_row_indices), z["zone_index"], - sorted(junk_row_indices), - ) - - # 4b2. Remove individual cells that consist of a single very-short, - # low-confidence word (OCR artifacts like "as", "b" from stray marks). - # These survive row-level junk removal when the row has valid cells - # in other columns. - _ARTIFACT_MAX_LEN = 2 - _ARTIFACT_CONF_THRESHOLD = 65 - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - artifact_ids = set() - for cell in cells: - wbs = cell.get("word_boxes") or [] - if len(wbs) != 1: - continue - wb = wbs[0] - text = (wb.get("text") or "").strip() - conf = wb.get("conf", 100) - if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD: - artifact_ids.add(cell.get("cell_id")) - if artifact_ids: - z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids] - logger.info( - "build-grid: removed %d artifact cells from zone %d: %s", - len(artifact_ids), z.get("zone_index", 0), - [c.get("text") for c in cells if c.get("cell_id") in artifact_ids], - ) - - # 4c. Remove oversized word_boxes from individual cells. - # OCR artifacts from graphics/images (e.g. a huge "N" from a map image) - # have word heights 3-5x the median. Remove them per-word so they don't - # pollute cells that also contain valid text in other columns. - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - all_wh = [ - wb["height"] - for cell in cells - for wb in cell.get("word_boxes") or [] - if wb.get("height", 0) > 0 - ] - if not all_wh: - continue - med_h = sorted(all_wh)[len(all_wh) // 2] - oversized_threshold = med_h * 3 - removed_oversized = 0 - for cell in cells: - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold] - if len(filtered) < len(wbs): - removed_oversized += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - if removed_oversized: - # Remove cells that became empty after oversized removal - z["cells"] = [c for c in cells if c.get("word_boxes")] - logger.info( - "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d", - removed_oversized, oversized_threshold, z.get("zone_index", 0), - ) - - # 4d. Remove pipe-character word_boxes (column divider artifacts). - # OCR reads physical vertical divider lines as "|" or "||" characters. - # These sit at consistent x positions near column boundaries and pollute - # cell text. Remove them from word_boxes and rebuild cell text. - # NOTE: Zones from a vertical split already had pipes removed in step 3b. - _PIPE_RE = re.compile(r"^\|+$") - for z in zones_data: - if z.get("vsplit_group") is not None: - continue # pipes already removed before split - removed_pipes = 0 - for cell in z.get("cells", []): - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] - if len(filtered) < len(wbs): - removed_pipes += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - # Remove cells that became empty after pipe removal - if removed_pipes: - z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info( - "build-grid: removed %d pipe-divider word_boxes from zone %d", - removed_pipes, z.get("zone_index", 0), - ) - - # Strip pipe chars ONLY from word_boxes/cells where the pipe is an - # OCR column-divider artifact. Preserve pipes that are embedded in - # words as syllable separators (e.g. "zu|trau|en") — these are - # intentional and used in dictionary Ground Truth. - for z in zones_data: - for cell in z.get("cells", []): - for wb in cell.get("word_boxes", []): - wbt = wb.get("text", "") - # Only strip if the ENTIRE word_box is just pipe(s) - # (handled by _PIPE_RE above) — leave embedded pipes alone - text = cell.get("text", "") - if "|" in text: - # Only strip leading/trailing pipes (OCR artifacts at cell edges) - cleaned = text.strip("|").strip() - if cleaned != text.strip(): - cell["text"] = cleaned - - # 4d2. Normalize narrow connector columns. - # In synonym dictionaries a narrow column repeats the same word - # (e.g. "oder") in every row. OCR sometimes appends noise chars - # (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column - # share the same short text, normalize near-match outliers. - for z in zones_data: - cols = z.get("columns", []) - cells = z.get("cells", []) - if not cols or not cells: - continue - for col in cols: - ci = col.get("index") - col_cells = [c for c in cells if c.get("col_index") == ci] - if len(col_cells) < 3: - continue - # Count text occurrences - text_counts: Dict[str, int] = {} - for c in col_cells: - t = (c.get("text") or "").strip() - if t: - text_counts[t] = text_counts.get(t, 0) + 1 - if not text_counts: - continue - dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type] - dominant_count = text_counts[dominant_text] - # Only normalize if dominant word is short and appears in ≥60% - if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6: - continue - # Fix outliers that start with the dominant text - fixed = 0 - for c in col_cells: - t = (c.get("text") or "").strip() - if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2: - c["text"] = dominant_text - # Also fix word_boxes - wbs = c.get("word_boxes") or [] - if len(wbs) == 1: - wbs[0]["text"] = dominant_text - fixed += 1 - if fixed: - logger.info( - "build-grid: normalized %d outlier cells in connector column %d " - "(dominant='%s') zone %d", - fixed, ci, dominant_text, z.get("zone_index", 0), - ) - - # 4e. Detect and remove page-border decoration strips. - # Skipped when the pre-filter already removed border words BEFORE - # column detection — re-running would incorrectly detect the - # leftmost content column as a "strip". - border_strip_removed = 0 - if border_prefiltered: - logger.info("Step 4e: skipped (border pre-filter already applied)") - else: - # Some textbooks have decorative alphabet strips along the page - # edge. OCR picks up scattered letters from these as artifacts. - # Detection: find the first significant x-gap (>30 px) from each - # page edge between a small cluster (<20 %) and the main content. - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - all_wbs_with_cell: List[tuple] = [] # (left, wb, cell) - for cell in cells: - for wb in cell.get("word_boxes") or []: - all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) - if len(all_wbs_with_cell) < 10: - continue - all_wbs_with_cell.sort(key=lambda t: t[0]) - total = len(all_wbs_with_cell) - - # -- Left-edge scan -- - left_strip_count = 0 - left_gap = 0 - running_right = 0 - for gi in range(total - 1): - running_right = max( - running_right, - all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0), - ) - gap = all_wbs_with_cell[gi + 1][0] - running_right - if gap > 30: - left_strip_count = gi + 1 - left_gap = gap - break - - # -- Right-edge scan -- - right_strip_count = 0 - right_gap = 0 - running_left = all_wbs_with_cell[-1][0] - for gi in range(total - 1, 0, -1): - running_left = min(running_left, all_wbs_with_cell[gi][0]) - prev_right = ( - all_wbs_with_cell[gi - 1][0] - + all_wbs_with_cell[gi - 1][1].get("width", 0) - ) - gap = running_left - prev_right - if gap > 30: - right_strip_count = total - gi - right_gap = gap - break - - strip_wbs: set = set() - strip_side = "" - strip_gap = 0 - strip_count = 0 - if left_strip_count > 0 and left_strip_count / total < 0.20: - strip_side = "left" - strip_count = left_strip_count - strip_gap = left_gap - strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]} - elif right_strip_count > 0 and right_strip_count / total < 0.20: - strip_side = "right" - strip_count = right_strip_count - strip_gap = right_gap - strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]} - - if not strip_wbs: - continue - for cell in cells: - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if id(wb) not in strip_wbs] - if len(filtered) < len(wbs): - border_strip_removed += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - z["cells"] = [c for c in cells - if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info( - "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " - "(gap=%dpx, strip=%d/%d wbs)", - border_strip_removed, strip_side, z.get("zone_index", 0), - strip_gap, strip_count, total, - ) - - # 4f. Remove decorative edge columns (alphabet sidebar safety net). - # Dictionary pages have A-Z letter sidebars that OCR reads as single- - # character word_boxes. These form narrow columns with very short text. - # Detection: edge column where almost ALL cells are single characters. - for z in zones_data: - columns = z.get("columns", []) - cells = z.get("cells", []) - if len(columns) < 3 or not cells: - continue - # Group cells by col_type (skip spanning_header) - col_cells: Dict[str, List[Dict]] = {} - for cell in cells: - ct = cell.get("col_type", "") - if ct.startswith("column_"): - col_cells.setdefault(ct, []).append(cell) - col_types_ordered = sorted(col_cells.keys()) - if len(col_types_ordered) < 3: - continue - for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: - edge_cells_list = col_cells.get(edge_ct, []) - if len(edge_cells_list) < 3: - continue - # Key criterion: average text length and single-char ratio. - # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells - # are single characters. - texts = [(c.get("text") or "").strip() for c in edge_cells_list] - avg_len = sum(len(t) for t in texts) / len(texts) - single_char = sum(1 for t in texts if len(t) <= 1) - single_ratio = single_char / len(texts) - if avg_len > 1.5: - continue # real content has longer text - if single_ratio < 0.7: - continue # not dominated by single chars - # Remove this edge column - removed_count = len(edge_cells_list) - edge_ids = {id(c) for c in edge_cells_list} - z["cells"] = [c for c in cells if id(c) not in edge_ids] - z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] - logger.info( - "Step 4f: removed decorative edge column '%s' from zone %d " - "(%d cells, avg_len=%.1f, single_char=%.0f%%)", - edge_ct, z.get("zone_index", 0), removed_count, - avg_len, single_ratio * 100, - ) - break # only remove one edge per zone - - # 5. Color annotation on final word_boxes in cells - if img_bgr is not None: - all_wb: List[Dict] = [] - for z in zones_data: - for cell in z.get("cells", []): - all_wb.extend(cell.get("word_boxes", [])) - detect_word_colors(img_bgr, all_wb) - - # 5a. Heading detection by color + height (after color is available) - heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h) - if heading_count: - logger.info("Detected %d heading rows by color+height", heading_count) - - # 5b. Fix unmatched parentheses in cell text - # OCR often misses opening "(" while detecting closing ")". - # If a cell's text has ")" without a matching "(", prepend "(". - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if ")" in text and "(" not in text: - cell["text"] = "(" + text - - # 5c. IPA phonetic correction — replace garbled OCR phonetics with - # correct IPA from the dictionary (same as in the OCR pipeline). - # Only applies to vocabulary tables (≥3 columns: EN | article | DE). - # Single/two-column layouts are continuous text, not vocab tables. - all_cells = [cell for z in zones_data for cell in z.get("cells", [])] - total_cols = sum(len(z.get("columns", [])) for z in zones_data) - en_col_type = None - ipa_target_cols: set = set() - all_content_cols: set = set() - skip_ipa = (ipa_mode == "none") - - # When ipa_mode=none, strip ALL square brackets from ALL content columns - if skip_ipa: - _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]') - for cell in all_cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - text = cell.get("text", "") - if "[" in text: - stripped = _SQUARE_BRACKET_RE_NONE.sub("", text) - if stripped != text: - cell["text"] = stripped.strip() - cell["_ipa_corrected"] = True - - if not skip_ipa and total_cols >= 3: - # Detect English headword column via IPA signals (brackets or garbled). - col_ipa_count: Dict[str, int] = {} - all_content_cols: set = set() - for cell in all_cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - txt = cell.get("text", "") or "" - if txt.strip(): - all_content_cols.add(ct) - if '[' in txt or _text_has_garbled_ipa(txt): - col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1 - if col_ipa_count: - en_col_type = max(col_ipa_count, key=col_ipa_count.get) - elif ipa_mode == "all": - # Force-all mode without auto-detection: pick column with most cells - col_cell_count: Dict[str, int] = {} - for cell in all_cells: - ct = cell.get("col_type", "") - if ct.startswith("column_") and (cell.get("text") or "").strip(): - col_cell_count[ct] = col_cell_count.get(ct, 0) + 1 - if col_cell_count: - en_col_type = max(col_cell_count, key=col_cell_count.get) - - # Decide which columns to process based on ipa_mode: - # auto/en: only the detected EN headword column (English IPA) - # de: all content columns EXCEPT the EN column (German IPA) - # all: EN column gets English IPA, other columns get German IPA - en_ipa_target_cols: set = set() - de_ipa_target_cols: set = set() - if ipa_mode in ("auto", "en"): - if en_col_type: - en_ipa_target_cols.add(en_col_type) - elif ipa_mode == "de": - de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols - elif ipa_mode == "all": - if en_col_type: - en_ipa_target_cols.add(en_col_type) - de_ipa_target_cols = all_content_cols - en_ipa_target_cols - - # --- Strip IPA from columns NOT in the target set --- - # When user selects "nur DE", English IPA from the OCR scan must - # be removed. When "none", all IPA is removed. - # In vocab columns, square brackets [...] are always IPA (both - # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]). - _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]') - strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols - if strip_en_ipa or ipa_mode == "none": - strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols - for cell in all_cells: - ct = cell.get("col_type", "") - if ct not in strip_cols: - continue - text = cell.get("text", "") - if "[" in text: - stripped = _SQUARE_BRACKET_RE.sub("", text) - if stripped != text: - cell["text"] = stripped.strip() - cell["_ipa_corrected"] = True - - # --- English IPA (Britfone + eng_to_ipa) --- - if en_ipa_target_cols: - for cell in all_cells: - ct = cell.get("col_type") - if ct in en_ipa_target_cols: - cell["_orig_col_type"] = ct - cell["col_type"] = "column_en" - _pre_ipa = {id(c): c.get("text", "") for c in all_cells} - fix_cell_phonetics(all_cells, pronunciation="british") - for cell in all_cells: - orig = cell.pop("_orig_col_type", None) - if orig: - cell["col_type"] = orig - if cell.get("text", "") != _pre_ipa.get(id(cell), ""): - cell["_ipa_corrected"] = True - - # --- German IPA (wiki-pronunciation-dict + epitran) --- - if de_ipa_target_cols: - from cv_ipa_german import insert_german_ipa - insert_german_ipa(all_cells, de_ipa_target_cols) - - ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols - - # Mark cells whose text was changed by IPA correction so that - # later steps (5i) don't overwrite the corrected text when - # reconstructing from word_boxes. (Already done inline above - # for English; insert_german_ipa sets _ipa_corrected too.) - for cell in all_cells: - if cell.get("text", "") != _pre_ipa.get(id(cell), ""): - cell["_ipa_corrected"] = True - - # 5d. Fix IPA continuation cells — cells where the printed - # phonetic transcription wraps to a line below the headword. - # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]"). - # Replace garbled text with proper IPA looked up from the - # headword in the previous row's same column. - # Note: We check ALL columns, not just en_col_type, because - # the EN headword column may not be the longest-average column. - _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") - ipa_cont_fixed = 0 - for z in ([] if skip_ipa else zones_data): - rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) - z_cells = z.get("cells", []) - for idx, row in enumerate(rows_sorted): - if idx == 0: - continue - ri = row["index"] - row_cells = [c for c in z_cells if c.get("row_index") == ri] - for cell in row_cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - cell_text = (cell.get("text") or "").strip() - if not cell_text: - # Step 5c may have emptied garbled IPA cells like - # "[n, nn]" — recover text from word_boxes. - wb_texts = [w.get("text", "") - for w in cell.get("word_boxes", [])] - cell_text = " ".join(wb_texts).strip() - if not cell_text: - continue - - is_bracketed = ( - cell_text.startswith('[') and cell_text.endswith(']') - ) - - if is_bracketed: - # Bracketed continuation: "[n, nn]", "[klaoz 'daun]" - # Text like "employee [im'ploi:]" is NOT fully - # bracketed and won't match here. - if not _text_has_garbled_ipa(cell_text): - continue - # Already has proper IPA brackets → skip - if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): - continue - else: - # Unbracketed continuation: "ska:f – ska:vz", - # "'sekandarr sku:l". Only treat as IPA - # continuation if this is the ONLY content cell - # in the row (single-cell row) and the text is - # garbled IPA without real IPA Unicode symbols. - content_cells_in_row = [ - c for c in row_cells - if c.get("col_type", "").startswith("column_") - and c.get("col_type") != "column_1" - ] - if len(content_cells_in_row) != 1: - continue - if not _text_has_garbled_ipa(cell_text): - continue - # Has real IPA symbols → already fixed or valid - if any(c in _REAL_IPA_CHARS for c in cell_text): - continue - - # Find headword in previous row, same column - prev_ri = rows_sorted[idx - 1]["index"] - prev_same_col = [ - c for c in z_cells - if c.get("row_index") == prev_ri - and c.get("col_type") == ct - ] - if not prev_same_col: - continue - prev_text = prev_same_col[0].get("text", "") - fixed = fix_ipa_continuation_cell( - cell_text, prev_text, pronunciation="british", - ) - if fixed != cell_text: - cell["text"] = fixed - ipa_cont_fixed += 1 - logger.info( - "IPA continuation R%d %s: '%s' → '%s'", - ri, ct, cell_text, fixed, - ) - if ipa_cont_fixed: - logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) - - # 5e. Heading detection by single-cell rows — black headings like - # "Theme" that have normal color and height but are the ONLY cell - # in their row (excluding page_ref column_1). Must run AFTER 5d - # so IPA continuation cells are already processed. - single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h) - if single_heading_count: - logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) - - # 5f. Strip IPA from headings — headings detected in 5e ran AFTER - # IPA correction (5c), so they may have dictionary IPA appended - # (e.g. "Theme [θˈiːm]" → "Theme"). Headings should show the - # original text only. - for z in zones_data: - for cell in z.get("cells", []): - if cell.get("col_type") != "heading": - continue - text = cell.get("text", "") - # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme" - stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip() - if stripped and stripped != text: - cell["text"] = stripped - - # 5g. Extract page_ref cells and footer rows from content zones. - # Page references (column_1 cells like "p.70") sit in rows that - # also contain vocabulary — extract them as zone metadata without - # removing the row. Footer lines (e.g. "two hundred and twelve" - # = page number at bottom) are standalone rows that should be - # removed from the table entirely. - _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") - # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70" - _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') - _NUMBER_WORDS = { - "one", "two", "three", "four", "five", "six", "seven", - "eight", "nine", "ten", "eleven", "twelve", "thirteen", - "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", - "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", - "seventy", "eighty", "ninety", "hundred", "thousand", "and", - "einhundert", "zweihundert", "dreihundert", "vierhundert", - "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", - } - for z in zones_data: - if z.get("zone_type") != "content": - continue - cells = z.get("cells", []) - rows = z.get("rows", []) - if not rows: - continue - - # Extract column_1 cells that look like page references - page_refs = [] - page_ref_cell_ids = set() - for cell in cells: - if cell.get("col_type") != "column_1": - continue - text = (cell.get("text") or "").strip() - if not text: - continue - if not _PAGE_REF_RE.match(text): - continue - page_refs.append({ - "row_index": cell.get("row_index"), - "text": text, - "bbox_pct": cell.get("bbox_pct", {}), - }) - page_ref_cell_ids.add(cell.get("cell_id")) - - # Keep page_ref cells in the table as a visible column. - # Previously these were removed, but users want to see them. - # The metadata extraction above still populates zone["page_refs"] - # for the frontend header display. - - # Detect footer: last non-header row if it has only 1 cell - # with short, non-content text (page numbers like "233" or - # "two hundred and twelve"). Comma-separated lists and long - # text are content continuations, not page numbers. - footer_rows = [] - non_header_rows = [r for r in rows if not r.get("is_header")] - if non_header_rows: - last_row = non_header_rows[-1] - last_ri = last_row["index"] - last_cells = [c for c in z["cells"] - if c.get("row_index") == last_ri] - if len(last_cells) == 1: - text = (last_cells[0].get("text") or "").strip() - # Not IPA (no real IPA symbols) and not a heading - has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) - # Comma-separated text is a content continuation, not a footer - has_commas = ',' in text - # Written-out page numbers like "two hundred and nine" - text_words = set(text.lower().split()) - is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) - # Short text or written-out number - is_page_number = len(text) <= 20 or is_written_number - if (text and not has_real_ipa and not has_commas - and is_page_number - and last_cells[0].get("col_type") != "heading"): - footer_rows.append({ - "row_index": last_ri, - "text": text, - "bbox_pct": last_cells[0].get("bbox_pct", {}), - }) - - # Classify footer rows: page numbers are removed from the grid - # and promoted to page_number metadata; other footers stay as rows. - page_number_footers = [] - other_footers = [] - for fr in footer_rows: - ft = fr["text"].strip() - # Pure digits - digits = "".join(c for c in ft if c.isdigit()) - if digits and re.match(r'^[\d\s.]+$', ft): - page_number_footers.append(fr) - # Written-out numbers - elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS): - page_number_footers.append(fr) - else: - other_footers.append(fr) - - # Remove page-number footer rows from grid entirely - if page_number_footers: - pn_ris = {fr["row_index"] for fr in page_number_footers} - z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris] - z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris] - # Set page_number metadata (use first one) - pn_text = page_number_footers[0]["text"].strip() - pn_digits = "".join(c for c in pn_text if c.isdigit()) - if not page_number_info: - page_number_info = { - "text": pn_text, - "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95), - } - if pn_digits: - page_number_info["number"] = int(pn_digits) - - # Mark remaining footer rows (non-page-number content) - if other_footers: - footer_ris = {fr["row_index"] for fr in other_footers} - for r in z["rows"]: - if r["index"] in footer_ris: - r["is_footer"] = True - for c in z["cells"]: - if c.get("row_index") in footer_ris: - c["col_type"] = "footer" - - if page_refs or footer_rows: - logger.info( - "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d", - len(page_refs), len(footer_rows), len(page_number_footers), - z.get("zone_index", 0), - ) - - # Store as zone-level metadata - if page_refs: - z["page_refs"] = page_refs - if other_footers: - z["footer"] = other_footers - - # 5h. Convert slash-delimited IPA to bracket notation. - # Dictionary-style pages print IPA between slashes: "tiger /'taiga/" - # Detect the pattern /ocr_ipa/ and replace with [dict_ipa] - # using the IPA dictionary when available, falling back to the OCR text. - # The regex requires a word character (or ² ³) right before the opening - # slash to avoid false positives like "sb/sth". - _SLASH_IPA_RE = re.compile( - r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1) - r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars - ) - # Standalone slash IPA at start of text (headword on previous line) - _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') - # IPA between slashes never contains spaces, parentheses, or commas. - # Reject matches that look like grammar: "sb/sth up a) jdn/" - _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') - slash_ipa_fixed = 0 - for z in ([] if skip_ipa else zones_data): - for cell in z.get("cells", []): - # Only process English headword column — avoid converting - # German text like "der/die/das" to IPA. - if en_col_type and cell.get("col_type") != en_col_type: - continue - text = cell.get("text", "") - if "/" not in text: - continue - - def _replace_slash_ipa(m: re.Match) -> str: - nonlocal slash_ipa_fixed - headword = m.group(1) - ocr_ipa = m.group(2) # includes slashes - inner_raw = ocr_ipa.strip("/").strip() - # Reject if inner content has spaces/parens/commas (grammar) - if _SLASH_IPA_REJECT_RE.search(inner_raw): - return m.group(0) - # Strip superscript digits for lookup - clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() - ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None - if ipa: - slash_ipa_fixed += 1 - return f"{headword} [{ipa}]" - # Fallback: keep OCR IPA but convert slashes to brackets - inner = inner_raw.lstrip("'").strip() - if inner: - slash_ipa_fixed += 1 - return f"{headword} [{inner}]" - return m.group(0) - - new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) - - # Second pass: convert remaining /ipa/ after [ipa] from first pass. - # Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant) - _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') - def _replace_trailing_slash(m: re.Match) -> str: - nonlocal slash_ipa_fixed - inner = m.group(1).strip("/").strip().lstrip("'").strip() - if _SLASH_IPA_REJECT_RE.search(inner): - return m.group(0) - if inner: - slash_ipa_fixed += 1 - return f" [{inner}]" - return m.group(0) - new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text) - - # Handle standalone /ipa/ at start (no headword in this cell) - if new_text == text: - m = _STANDALONE_SLASH_IPA_RE.match(text) - if m: - inner = m.group(1).strip() - if not _SLASH_IPA_REJECT_RE.search(inner): - inner = inner.lstrip("'").strip() - if inner: - new_text = "[" + inner + "]" + text[m.end():] - slash_ipa_fixed += 1 - - if new_text != text: - cell["text"] = new_text - - if slash_ipa_fixed: - logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed) - - # 5i. Remove blue bullet/artifact word_boxes. - # Dictionary pages have small blue square bullets (■) before entries. - # OCR reads these as text artifacts (©, e, *, or even plausible words - # like "fighily" overlapping the real word "tightly"). - # Detection rules: - # a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black) - # b) Overlapping word_boxes: >40% x-overlap → remove lower confidence - # c) Duplicate text: consecutive blue wbs with identical text, gap < 6px - bullet_removed = 0 - for z in zones_data: - for cell in z.get("cells", []): - wbs = cell.get("word_boxes") or [] - if len(wbs) < 2: - continue - to_remove: set = set() - - # Rule (a): tiny coloured symbols (bullets, graphic fragments) - for i, wb in enumerate(wbs): - cn = wb.get("color_name", "black") - if (cn != "black" - and wb.get("width", 0) * wb.get("height", 0) < 200 - and wb.get("conf", 100) < 85): - to_remove.add(i) - - # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) - # Small images/icons next to words get OCR'd as ">", "<", "~", etc. - # Remove word boxes that contain NO letters or digits. - for i, wb in enumerate(wbs): - t = (wb.get("text") or "").strip() - if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: - to_remove.add(i) - - # Rule (b) + (c): overlap and duplicate detection - # Sort by x for pairwise comparison - _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') - to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge - indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0)) - for p in range(len(indexed) - 1): - i1, w1 = indexed[p] - i2, w2 = indexed[p + 1] - x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0) - x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0) - overlap = max(0, min(x1e, x2e) - max(x1s, x2s)) - min_w = min(w1.get("width", 1), w2.get("width", 1)) - gap = x2s - x1e - overlap_pct = overlap / min_w if min_w > 0 else 0 - - # (b) Significant x-overlap - if overlap_pct > 0.20: - t1 = (w1.get("text") or "").strip() - t2 = (w2.get("text") or "").strip() - - # Syllable-split words: both are alphabetic text with - # moderate overlap (20-75%). Merge instead of removing. - # OCR splits words at syllable marks, producing overlapping - # boxes like "zu" + "tiefst" → "zutiefst". - if (overlap_pct <= 0.75 - and _ALPHA_WORD_RE.match(t1) - and _ALPHA_WORD_RE.match(t2)): - to_merge.append((i1, i2)) - continue - - # High overlap (>75%) with different alphabetic text: - # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104) - # causing it to heavily overlap with the next fragment ("brech"). - # Merge instead of removing when one is a short prefix (≤4 chars) - # and the texts are different. - if (overlap_pct > 0.75 - and _ALPHA_WORD_RE.match(t1) - and _ALPHA_WORD_RE.match(t2) - and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower() - and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4): - to_merge.append((i1, i2)) - continue - - if overlap_pct <= 0.40: - continue # too little overlap and not alphabetic merge - - c1 = w1.get("conf", 50) - c2 = w2.get("conf", 50) - - # For very high overlap (>90%) with different text, - # prefer the word that exists in the IPA dictionary - # over confidence (OCR can give artifacts high conf). - if overlap_pct > 0.90 and t1.lower() != t2.lower(): - in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False - in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False - if in_dict_1 and not in_dict_2: - to_remove.add(i2) - continue - elif in_dict_2 and not in_dict_1: - to_remove.add(i1) - continue - - if c1 < c2: - to_remove.add(i1) - elif c2 < c1: - to_remove.add(i2) - else: - # Same confidence: remove the taller one (bullet slivers) - if w1.get("height", 0) > w2.get("height", 0): - to_remove.add(i1) - else: - to_remove.add(i2) - - # (c) Duplicate text: consecutive blue with same text, gap < 6px - elif (gap < 6 - and w1.get("color_name") == "blue" - and w2.get("color_name") == "blue" - and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()): - # Remove the one with lower confidence; if equal, first one - c1 = w1.get("conf", 50) - c2 = w2.get("conf", 50) - to_remove.add(i1 if c1 <= c2 else i2) - - # Execute merges first (syllable-split words). - # Use merge_parent to support chain merging: if "zer" absorbed - # "brech" and then "brech"+"lich" is a merge pair, redirect to - # merge "lich" into "zer" → "zerbrechlich". - if to_merge: - merge_parent: Dict[int, int] = {} # absorbed → absorber - for mi1, mi2 in to_merge: - # Follow chain: if mi1 was absorbed, find root absorber - actual_mi1 = mi1 - while actual_mi1 in merge_parent: - actual_mi1 = merge_parent[actual_mi1] - if actual_mi1 in to_remove or mi2 in to_remove: - continue - if mi2 in merge_parent: - continue # mi2 already absorbed - mw1, mw2 = wbs[actual_mi1], wbs[mi2] - # Concatenate text (no space — they're parts of one word) - mt1 = (mw1.get("text") or "").rstrip(".,;:!?") - mt2 = (mw2.get("text") or "").strip() - merged_text = mt1 + mt2 - # Union bounding box - mx = min(mw1["left"], mw2["left"]) - my = min(mw1["top"], mw2["top"]) - mr = max(mw1["left"] + mw1["width"], - mw2["left"] + mw2["width"]) - mb = max(mw1["top"] + mw1["height"], - mw2["top"] + mw2["height"]) - mw1["text"] = merged_text - mw1["left"] = mx - mw1["top"] = my - mw1["width"] = mr - mx - mw1["height"] = mb - my - mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 - to_remove.add(mi2) - merge_parent[mi2] = actual_mi1 - bullet_removed -= 1 # net: merge, not removal - - if to_remove: - bullet_removed += len(to_remove) - filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] - cell["word_boxes"] = filtered - # Don't overwrite text that was corrected by Step 5c IPA fix - if not cell.get("_ipa_corrected"): - cell["text"] = _words_to_reading_order_text(filtered) - - # Remove cells that became empty after bullet removal - if bullet_removed: - for z in zones_data: - z["cells"] = [c for c in z.get("cells", []) - if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) - - # 5j-pre. Remove cells whose text is entirely garbled / artifact noise. - # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr", - # "\\", "PEE", "a=") that survive earlier filters because their rows also - # contain real content in other columns. Remove them here. - _COMMON_SHORT_WORDS = { - # German - "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja", - "ob", "so", "um", "zu", "wo", "je", "oh", "or", - "die", "der", "das", "dem", "den", "des", "ein", "und", - "auf", "aus", "bei", "bis", "für", "mit", "nur", "von", - # English - "a", "i", "an", "as", "at", "be", "by", "do", "go", "he", - "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok", - "on", "or", "so", "to", "up", "us", "we", - "the", "and", "but", "for", "not", - } - _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$') - artifact_cells_removed = 0 - for z in zones_data: - before = len(z.get("cells", [])) - kept = [] - for cell in z.get("cells", []): - text = (cell.get("text") or "").strip() - core = text.rstrip(".,;:!?'\"") - is_artifact = False - if not core: - is_artifact = True - elif _PURE_JUNK_RE.match(core): - is_artifact = True - elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): - # Short non-alphabetic text like "a=", not word beginnings like "Zw" - is_artifact = True - elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: - is_artifact = True - elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core) - and not re.match(r'^[pPsS]\.?\d+$', core)): - # Mixed digits + letters in short text (e.g. "7 EN", "a=3") - # but NOT page references like "p.43", "p50", "S.12" - is_artifact = True - if is_artifact: - kept.append(None) # placeholder - else: - kept.append(cell) - z["cells"] = [c for c in kept if c is not None] - artifact_cells_removed += before - len(z["cells"]) - if artifact_cells_removed: - # Also remove rows that became completely empty - for z in zones_data: - cell_ris = {c.get("row_index") for c in z.get("cells", [])} - z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris] - logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed) - - # 5j. Normalise word_box order to reading order (group by Y, sort by X). - # The frontend renders colored cells from word_boxes array order - # (GridTable.tsx), so they MUST be in left-to-right reading order. - wb_reordered = 0 - for z in zones_data: - for cell in z.get("cells", []): - wbs = cell.get("word_boxes") or [] - if len(wbs) < 2: - continue - lines = _group_words_into_lines(wbs, y_tolerance_px=15) - sorted_wbs = [w for line in lines for w in line] - # Check if order actually changed - if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]: - cell["word_boxes"] = sorted_wbs - wb_reordered += 1 - if wb_reordered: - logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered) - - duration = time.time() - t0 - - # 6. Build result - total_cells = sum(len(z.get("cells", [])) for z in zones_data) - total_columns = sum(len(z.get("columns", [])) for z in zones_data) - total_rows = sum(len(z.get("rows", [])) for z in zones_data) - - # Collect color statistics from all word_boxes in cells - color_stats: Dict[str, int] = {} - for z in zones_data: - for cell in z.get("cells", []): - for wb in cell.get("word_boxes", []): - cn = wb.get("color_name", "black") - color_stats[cn] = color_stats.get(cn, 0) + 1 - - # Compute layout metrics for faithful grid reconstruction - all_content_row_heights: List[float] = [] - for z in zones_data: - for row in z.get("rows", []): - if not row.get("is_header", False): - h = row.get("y_max_px", 0) - row.get("y_min_px", 0) - if h > 0: - all_content_row_heights.append(h) - avg_row_height = ( - sum(all_content_row_heights) / len(all_content_row_heights) - if all_content_row_heights else 30.0 - ) - font_size_suggestion = max(10, int(avg_row_height * 0.6)) - - # --- Dictionary detection on assembled grid --- - # Build lightweight ColumnGeometry-like structures from zone columns for - # dictionary signal scoring. - from cv_layout import _score_dictionary_signals - dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0} - try: - from cv_vocab_types import ColumnGeometry - for z in zones_data: - zone_cells = z.get("cells", []) - zone_cols = z.get("columns", []) - if len(zone_cols) < 2 or len(zone_cells) < 10: - continue - # Build pseudo-ColumnGeometry per column - pseudo_geoms = [] - for col in zone_cols: - ci = col["index"] - col_cells = [c for c in zone_cells if c.get("col_index") == ci] - # Flatten word_boxes into word dicts compatible with _score_language - col_words = [] - for cell in col_cells: - for wb in cell.get("word_boxes") or []: - col_words.append({ - "text": wb.get("text", ""), - "conf": wb.get("conf", 0), - "top": wb.get("top", 0), - "left": wb.get("left", 0), - "height": wb.get("height", 0), - "width": wb.get("width", 0), - }) - # Fallback: use cell text if no word_boxes - if not cell.get("word_boxes") and cell.get("text"): - col_words.append({ - "text": cell["text"], - "conf": cell.get("confidence", 50), - "top": cell.get("bbox_px", {}).get("y", 0), - "left": cell.get("bbox_px", {}).get("x", 0), - "height": cell.get("bbox_px", {}).get("h", 20), - "width": cell.get("bbox_px", {}).get("w", 50), - }) - col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0) - pseudo_geoms.append(ColumnGeometry( - index=ci, x=col.get("x_min_px", 0), y=0, - width=max(col_w, 1), height=img_h, - word_count=len(col_words), words=col_words, - width_ratio=col_w / max(img_w, 1), - )) - if len(pseudo_geoms) >= 2: - dd = _score_dictionary_signals( - pseudo_geoms, - document_category=document_category, - margin_strip_detected=margin_strip_detected, - ) - if dd["confidence"] > dict_detection["confidence"]: - dict_detection = dd - except Exception as e: - logger.warning("Dictionary detection failed: %s", e) - - # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" --- - try: - from cv_syllable_detect import merge_word_gaps_in_zones - merge_word_gaps_in_zones(zones_data, session_id) - except Exception as e: - logger.warning("Word-gap merge failed: %s", e) - - # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers --- - # Strips | from words, validates with pyphen, tries char-deletion for garbled - # words like "Ze|plpe|lin" → "Zeppelin". - try: - from cv_syllable_detect import autocorrect_pipe_artifacts - autocorrect_pipe_artifacts(zones_data, session_id) - except Exception as e: - logger.warning("Pipe autocorrect failed: %s", e) - - # --- Syllable divider insertion for dictionary pages --- - # syllable_mode: "auto" = only when original has pipe dividers (1% threshold), - # "all" = force on all content words, "en" = English column only, - # "de" = German columns only, "none" = skip entirely. - syllable_insertions = 0 - if syllable_mode != "none" and img_bgr is not None: - _syllable_eligible = False - if syllable_mode in ("all", "de", "en"): - _syllable_eligible = True - elif (dict_detection.get("is_dictionary") - and dict_detection.get("article_col_index") is not None): - # auto: only on dictionary pages with article columns - _syllable_eligible = True - # For language-specific modes, determine allowed columns - _syllable_col_filter: Optional[set] = None # None = all columns - if syllable_mode == "en": - _syllable_col_filter = {en_col_type} if en_col_type else set() - elif syllable_mode == "de": - if en_col_type and total_cols >= 3: - _syllable_col_filter = all_content_cols - {en_col_type} - # else None → all columns (correct for German-only dicts) - if _syllable_eligible: - try: - from cv_syllable_detect import insert_syllable_dividers - force_syllables = (syllable_mode in ("all", "de", "en")) - syllable_insertions = insert_syllable_dividers( - zones_data, img_bgr, session_id, - force=force_syllables, - col_filter=_syllable_col_filter, - ) - except Exception as e: - logger.warning("Syllable insertion failed: %s", e) - - # When syllable mode is "none", strip any residual | from OCR so - # that the displayed text is clean (e.g. "Zel|le" → "Zelle"). - if syllable_mode == "none": - for z in zones_data: - for cell in z.get("cells", []): - t = cell.get("text", "") - if "|" in t: - cell["text"] = t.replace("|", "") - - # --- Split merged words (OCR sometimes glues adjacent words) --- - # Uses dictionary lookup to split e.g. "atmyschool" → "at my school" - try: - from cv_review import _try_split_merged_word, _SPELL_AVAILABLE - if _SPELL_AVAILABLE: - split_count = 0 - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if not text: - continue - parts = [] - changed = False - for token in text.split(): - # Try splitting pure-alpha tokens >= 4 chars - # Strip trailing punctuation AND IPA brackets - clean = token - # Remove trailing IPA like [dɪsˈɪʒən] first - bracket_pos = clean.find('[') - suffix_ipa = "" - if bracket_pos > 0: - suffix_ipa = clean[bracket_pos:] - clean = clean[:bracket_pos] - suffix_punct = "" - stripped = clean.rstrip(".,!?;:'\")") - if stripped != clean: - suffix_punct = clean[len(stripped):] - clean = stripped - suffix = suffix_punct + suffix_ipa - # Handle contractions: "solet's" → try "solet" + "'s" - contraction = "" - if "'" in clean and clean.index("'") >= 2: - apos_pos = clean.index("'") - contraction = clean[apos_pos:] - clean = clean[:apos_pos] - suffix = contraction + suffix - if len(clean) >= 4 and clean.isalpha(): - split = _try_split_merged_word(clean) - if split: - parts.append(split + suffix) - changed = True - continue - parts.append(token) - if changed: - cell["text"] = " ".join(parts) - split_count += 1 - if split_count: - logger.info("build-grid session %s: split %d merged words", session_id, split_count) - except ImportError: - pass - - # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" --- - # Matches any [bracket] directly after a letter, as long as the bracket - # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]"). - _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])') - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if text and "[" in text: - fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text) - if fixed != text: - cell["text"] = fixed - - # --- SmartSpellChecker: language-aware OCR correction on all cells --- - try: - from smart_spell import SmartSpellChecker - _ssc = SmartSpellChecker() - spell_fix_count = 0 - - # Determine language per column: - # en_col_type was already detected (column with IPA = English). - # All other content columns are assumed German for vocab tables. - # For single/two-column layouts, use auto-detection. - for z in zones_data: - zone_cols = z.get("columns", []) - for cell in z.get("cells", []): - text = cell.get("text", "") - if not text or not text.strip(): - continue - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - - # Determine language for this cell - if total_cols >= 3 and en_col_type: - lang = "en" if ct == en_col_type else "de" - elif total_cols <= 2: - lang = "auto" # auto-detect for non-vocab layouts - else: - lang = "auto" - - result = _ssc.correct_text(text, lang=lang) - if result.changed: - cell["text"] = result.corrected - spell_fix_count += 1 - - if spell_fix_count: - logger.info( - "build-grid session %s: SmartSpellChecker fixed %d cells", - session_id, spell_fix_count, - ) - except ImportError: - logger.debug("SmartSpellChecker not available in build-grid") - except Exception as e: - logger.warning("SmartSpellChecker error in build-grid: %s", e) - - # --- Debug: log cell counts per column before empty-column removal --- - for z in zones_data: - if z.get("zone_type") == "content": - from collections import Counter as _Counter - _cc = _Counter(c.get("col_index") for c in z.get("cells", [])) - _cols = z.get("columns", []) - logger.info( - "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s", - z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())), - ) - - # --- Remove empty columns (no cells assigned) --- - for z in zones_data: - cells = z.get("cells", []) - used_col_indices = {c.get("col_index") for c in cells} - old_cols = z.get("columns", []) - new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices] - if len(new_cols) < len(old_cols): - # Re-index columns and cells - old_to_new = {} - for new_i, col in enumerate(new_cols): - old_i = col.get("col_index", col.get("index", new_i)) - old_to_new[old_i] = new_i - col["col_index"] = new_i - col["index"] = new_i - col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text" - for cell in cells: - old_ci = cell.get("col_index", 0) - cell["col_index"] = old_to_new.get(old_ci, old_ci) - cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text" - z["columns"] = new_cols - - # Clean up internal flags before returning - for z in zones_data: - for cell in z.get("cells", []): - cell.pop("_ipa_corrected", None) - - result = { - "session_id": session_id, - "image_width": img_w, - "image_height": img_h, - "zones": zones_data, - "boxes_detected": boxes_detected, - "summary": { - "total_zones": len(zones_data), - "total_columns": total_columns, - "total_rows": total_rows, - "total_cells": total_cells, - "total_words": len(all_words), - "recovered_colored": recovered_count, - "color_stats": color_stats, - }, - "formatting": { - "bold_columns": [], - "header_rows": [], - }, - "layout_metrics": { - "page_width_px": img_w, - "page_height_px": img_h, - "avg_row_height_px": round(avg_row_height, 1), - "font_size_suggestion_px": font_size_suggestion, - }, - "dictionary_detection": { - "is_dictionary": dict_detection.get("is_dictionary", False), - "confidence": dict_detection.get("confidence", 0.0), - "signals": dict_detection.get("signals", {}), - "article_col_index": dict_detection.get("article_col_index"), - "headword_col_index": dict_detection.get("headword_col_index"), - }, - "processing_modes": { - "ipa_mode": ipa_mode, - "syllable_mode": syllable_mode, - "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, - "syllables_applied": syllable_insertions > 0, - }, - "page_number": page_number_info, - "duration_seconds": round(duration, 2), - } - - return result - - # --------------------------------------------------------------------------- # Endpoints # ---------------------------------------------------------------------------