Restructure: Move grid_* + vocab_* into packages (klausur-service)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s

grid/ package (16 files):
  grid/build/   — core, zones, cleanup, text_ops, cell_ops, finalize
  grid/editor/  — api, helpers, columns, filters, headers, zones

vocab/ package (10 files):
  vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare
  vocab/           — session_store, learn_bridge

26 backward-compat shims. Internal imports relative. RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions
+10
View File
@@ -0,0 +1,10 @@
"""
Grid package — restructured from grid_* flat modules.
Backward-compatible re-exports: consumers can still use
``from grid_build_core import ...`` etc. via the shim files in backend/.
Sub-packages:
- grid.build — grid construction pipeline (_build_grid_core and phases)
- grid.editor — FastAPI endpoints, helper functions, column/zone logic
"""
@@ -0,0 +1,11 @@
"""
Grid Build sub-package — grid construction pipeline.
Modules:
- core — _build_grid_core() main entry point
- zones — image loading, graphic/box detection, zone-aware grid building
- cleanup — junk row removal, artifact cleanup, pipe dividers
- text_ops — color annotation, heading detection, IPA correction
- cell_ops — bullet removal, garbled cells, word-box reordering
- finalize — dictionary detection, spell checking, result assembly
"""
@@ -0,0 +1,305 @@
"""
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
garbled cell cleanup, word-box reordering, and max_columns enforcement.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Tuple
from cv_ocr_engines import (
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
)
logger = logging.getLogger(__name__)
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
"""Remove blue bullet/artifact word_boxes (Step 5i).
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
and syllable-split word merging.
"""
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
bullet_removed = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
to_remove: set = set()
# Rule (a): tiny coloured symbols
for i, wb in enumerate(wbs):
cn = wb.get("color_name", "black")
if (cn != "black"
and wb.get("width", 0) * wb.get("height", 0) < 200
and wb.get("conf", 100) < 85):
to_remove.add(i)
# Rule (a2): isolated non-alphanumeric symbols
for i, wb in enumerate(wbs):
t = (wb.get("text") or "").strip()
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
if t in _REMOVE_SYMBOLS:
to_remove.add(i)
# Rule (b) + (c): overlap and duplicate detection
to_merge: List[Tuple[int, int]] = []
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
for p in range(len(indexed) - 1):
i1, w1 = indexed[p]
i2, w2 = indexed[p + 1]
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
min_w = min(w1.get("width", 1), w2.get("width", 1))
gap = x2s - x1e
overlap_pct = overlap / min_w if min_w > 0 else 0
if overlap_pct > 0.20:
t1 = (w1.get("text") or "").strip()
t2 = (w2.get("text") or "").strip()
# Syllable-split words
if (overlap_pct <= 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)):
to_merge.append((i1, i2))
continue
# High overlap with short prefix
if (overlap_pct > 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
to_merge.append((i1, i2))
continue
if overlap_pct <= 0.40:
continue
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
# Very high overlap: prefer IPA-dictionary word
if overlap_pct > 0.90 and t1.lower() != t2.lower():
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
if in_dict_1 and not in_dict_2:
to_remove.add(i2)
continue
elif in_dict_2 and not in_dict_1:
to_remove.add(i1)
continue
if c1 < c2:
to_remove.add(i1)
elif c2 < c1:
to_remove.add(i2)
else:
if w1.get("height", 0) > w2.get("height", 0):
to_remove.add(i1)
else:
to_remove.add(i2)
elif (gap < 6
and w1.get("color_name") == "blue"
and w2.get("color_name") == "blue"
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
to_remove.add(i1 if c1 <= c2 else i2)
# Execute merges first (syllable-split words)
if to_merge:
merge_parent: Dict[int, int] = {}
for mi1, mi2 in to_merge:
actual_mi1 = mi1
while actual_mi1 in merge_parent:
actual_mi1 = merge_parent[actual_mi1]
if actual_mi1 in to_remove or mi2 in to_remove:
continue
if mi2 in merge_parent:
continue
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
mt2 = (mw2.get("text") or "").strip()
merged_text = mt1 + mt2
mx = min(mw1["left"], mw2["left"])
my = min(mw1["top"], mw2["top"])
mr = max(mw1["left"] + mw1["width"],
mw2["left"] + mw2["width"])
mb = max(mw1["top"] + mw1["height"],
mw2["top"] + mw2["height"])
mw1["text"] = merged_text
mw1["left"] = mx
mw1["top"] = my
mw1["width"] = mr - mx
mw1["height"] = mb - my
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
to_remove.add(mi2)
merge_parent[mi2] = actual_mi1
bullet_removed -= 1
if to_remove:
bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
cell["word_boxes"] = filtered
if not cell.get("_ipa_corrected"):
cell["text"] = _words_to_reading_order_text(filtered)
if bullet_removed:
for z in zones_data:
z["cells"] = [c for c in z.get("cells", [])
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
_COMMON_SHORT_WORDS = {
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
"die", "der", "das", "dem", "den", "des", "ein", "und",
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
"on", "or", "so", "to", "up", "us", "we",
"the", "and", "but", "for", "not",
}
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
artifact_cells_removed = 0
for z in zones_data:
before = len(z.get("cells", []))
kept = []
for cell in z.get("cells", []):
text = (cell.get("text") or "").strip()
core = text.rstrip(".,;:!?'\"")
is_artifact = False
if not core:
is_artifact = True
elif _PURE_JUNK_RE.match(core):
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '', ''):
is_artifact = True
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
is_artifact = True
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
and not re.match(r'^[pPsS]\.?\d+$', core)):
is_artifact = True
if is_artifact:
kept.append(None)
else:
kept.append(cell)
z["cells"] = [c for c in kept if c is not None]
artifact_cells_removed += before - len(z["cells"])
if artifact_cells_removed:
for z in zones_data:
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
"""Normalise word_box order to reading order (Step 5j)."""
wb_reordered = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
sorted_wbs = [w for line in lines for w in line]
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
cell["word_boxes"] = sorted_wbs
wb_reordered += 1
if wb_reordered:
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
def _enforce_max_columns(
zones_data: List[Dict[str, Any]],
max_columns: int,
) -> None:
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
for z in zones_data:
if z.get("zone_type") != "content":
continue
cols = z.get("columns", [])
cells = z.get("cells", [])
if len(cols) <= max_columns:
continue
logger.info(
"max_columns=%d: zone %s has %d columns -> merging",
max_columns, z.get("zone_index"), len(cols),
)
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
while len(cols) > max_columns:
narrowest = cols_by_width.pop(0)
ni = narrowest["index"]
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
if pos + 1 < len(sorted_by_x):
merge_target = sorted_by_x[pos + 1]
elif pos > 0:
merge_target = sorted_by_x[pos - 1]
else:
break
ti = merge_target["index"]
merge_target["x_min_px"] = min(
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
)
merge_target["x_max_px"] = max(
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
)
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
for cell in cells:
if cell.get("col_index") == ni:
cell["col_index"] = ti
existing = next(
(c for c in cells if c["col_index"] == ti
and c["row_index"] == cell["row_index"]
and c is not cell),
None,
)
if existing:
existing["text"] = (
(existing.get("text", "") + " " + cell.get("text", "")).strip()
)
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
cell["_merged"] = True
z["cells"] = [c for c in cells if not c.get("_merged")]
cells = z["cells"]
cols.remove(narrowest)
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
# Re-index columns 0..N-1
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
old_idx = col["index"]
col["index"] = new_idx
for cell in cells:
if cell.get("col_index") == old_idx:
cell["col_index"] = new_idx
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
@@ -0,0 +1,390 @@
"""
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
divider removal, connector normalization, border strip detection, and
alphabet sidebar removal.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List
from cv_ocr_engines import _words_to_reading_order_text
logger = logging.getLogger(__name__)
_PIPE_RE = re.compile(r"^\|+$")
def _cleanup_zones(
zones_data: List[Dict[str, Any]],
border_prefiltered: bool,
session_id: str,
) -> bool:
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
Args:
zones_data: List of zone dicts (modified in place).
border_prefiltered: Whether border words were already pre-filtered.
session_id: For logging.
Returns:
Updated border_prefiltered flag.
"""
_remove_junk_rows(zones_data)
_remove_artifact_cells(zones_data)
_remove_oversized_word_boxes(zones_data)
_remove_pipe_dividers(zones_data)
_normalize_connector_columns(zones_data)
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
_remove_alphabet_sidebars(zones_data)
return border_prefiltered
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
"""Remove rows where ALL cells contain only short, low-confidence text.
Also removes 'oversized stub' rows and 'scattered debris' rows.
"""
_JUNK_CONF_THRESHOLD = 50
_JUNK_MAX_TEXT_LEN = 3
for z in zones_data:
cells = z.get("cells", [])
rows = z.get("rows", [])
if not cells or not rows:
continue
# Compute median word height across the zone for oversized detection
all_wb_heights = [
wb["height"]
for cell in cells
for wb in cell.get("word_boxes") or []
if wb.get("height", 0) > 0
]
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
junk_row_indices = set()
for row in rows:
ri = row["index"]
row_cells = [c for c in cells if c.get("row_index") == ri]
if not row_cells:
continue
row_wbs = [
wb for cell in row_cells
for wb in cell.get("word_boxes") or []
]
# Rule 1: ALL word_boxes are low-conf AND short text
all_junk = True
for wb in row_wbs:
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 0)
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
all_junk = False
break
if all_junk and row_wbs:
junk_row_indices.add(ri)
continue
# Rule 2: oversized stub -- <=3 words, short total text,
# and word height > 1.8x median
if len(row_wbs) <= 3:
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
has_page_ref = any(
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
for wb in row_wbs
)
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
junk_row_indices.add(ri)
continue
# Rule 3: scattered debris -- rows with only tiny fragments
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
if longest <= 2:
junk_row_indices.add(ri)
continue
if junk_row_indices:
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
logger.info(
"build-grid: removed %d junk rows from zone %d: %s",
len(junk_row_indices), z["zone_index"],
sorted(junk_row_indices),
)
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove individual cells with a single very-short, low-conf word."""
_ARTIFACT_MAX_LEN = 2
_ARTIFACT_CONF_THRESHOLD = 65
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
artifact_ids = set()
for cell in cells:
wbs = cell.get("word_boxes") or []
if len(wbs) != 1:
continue
wb = wbs[0]
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 100)
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
artifact_ids.add(cell.get("cell_id"))
if artifact_ids:
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
logger.info(
"build-grid: removed %d artifact cells from zone %d: %s",
len(artifact_ids), z.get("zone_index", 0),
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
)
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wh = [
wb["height"]
for cell in cells
for wb in cell.get("word_boxes") or []
if wb.get("height", 0) > 0
]
if not all_wh:
continue
med_h = sorted(all_wh)[len(all_wh) // 2]
oversized_threshold = med_h * 3
removed_oversized = 0
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
if len(filtered) < len(wbs):
removed_oversized += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
if removed_oversized:
z["cells"] = [c for c in cells if c.get("word_boxes")]
logger.info(
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
removed_oversized, oversized_threshold, z.get("zone_index", 0),
)
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
"""Remove pipe-character word_boxes (column divider artifacts)."""
for z in zones_data:
if z.get("vsplit_group") is not None:
continue # pipes already removed before split
removed_pipes = 0
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed_pipes += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
if removed_pipes:
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"build-grid: removed %d pipe-divider word_boxes from zone %d",
removed_pipes, z.get("zone_index", 0),
)
# Strip pipe chars ONLY from cell edges (OCR artifacts).
# Preserve pipes embedded in words as syllable separators.
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if "|" in text:
cleaned = text.strip("|").strip()
if cleaned != text.strip():
cell["text"] = cleaned
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
"""Normalize narrow connector columns where OCR appends noise chars.
In synonym dictionaries a narrow column repeats the same word
(e.g. "oder") in every row. OCR sometimes appends noise chars.
"""
for z in zones_data:
cols = z.get("columns", [])
cells = z.get("cells", [])
if not cols or not cells:
continue
for col in cols:
ci = col.get("index")
col_cells = [c for c in cells if c.get("col_index") == ci]
if len(col_cells) < 3:
continue
text_counts: Dict[str, int] = {}
for c in col_cells:
t = (c.get("text") or "").strip()
if t:
text_counts[t] = text_counts.get(t, 0) + 1
if not text_counts:
continue
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
dominant_count = text_counts[dominant_text]
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
continue
fixed = 0
for c in col_cells:
t = (c.get("text") or "").strip()
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
c["text"] = dominant_text
wbs = c.get("word_boxes") or []
if len(wbs) == 1:
wbs[0]["text"] = dominant_text
fixed += 1
if fixed:
logger.info(
"build-grid: normalized %d outlier cells in connector column %d "
"(dominant='%s') zone %d",
fixed, ci, dominant_text, z.get("zone_index", 0),
)
def _remove_border_strips(
zones_data: List[Dict[str, Any]],
border_prefiltered: bool,
) -> bool:
"""Detect and remove page-border decoration strips.
Returns updated border_prefiltered flag.
"""
border_strip_removed = 0
if border_prefiltered:
logger.info("Step 4e: skipped (border pre-filter already applied)")
return border_prefiltered
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wbs_with_cell: list = []
for cell in cells:
for wb in cell.get("word_boxes") or []:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
if len(all_wbs_with_cell) < 10:
continue
all_wbs_with_cell.sort(key=lambda t: t[0])
total = len(all_wbs_with_cell)
# -- Left-edge scan --
left_strip_count = 0
left_gap = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
)
gap = all_wbs_with_cell[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
left_gap = gap
break
# -- Right-edge scan --
right_strip_count = 0
right_gap = 0
running_left = all_wbs_with_cell[-1][0]
for gi in range(total - 1, 0, -1):
running_left = min(running_left, all_wbs_with_cell[gi][0])
prev_right = (
all_wbs_with_cell[gi - 1][0]
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
)
gap = running_left - prev_right
if gap > 30:
right_strip_count = total - gi
right_gap = gap
break
strip_wbs: set = set()
strip_side = ""
strip_gap = 0
strip_count = 0
if left_strip_count > 0 and left_strip_count / total < 0.20:
strip_side = "left"
strip_count = left_strip_count
strip_gap = left_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
elif right_strip_count > 0 and right_strip_count / total < 0.20:
strip_side = "right"
strip_count = right_strip_count
strip_gap = right_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
if not strip_wbs:
continue
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
if len(filtered) < len(wbs):
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
strip_gap, strip_count, total,
)
return border_prefiltered
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
"""Remove decorative edge columns (alphabet sidebar safety net).
Dictionary pages have A-Z letter sidebars that OCR reads as single-
character word_boxes.
"""
for z in zones_data:
columns = z.get("columns", [])
cells = z.get("cells", [])
if len(columns) < 3 or not cells:
continue
col_cells: Dict[str, List[Dict]] = {}
for cell in cells:
ct = cell.get("col_type", "")
if ct.startswith("column_"):
col_cells.setdefault(ct, []).append(cell)
col_types_ordered = sorted(col_cells.keys())
if len(col_types_ordered) < 3:
continue
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
edge_cells_list = col_cells.get(edge_ct, [])
if len(edge_cells_list) < 3:
continue
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
avg_len = sum(len(t) for t in texts) / len(texts)
single_char = sum(1 for t in texts if len(t) <= 1)
single_ratio = single_char / len(texts)
if avg_len > 1.5:
continue
if single_ratio < 0.7:
continue
removed_count = len(edge_cells_list)
edge_ids = {id(c) for c in edge_cells_list}
z["cells"] = [c for c in cells if id(c) not in edge_ids]
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
logger.info(
"Step 4f: removed decorative edge column '%s' from zone %d "
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
edge_ct, z.get("zone_index", 0), removed_count,
avg_len, single_ratio * 100,
)
break # only remove one edge per zone
+213
View File
@@ -0,0 +1,213 @@
"""
Grid Build Core — the main _build_grid_core() function.
Extracted from grid_editor_api.py for maintainability.
Takes merged OCR word positions and builds a structured, zone-aware grid.
The function delegates to phase-specific modules:
- grid_build_zones.py — image loading, graphic/box detection, zone grids
- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
- grid_build_text_ops.py — color, headings, IPA, page refs
- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
"""
import logging
import time
from typing import Any, Dict, List, Optional
from grid.editor.filters import (
_flatten_word_boxes,
_get_content_bounds,
_filter_decorative_margin,
_filter_footer_words,
_filter_header_junk,
)
from .zones import _build_zones
from .cleanup import _cleanup_zones
from .text_ops import _process_text
from .finalize import _finalize_grid
logger = logging.getLogger(__name__)
async def _build_grid_core(
session_id: str,
session: dict,
*,
ipa_mode: str = "auto",
syllable_mode: str = "auto",
enhance: bool = True,
max_columns: Optional[int] = None,
min_conf: Optional[int] = None,
) -> dict:
"""Core grid building logic — pure computation, no HTTP or DB side effects.
Args:
session_id: Session identifier (for logging and image loading).
session: Full session dict from get_session_db().
ipa_mode: "auto" (only when English headwords detected), "all"
(force IPA on all content columns), "en" (English column only),
"de" (German/definition columns only), or "none" (skip entirely).
syllable_mode: "auto" (only when original has pipe dividers),
"all" (force syllabification on all words), "en" (English only),
"de" (German only), or "none" (skip).
Returns:
StructuredGrid result dict.
Raises:
ValueError: If session data is incomplete.
"""
t0 = time.time()
# ── Phase 1: Input Validation & Word Filtering ──────────────────
# 1. Validate and load word results
word_result = session.get("word_result")
if not word_result or not word_result.get("cells"):
raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
img_w = word_result.get("image_width", 0)
img_h = word_result.get("image_height", 0)
if not img_w or not img_h:
raise ValueError("Missing image dimensions in word_result")
# 2. Flatten all word boxes from cells
all_words = _flatten_word_boxes(word_result["cells"])
if not all_words:
raise ValueError("No word boxes found in cells")
# 2a-pre. Apply min_conf filter if specified
if min_conf and min_conf > 0:
before = len(all_words)
all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
removed = before - len(all_words)
if removed:
logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
session_id, min_conf, removed, before)
logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
session_id, len(all_words), len(word_result["cells"]),
enhance, max_columns, min_conf)
# 2b. Filter decorative margin columns (alphabet graphics)
margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
margin_strip_detected = margin_strip_info.get("found", False)
# Read document_category from session
document_category = session.get("document_category")
# 2c. Filter footer rows (page numbers at the very bottom)
page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
# 2c2. Filter OCR junk from header illustrations
_filter_header_junk(all_words, img_h, logger, session_id)
# 2d. Filter words inside user-defined exclude regions
structure_result = session.get("structure_result")
exclude_rects = []
if structure_result:
for er in structure_result.get("exclude_regions", []):
exclude_rects.append({
"x": er["x"], "y": er["y"],
"w": er["w"], "h": er["h"],
})
if exclude_rects:
before = len(all_words)
filtered = []
for w in all_words:
w_cx = w["left"] + w.get("width", 0) / 2
w_cy = w["top"] + w.get("height", 0) / 2
inside = any(
er["x"] <= w_cx <= er["x"] + er["w"]
and er["y"] <= w_cy <= er["y"] + er["h"]
for er in exclude_rects
)
if not inside:
filtered.append(w)
removed = before - len(filtered)
if removed:
all_words = filtered
logger.info(
"build-grid session %s: removed %d words inside %d user exclude region(s)",
session_id, removed, len(exclude_rects),
)
# 2e. Hard-filter words inside graphic/image regions from structure step
graphic_rects: List[Dict[str, int]] = []
if structure_result:
for g in structure_result.get("graphics", []):
graphic_rects.append({
"x": g["x"], "y": g["y"],
"w": g["w"], "h": g["h"],
})
if graphic_rects:
before = len(all_words)
all_words = [
w for w in all_words
if not any(
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in graphic_rects
)
]
removed = before - len(all_words)
if removed:
logger.info(
"build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
session_id, removed, len(graphic_rects),
)
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
# ── Phase 2: Image Processing & Zone Detection ──────────────────
zone_result = await _build_zones(
session_id, session, all_words, graphic_rects,
content_x, content_y, content_w, content_h,
img_w, img_h,
)
zones_data = zone_result["zones_data"]
boxes_detected = zone_result["boxes_detected"]
recovered_count = zone_result["recovered_count"]
border_prefiltered = zone_result["border_prefiltered"]
img_bgr = zone_result["img_bgr"]
# ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
# ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
text_result = _process_text(
zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
)
# ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
duration = time.time() - t0
result = _finalize_grid(
zones_data=zones_data,
all_words=all_words,
img_bgr=img_bgr,
img_w=img_w,
img_h=img_h,
session_id=session_id,
max_columns=max_columns,
ipa_mode=ipa_mode,
syllable_mode=syllable_mode,
en_col_type=text_result["en_col_type"],
ipa_target_cols=text_result["ipa_target_cols"],
all_content_cols=text_result["all_content_cols"],
skip_ipa=text_result["skip_ipa"],
document_category=document_category,
margin_strip_detected=margin_strip_detected,
page_number_info=text_result["page_number_info"],
boxes_detected=boxes_detected,
recovered_count=recovered_count,
duration=duration,
)
return result
@@ -0,0 +1,452 @@
"""
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
dictionary detection, syllable dividers, spell checking, empty column
removal, and result assembly.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from .cell_ops import (
_remove_bullets_and_artifacts,
_remove_garbled_cells,
_normalize_word_order,
_enforce_max_columns,
)
logger = logging.getLogger(__name__)
def _finalize_grid(
zones_data: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
img_bgr: Any,
img_w: int,
img_h: int,
session_id: str,
max_columns: Optional[int],
ipa_mode: str,
syllable_mode: str,
en_col_type: Optional[str],
ipa_target_cols: set,
all_content_cols: set,
skip_ipa: bool,
document_category: Optional[str],
margin_strip_detected: bool,
page_number_info: Optional[Dict],
boxes_detected: int,
recovered_count: int,
duration: float,
) -> dict:
"""Run final processing steps and assemble result dict.
Handles: bullet removal, artifact cells, word ordering, max_columns,
dictionary detection, syllable dividers, spell check, empty columns,
internal flag cleanup, and result assembly.
"""
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
# 5i. Remove blue bullet/artifact word_boxes
_remove_bullets_and_artifacts(zones_data)
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
_remove_garbled_cells(zones_data)
# 5j. Normalise word_box order to reading order
_normalize_word_order(zones_data)
# 5k. Enforce max_columns by merging narrowest columns
if max_columns and max_columns > 0:
_enforce_max_columns(zones_data, max_columns)
# --- Dictionary detection on assembled grid ---
dict_detection = _detect_dictionary(
zones_data, img_w, img_h, document_category, margin_strip_detected
)
# --- Word-gap merge ---
try:
from cv_syllable_detect import merge_word_gaps_in_zones
merge_word_gaps_in_zones(zones_data, session_id)
except Exception as e:
logger.warning("Word-gap merge failed: %s", e)
# --- Pipe auto-correction ---
try:
from cv_syllable_detect import autocorrect_pipe_artifacts
autocorrect_pipe_artifacts(zones_data, session_id)
except Exception as e:
logger.warning("Pipe autocorrect failed: %s", e)
# --- Syllable divider insertion ---
syllable_insertions = _insert_syllable_dividers(
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
en_col_type, all_content_cols, total_cols,
)
# --- Split merged words ---
_split_merged_words(zones_data, session_id)
# --- Ensure space before IPA/phonetic brackets ---
_fix_ipa_spacing(zones_data)
# --- SmartSpellChecker ---
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
# --- Debug log cell counts per column ---
for z in zones_data:
if z.get("zone_type") == "content":
from collections import Counter as _Counter
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
_cols = z.get("columns", [])
logger.info(
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
)
# --- Remove empty columns ---
_remove_empty_columns(zones_data)
# Clean up internal flags before returning
for z in zones_data:
for cell in z.get("cells", []):
cell.pop("_ipa_corrected", None)
# 6. Build result
return _assemble_result(
zones_data, all_words, img_w, img_h, session_id,
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
dict_detection, page_number_info, boxes_detected,
recovered_count, duration, syllable_insertions,
)
def _detect_dictionary(
zones_data: List[Dict[str, Any]],
img_w: int,
img_h: int,
document_category: Optional[str],
margin_strip_detected: bool,
) -> Dict[str, Any]:
"""Run dictionary detection on the assembled grid."""
from cv_layout import _score_dictionary_signals
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
try:
from cv_vocab_types import ColumnGeometry
for z in zones_data:
zone_cells = z.get("cells", [])
zone_cols = z.get("columns", [])
if len(zone_cols) < 2 or len(zone_cells) < 10:
continue
pseudo_geoms = []
for col in zone_cols:
ci = col["index"]
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
col_words = []
for cell in col_cells:
for wb in cell.get("word_boxes") or []:
col_words.append({
"text": wb.get("text", ""),
"conf": wb.get("conf", 0),
"top": wb.get("top", 0),
"left": wb.get("left", 0),
"height": wb.get("height", 0),
"width": wb.get("width", 0),
})
if not cell.get("word_boxes") and cell.get("text"):
col_words.append({
"text": cell["text"],
"conf": cell.get("confidence", 50),
"top": cell.get("bbox_px", {}).get("y", 0),
"left": cell.get("bbox_px", {}).get("x", 0),
"height": cell.get("bbox_px", {}).get("h", 20),
"width": cell.get("bbox_px", {}).get("w", 50),
})
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
pseudo_geoms.append(ColumnGeometry(
index=ci, x=col.get("x_min_px", 0), y=0,
width=max(col_w, 1), height=img_h,
word_count=len(col_words), words=col_words,
width_ratio=col_w / max(img_w, 1),
))
if len(pseudo_geoms) >= 2:
dd = _score_dictionary_signals(
pseudo_geoms,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dd["confidence"] > dict_detection["confidence"]:
dict_detection = dd
except Exception as e:
logger.warning("Dictionary detection failed: %s", e)
return dict_detection
def _insert_syllable_dividers(
zones_data: List[Dict[str, Any]],
img_bgr: Any,
session_id: str,
syllable_mode: str,
dict_detection: Dict[str, Any],
en_col_type: Optional[str],
all_content_cols: set,
total_cols: int,
) -> int:
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
syllable_insertions = 0
if syllable_mode == "none" or img_bgr is None:
if syllable_mode == "none":
for z in zones_data:
for cell in z.get("cells", []):
t = cell.get("text", "")
if "|" in t:
cell["text"] = t.replace("|", "")
return syllable_insertions
_syllable_eligible = False
if syllable_mode in ("all", "de", "en"):
_syllable_eligible = True
elif (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None):
_syllable_eligible = True
_syllable_col_filter: Optional[set] = None
if syllable_mode == "en":
_syllable_col_filter = {en_col_type} if en_col_type else set()
elif syllable_mode == "de":
if en_col_type and total_cols >= 3:
_syllable_col_filter = all_content_cols - {en_col_type}
if _syllable_eligible:
try:
from cv_syllable_detect import insert_syllable_dividers
force_syllables = (syllable_mode in ("all", "de", "en"))
syllable_insertions = insert_syllable_dividers(
zones_data, img_bgr, session_id,
force=force_syllables,
col_filter=_syllable_col_filter,
)
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
return syllable_insertions
def _split_merged_words(
zones_data: List[Dict[str, Any]],
session_id: str,
) -> None:
"""Split merged words using dictionary lookup."""
try:
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
if not _SPELL_AVAILABLE:
return
split_count = 0
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if not text:
continue
parts = []
changed = False
for token in text.split():
clean = token
bracket_pos = clean.find('[')
suffix_ipa = ""
if bracket_pos > 0:
suffix_ipa = clean[bracket_pos:]
clean = clean[:bracket_pos]
suffix_punct = ""
stripped = clean.rstrip(".,!?;:'\")")
if stripped != clean:
suffix_punct = clean[len(stripped):]
clean = stripped
suffix = suffix_punct + suffix_ipa
contraction = ""
if "'" in clean and clean.index("'") >= 2:
apos_pos = clean.index("'")
contraction = clean[apos_pos:]
clean = clean[:apos_pos]
suffix = contraction + suffix
if len(clean) >= 4 and clean.isalpha():
split = _try_split_merged_word(clean)
if split:
parts.append(split + suffix)
changed = True
continue
parts.append(token)
if changed:
cell["text"] = " ".join(parts)
split_count += 1
if split_count:
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
except ImportError:
pass
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if text and "[" in text:
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
if fixed != text:
cell["text"] = fixed
def _run_spell_checker(
zones_data: List[Dict[str, Any]],
session_id: str,
en_col_type: Optional[str],
total_cols: int,
) -> None:
"""Run SmartSpellChecker on all cells."""
try:
from smart_spell import SmartSpellChecker
_ssc = SmartSpellChecker()
spell_fix_count = 0
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if not text or not text.strip():
continue
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
if total_cols >= 3 and en_col_type:
lang = "en" if ct == en_col_type else "de"
elif total_cols <= 2:
lang = "auto"
else:
lang = "auto"
result = _ssc.correct_text(text, lang=lang)
if result.changed:
cell["text"] = result.corrected
spell_fix_count += 1
if spell_fix_count:
logger.info(
"build-grid session %s: SmartSpellChecker fixed %d cells",
session_id, spell_fix_count,
)
except ImportError:
logger.debug("SmartSpellChecker not available in build-grid")
except Exception as e:
logger.warning("SmartSpellChecker error in build-grid: %s", e)
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
"""Remove columns that have no cells assigned."""
for z in zones_data:
cells = z.get("cells", [])
used_col_indices = {c.get("col_index") for c in cells}
old_cols = z.get("columns", [])
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
if len(new_cols) < len(old_cols):
old_to_new = {}
for new_i, col in enumerate(new_cols):
old_i = col.get("col_index", col.get("index", new_i))
old_to_new[old_i] = new_i
col["col_index"] = new_i
col["index"] = new_i
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
for cell in cells:
old_ci = cell.get("col_index", 0)
cell["col_index"] = old_to_new.get(old_ci, old_ci)
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
z["columns"] = new_cols
def _assemble_result(
zones_data: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
img_w: int,
img_h: int,
session_id: str,
ipa_mode: str,
syllable_mode: str,
ipa_target_cols: set,
skip_ipa: bool,
dict_detection: Dict[str, Any],
page_number_info: Optional[Dict],
boxes_detected: int,
recovered_count: int,
duration: float,
syllable_insertions: int,
) -> dict:
"""Build the final result dict (Phase 6)."""
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
# Collect color statistics
color_stats: Dict[str, int] = {}
for z in zones_data:
for cell in z.get("cells", []):
for wb in cell.get("word_boxes", []):
cn = wb.get("color_name", "black")
color_stats[cn] = color_stats.get(cn, 0) + 1
# Compute layout metrics
all_content_row_heights: List[float] = []
for z in zones_data:
for row in z.get("rows", []):
if not row.get("is_header", False):
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
if h > 0:
all_content_row_heights.append(h)
avg_row_height = (
sum(all_content_row_heights) / len(all_content_row_heights)
if all_content_row_heights else 30.0
)
font_size_suggestion = max(10, int(avg_row_height * 0.6))
return {
"session_id": session_id,
"image_width": img_w,
"image_height": img_h,
"zones": zones_data,
"boxes_detected": boxes_detected,
"summary": {
"total_zones": len(zones_data),
"total_columns": total_columns,
"total_rows": total_rows,
"total_cells": total_cells,
"total_words": len(all_words),
"recovered_colored": recovered_count,
"color_stats": color_stats,
},
"formatting": {
"bold_columns": [],
"header_rows": [],
},
"layout_metrics": {
"page_width_px": img_w,
"page_height_px": img_h,
"avg_row_height_px": round(avg_row_height, 1),
"font_size_suggestion_px": font_size_suggestion,
},
"dictionary_detection": {
"is_dictionary": dict_detection.get("is_dictionary", False),
"confidence": dict_detection.get("confidence", 0.0),
"signals": dict_detection.get("signals", {}),
"article_col_index": dict_detection.get("article_col_index"),
"headword_col_index": dict_detection.get("headword_col_index"),
},
"processing_modes": {
"ipa_mode": ipa_mode,
"syllable_mode": syllable_mode,
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
"syllables_applied": syllable_insertions > 0,
},
"page_number": page_number_info,
"duration_seconds": round(duration, 2),
}
@@ -0,0 +1,489 @@
"""
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
parenthesis fix, IPA phonetic correction, page ref extraction, and
slash-IPA conversion.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Optional, Set, Tuple
from cv_color_detect import detect_word_colors
from cv_ocr_engines import (
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
_lookup_ipa,
)
from grid.editor.headers import (
_detect_heading_rows_by_color,
_detect_heading_rows_by_single_cell,
)
logger = logging.getLogger(__name__)
def _process_text(
zones_data: List[Dict[str, Any]],
img_bgr: Any,
img_w: int,
img_h: int,
ipa_mode: str,
page_number_info: Optional[Dict],
) -> Dict[str, Any]:
"""Run color annotation, heading detection, IPA correction, and page refs.
Args:
zones_data: List of zone dicts (modified in place).
img_bgr: BGR image array (or None).
img_w: Image width.
img_h: Image height.
ipa_mode: IPA processing mode.
page_number_info: Existing page number metadata (may be None).
Returns:
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
skip_ipa, page_number_info.
"""
# 5. Color annotation on final word_boxes in cells
if img_bgr is not None:
all_wb: List[Dict] = []
for z in zones_data:
for cell in z.get("cells", []):
all_wb.extend(cell.get("word_boxes", []))
detect_word_colors(img_bgr, all_wb)
# 5a. Heading detection by color + height
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
if heading_count:
logger.info("Detected %d heading rows by color+height", heading_count)
# 5b. Fix unmatched parentheses in cell text
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if ")" in text and "(" not in text:
cell["text"] = "(" + text
# 5c. IPA phonetic correction
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
en_col_type = None
ipa_target_cols: set = set()
all_content_cols: set = set()
skip_ipa = (ipa_mode == "none")
# When ipa_mode=none, strip ALL square brackets from ALL content columns
if skip_ipa:
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if "[" in text:
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
if stripped != text:
cell["text"] = stripped.strip()
cell["_ipa_corrected"] = True
if not skip_ipa and total_cols >= 3:
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
all_cells, total_cols, ipa_mode, zones_data
)
elif not skip_ipa:
# Collect all_content_cols even when <3 cols (needed by finalize)
for cell in all_cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
all_content_cols.add(ct)
# 5e. Heading detection by single-cell rows
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
if single_heading_count:
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
# 5f. Strip IPA from headings
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type") != "heading":
continue
text = cell.get("text", "")
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
if stripped and stripped != text:
cell["text"] = stripped
# 5g. Extract page_ref cells and footer rows
_extract_page_refs_and_footers(zones_data, page_number_info)
# 5h. Convert slash-delimited IPA to bracket notation
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
return {
"en_col_type": en_col_type,
"ipa_target_cols": ipa_target_cols,
"all_content_cols": all_content_cols,
"skip_ipa": skip_ipa,
"page_number_info": page_number_info,
}
def _run_ipa_correction(
all_cells: List[Dict],
total_cols: int,
ipa_mode: str,
zones_data: List[Dict[str, Any]],
) -> Tuple[Optional[str], set, set]:
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
en_col_type = None
all_content_cols: set = set()
# Detect English headword column via IPA signals
col_ipa_count: Dict[str, int] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
txt = cell.get("text", "") or ""
if txt.strip():
all_content_cols.add(ct)
if '[' in txt or _text_has_garbled_ipa(txt):
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
if col_ipa_count:
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
elif ipa_mode == "all":
col_cell_count: Dict[str, int] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
if col_cell_count:
en_col_type = max(col_cell_count, key=col_cell_count.get)
# Decide which columns to process based on ipa_mode
en_ipa_target_cols: set = set()
de_ipa_target_cols: set = set()
if ipa_mode in ("auto", "en"):
if en_col_type:
en_ipa_target_cols.add(en_col_type)
elif ipa_mode == "de":
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
elif ipa_mode == "all":
if en_col_type:
en_ipa_target_cols.add(en_col_type)
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
# --- Strip IPA from columns NOT in the target set ---
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
if strip_en_ipa or ipa_mode == "none":
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
for cell in all_cells:
ct = cell.get("col_type", "")
if ct not in strip_cols:
continue
text = cell.get("text", "")
if "[" in text:
stripped = _SQUARE_BRACKET_RE.sub("", text)
if stripped != text:
cell["text"] = stripped.strip()
cell["_ipa_corrected"] = True
# --- English IPA (Britfone + eng_to_ipa) ---
if en_ipa_target_cols:
for cell in all_cells:
ct = cell.get("col_type")
if ct in en_ipa_target_cols:
cell["_orig_col_type"] = ct
cell["col_type"] = "column_en"
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
fix_cell_phonetics(all_cells, pronunciation="british")
for cell in all_cells:
orig = cell.pop("_orig_col_type", None)
if orig:
cell["col_type"] = orig
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# --- German IPA (wiki-pronunciation-dict + epitran) ---
if de_ipa_target_cols:
from cv_ipa_german import insert_german_ipa
insert_german_ipa(all_cells, de_ipa_target_cols)
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
# Mark cells whose text was changed by IPA correction
for cell in all_cells:
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# 5d. Fix IPA continuation cells
skip_ipa = (ipa_mode == "none")
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
ipa_cont_fixed = 0
for z in ([] if skip_ipa else zones_data):
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
z_cells = z.get("cells", [])
for idx, row in enumerate(rows_sorted):
if idx == 0:
continue
ri = row["index"]
row_cells = [c for c in z_cells if c.get("row_index") == ri]
for cell in row_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_text = (cell.get("text") or "").strip()
if not cell_text:
wb_texts = [w.get("text", "")
for w in cell.get("word_boxes", [])]
cell_text = " ".join(wb_texts).strip()
if not cell_text:
continue
is_bracketed = (
cell_text.startswith('[') and cell_text.endswith(']')
)
if is_bracketed:
if not _text_has_garbled_ipa(cell_text):
continue
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
continue
else:
content_cells_in_row = [
c for c in row_cells
if c.get("col_type", "").startswith("column_")
and c.get("col_type") != "column_1"
]
if len(content_cells_in_row) != 1:
continue
if not _text_has_garbled_ipa(cell_text):
continue
if any(c in _REAL_IPA_CHARS for c in cell_text):
continue
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
if len(_words_in_text) >= 3:
continue
# Find headword in previous row, same column
prev_ri = rows_sorted[idx - 1]["index"]
prev_same_col = [
c for c in z_cells
if c.get("row_index") == prev_ri
and c.get("col_type") == ct
]
if not prev_same_col:
continue
prev_text = prev_same_col[0].get("text", "")
fixed = fix_ipa_continuation_cell(
cell_text, prev_text, pronunciation="british",
)
if fixed != cell_text:
cell["text"] = fixed
ipa_cont_fixed += 1
logger.info(
"IPA continuation R%d %s: '%s' -> '%s'",
ri, ct, cell_text, fixed,
)
if ipa_cont_fixed:
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
return en_col_type, ipa_target_cols, all_content_cols
def _extract_page_refs_and_footers(
zones_data: List[Dict[str, Any]],
page_number_info: Optional[Dict],
) -> None:
"""Extract page_ref cells and footer rows from content zones.
Modifies zones_data in place. Updates page_number_info if a page number
footer is found.
"""
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
_NUMBER_WORDS = {
"one", "two", "three", "four", "five", "six", "seven",
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
"einhundert", "zweihundert", "dreihundert", "vierhundert",
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
}
for z in zones_data:
if z.get("zone_type") != "content":
continue
cells = z.get("cells", [])
rows = z.get("rows", [])
if not rows:
continue
# Extract column_1 cells that look like page references
page_refs = []
page_ref_cell_ids = set()
for cell in cells:
if cell.get("col_type") != "column_1":
continue
text = (cell.get("text") or "").strip()
if not text:
continue
if not _PAGE_REF_RE.match(text):
continue
page_refs.append({
"row_index": cell.get("row_index"),
"text": text,
"bbox_pct": cell.get("bbox_pct", {}),
})
page_ref_cell_ids.add(cell.get("cell_id"))
# Detect footer: last non-header row if it has only 1 cell
footer_rows = []
non_header_rows = [r for r in rows if not r.get("is_header")]
if non_header_rows:
last_row = non_header_rows[-1]
last_ri = last_row["index"]
last_cells = [c for c in z["cells"]
if c.get("row_index") == last_ri]
if len(last_cells) == 1:
text = (last_cells[0].get("text") or "").strip()
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
has_commas = ',' in text
text_words = set(text.lower().split())
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
is_page_number = len(text) <= 20 or is_written_number
if (text and not has_real_ipa and not has_commas
and is_page_number
and last_cells[0].get("col_type") != "heading"):
footer_rows.append({
"row_index": last_ri,
"text": text,
"bbox_pct": last_cells[0].get("bbox_pct", {}),
})
# Classify footer rows
page_number_footers = []
other_footers = []
for fr in footer_rows:
ft = fr["text"].strip()
digits = "".join(c for c in ft if c.isdigit())
if digits and re.match(r'^[\d\s.]+$', ft):
page_number_footers.append(fr)
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
page_number_footers.append(fr)
else:
other_footers.append(fr)
# Remove page-number footer rows from grid entirely
if page_number_footers:
pn_ris = {fr["row_index"] for fr in page_number_footers}
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
pn_text = page_number_footers[0]["text"].strip()
pn_digits = "".join(c for c in pn_text if c.isdigit())
if not page_number_info:
page_number_info = {
"text": pn_text,
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
}
if pn_digits:
page_number_info["number"] = int(pn_digits)
# Mark remaining footer rows
if other_footers:
footer_ris = {fr["row_index"] for fr in other_footers}
for r in z["rows"]:
if r["index"] in footer_ris:
r["is_footer"] = True
for c in z["cells"]:
if c.get("row_index") in footer_ris:
c["col_type"] = "footer"
if page_refs or footer_rows:
logger.info(
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
len(page_refs), len(footer_rows), len(page_number_footers),
z.get("zone_index", 0),
)
if page_refs:
z["page_refs"] = page_refs
if other_footers:
z["footer"] = other_footers
def _convert_slash_ipa(
zones_data: List[Dict[str, Any]],
skip_ipa: bool,
en_col_type: Optional[str],
) -> None:
"""Convert slash-delimited IPA to bracket notation.
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
"""
_SLASH_IPA_RE = re.compile(
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
)
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
slash_ipa_fixed = 0
for z in ([] if skip_ipa else zones_data):
for cell in z.get("cells", []):
if en_col_type and cell.get("col_type") != en_col_type:
continue
text = cell.get("text", "")
if "/" not in text:
continue
def _replace_slash_ipa(m: re.Match) -> str:
nonlocal slash_ipa_fixed
headword = m.group(1)
ocr_ipa = m.group(2)
inner_raw = ocr_ipa.strip("/").strip()
if _SLASH_IPA_REJECT_RE.search(inner_raw):
return m.group(0)
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
if ipa:
slash_ipa_fixed += 1
return f"{headword} [{ipa}]"
inner = inner_raw.lstrip("'").strip()
if inner:
slash_ipa_fixed += 1
return f"{headword} [{inner}]"
return m.group(0)
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
def _replace_trailing_slash(m: re.Match) -> str:
nonlocal slash_ipa_fixed
inner = m.group(1).strip("/").strip().lstrip("'").strip()
if _SLASH_IPA_REJECT_RE.search(inner):
return m.group(0)
if inner:
slash_ipa_fixed += 1
return f" [{inner}]"
return m.group(0)
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
if new_text == text:
m = _STANDALONE_SLASH_IPA_RE.match(text)
if m:
inner = m.group(1).strip()
if not _SLASH_IPA_REJECT_RE.search(inner):
inner = inner.lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
slash_ipa_fixed += 1
if new_text != text:
cell["text"] = new_text
if slash_ipa_fixed:
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
+464
View File
@@ -0,0 +1,464 @@
"""
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
detection and zone-aware grid building.
Extracted from grid_build_core.py for maintainability.
"""
import logging
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_graphic_detect import detect_graphic_elements
from cv_color_detect import recover_colored_text
from cv_vocab_types import PageZone
from ocr_pipeline_session_store import get_session_image
from grid.editor.filters import (
_filter_border_strip_words,
_filter_border_ghosts,
_words_in_zone,
)
from grid.editor.zones import (
_PIPE_RE_VSPLIT,
_detect_vertical_dividers,
_split_zone_at_vertical_dividers,
_merge_content_zones_across_boxes,
_build_zone_grid,
)
logger = logging.getLogger(__name__)
async def _build_zones(
session_id: str,
session: dict,
all_words: List[Dict[str, Any]],
graphic_rects: List[Dict[str, int]],
content_x: int,
content_y: int,
content_w: int,
content_h: int,
img_w: int,
img_h: int,
) -> Dict[str, Any]:
"""Load image, detect graphics/boxes, build zone-aware grids.
Returns a dict with keys:
zones_data, boxes_detected, recovered_count, border_prefiltered,
img_bgr, all_words (modified in-place but returned for clarity).
"""
zones_data: List[Dict[str, Any]] = []
boxes_detected = 0
recovered_count = 0
border_prefiltered = False
img_bgr = None
# 3. Load image for box detection
img_png = await get_session_image(session_id, "cropped")
if not img_png:
img_png = await get_session_image(session_id, "dewarped")
if not img_png:
img_png = await get_session_image(session_id, "original")
if img_png:
# Decode image for color detection + box detection
arr = np.frombuffer(img_png, dtype=np.uint8)
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img_bgr is not None:
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
if fresh_graphics:
fresh_rects = [
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
for g in fresh_graphics
]
graphic_rects.extend(fresh_rects)
logger.info(
"build-grid session %s: detected %d graphic region(s) via CV",
session_id, len(fresh_graphics),
)
# Hard-filter words inside newly detected graphic regions
before = len(all_words)
all_words[:] = [
w for w in all_words
if not any(
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in fresh_rects
)
]
removed = before - len(all_words)
if removed:
logger.info(
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
session_id, removed, len(fresh_rects),
)
# --- Recover colored text that OCR missed (before grid building) ---
recovered = recover_colored_text(img_bgr, all_words)
if recovered and graphic_rects:
# Filter recovered chars inside graphic regions
recovered = [
r for r in recovered
if not any(
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in graphic_rects
)
]
if recovered:
recovered_count = len(recovered)
all_words.extend(recovered)
logger.info(
"build-grid session %s: +%d recovered colored words",
session_id, recovered_count,
)
# Detect bordered boxes
boxes = detect_boxes(
img_bgr,
content_x=content_x,
content_w=content_w,
content_y=content_y,
content_h=content_h,
)
boxes_detected = len(boxes)
if boxes:
# Filter border ghost words before grid building
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
if ghost_count:
all_words[:] = all_words_new
logger.info(
"build-grid session %s: removed %d border ghost words",
session_id, ghost_count,
)
# Split page into zones
page_zones = split_page_into_zones(
content_x, content_y, content_w, content_h, boxes
)
# Merge content zones separated by box zones
page_zones = _merge_content_zones_across_boxes(
page_zones, content_x, content_w
)
# 3b. Detect vertical dividers and split content zones
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
page_zones, all_words
)
# --- First pass: build grids per zone independently ---
zone_grids = _build_grids_per_zone(
page_zones, all_words, img_w, img_h
)
border_prefiltered = border_prefiltered or any(
zg.get("_border_prefiltered") for zg in zone_grids
)
# --- Second pass: merge column boundaries from all content zones ---
_merge_content_zone_columns(
zone_grids, all_words, content_w, img_w, img_h, session_id
)
# --- Build zones_data from zone_grids ---
for zg in zone_grids:
pz = zg["pz"]
grid = zg["grid"]
grid.pop("_raw_columns", None)
zone_entry: Dict[str, Any] = {
"zone_index": pz.index,
"zone_type": pz.zone_type,
"bbox_px": {
"x": pz.x, "y": pz.y,
"w": pz.width, "h": pz.height,
},
"bbox_pct": {
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
},
"border": None,
"word_count": len(zg["words"]),
**grid,
}
if pz.box:
zone_entry["border"] = {
"thickness": pz.box.border_thickness,
"confidence": pz.box.confidence,
}
if pz.image_overlays:
zone_entry["image_overlays"] = pz.image_overlays
if pz.layout_hint:
zone_entry["layout_hint"] = pz.layout_hint
if pz.vsplit_group is not None:
zone_entry["vsplit_group"] = pz.vsplit_group
zones_data.append(zone_entry)
# 4. Fallback: no boxes detected -> single zone with all words
if not zones_data:
before = len(all_words)
filtered_words = [
w for w in all_words
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
]
removed = before - len(filtered_words)
if removed:
logger.info(
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
session_id, removed,
)
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid session %s: pre-filtered %d border-strip words",
session_id, bs_removed,
)
grid = _build_zone_grid(
filtered_words, content_x, content_y, content_w, content_h,
0, img_w, img_h,
)
grid.pop("_raw_columns", None)
zones_data.append({
"zone_index": 0,
"zone_type": "content",
"bbox_px": {
"x": content_x, "y": content_y,
"w": content_w, "h": content_h,
},
"bbox_pct": {
"x": round(content_x / img_w * 100, 2) if img_w else 0,
"y": round(content_y / img_h * 100, 2) if img_h else 0,
"w": round(content_w / img_w * 100, 2) if img_w else 0,
"h": round(content_h / img_h * 100, 2) if img_h else 0,
},
"border": None,
"word_count": len(all_words),
**grid,
})
return {
"zones_data": zones_data,
"boxes_detected": boxes_detected,
"recovered_count": recovered_count,
"border_prefiltered": border_prefiltered,
"img_bgr": img_bgr,
}
def _detect_and_split_vertical_dividers(
page_zones: List[PageZone],
all_words: List[Dict[str, Any]],
) -> tuple:
"""Detect vertical dividers and split content zones.
Returns (expanded_zones, border_prefiltered_from_vsplit).
"""
vsplit_group_counter = 0
expanded_zones: List = []
for pz in page_zones:
if pz.zone_type != "content":
expanded_zones.append(pz)
continue
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
divider_xs = _detect_vertical_dividers(
zone_words, pz.x, pz.width, pz.y, pz.height
)
if divider_xs:
sub_zones = _split_zone_at_vertical_dividers(
pz, divider_xs, vsplit_group_counter
)
expanded_zones.extend(sub_zones)
vsplit_group_counter += 1
# Remove pipe words so they don't appear in sub-zones
pipe_ids = set(
id(w) for w in zone_words
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
)
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
logger.info(
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
pz.index, [int(x) for x in divider_xs], len(sub_zones),
)
else:
expanded_zones.append(pz)
# Re-index zones
for i, pz in enumerate(expanded_zones):
pz.index = i
return expanded_zones, False
def _build_grids_per_zone(
page_zones: List[PageZone],
all_words: List[Dict[str, Any]],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Build grids for each zone independently (first pass)."""
zone_grids: List[Dict] = []
for pz in page_zones:
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
if pz.zone_type == "content":
logger.info(
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
pz.index, pz.zone_type,
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
len(zone_words), len(all_words),
)
# Filter recovered single-char artifacts in ALL zones
before = len(zone_words)
zone_words = [
w for w in zone_words
if not (
w.get("recovered")
and len(w.get("text", "").strip()) <= 2
)
]
removed = before - len(zone_words)
if removed:
logger.info(
"build-grid: filtered %d recovered artifacts from %s zone %d",
removed, pz.zone_type, pz.index,
)
# Filter words inside image overlay regions (merged box zones)
if pz.image_overlays:
before_ov = len(zone_words)
zone_words = [
w for w in zone_words
if not any(
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
for ov in pz.image_overlays
)
]
ov_removed = before_ov - len(zone_words)
if ov_removed:
logger.info(
"build-grid: filtered %d words inside image overlays from zone %d",
ov_removed, pz.index,
)
zone_words, bs_removed = _filter_border_strip_words(zone_words)
bp = False
if bs_removed:
bp = True
logger.info(
"build-grid: pre-filtered %d border-strip words from zone %d",
bs_removed, pz.index,
)
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
skip_first_row_header=bool(pz.image_overlays),
)
zone_grids.append({
"pz": pz, "words": zone_words, "grid": grid,
"_border_prefiltered": bp,
})
return zone_grids
def _merge_content_zone_columns(
zone_grids: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
content_w: int,
img_w: int,
img_h: int,
session_id: str,
) -> None:
"""Second pass: merge column boundaries from all content zones.
Modifies zone_grids in place.
"""
content_zones = [
zg for zg in zone_grids
if zg["pz"].zone_type == "content"
and zg["pz"].vsplit_group is None
]
if len(content_zones) <= 1:
return
# Collect column split points (x_min of non-first columns)
all_split_xs: List[float] = []
for zg in content_zones:
raw_cols = zg["grid"].get("_raw_columns", [])
for col in raw_cols[1:]:
all_split_xs.append(col["x_min"])
if not all_split_xs:
return
all_split_xs.sort()
merge_distance = max(25, int(content_w * 0.03))
merged_xs = [all_split_xs[0]]
for x in all_split_xs[1:]:
if x - merged_xs[-1] < merge_distance:
merged_xs[-1] = (merged_xs[-1] + x) / 2
else:
merged_xs.append(x)
total_cols = len(merged_xs) + 1
max_zone_cols = max(
len(zg["grid"].get("_raw_columns", []))
for zg in content_zones
)
if total_cols < max_zone_cols:
return
cx_min = min(w["left"] for w in all_words)
cx_max = max(w["left"] + w["width"] for w in all_words)
merged_columns: List[Dict[str, Any]] = []
prev_x = cx_min
for i, sx in enumerate(merged_xs):
merged_columns.append({
"index": i,
"type": f"column_{i + 1}",
"x_min": prev_x,
"x_max": sx,
})
prev_x = sx
merged_columns.append({
"index": len(merged_xs),
"type": f"column_{len(merged_xs) + 1}",
"x_min": prev_x,
"x_max": cx_max,
})
# Re-build ALL content zones with merged columns
for zg in zone_grids:
pz = zg["pz"]
if pz.zone_type == "content":
grid = _build_zone_grid(
zg["words"], pz.x, pz.y,
pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=merged_columns,
skip_first_row_header=bool(pz.image_overlays),
)
zg["grid"] = grid
logger.info(
"build-grid session %s: union of %d content "
"zones -> %d merged columns (max single zone: %d)",
session_id, len(content_zones),
total_cols, max_zone_cols,
)
@@ -0,0 +1,15 @@
"""
Grid Editor sub-package — FastAPI endpoints and helper functions.
Modules:
- api — barrel re-export (combined router + _build_grid_core)
- api_grid — build-grid, save-grid, get-grid endpoints
- api_gutter — gutter-repair endpoints
- api_box — build-box-grids endpoints
- api_unified — build-unified-grid endpoints
- helpers — barrel re-export of all helper symbols
- columns — column detection, cross-column splitting
- filters — word/zone filtering, border ghosts
- headers — header/heading detection, colspan detection
- zones — vertical dividers, zone splitting/merging
"""
@@ -0,0 +1,31 @@
"""
Grid Editor API — barrel re-export.
The actual endpoints live in:
- grid_editor_api_grid.py (build-grid, rerun-ocr, save-grid, get-grid)
- grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply)
- grid_editor_api_box.py (build-box-grids)
- grid_editor_api_unified.py (build-unified-grid, unified-grid)
This module re-exports the combined router and key symbols so that
existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core`
continue to work unchanged.
"""
from fastapi import APIRouter
from .api_grid import router as _grid_router
from .api_gutter import router as _gutter_router
from .api_box import router as _box_router
from .api_unified import router as _unified_router
# Re-export _build_grid_core so callers that do
# `from grid_editor_api import _build_grid_core` keep working.
from grid.build.core import _build_grid_core # noqa: F401
# Merge all sub-routers into one combined router
router = APIRouter()
router.include_router(_grid_router)
router.include_router(_gutter_router)
router.include_router(_box_router)
router.include_router(_unified_router)
@@ -0,0 +1,177 @@
"""
Grid Editor API — box-grid-review endpoints.
"""
import logging
from fastapi import APIRouter, HTTPException, Request
from .filters import _words_in_zone
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-box-grids")
async def build_box_grids(session_id: str, request: Request):
"""Rebuild grid structure for all detected boxes with layout-aware detection.
Uses structure_result.boxes (from Step 7) as the source of box coordinates,
and raw_paddle_words as OCR word source. Creates or updates box zones in
the grid_editor_result.
Optional body: { "overrides": { "0": "bullet_list" } }
Maps box_index -> forced layout_type.
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
grid_data = session.get("grid_editor_result")
if not grid_data:
raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
# Get raw OCR words (with top/left/width/height keys)
word_result = session.get("word_result") or {}
all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or []
if not all_words:
raise HTTPException(status_code=400, detail="No raw OCR words available.")
# Get detected boxes from structure_result
structure_result = session.get("structure_result") or {}
gt = session.get("ground_truth") or {}
if not structure_result:
structure_result = gt.get("structure_result") or {}
detected_boxes = structure_result.get("boxes") or []
if not detected_boxes:
return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
# Filter out false-positive boxes in header/footer margins.
img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
if img_h_for_filter > 0:
margin_frac = 0.07 # 7% of image height
margin_top = img_h_for_filter * margin_frac
margin_bottom = img_h_for_filter * (1 - margin_frac)
filtered = []
for box in detected_boxes:
by = box.get("y", 0)
bh = box.get("h", 0)
box_center_y = by + bh / 2
if box_center_y < margin_top or box_center_y > margin_bottom:
logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
by, bh, box_center_y, margin_top, margin_bottom)
continue
filtered.append(box)
detected_boxes = filtered
body = {}
try:
body = await request.json()
except Exception:
pass
layout_overrides = body.get("overrides", {})
from cv_box_layout import build_box_zone_grid
img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0)
img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
zones = grid_data.get("zones", [])
# Find highest existing zone_index
max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1)
# Remove old box zones (we'll rebuild them)
zones = [z for z in zones if z.get("zone_type") != "box"]
box_count = 0
spell_fixes = 0
for box_idx, box in enumerate(detected_boxes):
bx = box.get("x", 0)
by = box.get("y", 0)
bw = box.get("w", 0)
bh = box.get("h", 0)
if bw <= 0 or bh <= 0:
continue
# Filter raw OCR words inside this box
zone_words = _words_in_zone(all_words, by, bh, bx, bw)
if not zone_words:
logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh)
continue
zone_idx = max_zone_idx + 1 + box_idx
forced_layout = layout_overrides.get(str(box_idx))
# Build box grid
box_grid = build_box_zone_grid(
zone_words, bx, by, bw, bh,
zone_idx, img_w, img_h,
layout_type=forced_layout,
)
# Apply SmartSpellChecker to all box cells
try:
from smart_spell import SmartSpellChecker
ssc = SmartSpellChecker()
for cell in box_grid.get("cells", []):
text = cell.get("text", "")
if not text:
continue
result = ssc.correct_text(text, lang="auto")
if result.changed:
cell["text"] = result.corrected
spell_fixes += 1
except ImportError:
pass
# Build zone entry
zone_entry = {
"zone_index": zone_idx,
"zone_type": "box",
"bbox_px": {"x": bx, "y": by, "w": bw, "h": bh},
"bbox_pct": {
"x": round(bx / img_w * 100, 2) if img_w else 0,
"y": round(by / img_h * 100, 2) if img_h else 0,
"w": round(bw / img_w * 100, 2) if img_w else 0,
"h": round(bh / img_h * 100, 2) if img_h else 0,
},
"border": None,
"word_count": len(zone_words),
"columns": box_grid["columns"],
"rows": box_grid["rows"],
"cells": box_grid["cells"],
"header_rows": box_grid.get("header_rows", []),
"box_layout_type": box_grid.get("box_layout_type", "flowing"),
"box_grid_reviewed": False,
"box_bg_color": box.get("bg_color_name", ""),
"box_bg_hex": box.get("bg_color_hex", ""),
}
zones.append(zone_entry)
box_count += 1
# Sort zones by y-position for correct reading order
zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0))
grid_data["zones"] = zones
await update_session_db(session_id, grid_editor_result=grid_data)
logger.info(
"build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected",
session_id, box_count, spell_fixes, len(detected_boxes),
)
return {
"session_id": session_id,
"box_zones_rebuilt": box_count,
"total_detected_boxes": len(detected_boxes),
"spell_fixes": spell_fixes,
"zones": zones,
}
@@ -0,0 +1,334 @@
"""
Grid Editor API — grid build, save, and retrieve endpoints.
"""
import logging
from fastapi import APIRouter, HTTPException, Query, Request
from grid.build.core import _build_grid_core
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_load_session_to_cache,
_get_cached,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-grid")
async def build_grid(
session_id: str,
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
):
"""Build a structured, zone-aware grid from existing Kombi word results.
Requires that paddle-kombi or rapid-kombi has already been run on the session.
Uses the image for box detection and the word positions for grid structuring.
Query params:
ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
Returns a StructuredGrid with zones, each containing their own
columns, rows, and cells — ready for the frontend Excel-like editor.
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
try:
result = await _build_grid_core(
session_id, session,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
enhance=enhance,
max_columns=max_cols if max_cols > 0 else None,
min_conf=min_conf if min_conf > 0 else None,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
# Save automatic grid snapshot for later comparison with manual corrections
# Lazy import to avoid circular dependency with ocr_pipeline_regression
from ocr_pipeline_regression import _build_reference_snapshot
wr = session.get("word_result") or {}
engine = wr.get("ocr_engine", "")
if engine in ("kombi", "rapid_kombi"):
auto_pipeline = "kombi"
elif engine == "paddle_direct":
auto_pipeline = "paddle-direct"
else:
auto_pipeline = "pipeline"
auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline)
gt = session.get("ground_truth") or {}
gt["auto_grid_snapshot"] = auto_snapshot
# Persist to DB and advance current_step to 11 (reconstruction complete)
await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11)
logger.info(
"build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
"%d boxes in %.2fs",
session_id,
len(result.get("zones", [])),
result.get("summary", {}).get("total_columns", 0),
result.get("summary", {}).get("total_rows", 0),
result.get("summary", {}).get("total_cells", 0),
result.get("boxes_detected", 0),
result.get("duration_seconds", 0),
)
return result
@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid")
async def rerun_ocr_and_build_grid(
session_id: str,
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
):
"""Re-run OCR with quality settings, then rebuild the grid.
Unlike build-grid (which only rebuilds from existing words),
this endpoint re-runs the full OCR pipeline on the cropped image
with optional CLAHE enhancement, then builds the grid.
Steps executed: Image Enhancement -> OCR -> Grid Build
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
import time as _time
t0 = _time.time()
# 1. Load the cropped/dewarped image from cache or session
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.")
img_h, img_w = dewarped_bgr.shape[:2]
ocr_input = dewarped_bgr.copy()
# 2. Scan quality assessment
scan_quality_info = {}
try:
from scan_quality import score_scan_quality
quality_report = score_scan_quality(ocr_input)
scan_quality_info = quality_report.to_dict()
actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf
except Exception as e:
logger.warning(f"rerun-ocr: scan quality failed: {e}")
actual_min_conf = min_conf if min_conf > 0 else 40
# 3. Image enhancement (Step 3)
is_degraded = scan_quality_info.get("is_degraded", False)
if enhance and is_degraded:
try:
from ocr_image_enhance import enhance_for_ocr
ocr_input = enhance_for_ocr(ocr_input, is_degraded=True)
logger.info("rerun-ocr: CLAHE enhancement applied")
except Exception as e:
logger.warning(f"rerun-ocr: enhancement failed: {e}")
# 4. Run dual-engine OCR
from PIL import Image
import pytesseract
# RapidOCR
rapid_words = []
try:
from cv_ocr_engines import ocr_region_rapid
from cv_vocab_types import PageRegion
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
rapid_words = ocr_region_rapid(ocr_input, full_region) or []
except Exception as e:
logger.warning(f"rerun-ocr: RapidOCR failed: {e}")
# Tesseract
pil_img = Image.fromarray(ocr_input[:, :, ::-1])
data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT)
tess_words = []
for i in range(len(data["text"])):
text = (data["text"][i] or "").strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < actual_min_conf:
continue
tess_words.append({
"text": text, "left": data["left"][i], "top": data["top"][i],
"width": data["width"][i], "height": data["height"][i], "conf": conf,
})
# 5. Merge OCR results
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
if rapid_split or tess_words:
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
merged_words = _deduplicate_words(merged_words)
else:
merged_words = tess_words
# 6. Store updated word_result in session
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
for w in merged_words]
word_result = {
"cells": [{"text": " ".join(w["text"] for w in merged_words),
"word_boxes": cells_for_storage}],
"image_width": img_w,
"image_height": img_h,
"ocr_engine": "rapid_kombi",
"word_count": len(merged_words),
"raw_paddle_words": rapid_words,
}
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
vision_applied = False
if vision_fusion:
try:
from vision_ocr_fusion import vision_fuse_ocr
category = doc_category or session.get("document_category") or "vokabelseite"
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
vision_applied = True
# Rebuild storage from fused words
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
for w in merged_words]
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
"word_boxes": cells_for_storage}]
word_result["word_count"] = len(merged_words)
word_result["ocr_engine"] = "vision_fusion"
except Exception as e:
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
await update_session_db(session_id, word_result=word_result)
# Reload session with updated word_result
session = await get_session_db(session_id)
ocr_duration = _time.time() - t0
logger.info(
"rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs "
"(enhance=%s, min_conf=%d, quality=%s)",
session_id, len(merged_words), len(rapid_words), len(tess_words),
len(merged_words), ocr_duration, enhance, actual_min_conf,
scan_quality_info.get("quality_pct", "?"),
)
# 7. Build grid from new words
try:
result = await _build_grid_core(
session_id, session,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
enhance=enhance,
max_columns=max_cols if max_cols > 0 else None,
min_conf=min_conf if min_conf > 0 else None,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
# Persist grid
await update_session_db(session_id, grid_editor_result=result, current_step=11)
# Add quality info to response
result["scan_quality"] = scan_quality_info
result["ocr_stats"] = {
"rapid_words": len(rapid_words),
"tess_words": len(tess_words),
"merged_words": len(merged_words),
"min_conf_used": actual_min_conf,
"enhance_applied": enhance and is_degraded,
"vision_fusion_applied": vision_applied,
"document_category": doc_category or session.get("document_category", ""),
"ocr_duration_seconds": round(ocr_duration, 1),
}
total_duration = _time.time() - t0
logger.info(
"rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs",
session_id,
len(result.get("zones", [])),
result.get("summary", {}).get("total_columns", 0),
result.get("summary", {}).get("total_cells", 0),
total_duration,
)
return result
@router.post("/sessions/{session_id}/save-grid")
async def save_grid(session_id: str, request: Request):
"""Save edited grid data from the frontend Excel-like editor.
Receives the full StructuredGrid with user edits (text changes,
formatting changes like bold columns, header rows, etc.) and
persists it to the session's grid_editor_result.
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
body = await request.json()
# Validate basic structure
if "zones" not in body:
raise HTTPException(status_code=400, detail="Missing 'zones' in request body")
# Preserve metadata from the original build
existing = session.get("grid_editor_result") or {}
result = {
"session_id": session_id,
"image_width": body.get("image_width", existing.get("image_width", 0)),
"image_height": body.get("image_height", existing.get("image_height", 0)),
"zones": body["zones"],
"boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)),
"summary": body.get("summary", existing.get("summary", {})),
"formatting": body.get("formatting", existing.get("formatting", {})),
"duration_seconds": existing.get("duration_seconds", 0),
"edited": True,
}
await update_session_db(session_id, grid_editor_result=result, current_step=11)
logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))
return {"session_id": session_id, "saved": True}
@router.get("/sessions/{session_id}/grid-editor")
async def get_grid(session_id: str):
"""Retrieve the current grid editor state for a session."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
result = session.get("grid_editor_result")
if not result:
raise HTTPException(
status_code=404,
detail="No grid editor data. Run build-grid first.",
)
return result
@@ -0,0 +1,110 @@
"""
Grid Editor API gutter repair endpoints.
"""
import logging
from fastapi import APIRouter, HTTPException, Request
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/gutter-repair")
async def gutter_repair(session_id: str):
"""Analyse grid for gutter-edge OCR errors and return repair suggestions.
Detects:
- Words truncated/blurred at the book binding (spell_fix)
- Words split across rows with missing hyphen chars (hyphen_join)
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
grid_data = session.get("grid_editor_result")
if not grid_data:
raise HTTPException(
status_code=400,
detail="No grid data. Run build-grid first.",
)
from cv_gutter_repair import analyse_grid_for_gutter_repair
image_width = grid_data.get("image_width", 0)
result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
# Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
gt = session.get("ground_truth") or {}
gt["gutter_repair"] = result
await update_session_db(session_id, ground_truth=gt)
logger.info(
"gutter-repair session %s: %d suggestions in %.2fs",
session_id,
result.get("stats", {}).get("suggestions_found", 0),
result.get("duration_seconds", 0),
)
return result
@router.post("/sessions/{session_id}/gutter-repair/apply")
async def gutter_repair_apply(session_id: str, request: Request):
"""Apply accepted gutter repair suggestions to the grid.
Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
grid_data = session.get("grid_editor_result")
if not grid_data:
raise HTTPException(status_code=400, detail="No grid data.")
gt = session.get("ground_truth") or {}
gutter_result = gt.get("gutter_repair")
if not gutter_result:
raise HTTPException(
status_code=400,
detail="No gutter repair data. Run gutter-repair first.",
)
body = await request.json()
accepted_ids = body.get("accepted", [])
if not accepted_ids:
return {"applied_count": 0, "changes": []}
# text_overrides: { suggestion_id: "alternative_text" }
# Allows the user to pick a different correction from the alternatives list
text_overrides = body.get("text_overrides", {})
from cv_gutter_repair import apply_gutter_suggestions
suggestions = gutter_result.get("suggestions", [])
# Apply user-selected alternatives before passing to apply
for s in suggestions:
sid = s.get("id", "")
if sid in text_overrides and text_overrides[sid]:
s["suggested_text"] = text_overrides[sid]
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
# Save updated grid back to session
await update_session_db(session_id, grid_editor_result=grid_data)
logger.info(
"gutter-repair/apply session %s: %d changes applied",
session_id,
result.get("applied_count", 0),
)
return result
@@ -0,0 +1,71 @@
"""
Grid Editor API unified grid endpoints.
"""
import logging
from fastapi import APIRouter, HTTPException
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
@router.post("/sessions/{session_id}/build-unified-grid")
async def build_unified_grid_endpoint(session_id: str):
"""Build a single-zone unified grid merging content + box zones.
Takes the existing multi-zone grid_editor_result and produces a
unified grid where boxes are integrated into the main row sequence.
Persists as unified_grid_result (preserves original multi-zone data).
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
grid_data = session.get("grid_editor_result")
if not grid_data:
raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
from unified_grid import build_unified_grid
result = build_unified_grid(
zones=grid_data.get("zones", []),
image_width=grid_data.get("image_width", 0),
image_height=grid_data.get("image_height", 0),
layout_metrics=grid_data.get("layout_metrics", {}),
)
# Persist as separate field (don't overwrite original multi-zone grid)
await update_session_db(session_id, unified_grid_result=result)
logger.info(
"build-unified-grid session %s: %d rows, %d cells",
session_id,
result.get("summary", {}).get("total_rows", 0),
result.get("summary", {}).get("total_cells", 0),
)
return result
@router.get("/sessions/{session_id}/unified-grid")
async def get_unified_grid(session_id: str):
"""Retrieve the unified grid for a session."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
result = session.get("unified_grid_result")
if not result:
raise HTTPException(
status_code=404,
detail="No unified grid. Run build-unified-grid first.",
)
return result
@@ -0,0 +1,492 @@
"""
Grid Editor column detection, cross-column splitting, marker merging.
Split from grid_editor_helpers.py for maintainability.
All functions are pure computation no HTTP, DB, or session side effects.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Cross-column word splitting
# ---------------------------------------------------------------------------
_spell_cache: Optional[Any] = None
_spell_loaded = False
def _is_recognized_word(text: str) -> bool:
"""Check if *text* is a recognized German or English word.
Uses the spellchecker library (same as cv_syllable_detect.py).
Returns True for real words like "oder", "Kabel", "Zeitung".
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
"""
global _spell_cache, _spell_loaded
if not text or len(text) < 2:
return False
if not _spell_loaded:
_spell_loaded = True
try:
from spellchecker import SpellChecker
_spell_cache = SpellChecker(language="de")
except Exception:
pass
if _spell_cache is None:
return False
return text.lower() in _spell_cache
def _split_cross_column_words(
words: List[Dict],
columns: List[Dict],
) -> List[Dict]:
"""Split word boxes that span across column boundaries.
When OCR merges adjacent words from different columns (e.g. "sichzie"
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
split the word box at the column boundary so each piece is assigned
to the correct column.
Only splits when:
- The word has significant overlap (>15% of its width) on both sides
- AND the word is not a recognized real word (OCR merge artifact), OR
the word contains a case transition (lowercase->uppercase) near the
boundary indicating two merged words like "dasZimmer".
"""
if len(columns) < 2:
return words
# Column boundaries = midpoints between adjacent column edges
boundaries = []
for i in range(len(columns) - 1):
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
boundaries.append(boundary)
new_words: List[Dict] = []
split_count = 0
for w in words:
w_left = w["left"]
w_width = w["width"]
w_right = w_left + w_width
text = (w.get("text") or "").strip()
if not text or len(text) < 4 or w_width < 10:
new_words.append(w)
continue
# Find the first boundary this word straddles significantly
split_boundary = None
for b in boundaries:
if w_left < b < w_right:
left_part = b - w_left
right_part = w_right - b
# Both sides must have at least 15% of the word width
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
split_boundary = b
break
if split_boundary is None:
new_words.append(w)
continue
# Compute approximate split position in the text.
left_width = split_boundary - w_left
split_ratio = left_width / w_width
approx_pos = len(text) * split_ratio
# Strategy 1: look for a case transition (lowercase->uppercase) near
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
split_char = None
search_lo = max(1, int(approx_pos) - 3)
search_hi = min(len(text), int(approx_pos) + 2)
for i in range(search_lo, search_hi):
if text[i - 1].islower() and text[i].isupper():
split_char = i
break
# Strategy 2: if no case transition, only split if the whole word
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
# Real words like "oder", "Kabel", "Zeitung" must not be split.
if split_char is None:
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
if _is_recognized_word(clean):
new_words.append(w)
continue
# Not a real word — use floor of proportional position
split_char = max(1, min(len(text) - 1, int(approx_pos)))
left_text = text[:split_char].rstrip()
right_text = text[split_char:].lstrip()
if len(left_text) < 2 or len(right_text) < 2:
new_words.append(w)
continue
right_width = w_width - round(left_width)
new_words.append({
**w,
"text": left_text,
"width": round(left_width),
})
new_words.append({
**w,
"text": right_text,
"left": round(split_boundary),
"width": right_width,
})
split_count += 1
logger.info(
"split cross-column word %r -> %r + %r at boundary %.0f",
text, left_text, right_text, split_boundary,
)
if split_count:
logger.info("split %d cross-column word(s)", split_count)
return new_words
def _cluster_columns_by_alignment(
words: List[Dict],
zone_w: int,
rows: List[Dict],
) -> List[Dict[str, Any]]:
"""Detect columns by clustering left-edge alignment across rows.
Hybrid approach:
1. Group words by row, find "group start" positions within each row
(words preceded by a large gap or first word in row)
2. Cluster group-start left-edges by X-proximity across rows
3. Filter by row coverage (how many rows have a group start here)
4. Merge nearby clusters
5. Build column boundaries
This filters out mid-phrase word positions (e.g. IPA transcriptions,
second words in multi-word entries) by only considering positions
where a new word group begins within a row.
"""
if not words or not rows:
return []
total_rows = len(rows)
if total_rows == 0:
return []
# --- Group words by row ---
row_words: Dict[int, List[Dict]] = {}
for w in words:
y_center = w["top"] + w["height"] / 2
best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
row_words.setdefault(best["index"], []).append(w)
# --- Compute adaptive gap threshold for group-start detection ---
all_gaps: List[float] = []
for ri, rw_list in row_words.items():
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
for i in range(len(sorted_rw) - 1):
right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
gap = sorted_rw[i + 1]["left"] - right
if gap > 0:
all_gaps.append(gap)
if all_gaps:
sorted_gaps = sorted(all_gaps)
median_gap = sorted_gaps[len(sorted_gaps) // 2]
heights = [w["height"] for w in words if w.get("height", 0) > 0]
median_h = sorted(heights)[len(heights) // 2] if heights else 25
# For small word counts (boxes, sub-zones): PaddleOCR returns
# multi-word blocks, so ALL inter-word gaps are potential column
# boundaries. Use a low threshold based on word height — any gap
# wider than ~1x median word height is a column separator.
if len(words) <= 60:
gap_threshold = max(median_h * 1.0, 25)
logger.info(
"alignment columns (small zone): gap_threshold=%.0f "
"(median_h=%.0f, %d words, %d gaps: %s)",
gap_threshold, median_h, len(words), len(sorted_gaps),
[int(g) for g in sorted_gaps[:10]],
)
else:
# Standard approach for large zones (full pages)
gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
# Cap at 25% of zone width
max_gap = zone_w * 0.25
if gap_threshold > max_gap > 30:
logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
gap_threshold = max_gap
else:
gap_threshold = 50
# --- Find group-start positions (left-edges that begin a new column) ---
start_positions: List[tuple] = [] # (left_edge, row_index)
for ri, rw_list in row_words.items():
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
# First word in row is always a group start
start_positions.append((sorted_rw[0]["left"], ri))
for i in range(1, len(sorted_rw)):
right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
gap = sorted_rw[i]["left"] - right_prev
if gap >= gap_threshold:
start_positions.append((sorted_rw[i]["left"], ri))
start_positions.sort(key=lambda x: x[0])
logger.info(
"alignment columns: %d group-start positions from %d words "
"(gap_threshold=%.0f, %d rows)",
len(start_positions), len(words), gap_threshold, total_rows,
)
if not start_positions:
x_min = min(w["left"] for w in words)
x_max = max(w["left"] + w["width"] for w in words)
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
# --- Cluster group-start positions by X-proximity ---
tolerance = max(10, int(zone_w * 0.01))
clusters: List[Dict[str, Any]] = []
cur_edges = [start_positions[0][0]]
cur_rows = {start_positions[0][1]}
for left, row_idx in start_positions[1:]:
if left - cur_edges[-1] <= tolerance:
cur_edges.append(left)
cur_rows.add(row_idx)
else:
clusters.append({
"mean_x": int(sum(cur_edges) / len(cur_edges)),
"min_edge": min(cur_edges),
"max_edge": max(cur_edges),
"count": len(cur_edges),
"distinct_rows": len(cur_rows),
"row_coverage": len(cur_rows) / total_rows,
})
cur_edges = [left]
cur_rows = {row_idx}
clusters.append({
"mean_x": int(sum(cur_edges) / len(cur_edges)),
"min_edge": min(cur_edges),
"max_edge": max(cur_edges),
"count": len(cur_edges),
"distinct_rows": len(cur_rows),
"row_coverage": len(cur_rows) / total_rows,
})
# --- Filter by row coverage ---
# These thresholds must be high enough to avoid false columns in flowing
# text (random inter-word gaps) while still detecting real columns in
# vocabulary worksheets (which typically have >80% row coverage).
MIN_COVERAGE_PRIMARY = 0.35
MIN_COVERAGE_SECONDARY = 0.12
MIN_WORDS_SECONDARY = 4
MIN_DISTINCT_ROWS = 3
# Content boundary for left-margin detection
content_x_min = min(w["left"] for w in words)
content_x_max = max(w["left"] + w["width"] for w in words)
content_span = content_x_max - content_x_min
primary = [
c for c in clusters
if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
and c["distinct_rows"] >= MIN_DISTINCT_ROWS
]
primary_ids = {id(c) for c in primary}
secondary = [
c for c in clusters
if id(c) not in primary_ids
and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
and c["count"] >= MIN_WORDS_SECONDARY
and c["distinct_rows"] >= MIN_DISTINCT_ROWS
]
# Tertiary: narrow left-margin columns (page refs, markers) that have
# too few rows for secondary but are clearly left-aligned and separated
# from the main content. These appear at the far left or far right and
# have a large gap to the nearest significant cluster.
used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
sig_xs = [c["mean_x"] for c in primary + secondary]
# Tertiary: clusters that are clearly to the LEFT of the first
# significant column (or RIGHT of the last). If words consistently
# start at a position left of the established first column boundary,
# they MUST be a separate column — regardless of how few rows they
# cover. The only requirement is a clear spatial gap.
MIN_COVERAGE_TERTIARY = 0.02 # at least 1 row effectively
tertiary = []
for c in clusters:
if id(c) in used_ids:
continue
if c["distinct_rows"] < 1:
continue
if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
continue
# Must be near left or right content margin (within 15%)
rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
if not (rel_pos < 0.15 or rel_pos > 0.85):
continue
# Must have significant gap to nearest significant cluster
if sig_xs:
min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
if min_dist < max(30, content_span * 0.02):
continue
tertiary.append(c)
if tertiary:
for c in tertiary:
logger.info(
" tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
c["mean_x"], c["min_edge"], c["max_edge"],
c["count"], c["distinct_rows"], c["row_coverage"] * 100,
)
significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
for c in significant:
logger.info(
" significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
c["mean_x"], c["min_edge"], c["max_edge"],
c["count"], c["distinct_rows"], c["row_coverage"] * 100,
)
logger.info(
"alignment columns: %d clusters, %d primary, %d secondary -> %d significant",
len(clusters), len(primary), len(secondary), len(significant),
)
if not significant:
# Fallback: single column covering all content
x_min = min(w["left"] for w in words)
x_max = max(w["left"] + w["width"] for w in words)
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
# --- Merge nearby clusters ---
merge_distance = max(25, int(zone_w * 0.03))
merged = [significant[0].copy()]
for s in significant[1:]:
if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
prev = merged[-1]
total = prev["count"] + s["count"]
prev["mean_x"] = (
prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
) // total
prev["count"] = total
prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
else:
merged.append(s.copy())
logger.info(
"alignment columns: %d after merge (distance=%d)",
len(merged), merge_distance,
)
# --- Build column boundaries ---
margin = max(5, int(zone_w * 0.005))
content_x_min = min(w["left"] for w in words)
content_x_max = max(w["left"] + w["width"] for w in words)
columns: List[Dict[str, Any]] = []
for i, cluster in enumerate(merged):
x_min = max(content_x_min, cluster["min_edge"] - margin)
if i + 1 < len(merged):
x_max = merged[i + 1]["min_edge"] - margin
else:
x_max = content_x_max
columns.append({
"index": i,
"type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
"x_min": x_min,
"x_max": x_max,
})
return columns
_MARKER_CHARS = set("*-+#>")
def _merge_inline_marker_columns(
columns: List[Dict],
words: List[Dict],
) -> List[Dict]:
"""Merge narrow marker columns (bullets, numbering) into adjacent text.
Bullet points (*, -) and numbering (1., 2.) create narrow columns
at the left edge of a zone. These are inline markers that indent text,
not real separate columns. Merge them with their right neighbour.
Does NOT merge columns containing alphabetic words like "to", "in",
"der", "die", "das" those are legitimate content columns.
"""
if len(columns) < 2:
return columns
merged: List[Dict] = []
skip: set = set()
for i, col in enumerate(columns):
if i in skip:
continue
# Find words in this column
col_words = [
w for w in words
if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
]
col_width = col["x_max"] - col["x_min"]
# Narrow column with mostly short words -> MIGHT be inline markers
if col_words and col_width < 80:
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
if avg_len <= 2 and i + 1 < len(columns):
# Check if words are actual markers (symbols/numbers) vs
# real alphabetic words like "to", "in", "der", "die"
texts = [(w.get("text") or "").strip() for w in col_words]
alpha_count = sum(
1 for t in texts
if t and t[0].isalpha() and t not in _MARKER_CHARS
)
alpha_ratio = alpha_count / len(texts) if texts else 0
# If >=50% of words are alphabetic, this is a real column
if alpha_ratio >= 0.5:
logger.info(
" kept narrow column %d (w=%d, avg_len=%.1f, "
"alpha=%.0f%%) -- contains real words",
i, col_width, avg_len, alpha_ratio * 100,
)
else:
# Merge into next column
next_col = columns[i + 1].copy()
next_col["x_min"] = col["x_min"]
merged.append(next_col)
skip.add(i + 1)
logger.info(
" merged inline marker column %d (w=%d, avg_len=%.1f) "
"into column %d",
i, col_width, avg_len, i + 1,
)
continue
merged.append(col)
# Re-index
for i, col in enumerate(merged):
col["index"] = i
col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
return merged
@@ -0,0 +1,402 @@
"""
Grid Editor word/zone filtering, border ghosts, decorative margins, footers.
Split from grid_editor_helpers.py for maintainability.
All functions are pure computation no HTTP, DB, or session side effects.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
"""Remove page-border decoration strip words BEFORE column detection.
Scans from each page edge inward to find the first significant x-gap
(>30 px). If the edge cluster contains <15 % of total words, those
words are removed as border-strip artifacts (alphabet letters,
illustration fragments).
Must run BEFORE ``_build_zone_grid`` so that column detection only
sees real content words and doesn't produce inflated row counts.
"""
if len(words) < 10:
return words, 0
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
total = len(sorted_words)
# -- Left-edge scan (running max right-edge) --
left_count = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
)
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
left_count = gi + 1
break
# -- Right-edge scan (running min left) --
right_count = 0
running_left = sorted_words[-1].get("left", 0)
for gi in range(total - 1, 0, -1):
running_left = min(running_left, sorted_words[gi].get("left", 0))
prev_right = (
sorted_words[gi - 1].get("left", 0)
+ sorted_words[gi - 1].get("width", 0)
)
if running_left - prev_right > 30:
right_count = total - gi
break
# Validate candidate strip: real border decorations are mostly short
# words (alphabet letters like "A", "Bb", stray marks). Multi-word
# content like "der Ranzen" or "die Schals" (continuation of German
# translations) must NOT be removed.
def _is_decorative_strip(candidates: List[Dict]) -> bool:
if not candidates:
return False
short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
return short / len(candidates) >= 0.45
strip_ids: set = set()
if left_count > 0 and left_count / total < 0.20:
candidates = sorted_words[:left_count]
if _is_decorative_strip(candidates):
strip_ids = {id(w) for w in candidates}
elif right_count > 0 and right_count / total < 0.20:
candidates = sorted_words[total - right_count:]
if _is_decorative_strip(candidates):
strip_ids = {id(w) for w in candidates}
if not strip_ids:
return words, 0
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
# Characters that are typically OCR artefacts from box border lines.
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+")
def _filter_border_ghosts(
words: List[Dict],
boxes: List,
) -> tuple:
"""Remove words sitting on box borders that are OCR artefacts.
Returns (filtered_words, removed_count).
"""
if not boxes or not words:
return words, 0
# Build border bands from detected boxes
x_bands: List[tuple] = []
y_bands: List[tuple] = []
for b in boxes:
bt = (
b.border_thickness
if hasattr(b, "border_thickness")
else b.get("border_thickness", 3)
)
# Skip borderless boxes (images/graphics) -- no border line to produce ghosts
if bt == 0:
continue
bx = b.x if hasattr(b, "x") else b.get("x", 0)
by = b.y if hasattr(b, "y") else b.get("y", 0)
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
margin = max(bt * 2, 10) + 6
x_bands.append((bx - margin, bx + margin))
x_bands.append((bx + bw - margin, bx + bw + margin))
y_bands.append((by - margin, by + margin))
y_bands.append((by + bh - margin, by + bh + margin))
def _is_ghost(w: Dict) -> bool:
text = (w.get("text") or "").strip()
if not text:
return False
# Check if any word edge (not just center) touches a border band
w_left = w["left"]
w_right = w["left"] + w["width"]
w_top = w["top"]
w_bottom = w["top"] + w["height"]
on_border = (
any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
)
if not on_border:
return False
if len(text) == 1 and text in _GRID_GHOST_CHARS:
return True
return False
filtered = [w for w in words if not _is_ghost(w)]
return filtered, len(words) - len(filtered)
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
"""Extract all word_boxes from cells into a flat list of word dicts."""
words: List[Dict] = []
for cell in cells:
for wb in cell.get("word_boxes") or []:
if wb.get("text", "").strip():
words.append({
"text": wb["text"],
"left": wb["left"],
"top": wb["top"],
"width": wb["width"],
"height": wb["height"],
"conf": wb.get("conf", 0),
})
return words
def _words_in_zone(
words: List[Dict],
zone_y: int,
zone_h: int,
zone_x: int,
zone_w: int,
) -> List[Dict]:
"""Filter words whose Y-center falls within a zone's bounds."""
zone_y_end = zone_y + zone_h
zone_x_end = zone_x + zone_w
result = []
for w in words:
cy = w["top"] + w["height"] / 2
cx = w["left"] + w["width"] / 2
if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
result.append(w)
return result
def _get_content_bounds(words: List[Dict]) -> tuple:
"""Get content bounds from word positions."""
if not words:
return 0, 0, 0, 0
x_min = min(w["left"] for w in words)
y_min = min(w["top"] for w in words)
x_max = max(w["left"] + w["width"] for w in words)
y_max = max(w["top"] + w["height"] for w in words)
return x_min, y_min, x_max - x_min, y_max - y_min
def _filter_decorative_margin(
words: List[Dict],
img_w: int,
log: Any,
session_id: str,
) -> Dict[str, Any]:
"""Remove words that belong to a decorative alphabet strip on a margin.
Some vocabulary worksheets have a vertical A-Z alphabet graphic along
the left or right edge. OCR reads each letter as an isolated single-
character word. These decorative elements are not content and confuse
column/row detection.
Detection criteria (phase 1 -- find the strip using single-char words):
- Words are in the outer 30% of the page (left or right)
- Nearly all words are single characters (letters or digits)
- At least 8 such words form a vertical strip (>=8 unique Y positions)
- Average horizontal spread of the strip is small (< 80px)
Phase 2 -- once a strip is confirmed, also remove any short word (<=3
chars) in the same narrow x-range. This catches multi-char OCR
artifacts like "Vv" that belong to the same decorative element.
Modifies *words* in place.
Returns:
Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
"""
no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
if not words or img_w <= 0:
return no_strip
margin_cutoff = img_w * 0.30
# Phase 1: find candidate strips using short words (1-2 chars).
# OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
# rather than singles, so accept <=2-char words as strip candidates.
left_strip = [
w for w in words
if len((w.get("text") or "").strip()) <= 2
and w["left"] + w.get("width", 0) / 2 < margin_cutoff
]
right_strip = [
w for w in words
if len((w.get("text") or "").strip()) <= 2
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
]
for strip, side in [(left_strip, "left"), (right_strip, "right")]:
if len(strip) < 6:
continue
# Check vertical distribution: should have many distinct Y positions
y_centers = sorted(set(
int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket
for w in strip
))
if len(y_centers) < 6:
continue
# Check horizontal compactness
x_positions = [w["left"] for w in strip]
x_min = min(x_positions)
x_max = max(x_positions)
x_spread = x_max - x_min
if x_spread > 80:
continue
# Phase 2: strip confirmed -- also collect short words in same x-range
# Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
strip_x_lo = x_min - 20
strip_x_hi = x_max + 60 # word width + tolerance
all_strip_words = [
w for w in words
if len((w.get("text") or "").strip()) <= 3
and strip_x_lo <= w["left"] <= strip_x_hi
and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
if side == "left"
else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
]
strip_set = set(id(w) for w in all_strip_words)
before = len(words)
words[:] = [w for w in words if id(w) not in strip_set]
removed = before - len(words)
if removed:
log.info(
"build-grid session %s: removed %d decorative %s-margin words "
"(strip x=%d-%d)",
session_id, removed, side, strip_x_lo, strip_x_hi,
)
return {"found": True, "side": side, "letters_detected": len(strip)}
return no_strip
def _filter_footer_words(
words: List[Dict],
img_h: int,
log: Any,
session_id: str,
) -> Optional[Dict]:
"""Remove isolated words in the bottom 5% of the page (page numbers).
Modifies *words* in place and returns a page_number metadata dict
if a page number was extracted, or None.
"""
if not words or img_h <= 0:
return None
footer_y = img_h * 0.95
footer_words = [
w for w in words
if w["top"] + w.get("height", 0) / 2 > footer_y
]
if not footer_words:
return None
# Only remove if footer has very few words (<= 3) with short text
total_text = "".join((w.get("text") or "").strip() for w in footer_words)
if len(footer_words) <= 3 and len(total_text) <= 10:
# Extract page number metadata before removing
page_number_info = {
"text": total_text.strip(),
"y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
}
# Try to parse as integer
digits = "".join(c for c in total_text if c.isdigit())
if digits:
page_number_info["number"] = int(digits)
footer_set = set(id(w) for w in footer_words)
words[:] = [w for w in words if id(w) not in footer_set]
log.info(
"build-grid session %s: extracted page number '%s' and removed %d footer words",
session_id, total_text, len(footer_words),
)
return page_number_info
return None
def _filter_header_junk(
words: List[Dict],
img_h: int,
log: Any,
session_id: str,
) -> None:
"""Remove OCR junk from header illustrations above the real content.
Textbook pages often have decorative header graphics (illustrations,
icons) that OCR reads as low-confidence junk characters. Real content
typically starts further down the page.
Algorithm:
1. Find the "content start" -- the first Y position where a dense
horizontal row of 3+ high-confidence words begins.
2. Above that line, remove words with conf < 75 and text <= 3 chars.
These are almost certainly OCR artifacts from illustrations.
Modifies *words* in place.
"""
if not words or img_h <= 0:
return
# --- Find content start: first horizontal row with >=3 high-conf words ---
# Sort words by Y
sorted_by_y = sorted(words, key=lambda w: w["top"])
content_start_y = 0
_ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row
_MIN_ROW_WORDS = 3
_MIN_CONF = 80
i = 0
while i < len(sorted_by_y):
row_y = sorted_by_y[i]["top"]
# Collect words in this row band
row_words = []
j = i
while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
row_words.append(sorted_by_y[j])
j += 1
# Count high-confidence words with real text (> 1 char)
high_conf = [
w for w in row_words
if w.get("conf", 0) >= _MIN_CONF
and len((w.get("text") or "").strip()) > 1
]
if len(high_conf) >= _MIN_ROW_WORDS:
content_start_y = row_y
break
i = j if j > i else i + 1
if content_start_y <= 0:
return # no clear content start found
# --- Remove low-conf short junk above content start ---
junk = [
w for w in words
if w["top"] + w.get("height", 0) < content_start_y
and w.get("conf", 0) < 75
and len((w.get("text") or "").strip()) <= 3
]
if not junk:
return
junk_set = set(id(w) for w in junk)
before = len(words)
words[:] = [w for w in words if id(w) not in junk_set]
removed = before - len(words)
if removed:
log.info(
"build-grid session %s: removed %d header junk words above y=%d "
"(content start)",
session_id, removed, content_start_y,
)
@@ -0,0 +1,499 @@
"""
Grid Editor header/heading detection and colspan (merged cell) detection.
Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects.
Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Dict, List, Optional
from cv_ocr_engines import _text_has_garbled_ipa
logger = logging.getLogger(__name__)
def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
"""Detect heading rows by color + height after color annotation.
A row is a heading if:
1. ALL word_boxes have color_name != 'black' (typically 'blue')
2. Mean word height > 1.2x median height of all words in the zone
Detected heading rows are merged into a single spanning cell.
Returns count of headings detected.
"""
heading_count = 0
for z in zones_data:
cells = z.get("cells", [])
rows = z.get("rows", [])
columns = z.get("columns", [])
if not cells or not rows or len(columns) < 2:
continue
# Compute median word height across the zone
all_heights = []
for cell in cells:
for wb in cell.get("word_boxes") or []:
h = wb.get("height", 0)
if h > 0:
all_heights.append(h)
if not all_heights:
continue
all_heights_sorted = sorted(all_heights)
median_h = all_heights_sorted[len(all_heights_sorted) // 2]
heading_row_indices = []
for row in rows:
if row.get("is_header"):
continue # already detected as header
ri = row["index"]
row_cells = [c for c in cells if c.get("row_index") == ri]
row_wbs = [
wb for cell in row_cells
for wb in cell.get("word_boxes") or []
]
if not row_wbs:
continue
# Condition 1: ALL words are non-black
all_colored = all(
wb.get("color_name", "black") != "black"
for wb in row_wbs
)
if not all_colored:
continue
# Condition 2: mean height > 1.2x median
mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
if mean_h <= median_h * 1.2:
continue
heading_row_indices.append(ri)
# Merge heading cells into spanning cells
for hri in heading_row_indices:
header_cells = [c for c in cells if c.get("row_index") == hri]
if len(header_cells) <= 1:
# Single cell -- just mark it as heading
if header_cells:
header_cells[0]["col_type"] = "heading"
heading_count += 1
# Mark row as header
for row in rows:
if row["index"] == hri:
row["is_header"] = True
continue
# Collect all word_boxes and text from all columns
all_wb = []
all_text_parts = []
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
all_wb.extend(hc.get("word_boxes", []))
if hc.get("text", "").strip():
all_text_parts.append(hc["text"].strip())
# Remove all cells for this row, replace with one spanning cell
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
if all_wb:
x_min = min(wb["left"] for wb in all_wb)
y_min = min(wb["top"] for wb in all_wb)
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
# Use the actual starting col_index from the first cell
first_col = min(hc["col_index"] for hc in header_cells)
zone_idx = z.get("zone_index", 0)
z["cells"].append({
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
"zone_index": zone_idx,
"row_index": hri,
"col_index": first_col,
"col_type": "heading",
"text": " ".join(all_text_parts),
"confidence": 0.0,
"bbox_px": {"x": x_min, "y": y_min,
"w": x_max - x_min, "h": y_max - y_min},
"bbox_pct": {
"x": round(x_min / img_w * 100, 2) if img_w else 0,
"y": round(y_min / img_h * 100, 2) if img_h else 0,
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
},
"word_boxes": all_wb,
"ocr_engine": "words_first",
"is_bold": True,
})
# Mark row as header
for row in rows:
if row["index"] == hri:
row["is_header"] = True
heading_count += 1
return heading_count
def _detect_heading_rows_by_single_cell(
zones_data: List[Dict], img_w: int, img_h: int,
) -> int:
"""Detect heading rows that have only a single content cell.
Black headings like "Theme" have normal color and height, so they are
missed by ``_detect_heading_rows_by_color``. The distinguishing signal
is that they occupy only one column while normal vocabulary rows fill
at least 2-3 columns.
A row qualifies as a heading if:
1. It is not already marked as a header/heading.
2. It has exactly ONE cell whose col_type starts with ``column_``
(excluding column_1 / page_ref which only carries page numbers).
3. That single cell is NOT in the last column (continuation/example
lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
4. The text does not start with ``[`` (IPA continuation).
5. The zone has >=3 columns and >=5 rows (avoids false positives in
tiny zones).
6. The majority of rows in the zone have >=2 content cells (ensures
we are in a multi-column vocab layout).
"""
heading_count = 0
for z in zones_data:
cells = z.get("cells", [])
rows = z.get("rows", [])
columns = z.get("columns", [])
if len(columns) < 3 or len(rows) < 5:
continue
# Determine the last col_index (example/sentence column)
col_indices = sorted(set(c.get("col_index", 0) for c in cells))
if not col_indices:
continue
last_col = col_indices[-1]
# Count content cells per row (column_* but not column_1/page_ref).
# Exception: column_1 cells that contain a dictionary article word
# (die/der/das etc.) ARE content -- they appear in dictionary layouts
# where the leftmost column holds grammatical articles.
_ARTICLE_WORDS = {
"die", "der", "das", "dem", "den", "des", "ein", "eine",
"the", "a", "an",
}
row_content_counts: Dict[int, int] = {}
for cell in cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
if ct == "column_1":
ctext = (cell.get("text") or "").strip().lower()
if ctext not in _ARTICLE_WORDS:
continue
ri = cell.get("row_index", -1)
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
# Majority of rows must have >=2 content cells
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
if multi_col_rows < len(rows) * 0.4:
continue
# Exclude first and last non-header rows -- these are typically
# page numbers or footer text, not headings.
non_header_rows = [r for r in rows if not r.get("is_header")]
if len(non_header_rows) < 3:
continue
first_ri = non_header_rows[0]["index"]
last_ri = non_header_rows[-1]["index"]
heading_row_indices = []
for row in rows:
if row.get("is_header"):
continue
ri = row["index"]
if ri == first_ri or ri == last_ri:
continue
row_cells = [c for c in cells if c.get("row_index") == ri]
content_cells = [
c for c in row_cells
if c.get("col_type", "").startswith("column_")
and (c.get("col_type") != "column_1"
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
]
if len(content_cells) != 1:
continue
cell = content_cells[0]
# Not in the last column (continuation/example lines)
if cell.get("col_index") == last_col:
continue
text = (cell.get("text") or "").strip()
if not text or text.startswith("["):
continue
# Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
if text.startswith("("):
continue
# Single cell NOT in the first content column is likely a
# continuation/overflow line, not a heading. Real headings
# ("Theme 1", "Unit 3: ...") appear in the first or second
# content column.
first_content_col = col_indices[0] if col_indices else 0
if cell.get("col_index", 0) > first_content_col + 1:
continue
# Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
# but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
_REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
continue
# Guard: dictionary section headings are short (1-4 alpha chars
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
# lowercase is a regular vocabulary word (e.g. "zentral") that
# happens to appear alone in its row.
alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
if len(alpha_only) > 4 and text[0].islower():
continue
heading_row_indices.append(ri)
# Guard: if >25% of eligible rows would become headings, the
# heuristic is misfiring (e.g. sparse single-column layout where
# most rows naturally have only 1 content cell).
eligible_rows = len(non_header_rows) - 2 # minus first/last excluded
if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
logger.debug(
"Skipping single-cell heading detection for zone %s: "
"%d/%d rows would be headings (>25%%)",
z.get("zone_index"), len(heading_row_indices), eligible_rows,
)
continue
for hri in heading_row_indices:
header_cells = [c for c in cells if c.get("row_index") == hri]
if not header_cells:
continue
# Collect all word_boxes and text
all_wb = []
all_text_parts = []
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
all_wb.extend(hc.get("word_boxes", []))
if hc.get("text", "").strip():
all_text_parts.append(hc["text"].strip())
first_col_idx = min(hc["col_index"] for hc in header_cells)
# Remove old cells for this row, add spanning heading cell
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
if all_wb:
x_min = min(wb["left"] for wb in all_wb)
y_min = min(wb["top"] for wb in all_wb)
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
else:
# Fallback to first cell bbox
bp = header_cells[0].get("bbox_px", {})
x_min = bp.get("x", 0)
y_min = bp.get("y", 0)
x_max = x_min + bp.get("w", 0)
y_max = y_min + bp.get("h", 0)
zone_idx = z.get("zone_index", 0)
z["cells"].append({
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
"zone_index": zone_idx,
"row_index": hri,
"col_index": first_col_idx,
"col_type": "heading",
"text": " ".join(all_text_parts),
"confidence": 0.0,
"bbox_px": {"x": x_min, "y": y_min,
"w": x_max - x_min, "h": y_max - y_min},
"bbox_pct": {
"x": round(x_min / img_w * 100, 2) if img_w else 0,
"y": round(y_min / img_h * 100, 2) if img_h else 0,
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
},
"word_boxes": all_wb,
"ocr_engine": "words_first",
"is_bold": False,
})
for row in rows:
if row["index"] == hri:
row["is_header"] = True
heading_count += 1
return heading_count
def _detect_header_rows(
rows: List[Dict],
zone_words: List[Dict],
zone_y: int,
columns: Optional[List[Dict]] = None,
skip_first_row_header: bool = False,
) -> List[int]:
"""Detect header rows: first-row heuristic + spanning header detection.
A "spanning header" is a row whose words stretch across multiple column
boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
"""
if len(rows) < 2:
return []
headers = []
if not skip_first_row_header:
first_row = rows[0]
second_row = rows[1]
# Gap between first and second row > 0.5x average row height
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
gap = second_row["y_min"] - first_row["y_max"]
if gap > avg_h * 0.5:
headers.append(0)
# Also check if first row words are taller than average (bold/header text)
all_heights = [w["height"] for w in zone_words]
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
first_row_words = [
w for w in zone_words
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
]
if first_row_words:
first_h = max(w["height"] for w in first_row_words)
if first_h > median_h * 1.3:
if 0 not in headers:
headers.append(0)
# Note: Spanning-header detection (rows spanning all columns) has been
# disabled because it produces too many false positives on vocabulary
# worksheets where IPA transcriptions or short entries naturally span
# multiple columns with few words. The first-row heuristic above is
# sufficient for detecting real headers.
return headers
def _detect_colspan_cells(
zone_words: List[Dict],
columns: List[Dict],
rows: List[Dict],
cells: List[Dict],
img_w: int,
img_h: int,
) -> List[Dict]:
"""Detect and merge cells that span multiple columns (colspan).
A word-block (PaddleOCR phrase) that extends significantly past a column
boundary into the next column indicates a merged cell. This replaces
the incorrectly split cells with a single cell spanning multiple columns.
Works for both full-page scans and box zones.
"""
if len(columns) < 2 or not zone_words or not rows:
return cells
from cv_words_first import _assign_word_to_row
# Column boundaries (midpoints between adjacent columns)
col_boundaries = []
for ci in range(len(columns) - 1):
col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
def _cols_covered(w_left: float, w_right: float) -> List[int]:
"""Return list of column indices that a word-block covers."""
covered = []
for col in columns:
col_mid = (col["x_min"] + col["x_max"]) / 2
# Word covers a column if it extends past the column's midpoint
if w_left < col_mid < w_right:
covered.append(col["index"])
# Also include column if word starts within it
elif col["x_min"] <= w_left < col["x_max"]:
covered.append(col["index"])
return sorted(set(covered))
# Group original word-blocks by row
row_word_blocks: Dict[int, List[Dict]] = {}
for w in zone_words:
ri = _assign_word_to_row(w, rows)
row_word_blocks.setdefault(ri, []).append(w)
# For each row, check if any word-block spans multiple columns
rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks
for ri, wblocks in row_word_blocks.items():
spanning = []
for w in wblocks:
w_left = w["left"]
w_right = w_left + w["width"]
covered = _cols_covered(w_left, w_right)
if len(covered) >= 2:
spanning.append({"word": w, "cols": covered})
if spanning:
rows_to_merge[ri] = spanning
if not rows_to_merge:
return cells
# Merge cells for spanning rows
new_cells = []
for cell in cells:
ri = cell.get("row_index", -1)
if ri not in rows_to_merge:
new_cells.append(cell)
continue
# Check if this cell's column is part of a spanning block
ci = cell.get("col_index", -1)
is_part_of_span = False
for span in rows_to_merge[ri]:
if ci in span["cols"]:
is_part_of_span = True
# Only emit the merged cell for the FIRST column in the span
if ci == span["cols"][0]:
# Use the ORIGINAL word-block text (not the split cell texts
# which may have broken words like "euros a" + "nd cents")
orig_word = span["word"]
merged_text = orig_word.get("text", "").strip()
all_wb = [orig_word]
# Compute merged bbox
if all_wb:
x_min = min(wb["left"] for wb in all_wb)
y_min = min(wb["top"] for wb in all_wb)
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
else:
x_min = y_min = x_max = y_max = 0
new_cells.append({
"cell_id": cell["cell_id"],
"row_index": ri,
"col_index": span["cols"][0],
"col_type": "spanning_header",
"colspan": len(span["cols"]),
"text": merged_text,
"confidence": cell.get("confidence", 0),
"bbox_px": {"x": x_min, "y": y_min,
"w": x_max - x_min, "h": y_max - y_min},
"bbox_pct": {
"x": round(x_min / img_w * 100, 2) if img_w else 0,
"y": round(y_min / img_h * 100, 2) if img_h else 0,
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
},
"word_boxes": all_wb,
"ocr_engine": cell.get("ocr_engine", ""),
"is_bold": cell.get("is_bold", False),
})
logger.info(
"colspan detected: row %d, cols %s -> merged %d cells (%r)",
ri, span["cols"], len(span["cols"]), merged_text[:50],
)
break
if not is_part_of_span:
new_cells.append(cell)
return new_cells
@@ -0,0 +1,58 @@
"""
Grid Editor helper functions barrel re-export module.
This file re-exports all public symbols from the split sub-modules
so that existing ``from grid_editor_helpers import ...`` statements
continue to work without changes.
Sub-modules:
- columns column detection, cross-column splitting, marker merging
- filters word/zone filtering, border ghosts, decorative margins
- headers header/heading detection, colspan detection
- zones vertical dividers, zone splitting/merging, zone grid building
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# --- Re-export: columns ---------------------------------------------------
from .columns import ( # noqa: F401
_is_recognized_word,
_split_cross_column_words,
_cluster_columns_by_alignment,
_MARKER_CHARS,
_merge_inline_marker_columns,
)
# --- Re-export: filters ----------------------------------------------------
from .filters import ( # noqa: F401
_filter_border_strip_words,
_GRID_GHOST_CHARS,
_filter_border_ghosts,
_flatten_word_boxes,
_words_in_zone,
_get_content_bounds,
_filter_decorative_margin,
_filter_footer_words,
_filter_header_junk,
)
# --- Re-export: headers ----------------------------------------------------
from .headers import ( # noqa: F401
_detect_heading_rows_by_color,
_detect_heading_rows_by_single_cell,
_detect_header_rows,
_detect_colspan_cells,
)
# --- Re-export: zones -------------------------------------------------------
from .zones import ( # noqa: F401
_PIPE_RE_VSPLIT,
_detect_vertical_dividers,
_split_zone_at_vertical_dividers,
_merge_content_zones_across_boxes,
_build_zone_grid,
)
# --- Re-export from cv_words_first (used by cv_box_layout.py) ---------------
from cv_words_first import _cluster_rows # noqa: F401
@@ -0,0 +1,389 @@
"""
Grid Editor vertical divider detection, zone splitting/merging, zone grid building.
Split from grid_editor_helpers.py for maintainability.
All functions are pure computation no HTTP, DB, or session side effects.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import PageZone
from cv_words_first import _cluster_rows, _build_cells
from .columns import (
_cluster_columns_by_alignment,
_merge_inline_marker_columns,
_split_cross_column_words,
)
from .headers import (
_detect_header_rows,
_detect_colspan_cells,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Vertical divider detection and zone splitting
# ---------------------------------------------------------------------------
_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
def _detect_vertical_dividers(
words: List[Dict],
zone_x: int,
zone_w: int,
zone_y: int,
zone_h: int,
) -> List[float]:
"""Detect vertical divider lines from pipe word_boxes at consistent x.
Returns list of divider x-positions (empty if no dividers found).
"""
if not words or zone_w <= 0 or zone_h <= 0:
return []
# Collect pipe word_boxes
pipes = [
w for w in words
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
]
if len(pipes) < 5:
return []
# Cluster pipe x-centers by proximity
tolerance = max(15, int(zone_w * 0.02))
pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
clusters: List[List[float]] = [[pipe_xs[0]]]
for x in pipe_xs[1:]:
if x - clusters[-1][-1] <= tolerance:
clusters[-1].append(x)
else:
clusters.append([x])
dividers: List[float] = []
for cluster in clusters:
if len(cluster) < 5:
continue
mean_x = sum(cluster) / len(cluster)
# Must be between 15% and 85% of zone width
rel_pos = (mean_x - zone_x) / zone_w
if rel_pos < 0.15 or rel_pos > 0.85:
continue
# Check vertical coverage: pipes must span >= 50% of zone height
cluster_pipes = [
w for w in pipes
if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
]
ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
y_span = max(ys) - min(ys) if ys else 0
if y_span < zone_h * 0.5:
continue
dividers.append(mean_x)
return sorted(dividers)
def _split_zone_at_vertical_dividers(
zone: "PageZone",
divider_xs: List[float],
vsplit_group_id: int,
) -> List["PageZone"]:
"""Split a PageZone at vertical divider positions into sub-zones."""
boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
hints = []
for i in range(len(boundaries) - 1):
if i == 0:
hints.append("left_of_vsplit")
elif i == len(boundaries) - 2:
hints.append("right_of_vsplit")
else:
hints.append("middle_of_vsplit")
sub_zones = []
for i in range(len(boundaries) - 1):
x_start = int(boundaries[i])
x_end = int(boundaries[i + 1])
sub = PageZone(
index=0, # re-indexed later
zone_type=zone.zone_type,
y=zone.y,
height=zone.height,
x=x_start,
width=x_end - x_start,
box=zone.box,
image_overlays=zone.image_overlays,
layout_hint=hints[i],
vsplit_group=vsplit_group_id,
)
sub_zones.append(sub)
return sub_zones
def _merge_content_zones_across_boxes(
zones: List,
content_x: int,
content_w: int,
) -> List:
"""Merge content zones separated by box zones into single zones.
Box zones become image_overlays on the merged content zone.
Pattern: [content, box*, content] -> [merged_content with overlay]
Box zones NOT between two content zones stay as standalone zones.
"""
if len(zones) < 3:
return zones
# Group consecutive runs of [content, box+, content]
result: List = []
i = 0
while i < len(zones):
z = zones[i]
if z.zone_type != "content":
result.append(z)
i += 1
continue
# Start of a potential merge group: content zone
group_contents = [z]
group_boxes = []
j = i + 1
# Absorb [box, content] pairs -- only absorb a box if it's
# confirmed to be followed by another content zone.
while j < len(zones):
if (zones[j].zone_type == "box"
and j + 1 < len(zones)
and zones[j + 1].zone_type == "content"):
group_boxes.append(zones[j])
group_contents.append(zones[j + 1])
j += 2
else:
break
if len(group_contents) >= 2 and group_boxes:
# Merge: create one large content zone spanning all
y_min = min(c.y for c in group_contents)
y_max = max(c.y + c.height for c in group_contents)
overlays = []
for bz in group_boxes:
overlay = {
"y": bz.y,
"height": bz.height,
"x": bz.x,
"width": bz.width,
}
if bz.box:
overlay["box"] = {
"x": bz.box.x,
"y": bz.box.y,
"width": bz.box.width,
"height": bz.box.height,
"confidence": bz.box.confidence,
"border_thickness": bz.box.border_thickness,
}
overlays.append(overlay)
merged = PageZone(
index=0, # re-indexed below
zone_type="content",
y=y_min,
height=y_max - y_min,
x=content_x,
width=content_w,
image_overlays=overlays,
)
result.append(merged)
i = j
else:
# No merge possible -- emit just the content zone
result.append(z)
i += 1
# Re-index zones
for idx, z in enumerate(result):
z.index = idx
logger.info(
"zone-merge: %d zones -> %d zones after merging across boxes",
len(zones), len(result),
)
return result
def _build_zone_grid(
zone_words: List[Dict],
zone_x: int,
zone_y: int,
zone_w: int,
zone_h: int,
zone_index: int,
img_w: int,
img_h: int,
global_columns: Optional[List[Dict]] = None,
skip_first_row_header: bool = False,
) -> Dict[str, Any]:
"""Build columns, rows, cells for a single zone from its words.
Args:
global_columns: If provided, use these pre-computed column boundaries
instead of detecting columns per zone. Used for content zones so
that all content zones (above/between/below boxes) share the same
column structure. Box zones always detect columns independently.
"""
if not zone_words:
return {
"columns": [],
"rows": [],
"cells": [],
"header_rows": [],
}
# Cluster rows first (needed for column alignment analysis)
rows = _cluster_rows(zone_words)
# Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
if len(zone_words) <= 60:
import statistics as _st
_heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
_med_h = _st.median(_heights) if _heights else 20
_y_tol = max(_med_h * 0.5, 5)
logger.info(
"zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
zone_index, len(zone_words), _med_h, _y_tol, len(rows),
)
for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
logger.info(
" zone %d word: y=%d x=%d h=%d w=%d '%s'",
zone_index, w['top'], w['left'], w['height'], w['width'],
w.get('text', '')[:40],
)
for r in rows:
logger.info(
" zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
)
# Use global columns if provided, otherwise detect per zone
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
# Merge inline marker columns (bullets, numbering) into adjacent text
if not global_columns:
columns = _merge_inline_marker_columns(columns, zone_words)
if not columns or not rows:
return {
"columns": [],
"rows": [],
"cells": [],
"header_rows": [],
}
# Split word boxes that straddle column boundaries (e.g. "sichzie"
# spanning Col 1 + Col 2). Must happen after column detection and
# before cell assignment.
# Keep original words for colspan detection (split destroys span info).
original_zone_words = zone_words
if len(columns) >= 2:
zone_words = _split_cross_column_words(zone_words, columns)
# Build cells
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
# --- Detect colspan (merged cells spanning multiple columns) ---
# Uses the ORIGINAL (pre-split) words to detect word-blocks that span
# multiple columns. _split_cross_column_words would have destroyed
# this information by cutting words at column boundaries.
if len(columns) >= 2:
cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
# Prefix cell IDs with zone index
for cell in cells:
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
cell["zone_index"] = zone_index
# Detect header rows (pass columns for spanning header detection)
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
skip_first_row_header=skip_first_row_header)
# Merge cells in spanning header rows into a single col-0 cell
if header_rows and len(columns) >= 2:
for hri in header_rows:
header_cells = [c for c in cells if c["row_index"] == hri]
if len(header_cells) <= 1:
continue
# Collect all word_boxes and text from all columns
all_wb = []
all_text_parts = []
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
all_wb.extend(hc.get("word_boxes", []))
if hc.get("text", "").strip():
all_text_parts.append(hc["text"].strip())
# Remove all header cells, replace with one spanning cell
cells = [c for c in cells if c["row_index"] != hri]
if all_wb:
x_min = min(wb["left"] for wb in all_wb)
y_min = min(wb["top"] for wb in all_wb)
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
cells.append({
"cell_id": f"R{hri:02d}_C0",
"row_index": hri,
"col_index": 0,
"col_type": "spanning_header",
"text": " ".join(all_text_parts),
"confidence": 0.0,
"bbox_px": {"x": x_min, "y": y_min,
"w": x_max - x_min, "h": y_max - y_min},
"bbox_pct": {
"x": round(x_min / img_w * 100, 2) if img_w else 0,
"y": round(y_min / img_h * 100, 2) if img_h else 0,
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
},
"word_boxes": all_wb,
"ocr_engine": "words_first",
"is_bold": True,
})
# Convert columns to output format with percentages
out_columns = []
for col in columns:
x_min = col["x_min"]
x_max = col["x_max"]
out_columns.append({
"index": col["index"],
"label": col["type"],
"x_min_px": round(x_min),
"x_max_px": round(x_max),
"x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
"x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
"bold": False,
})
# Convert rows to output format with percentages
out_rows = []
for row in rows:
out_rows.append({
"index": row["index"],
"y_min_px": round(row["y_min"]),
"y_max_px": round(row["y_max"]),
"y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
"y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
"is_header": row["index"] in header_rows,
})
return {
"columns": out_columns,
"rows": out_rows,
"cells": cells,
"header_rows": header_rows,
"_raw_columns": columns, # internal: for propagation to other zones
}