fix: global column detection across content zones in Kombi grid builder
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 26s

Content zones (above/between/below boxes) now share the same column
structure: columns are detected once from ALL content-zone words, then
applied to each content zone.  Box zones still detect columns independently.

This fixes the issue where narrow columns (page refs like p.55) were not
detected in small content zones above boxes, even though the same column
existed in the larger content zone below the box.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 22:04:17 +01:00
parent 0340204c1f
commit cf995f2d52

View File

@@ -309,8 +309,16 @@ def _build_zone_grid(
zone_index: int,
img_w: int,
img_h: int,
global_columns: Optional[List[Dict]] = None,
) -> Dict[str, Any]:
"""Build columns, rows, cells for a single zone from its words."""
"""Build columns, rows, cells for a single zone from its words.
Args:
global_columns: If provided, use these pre-computed column boundaries
instead of detecting columns per zone. Used for content zones so
that all content zones (above/between/below boxes) share the same
column structure. Box zones always detect columns independently.
"""
if not zone_words:
return {
"columns": [],
@@ -321,8 +329,8 @@ def _build_zone_grid(
# Cluster rows first (needed for column alignment analysis)
rows = _cluster_rows(zone_words)
# Cluster columns by left-edge alignment
columns = _cluster_columns_by_alignment(zone_words, zone_w, rows)
# Use global columns if provided, otherwise detect per zone
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
if not columns or not rows:
return {
@@ -476,13 +484,40 @@ async def build_grid(session_id: str):
content_x, content_y, content_w, content_h, boxes
)
# --- Global column detection across ALL content zones ---
# Content zones share the same table structure (the table
# spans the full page, boxes are overlaid on top). Detect
# columns once from all content-zone words so that narrow
# columns (page refs, markers) visible in only one zone
# are applied consistently everywhere.
all_content_words: List[Dict] = []
for pz in page_zones:
if pz.zone_type == "content":
all_content_words.extend(
_words_in_zone(all_words, pz.y, pz.height, pz.x, pz.width)
)
global_columns = None
if all_content_words:
global_rows = _cluster_rows(all_content_words)
global_columns = _cluster_columns_by_alignment(
all_content_words, content_w, global_rows,
)
logger.info(
"build-grid session %s: global columns from %d content words → %d columns",
session_id, len(all_content_words), len(global_columns),
)
for pz in page_zones:
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
# Content zones use global columns; box zones detect independently
cols_override = global_columns if pz.zone_type == "content" else None
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=cols_override,
)
zone_entry: Dict[str, Any] = {