fix: global column detection across content zones in Kombi grid builder
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 26s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 26s
Content zones (above/between/below boxes) now share the same column structure: columns are detected once from ALL content-zone words, then applied to each content zone. Box zones still detect columns independently. This fixes the issue where narrow columns (page refs like p.55) were not detected in small content zones above boxes, even though the same column existed in the larger content zone below the box. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -309,8 +309,16 @@ def _build_zone_grid(
|
||||
zone_index: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
global_columns: Optional[List[Dict]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build columns, rows, cells for a single zone from its words."""
|
||||
"""Build columns, rows, cells for a single zone from its words.
|
||||
|
||||
Args:
|
||||
global_columns: If provided, use these pre-computed column boundaries
|
||||
instead of detecting columns per zone. Used for content zones so
|
||||
that all content zones (above/between/below boxes) share the same
|
||||
column structure. Box zones always detect columns independently.
|
||||
"""
|
||||
if not zone_words:
|
||||
return {
|
||||
"columns": [],
|
||||
@@ -321,8 +329,8 @@ def _build_zone_grid(
|
||||
|
||||
# Cluster rows first (needed for column alignment analysis)
|
||||
rows = _cluster_rows(zone_words)
|
||||
# Cluster columns by left-edge alignment
|
||||
columns = _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||
# Use global columns if provided, otherwise detect per zone
|
||||
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
@@ -476,13 +484,40 @@ async def build_grid(session_id: str):
|
||||
content_x, content_y, content_w, content_h, boxes
|
||||
)
|
||||
|
||||
# --- Global column detection across ALL content zones ---
|
||||
# Content zones share the same table structure (the table
|
||||
# spans the full page, boxes are overlaid on top). Detect
|
||||
# columns once from all content-zone words so that narrow
|
||||
# columns (page refs, markers) visible in only one zone
|
||||
# are applied consistently everywhere.
|
||||
all_content_words: List[Dict] = []
|
||||
for pz in page_zones:
|
||||
if pz.zone_type == "content":
|
||||
all_content_words.extend(
|
||||
_words_in_zone(all_words, pz.y, pz.height, pz.x, pz.width)
|
||||
)
|
||||
|
||||
global_columns = None
|
||||
if all_content_words:
|
||||
global_rows = _cluster_rows(all_content_words)
|
||||
global_columns = _cluster_columns_by_alignment(
|
||||
all_content_words, content_w, global_rows,
|
||||
)
|
||||
logger.info(
|
||||
"build-grid session %s: global columns from %d content words → %d columns",
|
||||
session_id, len(all_content_words), len(global_columns),
|
||||
)
|
||||
|
||||
for pz in page_zones:
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
# Content zones use global columns; box zones detect independently
|
||||
cols_override = global_columns if pz.zone_type == "content" else None
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=cols_override,
|
||||
)
|
||||
|
||||
zone_entry: Dict[str, Any] = {
|
||||
|
||||
Reference in New Issue
Block a user