fix: propagate columns from largest content zone instead of global detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 21s

Global column detection diluted narrow sub-columns (page refs, markers)
because they appeared in too few rows relative to the total.  Instead,
detect columns per zone independently, then propagate the best columns
(from the content zone with the most words) to smaller content zones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 22:30:15 +01:00
parent cf995f2d52
commit 02ae6249ca

View File

@@ -383,6 +383,7 @@ def _build_zone_grid(
"rows": out_rows,
"cells": cells,
"header_rows": header_rows,
"_raw_columns": columns, # internal: for propagation to other zones
}
@@ -484,41 +485,59 @@ async def build_grid(session_id: str):
content_x, content_y, content_w, content_h, boxes
)
# --- Global column detection across ALL content zones ---
# Content zones share the same table structure (the table
# spans the full page, boxes are overlaid on top). Detect
# columns once from all content-zone words so that narrow
# columns (page refs, markers) visible in only one zone
# are applied consistently everywhere.
all_content_words: List[Dict] = []
for pz in page_zones:
if pz.zone_type == "content":
all_content_words.extend(
_words_in_zone(all_words, pz.y, pz.height, pz.x, pz.width)
)
# --- Propagate columns from largest content zone ---
# The table structure spans the full page; boxes are overlaid
# on top. The content zone with the most words has the best
# column detection. Apply its columns to all other content
# zones so that narrow columns (page refs, markers) visible
# in only one zone are consistent everywhere.
global_columns = None
if all_content_words:
global_rows = _cluster_rows(all_content_words)
global_columns = _cluster_columns_by_alignment(
all_content_words, content_w, global_rows,
)
logger.info(
"build-grid session %s: global columns from %d content words → %d columns",
session_id, len(all_content_words), len(global_columns),
)
# First pass: build grids per zone, track best content columns
zone_grids: List[Dict] = []
best_content_cols = None
best_content_word_count = 0
for pz in page_zones:
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
# Content zones use global columns; box zones detect independently
cols_override = global_columns if pz.zone_type == "content" else None
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=cols_override,
)
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
# Track the content zone with the most words
if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
best_content_word_count = len(zone_words)
# Extract column defs from grid output for reuse
best_content_cols = grid.get("_raw_columns")
# Second pass: re-build smaller content zones with best columns
if best_content_cols and len(best_content_cols) > 1:
for zg in zone_grids:
pz = zg["pz"]
if (pz.zone_type == "content"
and len(zg["words"]) < best_content_word_count):
# Re-build this zone with the best content columns
grid = _build_zone_grid(
zg["words"], pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=best_content_cols,
)
zg["grid"] = grid
logger.info(
"build-grid session %s: zone %d (%d words) "
"uses columns from largest content zone (%d words, %d cols)",
session_id, pz.index, len(zg["words"]),
best_content_word_count, len(best_content_cols),
)
for zg in zone_grids:
pz = zg["pz"]
grid = zg["grid"]
# Remove internal _raw_columns before adding to response
grid.pop("_raw_columns", None)
zone_entry: Dict[str, Any] = {
"zone_index": pz.index,
@@ -534,7 +553,7 @@ async def build_grid(session_id: str):
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
},
"border": None,
"word_count": len(zone_words),
"word_count": len(zg["words"]),
**grid,
}
@@ -552,6 +571,7 @@ async def build_grid(session_id: str):
all_words, content_x, content_y, content_w, content_h,
0, img_w, img_h,
)
grid.pop("_raw_columns", None)
zones_data.append({
"zone_index": 0,
"zone_type": "content",