fix: union column detection across all content zones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s

Instead of propagating columns from the largest content zone only
(which missed narrow columns like page_ref), collect column split
points from ALL content zones and merge them. This way a column
found in any zone (e.g. page_ref at x=132 in the zone below boxes)
is available everywhere.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 23:02:33 +01:00
parent 9fb3229270
commit 427fecdce0

View File

@@ -521,17 +521,14 @@ async def build_grid(session_id: str):
content_x, content_y, content_w, content_h, boxes
)
# --- Propagate columns from largest content zone ---
# The table structure spans the full page; boxes are overlaid
# on top. The content zone with the most words has the best
# column detection. Apply its columns to all other content
# zones so that narrow columns (page refs, markers) visible
# in only one zone are consistent everywhere.
# --- Union columns from all content zones ---
# Each content zone detects columns independently. Narrow
# columns (page refs, markers) may appear in only one zone.
# Merge column split-points from ALL content zones so every
# zone shares the full column set.
# First pass: build grids per zone, track best content columns
# First pass: build grids per zone independently
zone_grids: List[Dict] = []
best_content_cols = None
best_content_word_count = 0
for pz in page_zones:
zone_words = _words_in_zone(
@@ -543,30 +540,74 @@ async def build_grid(session_id: str):
)
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
# Track the content zone with the most words
if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
best_content_word_count = len(zone_words)
# Extract column defs from grid output for reuse
best_content_cols = grid.get("_raw_columns")
# Second pass: merge column boundaries from all content zones
content_zones = [
zg for zg in zone_grids if zg["pz"].zone_type == "content"
]
if len(content_zones) > 1:
# Collect column split points (x_min of non-first columns)
all_split_xs: List[float] = []
for zg in content_zones:
raw_cols = zg["grid"].get("_raw_columns", [])
for col in raw_cols[1:]:
all_split_xs.append(col["x_min"])
# Second pass: re-build smaller content zones with best columns
if best_content_cols and len(best_content_cols) > 1:
for zg in zone_grids:
pz = zg["pz"]
if (pz.zone_type == "content"
and len(zg["words"]) < best_content_word_count):
# Re-build this zone with the best content columns
grid = _build_zone_grid(
zg["words"], pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=best_content_cols,
if all_split_xs:
all_split_xs.sort()
merge_distance = max(25, int(content_w * 0.03))
merged_xs = [all_split_xs[0]]
for x in all_split_xs[1:]:
if x - merged_xs[-1] < merge_distance:
merged_xs[-1] = (merged_xs[-1] + x) / 2
else:
merged_xs.append(x)
total_cols = len(merged_xs) + 1
max_zone_cols = max(
len(zg["grid"].get("_raw_columns", []))
for zg in content_zones
)
# Only apply union if it found more columns than
# any single zone (union adds information)
if total_cols > max_zone_cols:
cx_min = min(w["left"] for w in all_words)
cx_max = max(
w["left"] + w["width"] for w in all_words
)
zg["grid"] = grid
merged_columns: List[Dict[str, Any]] = []
prev_x = cx_min
for i, sx in enumerate(merged_xs):
merged_columns.append({
"index": i,
"type": f"column_{i + 1}",
"x_min": prev_x,
"x_max": sx,
})
prev_x = sx
merged_columns.append({
"index": len(merged_xs),
"type": f"column_{len(merged_xs) + 1}",
"x_min": prev_x,
"x_max": cx_max,
})
# Re-build ALL content zones with merged columns
for zg in zone_grids:
pz = zg["pz"]
if pz.zone_type == "content":
grid = _build_zone_grid(
zg["words"], pz.x, pz.y,
pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=merged_columns,
)
zg["grid"] = grid
logger.info(
"build-grid session %s: zone %d (%d words) "
"uses columns from largest content zone (%d words, %d cols)",
session_id, pz.index, len(zg["words"]),
best_content_word_count, len(best_content_cols),
"build-grid session %s: union of %d content "
"zones → %d merged columns (max single zone: %d)",
session_id, len(content_zones),
total_cols, max_zone_cols,
)
for zg in zone_grids: