fix: union column detection across all content zones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s

Instead of propagating columns from the largest content zone only
(which missed narrow columns like page_ref), collect column split
points from ALL content zones and merge them. This way a column
found in any zone (e.g. page_ref at x=132 in the zone below boxes)
is available everywhere.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 23:02:33 +01:00
parent 9fb3229270
commit 427fecdce0

View File

@@ -521,17 +521,14 @@ async def build_grid(session_id: str):
content_x, content_y, content_w, content_h, boxes content_x, content_y, content_w, content_h, boxes
) )
# --- Propagate columns from largest content zone --- # --- Union columns from all content zones ---
# The table structure spans the full page; boxes are overlaid # Each content zone detects columns independently. Narrow
# on top. The content zone with the most words has the best # columns (page refs, markers) may appear in only one zone.
# column detection. Apply its columns to all other content # Merge column split-points from ALL content zones so every
# zones so that narrow columns (page refs, markers) visible # zone shares the full column set.
# in only one zone are consistent everywhere.
# First pass: build grids per zone, track best content columns # First pass: build grids per zone independently
zone_grids: List[Dict] = [] zone_grids: List[Dict] = []
best_content_cols = None
best_content_word_count = 0
for pz in page_zones: for pz in page_zones:
zone_words = _words_in_zone( zone_words = _words_in_zone(
@@ -543,30 +540,74 @@ async def build_grid(session_id: str):
) )
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
# Track the content zone with the most words # Second pass: merge column boundaries from all content zones
if pz.zone_type == "content" and len(zone_words) > best_content_word_count: content_zones = [
best_content_word_count = len(zone_words) zg for zg in zone_grids if zg["pz"].zone_type == "content"
# Extract column defs from grid output for reuse ]
best_content_cols = grid.get("_raw_columns") if len(content_zones) > 1:
# Collect column split points (x_min of non-first columns)
all_split_xs: List[float] = []
for zg in content_zones:
raw_cols = zg["grid"].get("_raw_columns", [])
for col in raw_cols[1:]:
all_split_xs.append(col["x_min"])
# Second pass: re-build smaller content zones with best columns if all_split_xs:
if best_content_cols and len(best_content_cols) > 1: all_split_xs.sort()
for zg in zone_grids: merge_distance = max(25, int(content_w * 0.03))
pz = zg["pz"] merged_xs = [all_split_xs[0]]
if (pz.zone_type == "content" for x in all_split_xs[1:]:
and len(zg["words"]) < best_content_word_count): if x - merged_xs[-1] < merge_distance:
# Re-build this zone with the best content columns merged_xs[-1] = (merged_xs[-1] + x) / 2
grid = _build_zone_grid( else:
zg["words"], pz.x, pz.y, pz.width, pz.height, merged_xs.append(x)
pz.index, img_w, img_h,
global_columns=best_content_cols, total_cols = len(merged_xs) + 1
max_zone_cols = max(
len(zg["grid"].get("_raw_columns", []))
for zg in content_zones
)
# Only apply union if it found more columns than
# any single zone (union adds information)
if total_cols > max_zone_cols:
cx_min = min(w["left"] for w in all_words)
cx_max = max(
w["left"] + w["width"] for w in all_words
) )
zg["grid"] = grid merged_columns: List[Dict[str, Any]] = []
prev_x = cx_min
for i, sx in enumerate(merged_xs):
merged_columns.append({
"index": i,
"type": f"column_{i + 1}",
"x_min": prev_x,
"x_max": sx,
})
prev_x = sx
merged_columns.append({
"index": len(merged_xs),
"type": f"column_{len(merged_xs) + 1}",
"x_min": prev_x,
"x_max": cx_max,
})
# Re-build ALL content zones with merged columns
for zg in zone_grids:
pz = zg["pz"]
if pz.zone_type == "content":
grid = _build_zone_grid(
zg["words"], pz.x, pz.y,
pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=merged_columns,
)
zg["grid"] = grid
logger.info( logger.info(
"build-grid session %s: zone %d (%d words) " "build-grid session %s: union of %d content "
"uses columns from largest content zone (%d words, %d cols)", "zones → %d merged columns (max single zone: %d)",
session_id, pz.index, len(zg["words"]), session_id, len(content_zones),
best_content_word_count, len(best_content_cols), total_cols, max_zone_cols,
) )
for zg in zone_grids: for zg in zone_grids: