fix: union column detection across all content zones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Instead of propagating columns from the largest content zone only (which missed narrow columns like page_ref), collect column split points from ALL content zones and merge them. This way a column found in any zone (e.g. page_ref at x=132 in the zone below boxes) is available everywhere. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -521,17 +521,14 @@ async def build_grid(session_id: str):
|
||||
content_x, content_y, content_w, content_h, boxes
|
||||
)
|
||||
|
||||
# --- Propagate columns from largest content zone ---
|
||||
# The table structure spans the full page; boxes are overlaid
|
||||
# on top. The content zone with the most words has the best
|
||||
# column detection. Apply its columns to all other content
|
||||
# zones so that narrow columns (page refs, markers) visible
|
||||
# in only one zone are consistent everywhere.
|
||||
# --- Union columns from all content zones ---
|
||||
# Each content zone detects columns independently. Narrow
|
||||
# columns (page refs, markers) may appear in only one zone.
|
||||
# Merge column split-points from ALL content zones so every
|
||||
# zone shares the full column set.
|
||||
|
||||
# First pass: build grids per zone, track best content columns
|
||||
# First pass: build grids per zone independently
|
||||
zone_grids: List[Dict] = []
|
||||
best_content_cols = None
|
||||
best_content_word_count = 0
|
||||
|
||||
for pz in page_zones:
|
||||
zone_words = _words_in_zone(
|
||||
@@ -543,30 +540,74 @@ async def build_grid(session_id: str):
|
||||
)
|
||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||
|
||||
# Track the content zone with the most words
|
||||
if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
|
||||
best_content_word_count = len(zone_words)
|
||||
# Extract column defs from grid output for reuse
|
||||
best_content_cols = grid.get("_raw_columns")
|
||||
# Second pass: merge column boundaries from all content zones
|
||||
content_zones = [
|
||||
zg for zg in zone_grids if zg["pz"].zone_type == "content"
|
||||
]
|
||||
if len(content_zones) > 1:
|
||||
# Collect column split points (x_min of non-first columns)
|
||||
all_split_xs: List[float] = []
|
||||
for zg in content_zones:
|
||||
raw_cols = zg["grid"].get("_raw_columns", [])
|
||||
for col in raw_cols[1:]:
|
||||
all_split_xs.append(col["x_min"])
|
||||
|
||||
# Second pass: re-build smaller content zones with best columns
|
||||
if best_content_cols and len(best_content_cols) > 1:
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
if (pz.zone_type == "content"
|
||||
and len(zg["words"]) < best_content_word_count):
|
||||
# Re-build this zone with the best content columns
|
||||
grid = _build_zone_grid(
|
||||
zg["words"], pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=best_content_cols,
|
||||
if all_split_xs:
|
||||
all_split_xs.sort()
|
||||
merge_distance = max(25, int(content_w * 0.03))
|
||||
merged_xs = [all_split_xs[0]]
|
||||
for x in all_split_xs[1:]:
|
||||
if x - merged_xs[-1] < merge_distance:
|
||||
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
||||
else:
|
||||
merged_xs.append(x)
|
||||
|
||||
total_cols = len(merged_xs) + 1
|
||||
max_zone_cols = max(
|
||||
len(zg["grid"].get("_raw_columns", []))
|
||||
for zg in content_zones
|
||||
)
|
||||
|
||||
# Only apply union if it found more columns than
|
||||
# any single zone (union adds information)
|
||||
if total_cols > max_zone_cols:
|
||||
cx_min = min(w["left"] for w in all_words)
|
||||
cx_max = max(
|
||||
w["left"] + w["width"] for w in all_words
|
||||
)
|
||||
zg["grid"] = grid
|
||||
merged_columns: List[Dict[str, Any]] = []
|
||||
prev_x = cx_min
|
||||
for i, sx in enumerate(merged_xs):
|
||||
merged_columns.append({
|
||||
"index": i,
|
||||
"type": f"column_{i + 1}",
|
||||
"x_min": prev_x,
|
||||
"x_max": sx,
|
||||
})
|
||||
prev_x = sx
|
||||
merged_columns.append({
|
||||
"index": len(merged_xs),
|
||||
"type": f"column_{len(merged_xs) + 1}",
|
||||
"x_min": prev_x,
|
||||
"x_max": cx_max,
|
||||
})
|
||||
|
||||
# Re-build ALL content zones with merged columns
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
if pz.zone_type == "content":
|
||||
grid = _build_zone_grid(
|
||||
zg["words"], pz.x, pz.y,
|
||||
pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=merged_columns,
|
||||
)
|
||||
zg["grid"] = grid
|
||||
logger.info(
|
||||
"build-grid session %s: zone %d (%d words) "
|
||||
"uses columns from largest content zone (%d words, %d cols)",
|
||||
session_id, pz.index, len(zg["words"]),
|
||||
best_content_word_count, len(best_content_cols),
|
||||
"build-grid session %s: union of %d content "
|
||||
"zones → %d merged columns (max single zone: %d)",
|
||||
session_id, len(content_zones),
|
||||
total_cols, max_zone_cols,
|
||||
)
|
||||
|
||||
for zg in zone_grids:
|
||||
|
||||
Reference in New Issue
Block a user