fix: union column detection across all content zones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Instead of propagating columns from the largest content zone only (which missed narrow columns like page_ref), collect column split points from ALL content zones and merge them. This way a column found in any zone (e.g. page_ref at x=132 in the zone below boxes) is available everywhere. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -521,17 +521,14 @@ async def build_grid(session_id: str):
|
|||||||
content_x, content_y, content_w, content_h, boxes
|
content_x, content_y, content_w, content_h, boxes
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Propagate columns from largest content zone ---
|
# --- Union columns from all content zones ---
|
||||||
# The table structure spans the full page; boxes are overlaid
|
# Each content zone detects columns independently. Narrow
|
||||||
# on top. The content zone with the most words has the best
|
# columns (page refs, markers) may appear in only one zone.
|
||||||
# column detection. Apply its columns to all other content
|
# Merge column split-points from ALL content zones so every
|
||||||
# zones so that narrow columns (page refs, markers) visible
|
# zone shares the full column set.
|
||||||
# in only one zone are consistent everywhere.
|
|
||||||
|
|
||||||
# First pass: build grids per zone, track best content columns
|
# First pass: build grids per zone independently
|
||||||
zone_grids: List[Dict] = []
|
zone_grids: List[Dict] = []
|
||||||
best_content_cols = None
|
|
||||||
best_content_word_count = 0
|
|
||||||
|
|
||||||
for pz in page_zones:
|
for pz in page_zones:
|
||||||
zone_words = _words_in_zone(
|
zone_words = _words_in_zone(
|
||||||
@@ -543,30 +540,74 @@ async def build_grid(session_id: str):
|
|||||||
)
|
)
|
||||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||||
|
|
||||||
# Track the content zone with the most words
|
# Second pass: merge column boundaries from all content zones
|
||||||
if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
|
content_zones = [
|
||||||
best_content_word_count = len(zone_words)
|
zg for zg in zone_grids if zg["pz"].zone_type == "content"
|
||||||
# Extract column defs from grid output for reuse
|
]
|
||||||
best_content_cols = grid.get("_raw_columns")
|
if len(content_zones) > 1:
|
||||||
|
# Collect column split points (x_min of non-first columns)
|
||||||
|
all_split_xs: List[float] = []
|
||||||
|
for zg in content_zones:
|
||||||
|
raw_cols = zg["grid"].get("_raw_columns", [])
|
||||||
|
for col in raw_cols[1:]:
|
||||||
|
all_split_xs.append(col["x_min"])
|
||||||
|
|
||||||
# Second pass: re-build smaller content zones with best columns
|
if all_split_xs:
|
||||||
if best_content_cols and len(best_content_cols) > 1:
|
all_split_xs.sort()
|
||||||
for zg in zone_grids:
|
merge_distance = max(25, int(content_w * 0.03))
|
||||||
pz = zg["pz"]
|
merged_xs = [all_split_xs[0]]
|
||||||
if (pz.zone_type == "content"
|
for x in all_split_xs[1:]:
|
||||||
and len(zg["words"]) < best_content_word_count):
|
if x - merged_xs[-1] < merge_distance:
|
||||||
# Re-build this zone with the best content columns
|
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
||||||
grid = _build_zone_grid(
|
else:
|
||||||
zg["words"], pz.x, pz.y, pz.width, pz.height,
|
merged_xs.append(x)
|
||||||
pz.index, img_w, img_h,
|
|
||||||
global_columns=best_content_cols,
|
total_cols = len(merged_xs) + 1
|
||||||
|
max_zone_cols = max(
|
||||||
|
len(zg["grid"].get("_raw_columns", []))
|
||||||
|
for zg in content_zones
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only apply union if it found more columns than
|
||||||
|
# any single zone (union adds information)
|
||||||
|
if total_cols > max_zone_cols:
|
||||||
|
cx_min = min(w["left"] for w in all_words)
|
||||||
|
cx_max = max(
|
||||||
|
w["left"] + w["width"] for w in all_words
|
||||||
)
|
)
|
||||||
zg["grid"] = grid
|
merged_columns: List[Dict[str, Any]] = []
|
||||||
|
prev_x = cx_min
|
||||||
|
for i, sx in enumerate(merged_xs):
|
||||||
|
merged_columns.append({
|
||||||
|
"index": i,
|
||||||
|
"type": f"column_{i + 1}",
|
||||||
|
"x_min": prev_x,
|
||||||
|
"x_max": sx,
|
||||||
|
})
|
||||||
|
prev_x = sx
|
||||||
|
merged_columns.append({
|
||||||
|
"index": len(merged_xs),
|
||||||
|
"type": f"column_{len(merged_xs) + 1}",
|
||||||
|
"x_min": prev_x,
|
||||||
|
"x_max": cx_max,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Re-build ALL content zones with merged columns
|
||||||
|
for zg in zone_grids:
|
||||||
|
pz = zg["pz"]
|
||||||
|
if pz.zone_type == "content":
|
||||||
|
grid = _build_zone_grid(
|
||||||
|
zg["words"], pz.x, pz.y,
|
||||||
|
pz.width, pz.height,
|
||||||
|
pz.index, img_w, img_h,
|
||||||
|
global_columns=merged_columns,
|
||||||
|
)
|
||||||
|
zg["grid"] = grid
|
||||||
logger.info(
|
logger.info(
|
||||||
"build-grid session %s: zone %d (%d words) "
|
"build-grid session %s: union of %d content "
|
||||||
"uses columns from largest content zone (%d words, %d cols)",
|
"zones → %d merged columns (max single zone: %d)",
|
||||||
session_id, pz.index, len(zg["words"]),
|
session_id, len(content_zones),
|
||||||
best_content_word_count, len(best_content_cols),
|
total_cols, max_zone_cols,
|
||||||
)
|
)
|
||||||
|
|
||||||
for zg in zone_grids:
|
for zg in zone_grids:
|
||||||
|
|||||||
Reference in New Issue
Block a user