fix: propagate columns from largest content zone instead of global detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 21s
Global column detection diluted narrow sub-columns (page refs, markers) because they appeared in too few rows relative to the total. Instead, detect columns per zone independently, then propagate the best columns (from the content zone with the most words) to smaller content zones. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -383,6 +383,7 @@ def _build_zone_grid(
|
||||
"rows": out_rows,
|
||||
"cells": cells,
|
||||
"header_rows": header_rows,
|
||||
"_raw_columns": columns, # internal: for propagation to other zones
|
||||
}
|
||||
|
||||
|
||||
@@ -484,41 +485,59 @@ async def build_grid(session_id: str):
|
||||
content_x, content_y, content_w, content_h, boxes
|
||||
)
|
||||
|
||||
# --- Global column detection across ALL content zones ---
|
||||
# Content zones share the same table structure (the table
|
||||
# spans the full page, boxes are overlaid on top). Detect
|
||||
# columns once from all content-zone words so that narrow
|
||||
# columns (page refs, markers) visible in only one zone
|
||||
# are applied consistently everywhere.
|
||||
all_content_words: List[Dict] = []
|
||||
for pz in page_zones:
|
||||
if pz.zone_type == "content":
|
||||
all_content_words.extend(
|
||||
_words_in_zone(all_words, pz.y, pz.height, pz.x, pz.width)
|
||||
)
|
||||
# --- Propagate columns from largest content zone ---
|
||||
# The table structure spans the full page; boxes are overlaid
|
||||
# on top. The content zone with the most words has the best
|
||||
# column detection. Apply its columns to all other content
|
||||
# zones so that narrow columns (page refs, markers) visible
|
||||
# in only one zone are consistent everywhere.
|
||||
|
||||
global_columns = None
|
||||
if all_content_words:
|
||||
global_rows = _cluster_rows(all_content_words)
|
||||
global_columns = _cluster_columns_by_alignment(
|
||||
all_content_words, content_w, global_rows,
|
||||
)
|
||||
logger.info(
|
||||
"build-grid session %s: global columns from %d content words → %d columns",
|
||||
session_id, len(all_content_words), len(global_columns),
|
||||
)
|
||||
# First pass: build grids per zone, track best content columns
|
||||
zone_grids: List[Dict] = []
|
||||
best_content_cols = None
|
||||
best_content_word_count = 0
|
||||
|
||||
for pz in page_zones:
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
# Content zones use global columns; box zones detect independently
|
||||
cols_override = global_columns if pz.zone_type == "content" else None
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=cols_override,
|
||||
)
|
||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||
|
||||
# Track the content zone with the most words
|
||||
if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
|
||||
best_content_word_count = len(zone_words)
|
||||
# Extract column defs from grid output for reuse
|
||||
best_content_cols = grid.get("_raw_columns")
|
||||
|
||||
# Second pass: re-build smaller content zones with best columns
|
||||
if best_content_cols and len(best_content_cols) > 1:
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
if (pz.zone_type == "content"
|
||||
and len(zg["words"]) < best_content_word_count):
|
||||
# Re-build this zone with the best content columns
|
||||
grid = _build_zone_grid(
|
||||
zg["words"], pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=best_content_cols,
|
||||
)
|
||||
zg["grid"] = grid
|
||||
logger.info(
|
||||
"build-grid session %s: zone %d (%d words) "
|
||||
"uses columns from largest content zone (%d words, %d cols)",
|
||||
session_id, pz.index, len(zg["words"]),
|
||||
best_content_word_count, len(best_content_cols),
|
||||
)
|
||||
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
grid = zg["grid"]
|
||||
# Remove internal _raw_columns before adding to response
|
||||
grid.pop("_raw_columns", None)
|
||||
|
||||
zone_entry: Dict[str, Any] = {
|
||||
"zone_index": pz.index,
|
||||
@@ -534,7 +553,7 @@ async def build_grid(session_id: str):
|
||||
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"border": None,
|
||||
"word_count": len(zone_words),
|
||||
"word_count": len(zg["words"]),
|
||||
**grid,
|
||||
}
|
||||
|
||||
@@ -552,6 +571,7 @@ async def build_grid(session_id: str):
|
||||
all_words, content_x, content_y, content_w, content_h,
|
||||
0, img_w, img_h,
|
||||
)
|
||||
grid.pop("_raw_columns", None)
|
||||
zones_data.append({
|
||||
"zone_index": 0,
|
||||
"zone_type": "content",
|
||||
|
||||
Reference in New Issue
Block a user