fix: union column detection across all content zones

Instead of propagating columns from the largest content zone only (which missed narrow columns like page_ref), collect column split points from ALL content zones and merge them. This way a column found in any zone (e.g. page_ref at x=132 in the zone below boxes) is available everywhere. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 23:02:33 +01:00
parent 9fb3229270
commit 427fecdce0
1 changed files with 71 additions and 30 deletions
@@ -521,17 +521,14 @@ async def build_grid(session_id: str):
                    content_x, content_y, content_w, content_h, boxes
                )

-                # --- Propagate columns from largest content zone ---
-                # The table structure spans the full page; boxes are overlaid
-                # on top.  The content zone with the most words has the best
-                # column detection.  Apply its columns to all other content
-                # zones so that narrow columns (page refs, markers) visible
-                # in only one zone are consistent everywhere.
+                # --- Union columns from all content zones ---
+                # Each content zone detects columns independently.  Narrow
+                # columns (page refs, markers) may appear in only one zone.
+                # Merge column split-points from ALL content zones so every
+                # zone shares the full column set.

-                # First pass: build grids per zone, track best content columns
+                # First pass: build grids per zone independently
                zone_grids: List[Dict] = []
-                best_content_cols = None
-                best_content_word_count = 0

                for pz in page_zones:
                    zone_words = _words_in_zone(
@@ -543,30 +540,74 @@ async def build_grid(session_id: str):
                    )
                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})

-                    # Track the content zone with the most words
-                    if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
-                        best_content_word_count = len(zone_words)
-                        # Extract column defs from grid output for reuse
-                        best_content_cols = grid.get("_raw_columns")
+                # Second pass: merge column boundaries from all content zones
+                content_zones = [
+                    zg for zg in zone_grids if zg["pz"].zone_type == "content"
+                ]
+                if len(content_zones) > 1:
+                    # Collect column split points (x_min of non-first columns)
+                    all_split_xs: List[float] = []
+                    for zg in content_zones:
+                        raw_cols = zg["grid"].get("_raw_columns", [])
+                        for col in raw_cols[1:]:
+                            all_split_xs.append(col["x_min"])

-                # Second pass: re-build smaller content zones with best columns
-                if best_content_cols and len(best_content_cols) > 1:
-                    for zg in zone_grids:
-                        pz = zg["pz"]
-                        if (pz.zone_type == "content"
-                                and len(zg["words"]) < best_content_word_count):
-                            # Re-build this zone with the best content columns
-                            grid = _build_zone_grid(
-                                zg["words"], pz.x, pz.y, pz.width, pz.height,
-                                pz.index, img_w, img_h,
-                                global_columns=best_content_cols,
+                    if all_split_xs:
+                        all_split_xs.sort()
+                        merge_distance = max(25, int(content_w * 0.03))
+                        merged_xs = [all_split_xs[0]]
+                        for x in all_split_xs[1:]:
+                            if x - merged_xs[-1] < merge_distance:
+                                merged_xs[-1] = (merged_xs[-1] + x) / 2
+                            else:
+                                merged_xs.append(x)
+
+                        total_cols = len(merged_xs) + 1
+                        max_zone_cols = max(
+                            len(zg["grid"].get("_raw_columns", []))
+                            for zg in content_zones
+                        )
+
+                        # Only apply union if it found more columns than
+                        # any single zone (union adds information)
+                        if total_cols > max_zone_cols:
+                            cx_min = min(w["left"] for w in all_words)
+                            cx_max = max(
+                                w["left"] + w["width"] for w in all_words
                            )
-                            zg["grid"] = grid
+                            merged_columns: List[Dict[str, Any]] = []
+                            prev_x = cx_min
+                            for i, sx in enumerate(merged_xs):
+                                merged_columns.append({
+                                    "index": i,
+                                    "type": f"column_{i + 1}",
+                                    "x_min": prev_x,
+                                    "x_max": sx,
+                                })
+                                prev_x = sx
+                            merged_columns.append({
+                                "index": len(merged_xs),
+                                "type": f"column_{len(merged_xs) + 1}",
+                                "x_min": prev_x,
+                                "x_max": cx_max,
+                            })
+
+                            # Re-build ALL content zones with merged columns
+                            for zg in zone_grids:
+                                pz = zg["pz"]
+                                if pz.zone_type == "content":
+                                    grid = _build_zone_grid(
+                                        zg["words"], pz.x, pz.y,
+                                        pz.width, pz.height,
+                                        pz.index, img_w, img_h,
+                                        global_columns=merged_columns,
+                                    )
+                                    zg["grid"] = grid
                            logger.info(
-                                "build-grid session %s: zone %d (%d words) "
-                                "uses columns from largest content zone (%d words, %d cols)",
-                                session_id, pz.index, len(zg["words"]),
-                                best_content_word_count, len(best_content_cols),
+                                "build-grid session %s: union of %d content "
+                                "zones → %d merged columns (max single zone: %d)",
+                                session_id, len(content_zones),
+                                total_cols, max_zone_cols,
                            )

                for zg in zone_grids: