fix: union column detection across all content zones

Instead of propagating columns from the largest content zone only (which missed narrow columns like page_ref), collect column split points from ALL content zones and merge them. This way a column found in any zone (e.g. page_ref at x=132 in the zone below boxes) is available everywhere. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 23:02:33 +01:00
parent 9fb3229270
commit 427fecdce0
1 changed files with 71 additions and 30 deletions
@@ -521,17 +521,14 @@ async def build_grid(session_id: str):
                    content_x, content_y, content_w, content_h, boxes
                )
-                # --- Propagate columns from largest content zone ---
+                # --- Union columns from all content zones ---
-                # The table structure spans the full page; boxes are overlaid
+                # Each content zone detects columns independently.  Narrow
-                # on top.  The content zone with the most words has the best
+                # columns (page refs, markers) may appear in only one zone.
-                # column detection.  Apply its columns to all other content
+                # Merge column split-points from ALL content zones so every
-                # zones so that narrow columns (page refs, markers) visible
+                # zone shares the full column set.
                # in only one zone are consistent everywhere.
-                # First pass: build grids per zone, track best content columns
+                # First pass: build grids per zone independently
                zone_grids: List[Dict] = []
                best_content_cols = None
                best_content_word_count = 0
                for pz in page_zones:
                    zone_words = _words_in_zone(
@@ -543,30 +540,74 @@ async def build_grid(session_id: str):
                    )
                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
-                    # Track the content zone with the most words
+                # Second pass: merge column boundaries from all content zones
-                    if pz.zone_type == "content" and len(zone_words) > best_content_word_count:
+                content_zones = [
-                        best_content_word_count = len(zone_words)
+                    zg for zg in zone_grids if zg["pz"].zone_type == "content"
-                        # Extract column defs from grid output for reuse
+                ]
-                        best_content_cols = grid.get("_raw_columns")
+                if len(content_zones) > 1:
                    # Collect column split points (x_min of non-first columns)
                    all_split_xs: List[float] = []
                    for zg in content_zones:
                        raw_cols = zg["grid"].get("_raw_columns", [])
                        for col in raw_cols[1:]:
                            all_split_xs.append(col["x_min"])
-                # Second pass: re-build smaller content zones with best columns
+                    if all_split_xs:
-                if best_content_cols and len(best_content_cols) > 1:
+                        all_split_xs.sort()
-                    for zg in zone_grids:
+                        merge_distance = max(25, int(content_w * 0.03))
-                        pz = zg["pz"]
+                        merged_xs = [all_split_xs[0]]
-                        if (pz.zone_type == "content"
+                        for x in all_split_xs[1:]:
-                                and len(zg["words"]) < best_content_word_count):
+                            if x - merged_xs[-1] < merge_distance:
-                            # Re-build this zone with the best content columns
+                                merged_xs[-1] = (merged_xs[-1] + x) / 2
-                            grid = _build_zone_grid(
+                            else:
-                                zg["words"], pz.x, pz.y, pz.width, pz.height,
+                                merged_xs.append(x)
-                                pz.index, img_w, img_h,
+
-                                global_columns=best_content_cols,
+                        total_cols = len(merged_xs) + 1
                        max_zone_cols = max(
                            len(zg["grid"].get("_raw_columns", []))
                            for zg in content_zones
                        )
                        # Only apply union if it found more columns than
                        # any single zone (union adds information)
                        if total_cols > max_zone_cols:
                            cx_min = min(w["left"] for w in all_words)
                            cx_max = max(
                                w["left"] + w["width"] for w in all_words
                            )
-                            zg["grid"] = grid
+                            merged_columns: List[Dict[str, Any]] = []
                            prev_x = cx_min
                            for i, sx in enumerate(merged_xs):
                                merged_columns.append({
                                    "index": i,
                                    "type": f"column_{i + 1}",
                                    "x_min": prev_x,
                                    "x_max": sx,
                                })
                                prev_x = sx
                            merged_columns.append({
                                "index": len(merged_xs),
                                "type": f"column_{len(merged_xs) + 1}",
                                "x_min": prev_x,
                                "x_max": cx_max,
                            })
                            # Re-build ALL content zones with merged columns
                            for zg in zone_grids:
                                pz = zg["pz"]
                                if pz.zone_type == "content":
                                    grid = _build_zone_grid(
                                        zg["words"], pz.x, pz.y,
                                        pz.width, pz.height,
                                        pz.index, img_w, img_h,
                                        global_columns=merged_columns,
                                    )
                                    zg["grid"] = grid
                            logger.info(
-                                "build-grid session %s: zone %d (%d words) "
+                                "build-grid session %s: union of %d content "
-                                "uses columns from largest content zone (%d words, %d cols)",
+                                "zones → %d merged columns (max single zone: %d)",
-                                session_id, pz.index, len(zg["words"]),
+                                session_id, len(content_zones),
-                                best_content_word_count, len(best_content_cols),
+                                total_cols, max_zone_cols,
                            )
                for zg in zone_grids: