From 427fecdce0ee2102d3258880e533526699d7cb56 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 16 Mar 2026 23:02:33 +0100 Subject: [PATCH] fix: union column detection across all content zones Instead of propagating columns from the largest content zone only (which missed narrow columns like page_ref), collect column split points from ALL content zones and merge them. This way a column found in any zone (e.g. page_ref at x=132 in the zone below boxes) is available everywhere. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 101 +++++++++++++++------ 1 file changed, 71 insertions(+), 30 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index e332ab6..ecb13c2 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -521,17 +521,14 @@ async def build_grid(session_id: str): content_x, content_y, content_w, content_h, boxes ) - # --- Propagate columns from largest content zone --- - # The table structure spans the full page; boxes are overlaid - # on top. The content zone with the most words has the best - # column detection. Apply its columns to all other content - # zones so that narrow columns (page refs, markers) visible - # in only one zone are consistent everywhere. + # --- Union columns from all content zones --- + # Each content zone detects columns independently. Narrow + # columns (page refs, markers) may appear in only one zone. + # Merge column split-points from ALL content zones so every + # zone shares the full column set. - # First pass: build grids per zone, track best content columns + # First pass: build grids per zone independently zone_grids: List[Dict] = [] - best_content_cols = None - best_content_word_count = 0 for pz in page_zones: zone_words = _words_in_zone( @@ -543,30 +540,74 @@ async def build_grid(session_id: str): ) zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) - # Track the content zone with the most words - if pz.zone_type == "content" and len(zone_words) > best_content_word_count: - best_content_word_count = len(zone_words) - # Extract column defs from grid output for reuse - best_content_cols = grid.get("_raw_columns") + # Second pass: merge column boundaries from all content zones + content_zones = [ + zg for zg in zone_grids if zg["pz"].zone_type == "content" + ] + if len(content_zones) > 1: + # Collect column split points (x_min of non-first columns) + all_split_xs: List[float] = [] + for zg in content_zones: + raw_cols = zg["grid"].get("_raw_columns", []) + for col in raw_cols[1:]: + all_split_xs.append(col["x_min"]) - # Second pass: re-build smaller content zones with best columns - if best_content_cols and len(best_content_cols) > 1: - for zg in zone_grids: - pz = zg["pz"] - if (pz.zone_type == "content" - and len(zg["words"]) < best_content_word_count): - # Re-build this zone with the best content columns - grid = _build_zone_grid( - zg["words"], pz.x, pz.y, pz.width, pz.height, - pz.index, img_w, img_h, - global_columns=best_content_cols, + if all_split_xs: + all_split_xs.sort() + merge_distance = max(25, int(content_w * 0.03)) + merged_xs = [all_split_xs[0]] + for x in all_split_xs[1:]: + if x - merged_xs[-1] < merge_distance: + merged_xs[-1] = (merged_xs[-1] + x) / 2 + else: + merged_xs.append(x) + + total_cols = len(merged_xs) + 1 + max_zone_cols = max( + len(zg["grid"].get("_raw_columns", [])) + for zg in content_zones + ) + + # Only apply union if it found more columns than + # any single zone (union adds information) + if total_cols > max_zone_cols: + cx_min = min(w["left"] for w in all_words) + cx_max = max( + w["left"] + w["width"] for w in all_words ) - zg["grid"] = grid + merged_columns: List[Dict[str, Any]] = [] + prev_x = cx_min + for i, sx in enumerate(merged_xs): + merged_columns.append({ + "index": i, + "type": f"column_{i + 1}", + "x_min": prev_x, + "x_max": sx, + }) + prev_x = sx + merged_columns.append({ + "index": len(merged_xs), + "type": f"column_{len(merged_xs) + 1}", + "x_min": prev_x, + "x_max": cx_max, + }) + + # Re-build ALL content zones with merged columns + for zg in zone_grids: + pz = zg["pz"] + if pz.zone_type == "content": + grid = _build_zone_grid( + zg["words"], pz.x, pz.y, + pz.width, pz.height, + pz.index, img_w, img_h, + global_columns=merged_columns, + ) + zg["grid"] = grid logger.info( - "build-grid session %s: zone %d (%d words) " - "uses columns from largest content zone (%d words, %d cols)", - session_id, pz.index, len(zg["words"]), - best_content_word_count, len(best_content_cols), + "build-grid session %s: union of %d content " + "zones → %d merged columns (max single zone: %d)", + session_id, len(content_zones), + total_cols, max_zone_cols, ) for zg in zone_grids: