From 02ae6249cad8e79e15531e24d81ed342b346e0b1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 16 Mar 2026 22:30:15 +0100 Subject: [PATCH] fix: propagate columns from largest content zone instead of global detection Global column detection diluted narrow sub-columns (page refs, markers) because they appeared in too few rows relative to the total. Instead, detect columns per zone independently, then propagate the best columns (from the content zone with the most words) to smaller content zones. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 72 ++++++++++++++-------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 697f7d4..06f9b86 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -383,6 +383,7 @@ def _build_zone_grid( "rows": out_rows, "cells": cells, "header_rows": header_rows, + "_raw_columns": columns, # internal: for propagation to other zones } @@ -484,41 +485,59 @@ async def build_grid(session_id: str): content_x, content_y, content_w, content_h, boxes ) - # --- Global column detection across ALL content zones --- - # Content zones share the same table structure (the table - # spans the full page, boxes are overlaid on top). Detect - # columns once from all content-zone words so that narrow - # columns (page refs, markers) visible in only one zone - # are applied consistently everywhere. - all_content_words: List[Dict] = [] - for pz in page_zones: - if pz.zone_type == "content": - all_content_words.extend( - _words_in_zone(all_words, pz.y, pz.height, pz.x, pz.width) - ) + # --- Propagate columns from largest content zone --- + # The table structure spans the full page; boxes are overlaid + # on top. The content zone with the most words has the best + # column detection. Apply its columns to all other content + # zones so that narrow columns (page refs, markers) visible + # in only one zone are consistent everywhere. - global_columns = None - if all_content_words: - global_rows = _cluster_rows(all_content_words) - global_columns = _cluster_columns_by_alignment( - all_content_words, content_w, global_rows, - ) - logger.info( - "build-grid session %s: global columns from %d content words → %d columns", - session_id, len(all_content_words), len(global_columns), - ) + # First pass: build grids per zone, track best content columns + zone_grids: List[Dict] = [] + best_content_cols = None + best_content_word_count = 0 for pz in page_zones: zone_words = _words_in_zone( all_words, pz.y, pz.height, pz.x, pz.width ) - # Content zones use global columns; box zones detect independently - cols_override = global_columns if pz.zone_type == "content" else None grid = _build_zone_grid( zone_words, pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, - global_columns=cols_override, ) + zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) + + # Track the content zone with the most words + if pz.zone_type == "content" and len(zone_words) > best_content_word_count: + best_content_word_count = len(zone_words) + # Extract column defs from grid output for reuse + best_content_cols = grid.get("_raw_columns") + + # Second pass: re-build smaller content zones with best columns + if best_content_cols and len(best_content_cols) > 1: + for zg in zone_grids: + pz = zg["pz"] + if (pz.zone_type == "content" + and len(zg["words"]) < best_content_word_count): + # Re-build this zone with the best content columns + grid = _build_zone_grid( + zg["words"], pz.x, pz.y, pz.width, pz.height, + pz.index, img_w, img_h, + global_columns=best_content_cols, + ) + zg["grid"] = grid + logger.info( + "build-grid session %s: zone %d (%d words) " + "uses columns from largest content zone (%d words, %d cols)", + session_id, pz.index, len(zg["words"]), + best_content_word_count, len(best_content_cols), + ) + + for zg in zone_grids: + pz = zg["pz"] + grid = zg["grid"] + # Remove internal _raw_columns before adding to response + grid.pop("_raw_columns", None) zone_entry: Dict[str, Any] = { "zone_index": pz.index, @@ -534,7 +553,7 @@ async def build_grid(session_id: str): "h": round(pz.height / img_h * 100, 2) if img_h else 0, }, "border": None, - "word_count": len(zone_words), + "word_count": len(zg["words"]), **grid, } @@ -552,6 +571,7 @@ async def build_grid(session_id: str): all_words, content_x, content_y, content_w, content_h, 0, img_w, img_h, ) + grid.pop("_raw_columns", None) zones_data.append({ "zone_index": 0, "zone_type": "content",