diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 697f7d4..06f9b86 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -383,6 +383,7 @@ def _build_zone_grid( "rows": out_rows, "cells": cells, "header_rows": header_rows, + "_raw_columns": columns, # internal: for propagation to other zones } @@ -484,41 +485,59 @@ async def build_grid(session_id: str): content_x, content_y, content_w, content_h, boxes ) - # --- Global column detection across ALL content zones --- - # Content zones share the same table structure (the table - # spans the full page, boxes are overlaid on top). Detect - # columns once from all content-zone words so that narrow - # columns (page refs, markers) visible in only one zone - # are applied consistently everywhere. - all_content_words: List[Dict] = [] - for pz in page_zones: - if pz.zone_type == "content": - all_content_words.extend( - _words_in_zone(all_words, pz.y, pz.height, pz.x, pz.width) - ) + # --- Propagate columns from largest content zone --- + # The table structure spans the full page; boxes are overlaid + # on top. The content zone with the most words has the best + # column detection. Apply its columns to all other content + # zones so that narrow columns (page refs, markers) visible + # in only one zone are consistent everywhere. - global_columns = None - if all_content_words: - global_rows = _cluster_rows(all_content_words) - global_columns = _cluster_columns_by_alignment( - all_content_words, content_w, global_rows, - ) - logger.info( - "build-grid session %s: global columns from %d content words → %d columns", - session_id, len(all_content_words), len(global_columns), - ) + # First pass: build grids per zone, track best content columns + zone_grids: List[Dict] = [] + best_content_cols = None + best_content_word_count = 0 for pz in page_zones: zone_words = _words_in_zone( all_words, pz.y, pz.height, pz.x, pz.width ) - # Content zones use global columns; box zones detect independently - cols_override = global_columns if pz.zone_type == "content" else None grid = _build_zone_grid( zone_words, pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, - global_columns=cols_override, ) + zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) + + # Track the content zone with the most words + if pz.zone_type == "content" and len(zone_words) > best_content_word_count: + best_content_word_count = len(zone_words) + # Extract column defs from grid output for reuse + best_content_cols = grid.get("_raw_columns") + + # Second pass: re-build smaller content zones with best columns + if best_content_cols and len(best_content_cols) > 1: + for zg in zone_grids: + pz = zg["pz"] + if (pz.zone_type == "content" + and len(zg["words"]) < best_content_word_count): + # Re-build this zone with the best content columns + grid = _build_zone_grid( + zg["words"], pz.x, pz.y, pz.width, pz.height, + pz.index, img_w, img_h, + global_columns=best_content_cols, + ) + zg["grid"] = grid + logger.info( + "build-grid session %s: zone %d (%d words) " + "uses columns from largest content zone (%d words, %d cols)", + session_id, pz.index, len(zg["words"]), + best_content_word_count, len(best_content_cols), + ) + + for zg in zone_grids: + pz = zg["pz"] + grid = zg["grid"] + # Remove internal _raw_columns before adding to response + grid.pop("_raw_columns", None) zone_entry: Dict[str, Any] = { "zone_index": pz.index, @@ -534,7 +553,7 @@ async def build_grid(session_id: str): "h": round(pz.height / img_h * 100, 2) if img_h else 0, }, "border": None, - "word_count": len(zone_words), + "word_count": len(zg["words"]), **grid, } @@ -552,6 +571,7 @@ async def build_grid(session_id: str): all_words, content_x, content_y, content_w, content_h, 0, img_w, img_h, ) + grid.pop("_raw_columns", None) zones_data.append({ "zone_index": 0, "zone_type": "content",