""" Unified Grid Builder — merges multi-zone grid into a single Excel-like grid. Takes content zone + box zones and produces one unified zone where: - All content rows use the dominant row height - Full-width boxes are integrated directly (box rows replace standard rows) - Partial-width boxes: extra rows inserted if box has more lines than standard - Box-origin cells carry metadata (bg_color, border) for visual distinction The result is a single-zone StructuredGrid that can be: - Rendered in an Excel-like editor - Exported to Excel/CSV - Edited with unified row/column numbering """ import logging import math import statistics from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) def _compute_dominant_row_height(content_zone: Dict) -> float: """Median of content row-to-row spacings, excluding box-gap jumps.""" rows = content_zone.get("rows", []) if len(rows) < 2: return 47.0 spacings = [] for i in range(len(rows) - 1): y1 = rows[i].get("y_min_px", rows[i].get("y_min", 0)) y2 = rows[i + 1].get("y_min_px", rows[i + 1].get("y_min", 0)) d = y2 - y1 if 0 < d < 100: # exclude box-gap jumps spacings.append(d) if not spacings: return 47.0 spacings.sort() return spacings[len(spacings) // 2] def _classify_boxes( box_zones: List[Dict], content_width: float, ) -> List[Dict]: """Classify each box as full_width or partial_width.""" result = [] for bz in box_zones: bb = bz.get("bbox_px", {}) bw = bb.get("w", 0) bx = bb.get("x", 0) if bw >= content_width * 0.85: classification = "full_width" side = "center" else: classification = "partial_width" # Determine which side of the page the box is on page_center = content_width / 2 box_center = bx + bw / 2 side = "right" if box_center > page_center else "left" # Count total text lines in box (including \n within cells) total_lines = sum( (c.get("text", "").count("\n") + 1) for c in bz.get("cells", []) ) result.append({ "zone": bz, "classification": classification, "side": side, "y_start": bb.get("y", 0), "y_end": bb.get("y", 0) + bb.get("h", 0), "total_lines": total_lines, "bg_hex": bz.get("box_bg_hex", ""), "bg_color": bz.get("box_bg_color", ""), }) return result def build_unified_grid( zones: List[Dict], image_width: int, image_height: int, layout_metrics: Dict, ) -> Dict[str, Any]: """Build a single-zone unified grid from multi-zone grid data. Returns a StructuredGrid with one zone containing all rows and cells. """ content_zone = None box_zones = [] for z in zones: if z.get("zone_type") == "content": content_zone = z elif z.get("zone_type") == "box": box_zones.append(z) if not content_zone: logger.warning("build_unified_grid: no content zone found") return {"zones": zones} # fallback: return as-is box_zones.sort(key=lambda b: b.get("bbox_px", {}).get("y", 0)) dominant_h = _compute_dominant_row_height(content_zone) content_bbox = content_zone.get("bbox_px", {}) content_width = content_bbox.get("w", image_width) content_x = content_bbox.get("x", 0) content_cols = content_zone.get("columns", []) num_cols = len(content_cols) box_infos = _classify_boxes(box_zones, content_width) logger.info( "build_unified_grid: dominant_h=%.1f, %d content rows, %d boxes (%s)", dominant_h, len(content_zone.get("rows", [])), len(box_infos), [b["classification"] for b in box_infos], ) # --- Build unified row list + cell list --- unified_rows: List[Dict] = [] unified_cells: List[Dict] = [] unified_row_idx = 0 # Content rows and cells indexed by row_index content_rows = content_zone.get("rows", []) content_cells = content_zone.get("cells", []) content_cells_by_row: Dict[int, List[Dict]] = {} for c in content_cells: content_cells_by_row.setdefault(c.get("row_index", -1), []).append(c) # Track which content rows we've processed content_row_ptr = 0 for bi, box_info in enumerate(box_infos): bz = box_info["zone"] by_start = box_info["y_start"] by_end = box_info["y_end"] # --- Add content rows ABOVE this box --- while content_row_ptr < len(content_rows): cr = content_rows[content_row_ptr] cry = cr.get("y_min_px", cr.get("y_min", 0)) if cry >= by_start: break # Add this content row _add_content_row( unified_rows, unified_cells, unified_row_idx, cr, content_cells_by_row, dominant_h, image_height, ) unified_row_idx += 1 content_row_ptr += 1 # --- Add box rows --- if box_info["classification"] == "full_width": # Full-width box: integrate box rows directly _add_full_width_box( unified_rows, unified_cells, unified_row_idx, bz, box_info, dominant_h, num_cols, image_height, ) unified_row_idx += len(bz.get("rows", [])) # Skip content rows that overlap with this box while content_row_ptr < len(content_rows): cr = content_rows[content_row_ptr] cry = cr.get("y_min_px", cr.get("y_min", 0)) if cry > by_end: break content_row_ptr += 1 else: # Partial-width box: merge with adjacent content rows unified_row_idx = _add_partial_width_box( unified_rows, unified_cells, unified_row_idx, bz, box_info, content_rows, content_cells_by_row, content_row_ptr, dominant_h, num_cols, image_height, content_x, content_width, ) # Advance content pointer past box region while content_row_ptr < len(content_rows): cr = content_rows[content_row_ptr] cry = cr.get("y_min_px", cr.get("y_min", 0)) if cry > by_end: break content_row_ptr += 1 # --- Add remaining content rows BELOW all boxes --- while content_row_ptr < len(content_rows): cr = content_rows[content_row_ptr] _add_content_row( unified_rows, unified_cells, unified_row_idx, cr, content_cells_by_row, dominant_h, image_height, ) unified_row_idx += 1 content_row_ptr += 1 # --- Build unified zone --- unified_zone = { "zone_index": 0, "zone_type": "unified", "bbox_px": content_bbox, "bbox_pct": content_zone.get("bbox_pct", {}), "border": None, "word_count": sum(len(c.get("word_boxes", [])) for c in unified_cells), "columns": content_cols, "rows": unified_rows, "cells": unified_cells, "header_rows": [], } logger.info( "build_unified_grid: %d unified rows, %d cells (from %d content + %d box zones)", len(unified_rows), len(unified_cells), len(content_rows), len(box_zones), ) return { "zones": [unified_zone], "image_width": image_width, "image_height": image_height, "layout_metrics": layout_metrics, "summary": { "total_zones": 1, "total_columns": num_cols, "total_rows": len(unified_rows), "total_cells": len(unified_cells), }, "is_unified": True, "dominant_row_h": dominant_h, } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_row(idx: int, y: float, h: float, img_h: int, is_header: bool = False) -> Dict: return { "index": idx, "row_index": idx, "y_min_px": round(y), "y_max_px": round(y + h), "y_min_pct": round(y / img_h * 100, 2) if img_h else 0, "y_max_pct": round((y + h) / img_h * 100, 2) if img_h else 0, "is_header": is_header, } def _remap_cell(cell: Dict, new_row: int, new_col: int = None, source_type: str = "content", box_region: Dict = None) -> Dict: """Create a new cell dict with remapped indices.""" c = dict(cell) c["row_index"] = new_row if new_col is not None: c["col_index"] = new_col c["cell_id"] = f"U_R{new_row:02d}_C{c.get('col_index', 0)}" c["source_zone_type"] = source_type if box_region: c["box_region"] = box_region return c def _add_content_row( unified_rows, unified_cells, row_idx, content_row, cells_by_row, dominant_h, img_h, ): """Add a single content row to the unified grid.""" y = content_row.get("y_min_px", content_row.get("y_min", 0)) is_hdr = content_row.get("is_header", False) unified_rows.append(_make_row(row_idx, y, dominant_h, img_h, is_hdr)) for cell in cells_by_row.get(content_row.get("index", -1), []): unified_cells.append(_remap_cell(cell, row_idx, source_type="content")) def _add_full_width_box( unified_rows, unified_cells, start_row_idx, box_zone, box_info, dominant_h, num_cols, img_h, ): """Add a full-width box's rows to the unified grid.""" box_rows = box_zone.get("rows", []) box_cells = box_zone.get("cells", []) box_region = {"bg_hex": box_info["bg_hex"], "bg_color": box_info["bg_color"], "border": True} # Distribute box height evenly among its rows box_h = box_info["y_end"] - box_info["y_start"] row_h = box_h / len(box_rows) if box_rows else dominant_h for i, br in enumerate(box_rows): y = box_info["y_start"] + i * row_h new_idx = start_row_idx + i is_hdr = br.get("is_header", False) unified_rows.append(_make_row(new_idx, y, row_h, img_h, is_hdr)) for cell in box_cells: if cell.get("row_index") == br.get("index", i): unified_cells.append( _remap_cell(cell, new_idx, source_type="box", box_region=box_region) ) def _add_partial_width_box( unified_rows, unified_cells, start_row_idx, box_zone, box_info, content_rows, content_cells_by_row, content_row_ptr, dominant_h, num_cols, img_h, content_x, content_width, ) -> int: """Add a partial-width box merged with content rows. Returns the next unified_row_idx after processing. """ by_start = box_info["y_start"] by_end = box_info["y_end"] box_h = by_end - by_start box_region = {"bg_hex": box_info["bg_hex"], "bg_color": box_info["bg_color"], "border": True} # Content rows in the box's Y range overlap_content_rows = [] ptr = content_row_ptr while ptr < len(content_rows): cr = content_rows[ptr] cry = cr.get("y_min_px", cr.get("y_min", 0)) if cry > by_end: break if cry >= by_start: overlap_content_rows.append(cr) ptr += 1 # How many standard rows fit in the box height standard_rows = max(1, math.floor(box_h / dominant_h)) # How many text lines the box actually has box_text_lines = box_info["total_lines"] # Extra rows needed extra_rows = max(0, box_text_lines - standard_rows) total_rows_for_region = standard_rows + extra_rows logger.info( "partial box: standard=%d, box_lines=%d, extra=%d, content_overlap=%d", standard_rows, box_text_lines, extra_rows, len(overlap_content_rows), ) # Determine which columns the box occupies box_bb = box_zone.get("bbox_px", {}) box_x = box_bb.get("x", 0) box_w = box_bb.get("w", 0) # Map box to content columns: find which content columns overlap box_col_start = 0 box_col_end = num_cols content_cols_list = [] for z_col_idx in range(num_cols): # Find the column definition by checking all column entries # Simple heuristic: if box starts past halfway, it's the right columns pass # Simpler approach: box on right side → last N columns # box on left side → first N columns if box_info["side"] == "right": # Box starts at x=box_x. Find first content column that overlaps box_col_start = num_cols # default: beyond all columns for z in (box_zone.get("columns") or [{"index": 0}]): pass # Use content column positions to determine overlap content_cols_data = [ {"idx": c.get("index", i), "x_min": c.get("x_min_px", 0), "x_max": c.get("x_max_px", 0)} for i, c in enumerate(content_rows[0:0] or []) # placeholder ] # Simple: split columns at midpoint box_col_start = num_cols // 2 # right half box_col_end = num_cols else: box_col_start = 0 box_col_end = num_cols // 2 # Build rows for this region box_cells = box_zone.get("cells", []) box_rows = box_zone.get("rows", []) row_idx = start_row_idx # Expand box cell texts with \n into individual lines for row mapping box_lines: List[Tuple[str, Dict]] = [] # (text_line, parent_cell) for bc in sorted(box_cells, key=lambda c: c.get("row_index", 0)): text = bc.get("text", "") for line in text.split("\n"): box_lines.append((line.strip(), bc)) for i in range(total_rows_for_region): y = by_start + i * dominant_h unified_rows.append(_make_row(row_idx, y, dominant_h, img_h)) # Content cells for this row (from overlapping content rows) if i < len(overlap_content_rows): cr = overlap_content_rows[i] for cell in content_cells_by_row.get(cr.get("index", -1), []): # Only include cells from columns NOT covered by the box ci = cell.get("col_index", 0) if ci < box_col_start or ci >= box_col_end: unified_cells.append(_remap_cell(cell, row_idx, source_type="content")) # Box cell for this row if i < len(box_lines): line_text, parent_cell = box_lines[i] box_cell = { "cell_id": f"U_R{row_idx:02d}_C{box_col_start}", "row_index": row_idx, "col_index": box_col_start, "col_type": "spanning_header" if (box_col_end - box_col_start) > 1 else parent_cell.get("col_type", "column_1"), "colspan": box_col_end - box_col_start, "text": line_text, "confidence": parent_cell.get("confidence", 0), "bbox_px": parent_cell.get("bbox_px", {}), "bbox_pct": parent_cell.get("bbox_pct", {}), "word_boxes": [], "ocr_engine": parent_cell.get("ocr_engine", ""), "is_bold": parent_cell.get("is_bold", False), "source_zone_type": "box", "box_region": box_region, } unified_cells.append(box_cell) row_idx += 1 return row_idx