breakpilot-lehrer/klausur-service/backend/unified_grid.py

"""
Unified Grid Builder — merges multi-zone grid into a single Excel-like grid.

Takes content zone + box zones and produces one unified zone where:
- All content rows use the dominant row height
- Full-width boxes are integrated directly (box rows replace standard rows)
- Partial-width boxes: extra rows inserted if box has more lines than standard
- Box-origin cells carry metadata (bg_color, border) for visual distinction

The result is a single-zone StructuredGrid that can be:
- Rendered in an Excel-like editor
- Exported to Excel/CSV
- Edited with unified row/column numbering
"""

import logging
import math
import statistics
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)


def _compute_dominant_row_height(content_zone: Dict) -> float:
    """Median of content row-to-row spacings, excluding box-gap jumps."""
    rows = content_zone.get("rows", [])
    if len(rows) < 2:
        return 47.0

    spacings = []
    for i in range(len(rows) - 1):
        y1 = rows[i].get("y_min_px", rows[i].get("y_min", 0))
        y2 = rows[i + 1].get("y_min_px", rows[i + 1].get("y_min", 0))
        d = y2 - y1
        if 0 < d < 100:  # exclude box-gap jumps
            spacings.append(d)

    if not spacings:
        return 47.0
    spacings.sort()
    return spacings[len(spacings) // 2]


def _classify_boxes(
    box_zones: List[Dict],
    content_width: float,
) -> List[Dict]:
    """Classify each box as full_width or partial_width."""
    result = []
    for bz in box_zones:
        bb = bz.get("bbox_px", {})
        bw = bb.get("w", 0)
        bx = bb.get("x", 0)

        if bw >= content_width * 0.85:
            classification = "full_width"
            side = "center"
        else:
            classification = "partial_width"
            # Determine which side of the page the box is on
            page_center = content_width / 2
            box_center = bx + bw / 2
            side = "right" if box_center > page_center else "left"

        # Count total text lines in box (including \n within cells)
        total_lines = sum(
            (c.get("text", "").count("\n") + 1)
            for c in bz.get("cells", [])
        )

        result.append({
            "zone": bz,
            "classification": classification,
            "side": side,
            "y_start": bb.get("y", 0),
            "y_end": bb.get("y", 0) + bb.get("h", 0),
            "total_lines": total_lines,
            "bg_hex": bz.get("box_bg_hex", ""),
            "bg_color": bz.get("box_bg_color", ""),
        })
    return result


def build_unified_grid(
    zones: List[Dict],
    image_width: int,
    image_height: int,
    layout_metrics: Dict,
) -> Dict[str, Any]:
    """Build a single-zone unified grid from multi-zone grid data.

    Returns a StructuredGrid with one zone containing all rows and cells.
    """
    content_zone = None
    box_zones = []
    for z in zones:
        if z.get("zone_type") == "content":
            content_zone = z
        elif z.get("zone_type") == "box":
            box_zones.append(z)

    if not content_zone:
        logger.warning("build_unified_grid: no content zone found")
        return {"zones": zones}  # fallback: return as-is

    box_zones.sort(key=lambda b: b.get("bbox_px", {}).get("y", 0))

    dominant_h = _compute_dominant_row_height(content_zone)
    content_bbox = content_zone.get("bbox_px", {})
    content_width = content_bbox.get("w", image_width)
    content_x = content_bbox.get("x", 0)
    content_cols = content_zone.get("columns", [])
    num_cols = len(content_cols)

    box_infos = _classify_boxes(box_zones, content_width)

    logger.info(
        "build_unified_grid: dominant_h=%.1f, %d content rows, %d boxes (%s)",
        dominant_h, len(content_zone.get("rows", [])), len(box_infos),
        [b["classification"] for b in box_infos],
    )

    # --- Build unified row list + cell list ---
    unified_rows: List[Dict] = []
    unified_cells: List[Dict] = []
    unified_row_idx = 0

    # Content rows and cells indexed by row_index
    content_rows = content_zone.get("rows", [])
    content_cells = content_zone.get("cells", [])
    content_cells_by_row: Dict[int, List[Dict]] = {}
    for c in content_cells:
        content_cells_by_row.setdefault(c.get("row_index", -1), []).append(c)

    # Track which content rows we've processed
    content_row_ptr = 0

    for bi, box_info in enumerate(box_infos):
        bz = box_info["zone"]
        by_start = box_info["y_start"]
        by_end = box_info["y_end"]

        # --- Add content rows ABOVE this box ---
        while content_row_ptr < len(content_rows):
            cr = content_rows[content_row_ptr]
            cry = cr.get("y_min_px", cr.get("y_min", 0))
            if cry >= by_start:
                break
            # Add this content row
            _add_content_row(
                unified_rows, unified_cells, unified_row_idx,
                cr, content_cells_by_row, dominant_h, image_height,
            )
            unified_row_idx += 1
            content_row_ptr += 1

        # --- Add box rows ---
        if box_info["classification"] == "full_width":
            # Full-width box: integrate box rows directly
            _add_full_width_box(
                unified_rows, unified_cells, unified_row_idx,
                bz, box_info, dominant_h, num_cols, image_height,
            )
            unified_row_idx += len(bz.get("rows", []))
            # Skip content rows that overlap with this box
            while content_row_ptr < len(content_rows):
                cr = content_rows[content_row_ptr]
                cry = cr.get("y_min_px", cr.get("y_min", 0))
                if cry > by_end:
                    break
                content_row_ptr += 1

        else:
            # Partial-width box: merge with adjacent content rows
            unified_row_idx = _add_partial_width_box(
                unified_rows, unified_cells, unified_row_idx,
                bz, box_info, content_rows, content_cells_by_row,
                content_row_ptr, dominant_h, num_cols, image_height,
                content_x, content_width,
            )
            # Advance content pointer past box region
            while content_row_ptr < len(content_rows):
                cr = content_rows[content_row_ptr]
                cry = cr.get("y_min_px", cr.get("y_min", 0))
                if cry > by_end:
                    break
                content_row_ptr += 1

    # --- Add remaining content rows BELOW all boxes ---
    while content_row_ptr < len(content_rows):
        cr = content_rows[content_row_ptr]
        _add_content_row(
            unified_rows, unified_cells, unified_row_idx,
            cr, content_cells_by_row, dominant_h, image_height,
        )
        unified_row_idx += 1
        content_row_ptr += 1

    # --- Build unified zone ---
    unified_zone = {
        "zone_index": 0,
        "zone_type": "unified",
        "bbox_px": content_bbox,
        "bbox_pct": content_zone.get("bbox_pct", {}),
        "border": None,
        "word_count": sum(len(c.get("word_boxes", [])) for c in unified_cells),
        "columns": content_cols,
        "rows": unified_rows,
        "cells": unified_cells,
        "header_rows": [],
    }

    logger.info(
        "build_unified_grid: %d unified rows, %d cells (from %d content + %d box zones)",
        len(unified_rows), len(unified_cells),
        len(content_rows), len(box_zones),
    )

    return {
        "zones": [unified_zone],
        "image_width": image_width,
        "image_height": image_height,
        "layout_metrics": layout_metrics,
        "summary": {
            "total_zones": 1,
            "total_columns": num_cols,
            "total_rows": len(unified_rows),
            "total_cells": len(unified_cells),
        },
        "is_unified": True,
        "dominant_row_h": dominant_h,
    }


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _make_row(idx: int, y: float, h: float, img_h: int, is_header: bool = False) -> Dict:
    return {
        "index": idx,
        "row_index": idx,
        "y_min_px": round(y),
        "y_max_px": round(y + h),
        "y_min_pct": round(y / img_h * 100, 2) if img_h else 0,
        "y_max_pct": round((y + h) / img_h * 100, 2) if img_h else 0,
        "is_header": is_header,
    }


def _remap_cell(cell: Dict, new_row: int, new_col: int = None,
                source_type: str = "content", box_region: Dict = None) -> Dict:
    """Create a new cell dict with remapped indices."""
    c = dict(cell)
    c["row_index"] = new_row
    if new_col is not None:
        c["col_index"] = new_col
    c["cell_id"] = f"U_R{new_row:02d}_C{c.get('col_index', 0)}"
    c["source_zone_type"] = source_type
    if box_region:
        c["box_region"] = box_region
    return c


def _add_content_row(
    unified_rows, unified_cells, row_idx,
    content_row, cells_by_row, dominant_h, img_h,
):
    """Add a single content row to the unified grid."""
    y = content_row.get("y_min_px", content_row.get("y_min", 0))
    is_hdr = content_row.get("is_header", False)
    unified_rows.append(_make_row(row_idx, y, dominant_h, img_h, is_hdr))

    for cell in cells_by_row.get(content_row.get("index", -1), []):
        unified_cells.append(_remap_cell(cell, row_idx, source_type="content"))


def _add_full_width_box(
    unified_rows, unified_cells, start_row_idx,
    box_zone, box_info, dominant_h, num_cols, img_h,
):
    """Add a full-width box's rows to the unified grid."""
    box_rows = box_zone.get("rows", [])
    box_cells = box_zone.get("cells", [])
    box_region = {"bg_hex": box_info["bg_hex"], "bg_color": box_info["bg_color"], "border": True}

    # Distribute box height evenly among its rows
    box_h = box_info["y_end"] - box_info["y_start"]
    row_h = box_h / len(box_rows) if box_rows else dominant_h

    for i, br in enumerate(box_rows):
        y = box_info["y_start"] + i * row_h
        new_idx = start_row_idx + i
        is_hdr = br.get("is_header", False)
        unified_rows.append(_make_row(new_idx, y, row_h, img_h, is_hdr))

        for cell in box_cells:
            if cell.get("row_index") == br.get("index", i):
                unified_cells.append(
                    _remap_cell(cell, new_idx, source_type="box", box_region=box_region)
                )


def _add_partial_width_box(
    unified_rows, unified_cells, start_row_idx,
    box_zone, box_info, content_rows, content_cells_by_row,
    content_row_ptr, dominant_h, num_cols, img_h,
    content_x, content_width,
) -> int:
    """Add a partial-width box merged with content rows.

    Returns the next unified_row_idx after processing.
    """
    by_start = box_info["y_start"]
    by_end = box_info["y_end"]
    box_h = by_end - by_start
    box_region = {"bg_hex": box_info["bg_hex"], "bg_color": box_info["bg_color"], "border": True}

    # Content rows in the box's Y range
    overlap_content_rows = []
    ptr = content_row_ptr
    while ptr < len(content_rows):
        cr = content_rows[ptr]
        cry = cr.get("y_min_px", cr.get("y_min", 0))
        if cry > by_end:
            break
        if cry >= by_start:
            overlap_content_rows.append(cr)
        ptr += 1

    # How many standard rows fit in the box height
    standard_rows = max(1, math.floor(box_h / dominant_h))
    # How many text lines the box actually has
    box_text_lines = box_info["total_lines"]
    # Extra rows needed
    extra_rows = max(0, box_text_lines - standard_rows)
    total_rows_for_region = standard_rows + extra_rows

    logger.info(
        "partial box: standard=%d, box_lines=%d, extra=%d, content_overlap=%d",
        standard_rows, box_text_lines, extra_rows, len(overlap_content_rows),
    )

    # Determine which columns the box occupies
    box_bb = box_zone.get("bbox_px", {})
    box_x = box_bb.get("x", 0)
    box_w = box_bb.get("w", 0)

    # Map box to content columns: find which content columns overlap
    box_col_start = 0
    box_col_end = num_cols
    content_cols_list = []
    for z_col_idx in range(num_cols):
        # Find the column definition by checking all column entries
        # Simple heuristic: if box starts past halfway, it's the right columns
        pass

    # Simpler approach: box on right side → last N columns
    # box on left side → first N columns
    if box_info["side"] == "right":
        # Box starts at x=box_x. Find first content column that overlaps
        box_col_start = num_cols  # default: beyond all columns
        for z in (box_zone.get("columns") or [{"index": 0}]):
            pass
        # Use content column positions to determine overlap
        content_cols_data = [
            {"idx": c.get("index", i), "x_min": c.get("x_min_px", 0), "x_max": c.get("x_max_px", 0)}
            for i, c in enumerate(content_rows[0:0] or [])  # placeholder
        ]
        # Simple: split columns at midpoint
        box_col_start = num_cols // 2  # right half
        box_col_end = num_cols
    else:
        box_col_start = 0
        box_col_end = num_cols // 2

    # Build rows for this region
    box_cells = box_zone.get("cells", [])
    box_rows = box_zone.get("rows", [])
    row_idx = start_row_idx

    # Expand box cell texts with \n into individual lines for row mapping
    box_lines: List[Tuple[str, Dict]] = []  # (text_line, parent_cell)
    for bc in sorted(box_cells, key=lambda c: c.get("row_index", 0)):
        text = bc.get("text", "")
        for line in text.split("\n"):
            box_lines.append((line.strip(), bc))

    for i in range(total_rows_for_region):
        y = by_start + i * dominant_h
        unified_rows.append(_make_row(row_idx, y, dominant_h, img_h))

        # Content cells for this row (from overlapping content rows)
        if i < len(overlap_content_rows):
            cr = overlap_content_rows[i]
            for cell in content_cells_by_row.get(cr.get("index", -1), []):
                # Only include cells from columns NOT covered by the box
                ci = cell.get("col_index", 0)
                if ci < box_col_start or ci >= box_col_end:
                    unified_cells.append(_remap_cell(cell, row_idx, source_type="content"))

        # Box cell for this row
        if i < len(box_lines):
            line_text, parent_cell = box_lines[i]
            box_cell = {
                "cell_id": f"U_R{row_idx:02d}_C{box_col_start}",
                "row_index": row_idx,
                "col_index": box_col_start,
                "col_type": "spanning_header" if (box_col_end - box_col_start) > 1 else parent_cell.get("col_type", "column_1"),
                "colspan": box_col_end - box_col_start,
                "text": line_text,
                "confidence": parent_cell.get("confidence", 0),
                "bbox_px": parent_cell.get("bbox_px", {}),
                "bbox_pct": parent_cell.get("bbox_pct", {}),
                "word_boxes": [],
                "ocr_engine": parent_cell.get("ocr_engine", ""),
                "is_bold": parent_cell.get("is_bold", False),
                "source_zone_type": "box",
                "box_region": box_region,
            }
            unified_cells.append(box_cell)

        row_idx += 1

    return row_idx