breakpilot-lehrer/klausur-service/backend/cv_box_layout.py

"""
Box layout classifier — detects internal layout type of embedded boxes.

Classifies each box as: flowing | columnar | bullet_list | header_only
and provides layout-appropriate grid building.

Used by the Box-Grid-Review step to rebuild box zones with correct structure.
"""

import logging
import re
import statistics
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

# Bullet / list-item patterns at the start of a line
_BULLET_RE = re.compile(
    r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s'  # dash, bullet chars
    r'|^\d{1,2}[.)]\s'     # numbered: "1) " or "1. "
    r'|^[a-z][.)]\s'       # lettered: "a) " or "a. "
)


def classify_box_layout(
    words: List[Dict],
    box_w: int,
    box_h: int,
) -> str:
    """Classify the internal layout of a detected box.

    Args:
        words: OCR word dicts within the box (with top, left, width, height, text)
        box_w: Box width in pixels
        box_h: Box height in pixels

    Returns:
        'header_only' | 'bullet_list' | 'columnar' | 'flowing'
    """
    if not words:
        return "header_only"

    # Group words into lines by y-proximity
    lines = _group_into_lines(words)

    # Header only: very few words or single line
    total_words = sum(len(line) for line in lines)
    if total_words <= 5 or len(lines) <= 1:
        return "header_only"

    # Bullet list: check if majority of lines start with bullet patterns
    bullet_count = 0
    for line in lines:
        first_text = line[0].get("text", "") if line else ""
        if _BULLET_RE.match(first_text):
            bullet_count += 1
        # Also check if first word IS a bullet char
        elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
            bullet_count += 1
    if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
        return "bullet_list"

    # Columnar: check for multiple distinct x-clusters
    if len(lines) >= 3 and _has_column_structure(words, box_w):
        return "columnar"

    # Default: flowing text
    return "flowing"


def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
    """Group words into lines by y-proximity."""
    if not words:
        return []

    sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
    median_h = statistics.median(heights) if heights else 20
    y_tolerance = max(median_h * 0.5, 5)

    lines: List[List[Dict]] = []
    current_line: List[Dict] = [sorted_words[0]]
    current_y = sorted_words[0]["top"]

    for w in sorted_words[1:]:
        if abs(w["top"] - current_y) <= y_tolerance:
            current_line.append(w)
        else:
            lines.append(sorted(current_line, key=lambda ww: ww["left"]))
            current_line = [w]
            current_y = w["top"]

    if current_line:
        lines.append(sorted(current_line, key=lambda ww: ww["left"]))

    return lines


def _has_column_structure(words: List[Dict], box_w: int) -> bool:
    """Check if words have multiple distinct left-edge clusters (columns)."""
    if box_w <= 0:
        return False

    lines = _group_into_lines(words)
    if len(lines) < 3:
        return False

    # Collect left-edges of non-first words in each line
    # (first word of each line often aligns regardless of columns)
    left_edges = []
    for line in lines:
        for w in line[1:]:  # skip first word
            left_edges.append(w["left"])

    if len(left_edges) < 4:
        return False

    # Check if left edges cluster into 2+ distinct groups
    left_edges.sort()
    gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
    if not gaps:
        return False

    median_gap = statistics.median(gaps)
    # A column gap is typically > 15% of box width
    column_gap_threshold = box_w * 0.15
    large_gaps = [g for g in gaps if g > column_gap_threshold]

    return len(large_gaps) >= 1


def build_box_zone_grid(
    zone_words: List[Dict],
    box_x: int,
    box_y: int,
    box_w: int,
    box_h: int,
    zone_index: int,
    img_w: int,
    img_h: int,
    layout_type: Optional[str] = None,
) -> Dict[str, Any]:
    """Build a grid for a box zone with layout-aware processing.

    If layout_type is None, auto-detects it.
    For 'flowing' and 'bullet_list', forces single-column layout.
    For 'columnar', uses the standard multi-column detection.
    For 'header_only', creates a single cell.

    Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
    """
    from grid_editor_helpers import _build_zone_grid, _cluster_rows

    if not zone_words:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
            "box_layout_type": layout_type or "header_only",
            "box_grid_reviewed": False,
        }

    # Auto-detect layout if not specified
    if not layout_type:
        layout_type = classify_box_layout(zone_words, box_w, box_h)

    logger.info(
        "Box zone %d: layout_type=%s, %d words, %dx%d",
        zone_index, layout_type, len(zone_words), box_w, box_h,
    )

    if layout_type == "header_only":
        # Single cell with all text concatenated
        all_text = " ".join(
            w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
        ).strip()
        return {
            "columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
                         "x_min_px": box_x, "x_max_px": box_x + box_w,
                         "x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
                         "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
                         "bold": False}],
            "rows": [{"index": 0, "row_index": 0,
                       "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
                       "y_min_px": box_y, "y_max_px": box_y + box_h,
                       "y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
                       "y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
                       "is_header": True}],
            "cells": [{
                "cell_id": f"Z{zone_index}_R0C0",
                "row_index": 0,
                "col_index": 0,
                "col_type": "column_1",
                "text": all_text,
                "word_boxes": zone_words,
            }],
            "header_rows": [0],
            "box_layout_type": layout_type,
            "box_grid_reviewed": False,
        }

    if layout_type in ("flowing", "bullet_list"):
        # Force single column — each line becomes one row with one cell
        lines = _group_into_lines(zone_words)
        # Column needs x_min_px/x_max_px for GridTable width calculation
        column = {
            "col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
            "x_min_px": box_x, "x_max_px": box_x + box_w,
            "x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
            "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
            "bold": False,
        }
        rows = []
        cells = []

        for row_idx, line_words in enumerate(lines):
            if not line_words:
                continue
            y_min = min(w["top"] for w in line_words)
            y_max = max(w["top"] + w["height"] for w in line_words)
            y_center = (y_min + y_max) / 2

            row = {
                "index": row_idx,
                "row_index": row_idx,
                "y_min": y_min,
                "y_max": y_max,
                "y_center": y_center,
                "y_min_px": y_min,
                "y_max_px": y_max,
                "y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
                "y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
                "is_header": False,
            }
            rows.append(row)

            line_text = " ".join(w.get("text", "") for w in line_words).strip()
            cell = {
                "cell_id": f"Z{zone_index}_R{row_idx}C0",
                "row_index": row_idx,
                "col_index": 0,
                "col_type": "column_1",
                "text": line_text,
                "word_boxes": line_words,
            }
            cells.append(cell)

        # Detect header: first row if it's notably different (bold, larger, or short)
        header_rows = []
        if len(lines) >= 2:
            first_line = lines[0]
            first_text = " ".join(w.get("text", "") for w in first_line).strip()
            # Header heuristic: short text, or all-caps, or ends with ':'
            if (len(first_text) < 40
                    or first_text.isupper()
                    or first_text.rstrip().endswith(':')):
                header_rows = [0]

        return {
            "columns": [column],
            "rows": rows,
            "cells": cells,
            "header_rows": header_rows,
            "box_layout_type": layout_type,
            "box_grid_reviewed": False,
        }

    # Columnar: use standard grid builder with independent column detection
    result = _build_zone_grid(
        zone_words, box_x, box_y, box_w, box_h,
        zone_index, img_w, img_h,
        global_columns=None,  # detect columns independently
    )

    # Colspan detection is now handled generically by _detect_colspan_cells
    # in grid_editor_helpers.py (called inside _build_zone_grid).

    result["box_layout_type"] = layout_type
    result["box_grid_reviewed"] = False
    return result