breakpilot-lehrer/klausur-service/backend/grid_editor_zones.py

"""
Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.

Split from grid_editor_helpers.py for maintainability.
All functions are pure computation — no HTTP, DB, or session side effects.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Any, Dict, List, Optional

from cv_vocab_types import PageZone
from cv_words_first import _cluster_rows, _build_cells

from grid_editor_columns import (
    _cluster_columns_by_alignment,
    _merge_inline_marker_columns,
    _split_cross_column_words,
)
from grid_editor_headers import (
    _detect_header_rows,
    _detect_colspan_cells,
)

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Vertical divider detection and zone splitting
# ---------------------------------------------------------------------------

_PIPE_RE_VSPLIT = re.compile(r"^\|+$")


def _detect_vertical_dividers(
    words: List[Dict],
    zone_x: int,
    zone_w: int,
    zone_y: int,
    zone_h: int,
) -> List[float]:
    """Detect vertical divider lines from pipe word_boxes at consistent x.

    Returns list of divider x-positions (empty if no dividers found).
    """
    if not words or zone_w <= 0 or zone_h <= 0:
        return []

    # Collect pipe word_boxes
    pipes = [
        w for w in words
        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
    ]
    if len(pipes) < 5:
        return []

    # Cluster pipe x-centers by proximity
    tolerance = max(15, int(zone_w * 0.02))
    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)

    clusters: List[List[float]] = [[pipe_xs[0]]]
    for x in pipe_xs[1:]:
        if x - clusters[-1][-1] <= tolerance:
            clusters[-1].append(x)
        else:
            clusters.append([x])

    dividers: List[float] = []
    for cluster in clusters:
        if len(cluster) < 5:
            continue
        mean_x = sum(cluster) / len(cluster)
        # Must be between 15% and 85% of zone width
        rel_pos = (mean_x - zone_x) / zone_w
        if rel_pos < 0.15 or rel_pos > 0.85:
            continue
        # Check vertical coverage: pipes must span >= 50% of zone height
        cluster_pipes = [
            w for w in pipes
            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
        ]
        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
        y_span = max(ys) - min(ys) if ys else 0
        if y_span < zone_h * 0.5:
            continue
        dividers.append(mean_x)

    return sorted(dividers)


def _split_zone_at_vertical_dividers(
    zone: "PageZone",
    divider_xs: List[float],
    vsplit_group_id: int,
) -> List["PageZone"]:
    """Split a PageZone at vertical divider positions into sub-zones."""
    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
    hints = []
    for i in range(len(boundaries) - 1):
        if i == 0:
            hints.append("left_of_vsplit")
        elif i == len(boundaries) - 2:
            hints.append("right_of_vsplit")
        else:
            hints.append("middle_of_vsplit")

    sub_zones = []
    for i in range(len(boundaries) - 1):
        x_start = int(boundaries[i])
        x_end = int(boundaries[i + 1])
        sub = PageZone(
            index=0,  # re-indexed later
            zone_type=zone.zone_type,
            y=zone.y,
            height=zone.height,
            x=x_start,
            width=x_end - x_start,
            box=zone.box,
            image_overlays=zone.image_overlays,
            layout_hint=hints[i],
            vsplit_group=vsplit_group_id,
        )
        sub_zones.append(sub)

    return sub_zones


def _merge_content_zones_across_boxes(
    zones: List,
    content_x: int,
    content_w: int,
) -> List:
    """Merge content zones separated by box zones into single zones.

    Box zones become image_overlays on the merged content zone.
    Pattern: [content, box*, content] -> [merged_content with overlay]
    Box zones NOT between two content zones stay as standalone zones.
    """
    if len(zones) < 3:
        return zones

    # Group consecutive runs of [content, box+, content]
    result: List = []
    i = 0
    while i < len(zones):
        z = zones[i]
        if z.zone_type != "content":
            result.append(z)
            i += 1
            continue

        # Start of a potential merge group: content zone
        group_contents = [z]
        group_boxes = []
        j = i + 1
        # Absorb [box, content] pairs -- only absorb a box if it's
        # confirmed to be followed by another content zone.
        while j < len(zones):
            if (zones[j].zone_type == "box"
                    and j + 1 < len(zones)
                    and zones[j + 1].zone_type == "content"):
                group_boxes.append(zones[j])
                group_contents.append(zones[j + 1])
                j += 2
            else:
                break

        if len(group_contents) >= 2 and group_boxes:
            # Merge: create one large content zone spanning all
            y_min = min(c.y for c in group_contents)
            y_max = max(c.y + c.height for c in group_contents)
            overlays = []
            for bz in group_boxes:
                overlay = {
                    "y": bz.y,
                    "height": bz.height,
                    "x": bz.x,
                    "width": bz.width,
                }
                if bz.box:
                    overlay["box"] = {
                        "x": bz.box.x,
                        "y": bz.box.y,
                        "width": bz.box.width,
                        "height": bz.box.height,
                        "confidence": bz.box.confidence,
                        "border_thickness": bz.box.border_thickness,
                    }
                overlays.append(overlay)

            merged = PageZone(
                index=0,  # re-indexed below
                zone_type="content",
                y=y_min,
                height=y_max - y_min,
                x=content_x,
                width=content_w,
                image_overlays=overlays,
            )
            result.append(merged)
            i = j
        else:
            # No merge possible -- emit just the content zone
            result.append(z)
            i += 1

    # Re-index zones
    for idx, z in enumerate(result):
        z.index = idx

    logger.info(
        "zone-merge: %d zones -> %d zones after merging across boxes",
        len(zones), len(result),
    )
    return result


def _build_zone_grid(
    zone_words: List[Dict],
    zone_x: int,
    zone_y: int,
    zone_w: int,
    zone_h: int,
    zone_index: int,
    img_w: int,
    img_h: int,
    global_columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
) -> Dict[str, Any]:
    """Build columns, rows, cells for a single zone from its words.

    Args:
        global_columns: If provided, use these pre-computed column boundaries
            instead of detecting columns per zone.  Used for content zones so
            that all content zones (above/between/below boxes) share the same
            column structure.  Box zones always detect columns independently.
    """
    if not zone_words:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
        }

    # Cluster rows first (needed for column alignment analysis)
    rows = _cluster_rows(zone_words)

    # Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
    if len(zone_words) <= 60:
        import statistics as _st
        _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
        _med_h = _st.median(_heights) if _heights else 20
        _y_tol = max(_med_h * 0.5, 5)
        logger.info(
            "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
            zone_index, len(zone_words), _med_h, _y_tol, len(rows),
        )
        for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
            logger.info(
                "  zone %d word: y=%d x=%d h=%d w=%d '%s'",
                zone_index, w['top'], w['left'], w['height'], w['width'],
                w.get('text', '')[:40],
            )
        for r in rows:
            logger.info(
                "  zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
                zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
            )

    # Use global columns if provided, otherwise detect per zone
    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)

    # Merge inline marker columns (bullets, numbering) into adjacent text
    if not global_columns:
        columns = _merge_inline_marker_columns(columns, zone_words)

    if not columns or not rows:
        return {
            "columns": [],
            "rows": [],
            "cells": [],
            "header_rows": [],
        }

    # Split word boxes that straddle column boundaries (e.g. "sichzie"
    # spanning Col 1 + Col 2).  Must happen after column detection and
    # before cell assignment.
    # Keep original words for colspan detection (split destroys span info).
    original_zone_words = zone_words
    if len(columns) >= 2:
        zone_words = _split_cross_column_words(zone_words, columns)

    # Build cells
    cells = _build_cells(zone_words, columns, rows, img_w, img_h)

    # --- Detect colspan (merged cells spanning multiple columns) ---
    # Uses the ORIGINAL (pre-split) words to detect word-blocks that span
    # multiple columns.  _split_cross_column_words would have destroyed
    # this information by cutting words at column boundaries.
    if len(columns) >= 2:
        cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)

    # Prefix cell IDs with zone index
    for cell in cells:
        cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
        cell["zone_index"] = zone_index

    # Detect header rows (pass columns for spanning header detection)
    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
                                      skip_first_row_header=skip_first_row_header)

    # Merge cells in spanning header rows into a single col-0 cell
    if header_rows and len(columns) >= 2:
        for hri in header_rows:
            header_cells = [c for c in cells if c["row_index"] == hri]
            if len(header_cells) <= 1:
                continue
            # Collect all word_boxes and text from all columns
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())
            # Remove all header cells, replace with one spanning cell
            cells = [c for c in cells if c["row_index"] != hri]
            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                cells.append({
                    "cell_id": f"R{hri:02d}_C0",
                    "row_index": hri,
                    "col_index": 0,
                    "col_type": "spanning_header",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
                    "bbox_px": {"x": x_min, "y": y_min,
                                "w": x_max - x_min, "h": y_max - y_min},
                    "bbox_pct": {
                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                    },
                    "word_boxes": all_wb,
                    "ocr_engine": "words_first",
                    "is_bold": True,
                })

    # Convert columns to output format with percentages
    out_columns = []
    for col in columns:
        x_min = col["x_min"]
        x_max = col["x_max"]
        out_columns.append({
            "index": col["index"],
            "label": col["type"],
            "x_min_px": round(x_min),
            "x_max_px": round(x_max),
            "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
            "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
            "bold": False,
        })

    # Convert rows to output format with percentages
    out_rows = []
    for row in rows:
        out_rows.append({
            "index": row["index"],
            "y_min_px": round(row["y_min"]),
            "y_max_px": round(row["y_max"]),
            "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
            "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
            "is_header": row["index"] in header_rows,
        })

    return {
        "columns": out_columns,
        "rows": out_rows,
        "cells": cells,
        "header_rows": header_rows,
        "_raw_columns": columns,  # internal: for propagation to other zones
    }