breakpilot-lehrer/klausur-service/backend/grid_build_zones.py

"""
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
detection and zone-aware grid building.

Extracted from grid_build_core.py for maintainability.
"""

import logging
from typing import Any, Dict, List, Optional

import cv2
import numpy as np

from cv_box_detect import detect_boxes, split_page_into_zones
from cv_graphic_detect import detect_graphic_elements
from cv_color_detect import recover_colored_text
from cv_vocab_types import PageZone
from ocr_pipeline_session_store import get_session_image

from grid_editor_helpers import (
    _filter_border_strip_words,
    _filter_border_ghosts,
    _words_in_zone,
    _PIPE_RE_VSPLIT,
    _detect_vertical_dividers,
    _split_zone_at_vertical_dividers,
    _merge_content_zones_across_boxes,
    _build_zone_grid,
)

logger = logging.getLogger(__name__)


async def _build_zones(
    session_id: str,
    session: dict,
    all_words: List[Dict[str, Any]],
    graphic_rects: List[Dict[str, int]],
    content_x: int,
    content_y: int,
    content_w: int,
    content_h: int,
    img_w: int,
    img_h: int,
) -> Dict[str, Any]:
    """Load image, detect graphics/boxes, build zone-aware grids.

    Returns a dict with keys:
        zones_data, boxes_detected, recovered_count, border_prefiltered,
        img_bgr, all_words (modified in-place but returned for clarity).
    """
    zones_data: List[Dict[str, Any]] = []
    boxes_detected = 0
    recovered_count = 0
    border_prefiltered = False
    img_bgr = None

    # 3. Load image for box detection
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")

    if img_png:
        # Decode image for color detection + box detection
        arr = np.frombuffer(img_png, dtype=np.uint8)
        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)

        if img_bgr is not None:
            # --- 3a. Detect graphic/image regions via CV and hard-filter ---
            sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
            fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
            if fresh_graphics:
                fresh_rects = [
                    {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
                    for g in fresh_graphics
                ]
                graphic_rects.extend(fresh_rects)
                logger.info(
                    "build-grid session %s: detected %d graphic region(s) via CV",
                    session_id, len(fresh_graphics),
                )
                # Hard-filter words inside newly detected graphic regions
                before = len(all_words)
                all_words[:] = [
                    w for w in all_words
                    if not any(
                        gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in fresh_rects
                    )
                ]
                removed = before - len(all_words)
                if removed:
                    logger.info(
                        "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
                        session_id, removed, len(fresh_rects),
                    )

            # --- Recover colored text that OCR missed (before grid building) ---
            recovered = recover_colored_text(img_bgr, all_words)
            if recovered and graphic_rects:
                # Filter recovered chars inside graphic regions
                recovered = [
                    r for r in recovered
                    if not any(
                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in graphic_rects
                    )
                ]
            if recovered:
                recovered_count = len(recovered)
                all_words.extend(recovered)
                logger.info(
                    "build-grid session %s: +%d recovered colored words",
                    session_id, recovered_count,
                )

            # Detect bordered boxes
            boxes = detect_boxes(
                img_bgr,
                content_x=content_x,
                content_w=content_w,
                content_y=content_y,
                content_h=content_h,
            )
            boxes_detected = len(boxes)

            if boxes:
                # Filter border ghost words before grid building
                all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
                if ghost_count:
                    all_words[:] = all_words_new
                    logger.info(
                        "build-grid session %s: removed %d border ghost words",
                        session_id, ghost_count,
                    )

                # Split page into zones
                page_zones = split_page_into_zones(
                    content_x, content_y, content_w, content_h, boxes
                )

                # Merge content zones separated by box zones
                page_zones = _merge_content_zones_across_boxes(
                    page_zones, content_x, content_w
                )

                # 3b. Detect vertical dividers and split content zones
                page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
                    page_zones, all_words
                )

                # --- First pass: build grids per zone independently ---
                zone_grids = _build_grids_per_zone(
                    page_zones, all_words, img_w, img_h
                )
                border_prefiltered = border_prefiltered or any(
                    zg.get("_border_prefiltered") for zg in zone_grids
                )

                # --- Second pass: merge column boundaries from all content zones ---
                _merge_content_zone_columns(
                    zone_grids, all_words, content_w, img_w, img_h, session_id
                )

                # --- Build zones_data from zone_grids ---
                for zg in zone_grids:
                    pz = zg["pz"]
                    grid = zg["grid"]
                    grid.pop("_raw_columns", None)

                    zone_entry: Dict[str, Any] = {
                        "zone_index": pz.index,
                        "zone_type": pz.zone_type,
                        "bbox_px": {
                            "x": pz.x, "y": pz.y,
                            "w": pz.width, "h": pz.height,
                        },
                        "bbox_pct": {
                            "x": round(pz.x / img_w * 100, 2) if img_w else 0,
                            "y": round(pz.y / img_h * 100, 2) if img_h else 0,
                            "w": round(pz.width / img_w * 100, 2) if img_w else 0,
                            "h": round(pz.height / img_h * 100, 2) if img_h else 0,
                        },
                        "border": None,
                        "word_count": len(zg["words"]),
                        **grid,
                    }

                    if pz.box:
                        zone_entry["border"] = {
                            "thickness": pz.box.border_thickness,
                            "confidence": pz.box.confidence,
                        }

                    if pz.image_overlays:
                        zone_entry["image_overlays"] = pz.image_overlays

                    if pz.layout_hint:
                        zone_entry["layout_hint"] = pz.layout_hint
                    if pz.vsplit_group is not None:
                        zone_entry["vsplit_group"] = pz.vsplit_group

                    zones_data.append(zone_entry)

    # 4. Fallback: no boxes detected -> single zone with all words
    if not zones_data:
        before = len(all_words)
        filtered_words = [
            w for w in all_words
            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
        ]
        removed = before - len(filtered_words)
        if removed:
            logger.info(
                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
                session_id, removed,
            )
        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
        if bs_removed:
            border_prefiltered = True
            logger.info(
                "build-grid session %s: pre-filtered %d border-strip words",
                session_id, bs_removed,
            )
        grid = _build_zone_grid(
            filtered_words, content_x, content_y, content_w, content_h,
            0, img_w, img_h,
        )
        grid.pop("_raw_columns", None)
        zones_data.append({
            "zone_index": 0,
            "zone_type": "content",
            "bbox_px": {
                "x": content_x, "y": content_y,
                "w": content_w, "h": content_h,
            },
            "bbox_pct": {
                "x": round(content_x / img_w * 100, 2) if img_w else 0,
                "y": round(content_y / img_h * 100, 2) if img_h else 0,
                "w": round(content_w / img_w * 100, 2) if img_w else 0,
                "h": round(content_h / img_h * 100, 2) if img_h else 0,
            },
            "border": None,
            "word_count": len(all_words),
            **grid,
        })

    return {
        "zones_data": zones_data,
        "boxes_detected": boxes_detected,
        "recovered_count": recovered_count,
        "border_prefiltered": border_prefiltered,
        "img_bgr": img_bgr,
    }


def _detect_and_split_vertical_dividers(
    page_zones: List[PageZone],
    all_words: List[Dict[str, Any]],
) -> tuple:
    """Detect vertical dividers and split content zones.

    Returns (expanded_zones, border_prefiltered_from_vsplit).
    """
    vsplit_group_counter = 0
    expanded_zones: List = []
    for pz in page_zones:
        if pz.zone_type != "content":
            expanded_zones.append(pz)
            continue
        zone_words = _words_in_zone(
            all_words, pz.y, pz.height, pz.x, pz.width
        )
        divider_xs = _detect_vertical_dividers(
            zone_words, pz.x, pz.width, pz.y, pz.height
        )
        if divider_xs:
            sub_zones = _split_zone_at_vertical_dividers(
                pz, divider_xs, vsplit_group_counter
            )
            expanded_zones.extend(sub_zones)
            vsplit_group_counter += 1
            # Remove pipe words so they don't appear in sub-zones
            pipe_ids = set(
                id(w) for w in zone_words
                if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
            )
            all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
            logger.info(
                "build-grid: vertical split zone %d at x=%s -> %d sub-zones",
                pz.index, [int(x) for x in divider_xs], len(sub_zones),
            )
        else:
            expanded_zones.append(pz)
    # Re-index zones
    for i, pz in enumerate(expanded_zones):
        pz.index = i
    return expanded_zones, False


def _build_grids_per_zone(
    page_zones: List[PageZone],
    all_words: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
) -> List[Dict[str, Any]]:
    """Build grids for each zone independently (first pass)."""
    zone_grids: List[Dict] = []

    for pz in page_zones:
        zone_words = _words_in_zone(
            all_words, pz.y, pz.height, pz.x, pz.width
        )
        if pz.zone_type == "content":
            logger.info(
                "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
                pz.index, pz.zone_type,
                pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
                len(zone_words), len(all_words),
            )
        # Filter recovered single-char artifacts in ALL zones
        before = len(zone_words)
        zone_words = [
            w for w in zone_words
            if not (
                w.get("recovered")
                and len(w.get("text", "").strip()) <= 2
            )
        ]
        removed = before - len(zone_words)
        if removed:
            logger.info(
                "build-grid: filtered %d recovered artifacts from %s zone %d",
                removed, pz.zone_type, pz.index,
            )
        # Filter words inside image overlay regions (merged box zones)
        if pz.image_overlays:
            before_ov = len(zone_words)
            zone_words = [
                w for w in zone_words
                if not any(
                    ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
                    and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
                    for ov in pz.image_overlays
                )
            ]
            ov_removed = before_ov - len(zone_words)
            if ov_removed:
                logger.info(
                    "build-grid: filtered %d words inside image overlays from zone %d",
                    ov_removed, pz.index,
                )
        zone_words, bs_removed = _filter_border_strip_words(zone_words)
        bp = False
        if bs_removed:
            bp = True
            logger.info(
                "build-grid: pre-filtered %d border-strip words from zone %d",
                bs_removed, pz.index,
            )
        grid = _build_zone_grid(
            zone_words, pz.x, pz.y, pz.width, pz.height,
            pz.index, img_w, img_h,
            skip_first_row_header=bool(pz.image_overlays),
        )
        zone_grids.append({
            "pz": pz, "words": zone_words, "grid": grid,
            "_border_prefiltered": bp,
        })

    return zone_grids


def _merge_content_zone_columns(
    zone_grids: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    content_w: int,
    img_w: int,
    img_h: int,
    session_id: str,
) -> None:
    """Second pass: merge column boundaries from all content zones.

    Modifies zone_grids in place.
    """
    content_zones = [
        zg for zg in zone_grids
        if zg["pz"].zone_type == "content"
        and zg["pz"].vsplit_group is None
    ]
    if len(content_zones) <= 1:
        return

    # Collect column split points (x_min of non-first columns)
    all_split_xs: List[float] = []
    for zg in content_zones:
        raw_cols = zg["grid"].get("_raw_columns", [])
        for col in raw_cols[1:]:
            all_split_xs.append(col["x_min"])

    if not all_split_xs:
        return

    all_split_xs.sort()
    merge_distance = max(25, int(content_w * 0.03))
    merged_xs = [all_split_xs[0]]
    for x in all_split_xs[1:]:
        if x - merged_xs[-1] < merge_distance:
            merged_xs[-1] = (merged_xs[-1] + x) / 2
        else:
            merged_xs.append(x)

    total_cols = len(merged_xs) + 1
    max_zone_cols = max(
        len(zg["grid"].get("_raw_columns", []))
        for zg in content_zones
    )

    if total_cols < max_zone_cols:
        return

    cx_min = min(w["left"] for w in all_words)
    cx_max = max(w["left"] + w["width"] for w in all_words)
    merged_columns: List[Dict[str, Any]] = []
    prev_x = cx_min
    for i, sx in enumerate(merged_xs):
        merged_columns.append({
            "index": i,
            "type": f"column_{i + 1}",
            "x_min": prev_x,
            "x_max": sx,
        })
        prev_x = sx
    merged_columns.append({
        "index": len(merged_xs),
        "type": f"column_{len(merged_xs) + 1}",
        "x_min": prev_x,
        "x_max": cx_max,
    })

    # Re-build ALL content zones with merged columns
    for zg in zone_grids:
        pz = zg["pz"]
        if pz.zone_type == "content":
            grid = _build_zone_grid(
                zg["words"], pz.x, pz.y,
                pz.width, pz.height,
                pz.index, img_w, img_h,
                global_columns=merged_columns,
                skip_first_row_header=bool(pz.image_overlays),
            )
            zg["grid"] = grid
    logger.info(
        "build-grid session %s: union of %d content "
        "zones -> %d merged columns (max single zone: %d)",
        session_id, len(content_zones),
        total_cols, max_zone_cols,
    )