breakpilot-lehrer/klausur-service/backend/cv_box_detect.py

"""
Embedded box detection and page zone splitting for the CV vocabulary pipeline.

Detects boxes (grammar tips, exercises, etc.) that span the page width and
interrupt the normal column layout. Splits the page into vertical zones so
that column detection can run independently per zone.

Two-stage algorithm:
  1. Morphological line detection — finds bordered boxes via horizontal lines.
  2. Color/saturation fallback — finds shaded boxes without visible borders.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import List, Optional, Tuple

import cv2
import numpy as np

from cv_vocab_types import DetectedBox, PageZone

logger = logging.getLogger(__name__)

__all__ = [
    "detect_boxes",
    "split_page_into_zones",
]


# ---------------------------------------------------------------------------
# Stage 1: Morphological line detection
# ---------------------------------------------------------------------------

def _detect_boxes_by_lines(
    gray: np.ndarray,
    content_x: int,
    content_w: int,
    content_y: int,
    content_h: int,
) -> List[DetectedBox]:
    """Find boxes defined by pairs of long horizontal border lines.

    Args:
        gray: Grayscale image (full page).
        content_x, content_w: Horizontal content bounds.
        content_y, content_h: Vertical content bounds.

    Returns:
        List of DetectedBox for each detected bordered box.
    """
    h, w = gray.shape[:2]

    # Binarize: dark pixels → white on black background
    _, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)

    # Horizontal morphology kernel — at least 50% of content width
    kernel_w = max(50, content_w // 2)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
    lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # Horizontal projection: count line pixels per row
    h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
    line_threshold = content_w * 0.30

    # Group consecutive rows with enough line pixels into line segments
    line_segments: List[Tuple[int, int]] = []  # (y_start, y_end)
    seg_start: Optional[int] = None
    for y in range(h):
        if h_proj[y] >= line_threshold:
            if seg_start is None:
                seg_start = y
        else:
            if seg_start is not None:
                line_segments.append((seg_start, y))
                seg_start = None
    if seg_start is not None:
        line_segments.append((seg_start, h))

    if len(line_segments) < 2:
        return []

    # Pair lines into boxes: top-line + bottom-line
    # Minimum box height: 30px.  Maximum: 70% of content height.
    min_box_h = 30
    max_box_h = int(content_h * 0.70)

    boxes: List[DetectedBox] = []
    used = set()
    for i, (top_start, top_end) in enumerate(line_segments):
        if i in used:
            continue
        for j in range(i + 1, len(line_segments)):
            if j in used:
                continue
            bot_start, bot_end = line_segments[j]
            box_y = top_start
            box_h = bot_end - top_start
            if box_h < min_box_h or box_h > max_box_h:
                continue

            # Estimate border thickness from line segment heights
            border_top = top_end - top_start
            border_bot = bot_end - bot_start

            box = DetectedBox(
                x=content_x,
                y=box_y,
                width=content_w,
                height=box_h,
                confidence=0.8,
                border_thickness=max(border_top, border_bot),
            )
            boxes.append(box)
            used.add(i)
            used.add(j)
            break  # move to next top-line candidate

    return boxes


# ---------------------------------------------------------------------------
# Stage 2: Color / saturation fallback
# ---------------------------------------------------------------------------

def _detect_boxes_by_color(
    img_bgr: np.ndarray,
    content_x: int,
    content_w: int,
    content_y: int,
    content_h: int,
) -> List[DetectedBox]:
    """Find boxes with shaded/colored background (no visible border lines).

    Args:
        img_bgr: BGR color image (full page).
        content_x, content_w: Horizontal content bounds.
        content_y, content_h: Vertical content bounds.

    Returns:
        List of DetectedBox for each detected shaded box.
    """
    h, w = img_bgr.shape[:2]

    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    # Mask: pixels that are saturated OR noticeably darker than white
    sat_mask = hsv[:, :, 1] > 25
    dark_mask = gray < 220
    combined = (sat_mask | dark_mask).astype(np.uint8) * 255

    # Close small gaps in the mask
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
    combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel)

    contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    min_area = content_w * content_h * 0.05
    min_box_h = 30
    max_box_h = int(content_h * 0.70)
    min_width_ratio = 0.60

    boxes: List[DetectedBox] = []
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area < min_area:
            continue

        # Approximate to polygon — check if roughly rectangular
        peri = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.04 * peri, True)
        if len(approx) < 4 or len(approx) > 8:
            continue

        bx, by, bw, bh = cv2.boundingRect(cnt)

        # Width filter: must span most of the page
        if bw < content_w * min_width_ratio:
            continue

        # Height filter
        if bh < min_box_h or bh > max_box_h:
            continue

        boxes.append(DetectedBox(
            x=bx,
            y=by,
            width=bw,
            height=bh,
            confidence=0.6,
            border_thickness=0,
        ))

    return boxes


# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------

def _validate_box(
    box: DetectedBox,
    gray: np.ndarray,
    content_w: int,
    content_h: int,
    median_row_gap: int,
) -> bool:
    """Validate that a detected box is genuine (not a table-row separator etc.)."""
    # Must span > 60% of content width
    if box.width < content_w * 0.60:
        return False

    # Height constraints
    if box.height < 30 or box.height > content_h * 0.70:
        return False

    # Must not be confused with a table-row separator:
    # real boxes are at least 3x the median row gap
    if median_row_gap > 0 and box.height < median_row_gap * 3:
        return False

    # Must contain some text (ink density check)
    roi = gray[box.y:box.y + box.height, box.x:box.x + box.width]
    if roi.size == 0:
        return False
    ink_ratio = np.sum(roi < 128) / roi.size
    if ink_ratio < 0.002:  # nearly empty → not a real content box
        return False

    return True


# ---------------------------------------------------------------------------
# Public API: detect_boxes
# ---------------------------------------------------------------------------

def detect_boxes(
    img_bgr: np.ndarray,
    content_x: int,
    content_w: int,
    content_y: int,
    content_h: int,
    median_row_gap: int = 0,
) -> List[DetectedBox]:
    """Detect embedded boxes on a page image.

    Runs line-based detection first, then color-based fallback if no
    bordered boxes are found.

    Args:
        img_bgr: BGR color image (full page or cropped).
        content_x, content_w: Horizontal content bounds.
        content_y, content_h: Vertical content bounds.
        median_row_gap: Median row gap height (for filtering out table separators).

    Returns:
        List of validated DetectedBox instances, sorted by y position.
    """
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    # Stage 1: Line-based detection
    boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)

    # Stage 2: Color fallback if no bordered boxes found
    if not boxes:
        boxes = _detect_boxes_by_color(img_bgr, content_x, content_w, content_y, content_h)

    # Validate
    validated = [b for b in boxes if _validate_box(b, gray, content_w, content_h, median_row_gap)]

    # Sort top to bottom
    validated.sort(key=lambda b: b.y)

    if validated:
        logger.info(f"BoxDetect: {len(validated)} box(es) detected "
                    f"(from {len(boxes)} candidates)")
    else:
        logger.debug("BoxDetect: no boxes detected")

    return validated


# ---------------------------------------------------------------------------
# Zone Splitting
# ---------------------------------------------------------------------------

def split_page_into_zones(
    content_x: int,
    content_y: int,
    content_w: int,
    content_h: int,
    boxes: List[DetectedBox],
    min_zone_height: int = 40,
) -> List[PageZone]:
    """Split a page into vertical zones based on detected boxes.

    Regions above, between, and below boxes become 'content' zones;
    box regions become 'box' zones.

    Args:
        content_x, content_y, content_w, content_h: Content area bounds.
        boxes: Detected boxes, sorted by y position.
        min_zone_height: Minimum height for a content zone to be kept.

    Returns:
        List of PageZone, ordered top to bottom.
    """
    if not boxes:
        # Single zone: entire content area
        return [PageZone(
            index=0,
            zone_type='content',
            y=content_y,
            height=content_h,
            x=content_x,
            width=content_w,
        )]

    zones: List[PageZone] = []
    zone_idx = 0
    cursor_y = content_y
    content_bottom = content_y + content_h

    for box in boxes:
        # Content zone above this box
        gap_above = box.y - cursor_y
        if gap_above >= min_zone_height:
            zones.append(PageZone(
                index=zone_idx,
                zone_type='content',
                y=cursor_y,
                height=gap_above,
                x=content_x,
                width=content_w,
            ))
            zone_idx += 1

        # Box zone
        zones.append(PageZone(
            index=zone_idx,
            zone_type='box',
            y=box.y,
            height=box.height,
            x=box.x,
            width=box.width,
            box=box,
        ))
        zone_idx += 1

        cursor_y = box.y + box.height

    # Content zone below last box
    remaining = content_bottom - cursor_y
    if remaining >= min_zone_height:
        zones.append(PageZone(
            index=zone_idx,
            zone_type='content',
            y=cursor_y,
            height=remaining,
            x=content_x,
            width=content_w,
        ))

    logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
                f"{[z.zone_type for z in zones]}")

    return zones