breakpilot-lehrer/klausur-service/backend/cv_layout_detection.py

"""
Document type detection, image preparation, content bounds, and header/footer detection.

Extracted from cv_layout.py — these are the "input-side" helpers that run before
column/row geometry analysis.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import List, Optional, Tuple

import numpy as np

from cv_vocab_types import (
    DocumentTypeResult,
    PageRegion,
)

logger = logging.getLogger(__name__)

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]


# =============================================================================
# Document Type Detection
# =============================================================================

def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
    """Detect whether the page is a vocab table, generic table, or full text.

    Uses projection profiles and text density analysis — no OCR required.
    Runs in < 2 seconds.

    Args:
        ocr_img: Binarized grayscale image (for projection profiles).
        img_bgr: BGR color image.

    Returns:
        DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
    """
    if ocr_img is None or ocr_img.size == 0:
        return DocumentTypeResult(
            doc_type='full_text', confidence=0.5, pipeline='full_page',
            skip_steps=['columns', 'rows'],
            features={'error': 'empty image'},
        )

    h, w = ocr_img.shape[:2]

    # --- 1. Vertical projection profile → detect column gaps ---
    # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
    # Invert: dark pixels on white background → high values = text.
    vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)

    # Smooth the profile to avoid noise spikes
    kernel_size = max(3, w // 100)
    if kernel_size % 2 == 0:
        kernel_size += 1
    vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')

    # Find significant vertical gaps (columns of near-zero text density)
    # A gap must be at least 1% of image width and have < 5% of max density
    max_density = max(vert_smooth.max(), 1)
    gap_threshold = max_density * 0.05
    min_gap_width = max(5, w // 100)

    in_gap = False
    gap_count = 0
    gap_start = 0
    vert_gaps = []

    for x in range(w):
        if vert_smooth[x] < gap_threshold:
            if not in_gap:
                in_gap = True
                gap_start = x
        else:
            if in_gap:
                gap_width = x - gap_start
                if gap_width >= min_gap_width:
                    gap_count += 1
                    vert_gaps.append((gap_start, x, gap_width))
                in_gap = False

    # Filter out margin gaps (within 10% of image edges)
    margin_threshold = w * 0.10
    internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
    internal_gap_count = len(internal_gaps)

    # --- 2. Horizontal projection profile → detect row gaps ---
    horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
    h_kernel = max(3, h // 200)
    if h_kernel % 2 == 0:
        h_kernel += 1
    horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')

    h_max = max(horiz_smooth.max(), 1)
    h_gap_threshold = h_max * 0.05
    min_row_gap = max(3, h // 200)

    row_gap_count = 0
    in_gap = False
    for y in range(h):
        if horiz_smooth[y] < h_gap_threshold:
            if not in_gap:
                in_gap = True
                gap_start = y
        else:
            if in_gap:
                if y - gap_start >= min_row_gap:
                    row_gap_count += 1
                in_gap = False

    # --- 3. Text density distribution (4×4 grid) ---
    grid_rows, grid_cols = 4, 4
    cell_h, cell_w = h // grid_rows, w // grid_cols
    densities = []
    for gr in range(grid_rows):
        for gc in range(grid_cols):
            cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
                           gc * cell_w:(gc + 1) * cell_w]
            if cell.size > 0:
                d = float(np.count_nonzero(cell < 128)) / cell.size
                densities.append(d)

    density_std = float(np.std(densities)) if densities else 0
    density_mean = float(np.mean(densities)) if densities else 0

    features = {
        'vertical_gaps': gap_count,
        'internal_vertical_gaps': internal_gap_count,
        'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
        'row_gaps': row_gap_count,
        'density_mean': round(density_mean, 4),
        'density_std': round(density_std, 4),
        'image_size': (w, h),
    }

    # --- 4. Decision tree ---
    # Use internal_gap_count (excludes margin gaps) for column detection.
    if internal_gap_count >= 2 and row_gap_count >= 5:
        # Multiple internal vertical gaps + many row gaps → table
        confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
        return DocumentTypeResult(
            doc_type='vocab_table',
            confidence=round(confidence, 2),
            pipeline='cell_first',
            skip_steps=[],
            features=features,
        )
    elif internal_gap_count >= 1 and row_gap_count >= 3:
        # Some internal structure, likely a table
        confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
        return DocumentTypeResult(
            doc_type='generic_table',
            confidence=round(confidence, 2),
            pipeline='cell_first',
            skip_steps=[],
            features=features,
        )
    elif internal_gap_count == 0:
        # No internal column gaps → full text (regardless of density)
        confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
        return DocumentTypeResult(
            doc_type='full_text',
            confidence=round(confidence, 2),
            pipeline='full_page',
            skip_steps=['columns', 'rows'],
            features=features,
        )
    else:
        # Ambiguous — default to vocab_table (most common use case)
        return DocumentTypeResult(
            doc_type='vocab_table',
            confidence=0.5,
            pipeline='cell_first',
            skip_steps=[],
            features=features,
        )


# =============================================================================
# Image Creation (Dual Image Preparation)
# =============================================================================

def create_ocr_image(img: np.ndarray) -> np.ndarray:
    """Create a binarized image optimized for Tesseract OCR.

    Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.

    Args:
        img: BGR image.

    Returns:
        Binary image (white text on black background inverted to black on white).
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Background normalization: divide by blurred version
    bg = cv2.GaussianBlur(gray, (51, 51), 0)
    normalized = cv2.divide(gray, bg, scale=255)

    # Adaptive binarization
    binary = cv2.adaptiveThreshold(
        normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 31, 10
    )

    # Light denoise
    denoised = cv2.medianBlur(binary, 3)

    return denoised


def create_layout_image(img: np.ndarray) -> np.ndarray:
    """Create a CLAHE-enhanced grayscale image for layout analysis.

    Args:
        img: BGR image.

    Returns:
        Enhanced grayscale image.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    return enhanced


# =============================================================================
# Content Bounds Detection
# =============================================================================

def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
    """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
    out = mask.copy()
    n = len(out)
    i = 0
    while i < n:
        if out[i]:
            start = i
            while i < n and out[i]:
                i += 1
            if (i - start) < min_width:
                out[start:i] = False
        else:
            i += 1
    return out


def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
    """Find the bounding box of actual text content (excluding page margins).

    Scan artefacts (thin black lines at page edges) are filtered out by
    discarding contiguous projection runs narrower than 1 % of the image
    dimension (min 5 px).

    Returns:
        Tuple of (left_x, right_x, top_y, bottom_y).
    """
    h, w = inv.shape[:2]
    threshold = 0.005

    # --- Horizontal projection for top/bottom ---
    h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
    h_mask = h_proj > threshold
    min_h_run = max(5, h // 100)
    h_mask = _filter_narrow_runs(h_mask, min_h_run)

    top_y = 0
    for y in range(h):
        if h_mask[y]:
            top_y = max(0, y - 5)
            break

    bottom_y = h
    for y in range(h - 1, 0, -1):
        if h_mask[y]:
            bottom_y = min(h, y + 5)
            break

    # --- Vertical projection for left/right margins ---
    v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
    v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
    v_mask = v_proj_norm > threshold
    min_v_run = max(5, w // 100)
    v_mask = _filter_narrow_runs(v_mask, min_v_run)

    left_x = 0
    for x in range(w):
        if v_mask[x]:
            left_x = max(0, x - 2)
            break

    right_x = w
    for x in range(w - 1, 0, -1):
        if v_mask[x]:
            right_x = min(w, x + 2)
            break

    return left_x, right_x, top_y, bottom_y


# =============================================================================
# Header / Footer Detection
# =============================================================================

def _detect_header_footer_gaps(
    inv: np.ndarray,
    img_w: int,
    img_h: int,
) -> Tuple[Optional[int], Optional[int]]:
    """Detect header/footer boundaries via horizontal projection gap analysis.

    Scans the full-page inverted image for large horizontal gaps in the top/bottom
    20% that separate header/footer content from the main body.

    Returns:
        (header_y, footer_y) — absolute y-coordinates.
        header_y = bottom edge of header region (None if no header detected).
        footer_y = top edge of footer region (None if no footer detected).
    """
    HEADER_FOOTER_ZONE = 0.20
    GAP_MULTIPLIER = 2.0

    # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
    actual_h = min(inv.shape[0], img_h)
    roi = inv[:actual_h, :]
    h_proj = np.sum(roi, axis=1).astype(float)
    proj_w = roi.shape[1]
    h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj

    # Step 2: Smoothing
    kernel_size = max(3, actual_h // 200)
    if kernel_size % 2 == 0:
        kernel_size += 1
    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')

    # Step 3: Gap threshold
    positive = h_smooth[h_smooth > 0]
    median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
    gap_threshold = max(median_density * 0.15, 0.003)

    in_gap = h_smooth < gap_threshold
    MIN_GAP_HEIGHT = max(3, actual_h // 500)

    # Step 4: Collect contiguous gaps
    raw_gaps: List[Tuple[int, int]] = []
    gap_start: Optional[int] = None
    for y in range(len(in_gap)):
        if in_gap[y]:
            if gap_start is None:
                gap_start = y
        else:
            if gap_start is not None:
                gap_height = y - gap_start
                if gap_height >= MIN_GAP_HEIGHT:
                    raw_gaps.append((gap_start, y))
                gap_start = None
    if gap_start is not None:
        gap_height = len(in_gap) - gap_start
        if gap_height >= MIN_GAP_HEIGHT:
            raw_gaps.append((gap_start, len(in_gap)))

    if not raw_gaps:
        return None, None

    # Step 5: Compute median gap size and large-gap threshold
    gap_sizes = [g[1] - g[0] for g in raw_gaps]
    median_gap = float(np.median(gap_sizes))
    large_gap_threshold = median_gap * GAP_MULTIPLIER

    # Step 6: Find largest qualifying gap in header / footer zones
    # A separator gap must have content on BOTH sides — edge-touching gaps
    # (e.g. dewarp padding at bottom) are not valid separators.
    EDGE_MARGIN = max(5, actual_h // 400)
    header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
    footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))

    header_y: Optional[int] = None
    footer_y: Optional[int] = None

    best_header_size = 0
    for gs, ge in raw_gaps:
        if gs <= EDGE_MARGIN:
            continue  # skip gaps touching the top edge
        gap_mid = (gs + ge) / 2
        gap_size = ge - gs
        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
            if gap_size > best_header_size:
                best_header_size = gap_size
                header_y = ge  # bottom edge of gap

    best_footer_size = 0
    for gs, ge in raw_gaps:
        if ge >= actual_h - EDGE_MARGIN:
            continue  # skip gaps touching the bottom edge
        gap_mid = (gs + ge) / 2
        gap_size = ge - gs
        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
            if gap_size > best_footer_size:
                best_footer_size = gap_size
                footer_y = gs  # top edge of gap

    if header_y is not None:
        logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
                    f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
    if footer_y is not None:
        logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
                    f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")

    return header_y, footer_y


def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
                        min_density: float = 0.005) -> bool:
    """Check whether a horizontal strip contains meaningful ink.

    Args:
        inv: Inverted binarized image (white-on-black).
        y_start: Top of the region (inclusive).
        y_end: Bottom of the region (exclusive).
        min_density: Fraction of white pixels required to count as content.

    Returns:
        True if the region contains text/graphics, False if empty margin.
    """
    if y_start >= y_end:
        return False
    strip = inv[y_start:y_end, :]
    density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
    return density > min_density


def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
                       img_w: int, img_h: int,
                       inv: Optional[np.ndarray] = None) -> None:
    """Add header/footer/margin regions in-place.

    Uses gap-based detection when *inv* is provided, otherwise falls back
    to simple top_y/bottom_y bounds.

    Region types depend on whether there is actual content (text/graphics):
      - 'header' / 'footer'       — region contains text (e.g. title, page number)
      - 'margin_top' / 'margin_bottom' — region is empty page margin
    """
    header_y: Optional[int] = None
    footer_y: Optional[int] = None

    if inv is not None:
        header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)

    # --- Top region ---
    top_boundary = header_y if header_y is not None and header_y > 10 else (
        top_y if top_y > 10 else None
    )
    if top_boundary is not None:
        has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
        rtype = 'header' if has_content else 'margin_top'
        regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
        logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
                    f"(has_content={has_content})")

    # --- Bottom region ---
    bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
        bottom_y if bottom_y < img_h - 10 else None
    )
    if bottom_boundary is not None:
        has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
        rtype = 'footer' if has_content else 'margin_bottom'
        regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
                                  height=img_h - bottom_boundary))
        logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
                    f"height={img_h - bottom_boundary}px (has_content={has_content})")