breakpilot-lehrer/klausur-service/backend/ocr/layout/rows.py

"""
Row geometry detection for document layout analysis.

Provides horizontal whitespace-gap analysis to detect text rows,
word-center grid regularization, and fallback word-grouping.

Extracted from cv_layout.py.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import Dict, List

import numpy as np

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]

from cv_vocab_types import RowGeometry
from cv_ocr_word_assembly import _group_words_into_lines
from cv_layout_row_regularize import _regularize_row_grid

logger = logging.getLogger(__name__)


# =============================================================================
# Row Geometry Detection (horizontal whitespace-gap analysis)
# =============================================================================

def detect_row_geometry(
    inv: np.ndarray,
    word_dicts: List[Dict],
    left_x: int, right_x: int,
    top_y: int, bottom_y: int,
) -> List['RowGeometry']:
    """Detect row geometry using horizontal whitespace-gap analysis.

    Algorithm overview (two phases):

    Phase 1 — Gap-based detection (Steps 1–6):
      1. Build a horizontal projection profile: for each y-pixel, sum the
         ink density across the content width. Only pixels within/near
         Tesseract word bounding boxes contribute (word_mask), so that
         images/illustrations don't merge adjacent text rows.
      2. Smooth the projection and find contiguous regions below a
         threshold (= gaps / horizontal whitespace between text lines).
         The threshold is 15% of the median non-zero density.
      3. Validate gaps against word bounding boxes — discard any gap
         that overlaps a word, or shift the gap boundary to avoid the word.
      4. Build rows from the spans between validated gaps.
      5. Detect header/footer rows: gaps in the top/bottom 15% of the
         page that are >= 2× the median gap size mark section boundaries.

    Phase 2 — Word-center regularization (_regularize_row_grid, Step 7):
      For each word, compute its vertical center (top + height/2).
      Group words into line clusters by Y-proximity (tolerance = 40% of
      the median gap-based row height).
      For each cluster, the line center = median of all word centers.
      The "pitch" = distance between consecutive line centers.
      Section breaks are detected where the pitch exceeds 1.8× the median.
      Within each section, row boundaries are placed at the midpoints
      between consecutive line centers:
        - Row top = midpoint to previous line center (or center - pitch/2 for first)
        - Row bottom = midpoint to next line center (or center + pitch/2 for last)
      This ensures rows tile without gaps or overlaps.

    Fallback:
      If < 2 gaps are found (very dense or uniform text), falls back to
      _build_rows_from_word_grouping() which groups words by Y proximity.

    Args:
        inv: Inverted binarized image (white text on black bg, full page).
        word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
        left_x, right_x: Absolute X bounds of the content area.
        top_y, bottom_y: Absolute Y bounds of the content area.

    Returns:
        List of RowGeometry objects sorted top to bottom.
    """
    content_w = right_x - left_x
    content_h = bottom_y - top_y

    if content_h < 10 or content_w < 10:
        logger.warning("detect_row_geometry: content area too small")
        return []

    # --- Step 1: Horizontal projection profile ---
    # For each y-pixel row, sum ink density across the content width.
    # A word-coverage mask ensures only pixels near Tesseract words contribute,
    # so that illustrations/images don't inflate the density and merge rows.
    content_strip = inv[top_y:bottom_y, left_x:right_x]
    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
    for wd in word_dicts:
        y1 = max(0, wd['top'] - WORD_PAD_Y)
        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
        x1 = max(0, wd['left'])
        x2 = min(content_w, wd['left'] + wd['width'])
        word_mask[y1:y2, x1:x2] = 255

    masked_strip = cv2.bitwise_and(content_strip, word_mask)
    h_proj = np.sum(masked_strip, axis=1).astype(float)
    h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj

    # --- Step 2: Smoothing + gap threshold ---
    # Smooth the projection to reduce noise, then threshold at 15% of the
    # median non-zero density. Pixels below this threshold are considered
    # "gap" (horizontal whitespace between text lines).
    # MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
    kernel_size = max(3, content_h // 200)
    if kernel_size % 2 == 0:
        kernel_size += 1
    h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')

    median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
    gap_threshold = max(median_density * 0.15, 0.003)

    in_gap = h_smooth < gap_threshold
    MIN_GAP_HEIGHT = max(3, content_h // 500)

    # --- Step 3: Collect contiguous gap regions ---
    raw_gaps = []  # (start_y_rel, end_y_rel) relative to content ROI
    gap_start = None
    for y in range(len(in_gap)):
        if in_gap[y]:
            if gap_start is None:
                gap_start = y
        else:
            if gap_start is not None:
                gap_height = y - gap_start
                if gap_height >= MIN_GAP_HEIGHT:
                    raw_gaps.append((gap_start, y))
                gap_start = None
    if gap_start is not None:
        gap_height = len(in_gap) - gap_start
        if gap_height >= MIN_GAP_HEIGHT:
            raw_gaps.append((gap_start, len(in_gap)))

    logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
                f"min_height={MIN_GAP_HEIGHT}px)")

    # --- Step 4: Validate gaps against word bounding boxes ---
    # A gap is valid only if no word's bounding box overlaps it vertically.
    # If a word overlaps, try to shift the gap boundary above or below the
    # word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
    validated_gaps = []
    for gap_start_rel, gap_end_rel in raw_gaps:
        overlapping = False
        for wd in word_dicts:
            word_top = wd['top']
            word_bottom = wd['top'] + wd['height']
            if word_top < gap_end_rel and word_bottom > gap_start_rel:
                overlapping = True
                break

        if not overlapping:
            validated_gaps.append((gap_start_rel, gap_end_rel))
        else:
            # Try to shift the gap to avoid overlapping words
            min_word_top = content_h
            max_word_bottom = 0
            for wd in word_dicts:
                word_top = wd['top']
                word_bottom = wd['top'] + wd['height']
                if word_top < gap_end_rel and word_bottom > gap_start_rel:
                    min_word_top = min(min_word_top, word_top)
                    max_word_bottom = max(max_word_bottom, word_bottom)

            if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
                validated_gaps.append((gap_start_rel, min_word_top))
            elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
                validated_gaps.append((max_word_bottom, gap_end_rel))
            else:
                logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
                             f"discarded (word overlap, no room to shift)")

    logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")

    # --- Fallback if too few gaps ---
    if len(validated_gaps) < 2:
        logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
        return _build_rows_from_word_grouping(
            word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
        )

    validated_gaps.sort(key=lambda g: g[0])

    # --- Step 5: Header/footer detection via gap size ---
    HEADER_FOOTER_ZONE = 0.15
    GAP_MULTIPLIER = 2.0

    gap_sizes = [g[1] - g[0] for g in validated_gaps]
    median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
    large_gap_threshold = median_gap * GAP_MULTIPLIER

    header_boundary_rel = None  # y below which is header
    footer_boundary_rel = None  # y above which is footer

    header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
    footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))

    # Find largest gap in header zone
    best_header_gap = None
    for gs, ge in validated_gaps:
        gap_mid = (gs + ge) / 2
        gap_size = ge - gs
        if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
            if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
                best_header_gap = (gs, ge)

    if best_header_gap is not None:
        header_boundary_rel = best_header_gap[1]
        logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
                    f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
                    f"median_gap={median_gap:.0f}px)")

    # Find largest gap in footer zone
    best_footer_gap = None
    for gs, ge in validated_gaps:
        gap_mid = (gs + ge) / 2
        gap_size = ge - gs
        if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
            if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
                best_footer_gap = (gs, ge)

    if best_footer_gap is not None:
        footer_boundary_rel = best_footer_gap[0]
        logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
                    f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")

    # --- Step 6: Build RowGeometry objects from gaps ---
    # Rows are the spans between consecutive gaps. The gap midpoints define
    # where one row ends and the next begins. Each row's height extends
    # from the end of the previous gap to the start of the next gap.
    row_boundaries = []  # (start_y_rel, end_y_rel)

    # Top of content to first gap
    if validated_gaps[0][0] > MIN_GAP_HEIGHT:
        row_boundaries.append((0, validated_gaps[0][0]))

    # Between gaps
    for i in range(len(validated_gaps) - 1):
        row_start = validated_gaps[i][1]
        row_end = validated_gaps[i + 1][0]
        if row_end - row_start > 0:
            row_boundaries.append((row_start, row_end))

    # Last gap to bottom of content
    if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
        row_boundaries.append((validated_gaps[-1][1], content_h))

    rows = []
    for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
        # Determine row type
        row_mid = (row_start_rel + row_end_rel) / 2
        if header_boundary_rel is not None and row_mid < header_boundary_rel:
            row_type = 'header'
        elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
            row_type = 'footer'
        else:
            row_type = 'content'

        # Collect words in this row
        row_words = [w for w in word_dicts
                     if w['top'] + w['height'] / 2 >= row_start_rel
                     and w['top'] + w['height'] / 2 < row_end_rel]

        # Gap before this row
        gap_before = 0
        if idx == 0 and validated_gaps[0][0] > 0:
            gap_before = validated_gaps[0][0]
        elif idx > 0:
            # Find the gap just before this row boundary
            for gs, ge in validated_gaps:
                if ge == row_start_rel:
                    gap_before = ge - gs
                    break

        rows.append(RowGeometry(
            index=idx,
            x=left_x,
            y=top_y + row_start_rel,
            width=content_w,
            height=row_end_rel - row_start_rel,
            word_count=len(row_words),
            words=row_words,
            row_type=row_type,
            gap_before=gap_before,
        ))

    # --- Step 7: Word-center grid regularization ---
    # Refine the gap-based rows using word vertical centers. For each word,
    # compute center_y = top + height/2. Group into line clusters, compute
    # the pitch (distance between consecutive line centers), and place row
    # boundaries at the midpoints between centers. This gives more precise
    # and evenly-spaced rows than the gap-based approach alone.
    # Also detects section breaks (headings, paragraphs) where the pitch
    # exceeds 1.8× the median, and handles each section independently.
    rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
                                content_w, content_h, inv)

    type_counts = {}
    for r in rows:
        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
    logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")

    return rows


def _build_rows_from_word_grouping(
    word_dicts: List[Dict],
    left_x: int, right_x: int,
    top_y: int, bottom_y: int,
    content_w: int, content_h: int,
) -> List['RowGeometry']:
    """Fallback: build rows by grouping words by Y position.

    Uses _group_words_into_lines() with a generous tolerance.
    No header/footer detection in fallback mode.
    """
    if not word_dicts:
        return []

    y_tolerance = max(20, content_h // 100)
    lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)

    rows = []
    for idx, line_words in enumerate(lines):
        if not line_words:
            continue
        min_top = min(w['top'] for w in line_words)
        max_bottom = max(w['top'] + w['height'] for w in line_words)
        row_height = max_bottom - min_top

        rows.append(RowGeometry(
            index=idx,
            x=left_x,
            y=top_y + min_top,
            width=content_w,
            height=row_height,
            word_count=len(line_words),
            words=line_words,
            row_type='content',
            gap_before=0,
        ))

    logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
    return rows