breakpilot-lehrer/klausur-service/backend/ocr/layout/column_refine.py

"""
Post-processing refinements for column geometry.

Extracted from cv_layout_columns.py — contains:
- _detect_sub_columns()      (sub-column detection via left-edge alignment)
- _split_broad_columns()     (broad column splitting via word-coverage gaps)
- expand_narrow_columns()    (narrow column expansion into whitespace)

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import statistics
from typing import Dict, List, Optional, Tuple

import numpy as np

from cv_vocab_types import ColumnGeometry

logger = logging.getLogger(__name__)


def _detect_sub_columns(
    geometries: List[ColumnGeometry],
    content_w: int,
    left_x: int = 0,
    top_y: int = 0,
    header_y: Optional[int] = None,
    footer_y: Optional[int] = None,
    _edge_tolerance: int = 8,
    _min_col_start_ratio: float = 0.10,
) -> List[ColumnGeometry]:
    """Split columns that contain internal sub-columns based on left-edge alignment.

    For each column, clusters word left-edges into alignment bins (within
    ``_edge_tolerance`` px).  The leftmost bin whose word count reaches
    ``_min_col_start_ratio`` of the column total is treated as the true column
    start.  Any words to the left of that bin form a sub-column, provided they
    number >= 2 and < 35 % of total.

    Word ``left`` values are relative to the content ROI (offset by *left_x*),
    while ``ColumnGeometry.x`` is in absolute image coordinates.  *left_x*
    bridges the two coordinate systems.

    If *header_y* / *footer_y* are provided (absolute y-coordinates), words
    in header/footer regions are excluded from alignment clustering to avoid
    polluting the bins with page numbers or chapter titles.  Word ``top``
    values are relative to *top_y*.

    Returns a new list of ColumnGeometry — potentially longer than the input.
    """
    if content_w <= 0:
        return geometries

    result: List[ColumnGeometry] = []
    for geo in geometries:
        # Only consider wide-enough columns with enough words
        if geo.width_ratio < 0.15 or geo.word_count < 5:
            result.append(geo)
            continue

        # Collect left-edges of confident words, excluding header/footer
        # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
        min_top_rel = (header_y - top_y) if header_y is not None else None
        max_top_rel = (footer_y - top_y) if footer_y is not None else None

        confident = [w for w in geo.words
                     if w.get('conf', 0) >= 30
                     and (min_top_rel is None or w['top'] >= min_top_rel)
                     and (max_top_rel is None or w['top'] <= max_top_rel)]
        if len(confident) < 3:
            result.append(geo)
            continue

        # --- Cluster left-edges into alignment bins ---
        sorted_edges = sorted(w['left'] for w in confident)
        bins: List[Tuple[int, int, int, int]] = []  # (center, count, min_edge, max_edge)
        cur = [sorted_edges[0]]
        for i in range(1, len(sorted_edges)):
            if sorted_edges[i] - cur[-1] <= _edge_tolerance:
                cur.append(sorted_edges[i])
            else:
                bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
                cur = [sorted_edges[i]]
        bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))

        # --- Find the leftmost bin qualifying as a real column start ---
        total = len(confident)
        min_count = max(3, int(total * _min_col_start_ratio))
        col_start_bin = None
        for b in bins:
            if b[1] >= min_count:
                col_start_bin = b
                break

        if col_start_bin is None:
            result.append(geo)
            continue

        # Words to the left of the column-start bin are sub-column candidates
        split_threshold = col_start_bin[2] - _edge_tolerance
        sub_words = [w for w in geo.words if w['left'] < split_threshold]
        main_words = [w for w in geo.words if w['left'] >= split_threshold]

        # Count only body words (excluding header/footer) for the threshold check
        # so that header/footer words don't artificially trigger a split.
        sub_body = [w for w in sub_words
                    if (min_top_rel is None or w['top'] >= min_top_rel)
                    and (max_top_rel is None or w['top'] <= max_top_rel)]
        if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
            result.append(geo)
            continue

        # --- Guard against inline markers (bullet points, numbering) ---
        # Bullet points like "1.", "2.", "•", "-" sit close to the main
        # column text and are part of the cell, not a separate column.
        # Only split if the horizontal gap between the rightmost sub-word
        # and the main column start is large enough.
        max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
        gap_to_main = col_start_bin[2] - max_sub_right  # px gap
        median_heights = [w.get('height', 20) for w in confident]
        med_h = statistics.median(median_heights) if median_heights else 20
        min_gap = max(med_h * 1.2, 20)  # at least 1.2× word height or 20px
        if gap_to_main < min_gap:
            logger.debug(
                "SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
                "(likely inline markers, not a sub-column)",
                geo.index, gap_to_main, min_gap)
            result.append(geo)
            continue

        # --- Build two sub-column geometries ---
        # Word 'left' values are relative to left_x; geo.x is absolute.
        # Convert the split position from relative to absolute coordinates.
        max_sub_left = max(w['left'] for w in sub_words)
        split_rel = (max_sub_left + col_start_bin[2]) // 2
        split_abs = split_rel + left_x

        sub_x = geo.x
        sub_width = split_abs - geo.x
        main_x = split_abs
        main_width = (geo.x + geo.width) - split_abs

        if sub_width <= 0 or main_width <= 0:
            result.append(geo)
            continue

        sub_geo = ColumnGeometry(
            index=0,
            x=sub_x,
            y=geo.y,
            width=sub_width,
            height=geo.height,
            word_count=len(sub_words),
            words=sub_words,
            width_ratio=sub_width / content_w if content_w > 0 else 0.0,
            is_sub_column=True,
        )
        main_geo = ColumnGeometry(
            index=0,
            x=main_x,
            y=geo.y,
            width=main_width,
            height=geo.height,
            word_count=len(main_words),
            words=main_words,
            width_ratio=main_width / content_w if content_w > 0 else 0.0,
            is_sub_column=True,
        )

        result.append(sub_geo)
        result.append(main_geo)

        logger.info(
            f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
            f"(rel={split_rel}), sub={len(sub_words)} words, "
            f"main={len(main_words)} words, "
            f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
        )

    # Re-index by left-to-right order
    result.sort(key=lambda g: g.x)
    for i, g in enumerate(result):
        g.index = i

    return result


def _split_broad_columns(
    geometries: List[ColumnGeometry],
    content_w: int,
    left_x: int = 0,
    _broad_threshold: float = 0.35,
    _min_gap_px: int = 15,
    _min_words_per_split: int = 5,
) -> List[ColumnGeometry]:
    """Split overly broad columns that contain two language blocks (EN+DE).

    Uses word-coverage gap analysis: builds a per-pixel coverage array from the
    words inside each broad column, finds the largest horizontal gap, and splits
    the column at that gap.

    Args:
        geometries: Column geometries from _detect_sub_columns.
        content_w: Width of the content area in pixels.
        left_x: Left edge of content ROI in absolute image coordinates.
        _broad_threshold: Minimum width_ratio to consider a column "broad".
        _min_gap_px: Minimum gap width (pixels) to trigger a split.
        _min_words_per_split: Both halves must have at least this many words.

    Returns:
        Updated list of ColumnGeometry (possibly with more columns).
    """
    result: List[ColumnGeometry] = []

    logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
                f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")

    for geo in geometries:
        if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
            result.append(geo)
            continue

        # Build word-coverage array (per pixel within column)
        col_left_rel = geo.x - left_x  # column left in content-relative coords
        coverage = np.zeros(geo.width, dtype=np.float32)

        for wd in geo.words:
            # wd['left'] is relative to left_x (content ROI)
            wl = wd['left'] - col_left_rel
            wr = wl + wd.get('width', 0)
            wl = max(0, int(wl))
            wr = min(geo.width, int(wr))
            if wr > wl:
                coverage[wl:wr] += 1.0

        # Light smoothing (kernel=3px) to avoid noise
        if len(coverage) > 3:
            kernel = np.ones(3, dtype=np.float32) / 3.0
            coverage = np.convolve(coverage, kernel, mode='same')

        # Normalise to [0, 1]
        cmax = coverage.max()
        if cmax > 0:
            coverage /= cmax

        # Find INTERNAL gaps where coverage < 0.5
        # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
        low_mask = coverage < 0.5
        all_gaps = []
        _gs = None
        for px in range(len(low_mask)):
            if low_mask[px]:
                if _gs is None:
                    _gs = px
            else:
                if _gs is not None:
                    all_gaps.append((_gs, px, px - _gs))
                    _gs = None
        if _gs is not None:
            all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))

        # Filter: only internal gaps (not touching column edges)
        _edge_margin = 10  # pixels from edge to ignore
        internal_gaps = [g for g in all_gaps
                         if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
        best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None

        logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
                    f"{[g for g in all_gaps if g[2] >= 5]}, "
                    f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
                    f"best={best_gap}")

        if best_gap is None or best_gap[2] < _min_gap_px:
            result.append(geo)
            continue

        gap_center = (best_gap[0] + best_gap[1]) // 2

        # Split words by midpoint relative to gap
        left_words = []
        right_words = []
        for wd in geo.words:
            wl = wd['left'] - col_left_rel
            mid = wl + wd.get('width', 0) / 2.0
            if mid < gap_center:
                left_words.append(wd)
            else:
                right_words.append(wd)

        if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
            result.append(geo)
            continue

        # Build two new ColumnGeometry objects
        split_x_abs = geo.x + gap_center
        left_w = gap_center
        right_w = geo.width - gap_center

        left_geo = ColumnGeometry(
            index=0,
            x=geo.x,
            y=geo.y,
            width=left_w,
            height=geo.height,
            word_count=len(left_words),
            words=left_words,
            width_ratio=left_w / content_w if content_w else 0,
            is_sub_column=True,
        )
        right_geo = ColumnGeometry(
            index=0,
            x=split_x_abs,
            y=geo.y,
            width=right_w,
            height=geo.height,
            word_count=len(right_words),
            words=right_words,
            width_ratio=right_w / content_w if content_w else 0,
            is_sub_column=True,
        )

        logger.info(
            f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
            f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
            f"left={len(left_words)} words (w={left_w}), "
            f"right={len(right_words)} words (w={right_w})"
        )

        result.append(left_geo)
        result.append(right_geo)

    # Re-index left-to-right
    result.sort(key=lambda g: g.x)
    for i, g in enumerate(result):
        g.index = i

    return result


def expand_narrow_columns(
    geometries: List[ColumnGeometry],
    content_w: int,
    left_x: int,
    word_dicts: List[Dict],
) -> List[ColumnGeometry]:
    """Expand narrow columns into adjacent whitespace gaps.

    Narrow columns (marker, page_ref, < 10% content width) often lose
    content at image edges due to residual shear.  This expands them toward
    the neighbouring column, but never past 40% of the gap or past the
    nearest word in the neighbour.

    Must be called AFTER _detect_sub_columns() so that sub-column splits
    (which create the narrowest columns) have already happened.
    """
    _NARROW_THRESHOLD_PCT = 10.0
    _MIN_WORD_MARGIN = 4

    if len(geometries) < 2:
        return geometries

    logger.info("ExpandNarrowCols: input %d cols: %s",
                len(geometries),
                [(i, g.x, g.width, round(g.width / content_w * 100, 1))
                 for i, g in enumerate(geometries)])

    for i, g in enumerate(geometries):
        col_pct = g.width / content_w * 100 if content_w > 0 else 100
        if col_pct >= _NARROW_THRESHOLD_PCT:
            continue

        expanded = False
        orig_pct = col_pct

        # --- try expanding to the LEFT ---
        if i > 0:
            left_nb = geometries[i - 1]
            # Gap can be 0 if sub-column split created adjacent columns.
            # In that case, look at where the neighbor's rightmost words
            # actually are — there may be unused space we can claim.
            nb_words_right = [wd['left'] + wd.get('width', 0)
                              for wd in left_nb.words]
            if nb_words_right:
                rightmost_word_abs = left_x + max(nb_words_right)
                safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
            else:
                # No words in neighbor → we can take up to neighbor's start
                safe_left_abs = left_nb.x + _MIN_WORD_MARGIN

            if safe_left_abs < g.x:
                g.width += (g.x - safe_left_abs)
                g.x = safe_left_abs
                expanded = True

        # --- try expanding to the RIGHT ---
        if i + 1 < len(geometries):
            right_nb = geometries[i + 1]
            nb_words_left = [wd['left'] for wd in right_nb.words]
            if nb_words_left:
                leftmost_word_abs = left_x + min(nb_words_left)
                safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
            else:
                safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN

            cur_right = g.x + g.width
            if safe_right_abs > cur_right:
                g.width = safe_right_abs - g.x
                expanded = True

        if expanded:
            col_left_rel = g.x - left_x
            col_right_rel = col_left_rel + g.width
            g.words = [wd for wd in word_dicts
                       if col_left_rel <= wd['left'] < col_right_rel]
            g.word_count = len(g.words)
            g.width_ratio = g.width / content_w if content_w > 0 else 0.0
            logger.info(
                "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
                i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)

            # --- Shrink overlapping neighbors to match new boundaries ---
            # Left neighbor: its right edge must not exceed our new left edge
            if i > 0:
                left_nb = geometries[i - 1]
                nb_right = left_nb.x + left_nb.width
                if nb_right > g.x:
                    left_nb.width = g.x - left_nb.x
                    if left_nb.width < 0:
                        left_nb.width = 0
                    left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
                    # Re-assign words
                    nb_left_rel = left_nb.x - left_x
                    nb_right_rel = nb_left_rel + left_nb.width
                    left_nb.words = [wd for wd in word_dicts
                                     if nb_left_rel <= wd['left'] < nb_right_rel]
                    left_nb.word_count = len(left_nb.words)

            # Right neighbor: its left edge must not be before our new right edge
            if i + 1 < len(geometries):
                right_nb = geometries[i + 1]
                my_right = g.x + g.width
                if right_nb.x < my_right:
                    old_right_edge = right_nb.x + right_nb.width
                    right_nb.x = my_right
                    right_nb.width = old_right_edge - right_nb.x
                    if right_nb.width < 0:
                        right_nb.width = 0
                    right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
                    # Re-assign words
                    nb_left_rel = right_nb.x - left_x
                    nb_right_rel = nb_left_rel + right_nb.width
                    right_nb.words = [wd for wd in word_dicts
                                      if nb_left_rel <= wd['left'] < nb_right_rel]
                    right_nb.word_count = len(right_nb.words)

    return geometries