breakpilot-lehrer/klausur-service/backend/cv_layout_columns.py

"""
Core column detection: gap-based geometry and clustering fallback.

Extracted from the original cv_layout_columns.py — contains:
- _detect_columns_by_clustering()   (fallback clustering)
- _build_geometries_from_starts()   (geometry construction)
- detect_column_geometry()          (main column detection)

Post-processing (sub-columns, broad-column split, narrow expansion)
lives in cv_layout_column_refine.py.
Legacy projection-profile layout lives in cv_layout_analyze.py.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import Dict, List, Optional, Tuple

import numpy as np

from cv_vocab_types import ColumnGeometry
from cv_layout_detection import _find_content_bounds

logger = logging.getLogger(__name__)

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]

try:
    import pytesseract
    from PIL import Image
except ImportError:
    pytesseract = None  # type: ignore[assignment]
    Image = None  # type: ignore[assignment,misc]


# =============================================================================
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
# =============================================================================

# --- Phase A: Geometry Detection ---

def _detect_columns_by_clustering(
    word_dicts: List[Dict],
    left_edges: List[int],
    edge_word_indices: List[int],
    content_w: int,
    content_h: int,
    left_x: int,
    right_x: int,
    top_y: int,
    bottom_y: int,
    inv: Optional[np.ndarray] = None,
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
    """Fallback: detect columns by clustering left-aligned word positions.

    Used when the primary gap-based algorithm finds fewer than 2 gaps.
    """
    tolerance = max(10, int(content_w * 0.01))
    sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])

    clusters = []
    cluster_widxs = []
    cur_edges = [sorted_pairs[0][0]]
    cur_widxs = [sorted_pairs[0][1]]
    for edge, widx in sorted_pairs[1:]:
        if edge - cur_edges[-1] <= tolerance:
            cur_edges.append(edge)
            cur_widxs.append(widx)
        else:
            clusters.append(cur_edges)
            cluster_widxs.append(cur_widxs)
            cur_edges = [edge]
            cur_widxs = [widx]
    clusters.append(cur_edges)
    cluster_widxs.append(cur_widxs)

    MIN_Y_COVERAGE_PRIMARY = 0.30
    MIN_Y_COVERAGE_SECONDARY = 0.15
    MIN_WORDS_SECONDARY = 5

    cluster_infos = []
    for c_edges, c_widxs in zip(clusters, cluster_widxs):
        if len(c_edges) < 2:
            continue
        y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
        y_span = max(y_positions) - min(y_positions)
        y_coverage = y_span / content_h if content_h > 0 else 0.0
        cluster_infos.append({
            'mean_x': int(np.mean(c_edges)),
            'count': len(c_edges),
            'min_edge': min(c_edges),
            'max_edge': max(c_edges),
            'y_min': min(y_positions),
            'y_max': max(y_positions),
            'y_coverage': y_coverage,
        })

    primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
    primary_set = set(id(c) for c in primary)
    secondary = [c for c in cluster_infos
                 if id(c) not in primary_set
                 and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
                 and c['count'] >= MIN_WORDS_SECONDARY]
    significant = sorted(primary + secondary, key=lambda c: c['mean_x'])

    if len(significant) < 3:
        logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
        return None

    merge_distance = max(30, int(content_w * 0.06))
    merged = [significant[0].copy()]
    for s in significant[1:]:
        if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
            prev = merged[-1]
            total = prev['count'] + s['count']
            avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
            prev['mean_x'] = avg_x
            prev['count'] = total
            prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
            prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
        else:
            merged.append(s.copy())

    if len(merged) < 3:
        logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
        return None

    logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")

    margin_px = max(6, int(content_w * 0.003))
    return _build_geometries_from_starts(
        [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
        word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
    )


def _build_geometries_from_starts(
    col_starts: List[Tuple[int, int]],
    word_dicts: List[Dict],
    left_x: int,
    right_x: int,
    top_y: int,
    bottom_y: int,
    content_w: int,
    content_h: int,
    inv: Optional[np.ndarray] = None,
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
    """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
    geometries = []
    for i, (start_x, count) in enumerate(col_starts):
        if i + 1 < len(col_starts):
            col_width = col_starts[i + 1][0] - start_x
        else:
            col_width = right_x - start_x

        col_left_rel = start_x - left_x
        col_right_rel = col_left_rel + col_width
        col_words = [w for w in word_dicts
                     if col_left_rel <= w['left'] < col_right_rel]

        geometries.append(ColumnGeometry(
            index=i,
            x=start_x,
            y=top_y,
            width=col_width,
            height=content_h,
            word_count=len(col_words),
            words=col_words,
            width_ratio=col_width / content_w if content_w > 0 else 0.0,
        ))

    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)


def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
    """Detect column geometry using whitespace-gap analysis with word validation.

    Phase A of the two-phase column detection. Uses vertical projection
    profiles to find whitespace gaps between columns, then validates that
    no gap cuts through a word bounding box.

    Falls back to clustering-based detection if fewer than 2 gaps are found.

    Args:
        ocr_img: Binarized grayscale image for layout analysis.
        dewarped_bgr: Original BGR image (for Tesseract word detection).

    Returns:
        Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
        or None if detection fails entirely.
    """
    h, w = ocr_img.shape[:2]

    # --- Step 1: Find content bounds ---
    inv = cv2.bitwise_not(ocr_img)
    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
    content_w = right_x - left_x
    content_h = bottom_y - top_y

    if content_w < w * 0.3 or content_h < h * 0.3:
        left_x, right_x = 0, w
        top_y, bottom_y = 0, h
        content_w, content_h = w, h

    logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
                f"y=[{top_y}..{bottom_y}] ({content_h}px)")

    # --- Step 2: Get word bounding boxes from Tesseract ---
    # Crop from left_x to full image width (not right_x) so words at the right
    # edge of the last column are included even if they extend past the detected
    # content boundary (right_x).
    content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
    pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))

    try:
        data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
    except Exception as e:
        logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
        return None

    word_dicts = []
    left_edges = []
    edge_word_indices = []
    n_words = len(data['text'])
    for i in range(n_words):
        conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
        text = str(data['text'][i]).strip()
        if conf < 30 or not text:
            continue
        lx = int(data['left'][i])
        ty = int(data['top'][i])
        bw = int(data['width'][i])
        bh = int(data['height'][i])
        left_edges.append(lx)
        edge_word_indices.append(len(word_dicts))
        word_dicts.append({
            'text': text, 'conf': conf,
            'left': lx, 'top': ty, 'width': bw, 'height': bh,
        })

    if len(left_edges) < 5:
        logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
        return None

    logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")

    # --- Step 2b: Segment by sub-headers ---
    # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
    # text bands that pollute the vertical projection.  We detect large
    # horizontal gaps (= whitespace rows separating sections) and use only
    # the tallest content segment for the projection.  This makes column
    # detection immune to sub-headers, illustrations, and section dividers.
    content_strip = inv[top_y:bottom_y, left_x:right_x]
    h_proj_row = np.sum(content_strip, axis=1).astype(float)
    h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row

    # Find horizontal gaps (near-empty rows)
    H_GAP_THRESH = 0.02  # rows with <2% ink density are "empty"
    h_in_gap = h_proj_row_norm < H_GAP_THRESH
    H_MIN_GAP = max(5, content_h // 200)  # min gap height ~5-7px

    h_gaps: List[Tuple[int, int]] = []
    h_gap_start = None
    for y_idx in range(len(h_in_gap)):
        if h_in_gap[y_idx]:
            if h_gap_start is None:
                h_gap_start = y_idx
        else:
            if h_gap_start is not None:
                if y_idx - h_gap_start >= H_MIN_GAP:
                    h_gaps.append((h_gap_start, y_idx))
                h_gap_start = None
    if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
        h_gaps.append((h_gap_start, len(h_in_gap)))

    # Identify "large" gaps (significantly bigger than median) that indicate
    # section boundaries (sub-headers, chapter titles).
    if len(h_gaps) >= 3:
        gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
        median_gap_h = gap_sizes[len(gap_sizes) // 2]
        large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
        large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
    else:
        large_gaps = h_gaps

    # Build content segments between large gaps and pick the tallest
    seg_boundaries = [0]
    for gs, ge in large_gaps:
        seg_boundaries.append(gs)
        seg_boundaries.append(ge)
    seg_boundaries.append(content_h)

    segments = []
    for i in range(0, len(seg_boundaries) - 1, 2):
        seg_top = seg_boundaries[i]
        seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
        seg_height = seg_bot - seg_top
        if seg_height > 20:  # ignore tiny fragments
            segments.append((seg_top, seg_bot, seg_height))

    if segments:
        segments.sort(key=lambda s: s[2], reverse=True)
        best_seg = segments[0]
        proj_strip = content_strip[best_seg[0]:best_seg[1], :]
        effective_h = best_seg[2]
        if len(segments) > 1:
            logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
                        f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
                        f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
    else:
        proj_strip = content_strip
        effective_h = content_h

    # --- Step 3: Vertical projection profile ---
    v_proj = np.sum(proj_strip, axis=0).astype(float)
    v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj

    # Smooth the projection to avoid noise-induced micro-gaps
    kernel_size = max(5, content_w // 80)
    if kernel_size % 2 == 0:
        kernel_size += 1  # keep odd for symmetry
    v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')

    # --- Step 4: Find whitespace gaps ---
    # Threshold: areas with very little ink density are gaps
    median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
    gap_threshold = max(median_density * 0.15, 0.005)

    in_gap = v_smooth < gap_threshold
    MIN_GAP_WIDTH = max(8, content_w // 200)  # min ~8px or 0.5% of content width

    # Collect contiguous gap regions
    raw_gaps = []  # (start_x_rel, end_x_rel) relative to content ROI
    gap_start = None
    for x in range(len(in_gap)):
        if in_gap[x]:
            if gap_start is None:
                gap_start = x
        else:
            if gap_start is not None:
                gap_width = x - gap_start
                if gap_width >= MIN_GAP_WIDTH:
                    raw_gaps.append((gap_start, x))
                gap_start = None
    # Handle gap at the right edge
    if gap_start is not None:
        gap_width = len(in_gap) - gap_start
        if gap_width >= MIN_GAP_WIDTH:
            raw_gaps.append((gap_start, len(in_gap)))

    logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
                f"min_width={MIN_GAP_WIDTH}px): "
                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")

    # --- Step 5: Validate gaps against word bounding boxes ---
    # When using a segment for projection, only validate against words
    # inside that segment — words from sub-headers or other sections
    # would incorrectly overlap with real column gaps.
    if segments and len(segments) > 1:
        seg_top_abs = best_seg[0]  # relative to content strip
        seg_bot_abs = best_seg[1]
        segment_words = [wd for wd in word_dicts
                         if wd['top'] >= seg_top_abs
                         and wd['top'] + wd['height'] <= seg_bot_abs]
        logger.info(f"ColumnGeometry: filtering words to segment: "
                    f"{len(segment_words)}/{len(word_dicts)} words")
    else:
        segment_words = word_dicts

    validated_gaps = []
    for gap_start_rel, gap_end_rel in raw_gaps:
        # Check if any word overlaps with this gap region
        overlapping = False
        for wd in segment_words:
            word_left = wd['left']
            word_right = wd['left'] + wd['width']
            if word_left < gap_end_rel and word_right > gap_start_rel:
                overlapping = True
                break

        if not overlapping:
            validated_gaps.append((gap_start_rel, gap_end_rel))
        else:
            # Try to shift the gap to avoid the overlapping word(s)
            # Find the tightest word boundaries within the gap region
            min_word_left = content_w
            max_word_right = 0
            for wd in segment_words:
                word_left = wd['left']
                word_right = wd['left'] + wd['width']
                if word_left < gap_end_rel and word_right > gap_start_rel:
                    min_word_left = min(min_word_left, word_left)
                    max_word_right = max(max_word_right, word_right)

            # Try gap before the overlapping words
            if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
                validated_gaps.append((gap_start_rel, min_word_left))
                logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
            # Try gap after the overlapping words
            elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
                validated_gaps.append((max_word_right, gap_end_rel))
                logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
            else:
                logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
                             f"discarded (word overlap, no room to shift)")

    logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")

    # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
    # When pixel-based projection fails (e.g. due to illustrations or colored
    # bands), use word bounding boxes to find clear vertical gaps.  This is
    # immune to decorative graphics that Tesseract doesn't recognise as words.
    if len(validated_gaps) < 2:
        logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
        word_coverage = np.zeros(content_w, dtype=np.int32)
        for wd in segment_words:
            wl = max(0, wd['left'])
            wr = min(wd['left'] + wd['width'], content_w)
            if wr > wl:
                word_coverage[wl:wr] += 1

        # Smooth slightly to bridge tiny 1-2px noise gaps between words
        wc_kernel = max(3, content_w // 300)
        if wc_kernel % 2 == 0:
            wc_kernel += 1
        wc_smooth = np.convolve(word_coverage.astype(float),
                                np.ones(wc_kernel) / wc_kernel, mode='same')

        wc_in_gap = wc_smooth < 0.5  # effectively zero word coverage
        WC_MIN_GAP = max(4, content_w // 300)

        wc_gaps: List[Tuple[int, int]] = []
        wc_gap_start = None
        for x in range(len(wc_in_gap)):
            if wc_in_gap[x]:
                if wc_gap_start is None:
                    wc_gap_start = x
            else:
                if wc_gap_start is not None:
                    if x - wc_gap_start >= WC_MIN_GAP:
                        wc_gaps.append((wc_gap_start, x))
                    wc_gap_start = None
        if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
            wc_gaps.append((wc_gap_start, len(wc_in_gap)))

        logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
                    f"(min_width={WC_MIN_GAP}px): "
                    f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")

        if len(wc_gaps) >= 2:
            validated_gaps = wc_gaps

    # --- Step 6: Fallback to clustering if too few gaps ---
    if len(validated_gaps) < 2:
        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
        return _detect_columns_by_clustering(
            word_dicts, left_edges, edge_word_indices,
            content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
        )

    # --- Step 7: Derive column boundaries from gaps ---
    # Sort gaps by position
    validated_gaps.sort(key=lambda g: g[0])

    # Identify margin gaps (first and last) vs interior gaps
    # A margin gap touches the edge of the content area (within 2% tolerance)
    edge_tolerance = max(10, int(content_w * 0.02))

    is_left_margin = validated_gaps[0][0] <= edge_tolerance
    is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance

    # Interior gaps define column boundaries
    # Column starts at the end of a gap, ends at the start of the next gap
    col_starts = []

    if is_left_margin:
        # First column starts after the left margin gap
        first_gap_end = validated_gaps[0][1]
        interior_gaps = validated_gaps[1:]
    else:
        # No left margin gap — first column starts at content left edge
        first_gap_end = 0
        interior_gaps = validated_gaps[:]

    if is_right_margin:
        # Last gap is right margin — don't use it as column start
        interior_gaps_for_boundaries = interior_gaps[:-1]
        right_boundary = validated_gaps[-1][0]  # last column ends at right margin gap start
    else:
        interior_gaps_for_boundaries = interior_gaps
        right_boundary = content_w

    # First column
    col_starts.append(left_x + first_gap_end)

    # Columns between interior gaps
    for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
        col_starts.append(left_x + gap_end_rel)

    # Count words per column region (for logging)
    col_start_counts = []
    for i, start_x in enumerate(col_starts):
        if i + 1 < len(col_starts):
            next_start = col_starts[i + 1]
        else:
            # Rightmost column always extends to full image width (w).
            # The page margin contains only white space — extending the OCR
            # crop to the image edge is safe and prevents text near the right
            # border from being cut off.
            next_start = w

        col_left_rel = start_x - left_x
        col_right_rel = next_start - left_x
        n_words_in_col = sum(1 for w in word_dicts
                             if col_left_rel <= w['left'] < col_right_rel)
        col_start_counts.append((start_x, n_words_in_col))

    logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
                f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
                f"{col_start_counts}")

    # --- Step 8: Build ColumnGeometry objects ---
    # Determine right edge for each column
    all_boundaries = []
    for i, start_x in enumerate(col_starts):
        if i + 1 < len(col_starts):
            end_x = col_starts[i + 1]
        else:
            # Rightmost column always extends to full image width (w).
            end_x = w
        all_boundaries.append((start_x, end_x))

    geometries = []
    for i, (start_x, end_x) in enumerate(all_boundaries):
        col_width = end_x - start_x
        col_left_rel = start_x - left_x
        col_right_rel = col_left_rel + col_width
        col_words = [w for w in word_dicts
                     if col_left_rel <= w['left'] < col_right_rel]

        geometries.append(ColumnGeometry(
            index=i,
            x=start_x,
            y=top_y,
            width=col_width,
            height=content_h,
            word_count=len(col_words),
            words=col_words,
            width_ratio=col_width / content_w if content_w > 0 else 0.0,
        ))

    logger.info(f"ColumnGeometry: {len(geometries)} columns: "
                f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")

    # --- Step 9: Filter phantom narrow columns ---
    # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
    # columns (< 3% of content width) with zero or no words. These are not
    # real columns — remove them and close the gap between neighbors.
    min_real_col_w = max(20, int(content_w * 0.03))
    filtered_geoms = [g for g in geometries
                      if not (g.word_count < 3 and g.width < min_real_col_w)]
    if len(filtered_geoms) < len(geometries):
        n_removed = len(geometries) - len(filtered_geoms)
        logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
                    f"(width < {min_real_col_w}px and words < 3)")
        # Extend each remaining column to close gaps with its right neighbor
        for i, g in enumerate(filtered_geoms):
            if i + 1 < len(filtered_geoms):
                g.width = filtered_geoms[i + 1].x - g.x
            else:
                g.width = w - g.x
            g.index = i
            col_left_rel = g.x - left_x
            col_right_rel = col_left_rel + g.width
            g.words = [w for w in word_dicts
                       if col_left_rel <= w['left'] < col_right_rel]
            g.word_count = len(g.words)
        geometries = filtered_geoms
        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")

    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)