breakpilot-lehrer/klausur-service/backend/cv_layout_row_regularize.py

"""
Row grid regularization for document layout analysis.

Provides word-center-based row boundary refinement to improve
gap-based row detection. Extracted from cv_layout_rows.py.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import Dict, List

import numpy as np

from cv_vocab_types import RowGeometry

logger = logging.getLogger(__name__)


def _regularize_row_grid(
    rows: List['RowGeometry'],
    word_dicts: List[Dict],
    left_x: int, right_x: int,
    top_y: int,
    content_w: int, content_h: int,
    inv: np.ndarray,
) -> List['RowGeometry']:
    """Rebuild row boundaries from word center-lines with section-break awareness.

    Instead of overlaying a rigid grid, this derives row positions bottom-up
    from the words themselves:

    Step A: Group all content words into line clusters by Y-proximity.
        Tolerance = 40% of median gap-based row height.
    Step B: For each cluster compute:
        - center_y = median of (word_top + word_height/2) for all words
        - letter_h = median of word heights (excluding outliers > 2× median)
    Step B2: Merge clusters whose centers are closer than 30% of row height
        (spurious splits from OCR jitter).
    Step C: Compute pitches (distances between consecutive centers).
        Detect section breaks where gap > 1.8× median pitch.
    Step D: Split clusters into sections at the section breaks.
    Step E: Within each section, place row boundaries at midpoints between
        consecutive line centers:
        - First row top = center - local_pitch/2
        - Last row bottom = center + local_pitch/2
        - Interior boundaries = (center_i + center_{i+1}) / 2
        This ensures rows tile seamlessly without gaps or overlaps.
    Step F: Re-assign words to the nearest grid row by vertical center distance.
    Step G: Validate that >= 85% of words land in a grid row; otherwise
        fall back to the original gap-based rows.
    Step H: Merge with preserved header/footer rows and re-index.

    Guard: Requires >= 5 content rows from gap-based detection to activate.
    This prevents the regularizer from running on very small images (e.g.
    box sub-sessions with only 3-6 rows) where the gap-based detection
    is already accurate enough.

    Header/footer rows from the gap-based detection are preserved.
    """
    content_rows = [r for r in rows if r.row_type == 'content']
    non_content = [r for r in rows if r.row_type != 'content']

    if len(content_rows) < 5:
        return rows

    # --- Step A: Group ALL words into line clusters ---
    # Collect words that belong to content rows (deduplicated)
    content_words: List[Dict] = []
    seen_keys: set = set()
    for r in content_rows:
        for w in r.words:
            key = (w['left'], w['top'], w['width'], w['height'])
            if key not in seen_keys:
                seen_keys.add(key)
                content_words.append(w)

    if len(content_words) < 5:
        return rows

    # Compute median word height (excluding outliers like tall brackets/IPA)
    word_heights = sorted(w['height'] for w in content_words)
    median_wh = word_heights[len(word_heights) // 2]

    # Compute median gap-based row height — this is the actual line height
    # as detected by the horizontal projection.  We use 40% of this as
    # grouping tolerance.  This is much more reliable than using word height
    # alone, because words on the same line can have very different heights
    # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
    gap_row_heights = sorted(r.height for r in content_rows)
    median_row_h = gap_row_heights[len(gap_row_heights) // 2]

    # Tolerance: 40% of row height.  Words on the same line should have
    # centers within this range.  Even if a word's bbox is taller/shorter,
    # its center should stay within half a row height of the line center.
    y_tol = max(10, int(median_row_h * 0.4))

    # Sort by center_y, then group by proximity
    words_by_center = sorted(content_words,
                             key=lambda w: (w['top'] + w['height'] / 2, w['left']))
    line_clusters: List[List[Dict]] = []
    current_line: List[Dict] = [words_by_center[0]]
    current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2

    for w in words_by_center[1:]:
        w_center = w['top'] + w['height'] / 2
        if abs(w_center - current_center) <= y_tol:
            current_line.append(w)
        else:
            current_line.sort(key=lambda w: w['left'])
            line_clusters.append(current_line)
            current_line = [w]
            current_center = w_center

    if current_line:
        current_line.sort(key=lambda w: w['left'])
        line_clusters.append(current_line)

    if len(line_clusters) < 3:
        return rows

    # --- Step B: Compute center_y per cluster ---
    # center_y = median of (word_top + word_height/2) across all words in cluster
    # letter_h = median of word heights, but excluding outlier-height words
    #            (>2× median) so that tall brackets/IPA don't skew the height
    cluster_info: List[Dict] = []
    for cl_words in line_clusters:
        centers = [w['top'] + w['height'] / 2 for w in cl_words]
        # Filter outlier heights for letter_h computation
        normal_heights = [w['height'] for w in cl_words
                          if w['height'] <= median_wh * 2.0]
        if not normal_heights:
            normal_heights = [w['height'] for w in cl_words]
        center_y = float(np.median(centers))
        letter_h = float(np.median(normal_heights))
        cluster_info.append({
            'center_y_rel': center_y,  # relative to content ROI
            'center_y_abs': center_y + top_y,  # absolute
            'letter_h': letter_h,
            'words': cl_words,
        })

    cluster_info.sort(key=lambda c: c['center_y_rel'])

    # --- Step B2: Merge clusters that are too close together ---
    # Even with center-based grouping, some edge cases can produce
    # spurious clusters.  Merge any pair whose centers are closer
    # than 30% of the row height (they're definitely the same text line).
    merge_threshold = max(8, median_row_h * 0.3)
    merged: List[Dict] = [cluster_info[0]]
    for cl in cluster_info[1:]:
        prev = merged[-1]
        if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
            # Merge: combine words, recompute center
            combined_words = prev['words'] + cl['words']
            centers = [w['top'] + w['height'] / 2 for w in combined_words]
            normal_heights = [w['height'] for w in combined_words
                              if w['height'] <= median_wh * 2.0]
            if not normal_heights:
                normal_heights = [w['height'] for w in combined_words]
            prev['center_y_rel'] = float(np.median(centers))
            prev['center_y_abs'] = prev['center_y_rel'] + top_y
            prev['letter_h'] = float(np.median(normal_heights))
            prev['words'] = combined_words
        else:
            merged.append(cl)

    cluster_info = merged

    if len(cluster_info) < 3:
        return rows

    # --- Step C: Compute pitches and detect section breaks ---
    pitches: List[float] = []
    for i in range(1, len(cluster_info)):
        pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
        pitches.append(pitch)

    if not pitches:
        return rows

    median_pitch = float(np.median(pitches))
    if median_pitch <= 5:
        return rows

    # A section break is where the gap between line centers is much larger
    # than the normal pitch (sub-headings, section titles, etc.)
    BREAK_FACTOR = 1.8

    # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
    sections: List[List[Dict]] = []
    current_section: List[Dict] = [cluster_info[0]]

    for i in range(1, len(cluster_info)):
        gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
        if gap > median_pitch * BREAK_FACTOR:
            sections.append(current_section)
            current_section = [cluster_info[i]]
        else:
            current_section.append(cluster_info[i])

    if current_section:
        sections.append(current_section)

    # --- Step E: Build row boundaries per section ---
    grid_rows: List[RowGeometry] = []

    for section in sections:
        if not section:
            continue

        if len(section) == 1:
            # Single-line section (likely a heading)
            cl = section[0]
            half_h = max(cl['letter_h'], median_pitch * 0.4)
            row_top = cl['center_y_abs'] - half_h
            row_bot = cl['center_y_abs'] + half_h
            grid_rows.append(RowGeometry(
                index=0,
                x=left_x,
                y=round(row_top),
                width=content_w,
                height=round(row_bot - row_top),
                word_count=len(cl['words']),
                words=cl['words'],
                row_type='content',
                gap_before=0,
            ))
            continue

        # Compute local pitch for this section
        local_pitches = []
        for i in range(1, len(section)):
            local_pitches.append(
                section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
            )
        local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch

        # Row boundaries are placed at midpoints between consecutive centers.
        # First row: top = center - local_pitch/2
        # Last row: bottom = center + local_pitch/2
        for i, cl in enumerate(section):
            if i == 0:
                row_top = cl['center_y_abs'] - local_pitch / 2
            else:
                # Midpoint between this center and previous center
                prev_center = section[i - 1]['center_y_abs']
                row_top = (prev_center + cl['center_y_abs']) / 2

            if i == len(section) - 1:
                row_bot = cl['center_y_abs'] + local_pitch / 2
            else:
                next_center = section[i + 1]['center_y_abs']
                row_bot = (cl['center_y_abs'] + next_center) / 2

            # Clamp to reasonable bounds
            row_top = max(top_y, row_top)
            row_bot = min(top_y + content_h, row_bot)

            if row_bot - row_top < 5:
                continue

            grid_rows.append(RowGeometry(
                index=0,
                x=left_x,
                y=round(row_top),
                width=content_w,
                height=round(row_bot - row_top),
                word_count=len(cl['words']),
                words=cl['words'],
                row_type='content',
                gap_before=0,
            ))

    if not grid_rows:
        return rows

    # --- Step F: Re-assign words to grid rows ---
    # Words may have shifted slightly; assign each word to the row whose
    # center is closest to the word's vertical center.
    for gr in grid_rows:
        gr.words = []

    for w in content_words:
        w_center = w['top'] + top_y + w['height'] / 2
        best_row = None
        best_dist = float('inf')
        for gr in grid_rows:
            row_center = gr.y + gr.height / 2
            dist = abs(w_center - row_center)
            if dist < best_dist:
                best_dist = dist
                best_row = gr
        if best_row is not None and best_dist < median_pitch:
            best_row.words.append(w)

    for gr in grid_rows:
        gr.word_count = len(gr.words)

    # --- Step G: Validate ---
    words_placed = sum(gr.word_count for gr in grid_rows)
    if len(content_words) > 0:
        match_ratio = words_placed / len(content_words)
        if match_ratio < 0.85:
            logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
                        f"of words, keeping gap-based rows")
            return rows

    # Remove empty grid rows (no words assigned)
    grid_rows = [gr for gr in grid_rows if gr.word_count > 0]

    # --- Step H: Merge header/footer + re-index ---
    result = list(non_content) + grid_rows
    result.sort(key=lambda r: r.y)
    for i, r in enumerate(result):
        r.index = i

    row_heights = [gr.height for gr in grid_rows]
    min_h = min(row_heights) if row_heights else 0
    max_h = max(row_heights) if row_heights else 0
    logger.info(f"RowGrid: word-center grid applied "
                f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
                f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
                f"{len(sections)} sections, "
                f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
                f"was {len(content_rows)} gap-based rows)")

    return result