breakpilot-lehrer/klausur-service/backend/grid_editor_headers.py

"""
Grid Editor — header/heading detection and colspan (merged cell) detection.
Split from grid_editor_helpers.py.  Pure computation, no HTTP/DB side effects.
Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Any, Dict, List, Optional

from cv_ocr_engines import _text_has_garbled_ipa

logger = logging.getLogger(__name__)


def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
    """Detect heading rows by color + height after color annotation.

    A row is a heading if:
    1. ALL word_boxes have color_name != 'black' (typically 'blue')
    2. Mean word height > 1.2x median height of all words in the zone

    Detected heading rows are merged into a single spanning cell.
    Returns count of headings detected.
    """
    heading_count = 0

    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        columns = z.get("columns", [])
        if not cells or not rows or len(columns) < 2:
            continue

        # Compute median word height across the zone
        all_heights = []
        for cell in cells:
            for wb in cell.get("word_boxes") or []:
                h = wb.get("height", 0)
                if h > 0:
                    all_heights.append(h)
        if not all_heights:
            continue
        all_heights_sorted = sorted(all_heights)
        median_h = all_heights_sorted[len(all_heights_sorted) // 2]

        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue  # already detected as header
            ri = row["index"]
            row_cells = [c for c in cells if c.get("row_index") == ri]
            row_wbs = [
                wb for cell in row_cells
                for wb in cell.get("word_boxes") or []
            ]
            if not row_wbs:
                continue

            # Condition 1: ALL words are non-black
            all_colored = all(
                wb.get("color_name", "black") != "black"
                for wb in row_wbs
            )
            if not all_colored:
                continue

            # Condition 2: mean height > 1.2x median
            mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
            if mean_h <= median_h * 1.2:
                continue

            heading_row_indices.append(ri)

        # Merge heading cells into spanning cells
        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if len(header_cells) <= 1:
                # Single cell -- just mark it as heading
                if header_cells:
                    header_cells[0]["col_type"] = "heading"
                    heading_count += 1
                    # Mark row as header
                    for row in rows:
                        if row["index"] == hri:
                            row["is_header"] = True
                continue

            # Collect all word_boxes and text from all columns
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())

            # Remove all cells for this row, replace with one spanning cell
            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]

            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)

                # Use the actual starting col_index from the first cell
                first_col = min(hc["col_index"] for hc in header_cells)
                zone_idx = z.get("zone_index", 0)
                z["cells"].append({
                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
                    "zone_index": zone_idx,
                    "row_index": hri,
                    "col_index": first_col,
                    "col_type": "heading",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
                    "bbox_px": {"x": x_min, "y": y_min,
                                "w": x_max - x_min, "h": y_max - y_min},
                    "bbox_pct": {
                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                    },
                    "word_boxes": all_wb,
                    "ocr_engine": "words_first",
                    "is_bold": True,
                })

            # Mark row as header
            for row in rows:
                if row["index"] == hri:
                    row["is_header"] = True
            heading_count += 1

    return heading_count


def _detect_heading_rows_by_single_cell(
    zones_data: List[Dict], img_w: int, img_h: int,
) -> int:
    """Detect heading rows that have only a single content cell.

    Black headings like "Theme" have normal color and height, so they are
    missed by ``_detect_heading_rows_by_color``.  The distinguishing signal
    is that they occupy only one column while normal vocabulary rows fill
    at least 2-3 columns.

    A row qualifies as a heading if:
    1. It is not already marked as a header/heading.
    2. It has exactly ONE cell whose col_type starts with ``column_``
       (excluding column_1 / page_ref which only carries page numbers).
    3. That single cell is NOT in the last column (continuation/example
       lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
    4. The text does not start with ``[`` (IPA continuation).
    5. The zone has >=3 columns and >=5 rows (avoids false positives in
       tiny zones).
    6. The majority of rows in the zone have >=2 content cells (ensures
       we are in a multi-column vocab layout).
    """
    heading_count = 0

    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        columns = z.get("columns", [])
        if len(columns) < 3 or len(rows) < 5:
            continue

        # Determine the last col_index (example/sentence column)
        col_indices = sorted(set(c.get("col_index", 0) for c in cells))
        if not col_indices:
            continue
        last_col = col_indices[-1]

        # Count content cells per row (column_* but not column_1/page_ref).
        # Exception: column_1 cells that contain a dictionary article word
        # (die/der/das etc.) ARE content -- they appear in dictionary layouts
        # where the leftmost column holds grammatical articles.
        _ARTICLE_WORDS = {
            "die", "der", "das", "dem", "den", "des", "ein", "eine",
            "the", "a", "an",
        }
        row_content_counts: Dict[int, int] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            if ct == "column_1":
                ctext = (cell.get("text") or "").strip().lower()
                if ctext not in _ARTICLE_WORDS:
                    continue
            ri = cell.get("row_index", -1)
            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1

        # Majority of rows must have >=2 content cells
        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
        if multi_col_rows < len(rows) * 0.4:
            continue

        # Exclude first and last non-header rows -- these are typically
        # page numbers or footer text, not headings.
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if len(non_header_rows) < 3:
            continue
        first_ri = non_header_rows[0]["index"]
        last_ri = non_header_rows[-1]["index"]

        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue
            ri = row["index"]
            if ri == first_ri or ri == last_ri:
                continue
            row_cells = [c for c in cells if c.get("row_index") == ri]
            content_cells = [
                c for c in row_cells
                if c.get("col_type", "").startswith("column_")
                and (c.get("col_type") != "column_1"
                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
            ]
            if len(content_cells) != 1:
                continue
            cell = content_cells[0]
            # Not in the last column (continuation/example lines)
            if cell.get("col_index") == last_col:
                continue
            text = (cell.get("text") or "").strip()
            if not text or text.startswith("["):
                continue
            # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
            if text.startswith("("):
                continue
            # Single cell NOT in the first content column is likely a
            # continuation/overflow line, not a heading.  Real headings
            # ("Theme 1", "Unit 3: ...") appear in the first or second
            # content column.
            first_content_col = col_indices[0] if col_indices else 0
            if cell.get("col_index", 0) > first_content_col + 1:
                continue
            # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
            # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
            _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
                continue
            # Guard: dictionary section headings are short (1-4 alpha chars
            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
            # lowercase is a regular vocabulary word (e.g. "zentral") that
            # happens to appear alone in its row.
            alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
            if len(alpha_only) > 4 and text[0].islower():
                continue
            heading_row_indices.append(ri)

        # Guard: if >25% of eligible rows would become headings, the
        # heuristic is misfiring (e.g. sparse single-column layout where
        # most rows naturally have only 1 content cell).
        eligible_rows = len(non_header_rows) - 2  # minus first/last excluded
        if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
            logger.debug(
                "Skipping single-cell heading detection for zone %s: "
                "%d/%d rows would be headings (>25%%)",
                z.get("zone_index"), len(heading_row_indices), eligible_rows,
            )
            continue

        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if not header_cells:
                continue

            # Collect all word_boxes and text
            all_wb = []
            all_text_parts = []
            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
                all_wb.extend(hc.get("word_boxes", []))
                if hc.get("text", "").strip():
                    all_text_parts.append(hc["text"].strip())

            first_col_idx = min(hc["col_index"] for hc in header_cells)

            # Remove old cells for this row, add spanning heading cell
            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]

            if all_wb:
                x_min = min(wb["left"] for wb in all_wb)
                y_min = min(wb["top"] for wb in all_wb)
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
            else:
                # Fallback to first cell bbox
                bp = header_cells[0].get("bbox_px", {})
                x_min = bp.get("x", 0)
                y_min = bp.get("y", 0)
                x_max = x_min + bp.get("w", 0)
                y_max = y_min + bp.get("h", 0)

            zone_idx = z.get("zone_index", 0)
            z["cells"].append({
                "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
                "zone_index": zone_idx,
                "row_index": hri,
                "col_index": first_col_idx,
                "col_type": "heading",
                "text": " ".join(all_text_parts),
                "confidence": 0.0,
                "bbox_px": {"x": x_min, "y": y_min,
                            "w": x_max - x_min, "h": y_max - y_min},
                "bbox_pct": {
                    "x": round(x_min / img_w * 100, 2) if img_w else 0,
                    "y": round(y_min / img_h * 100, 2) if img_h else 0,
                    "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                    "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                },
                "word_boxes": all_wb,
                "ocr_engine": "words_first",
                "is_bold": False,
            })

            for row in rows:
                if row["index"] == hri:
                    row["is_header"] = True
            heading_count += 1

    return heading_count


def _detect_header_rows(
    rows: List[Dict],
    zone_words: List[Dict],
    zone_y: int,
    columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
) -> List[int]:
    """Detect header rows: first-row heuristic + spanning header detection.

    A "spanning header" is a row whose words stretch across multiple column
    boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
    """
    if len(rows) < 2:
        return []

    headers = []

    if not skip_first_row_header:
        first_row = rows[0]
        second_row = rows[1]

        # Gap between first and second row > 0.5x average row height
        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
        gap = second_row["y_min"] - first_row["y_max"]
        if gap > avg_h * 0.5:
            headers.append(0)

        # Also check if first row words are taller than average (bold/header text)
        all_heights = [w["height"] for w in zone_words]
        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
        first_row_words = [
            w for w in zone_words
            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
        ]
        if first_row_words:
            first_h = max(w["height"] for w in first_row_words)
            if first_h > median_h * 1.3:
                if 0 not in headers:
                    headers.append(0)

    # Note: Spanning-header detection (rows spanning all columns) has been
    # disabled because it produces too many false positives on vocabulary
    # worksheets where IPA transcriptions or short entries naturally span
    # multiple columns with few words.  The first-row heuristic above is
    # sufficient for detecting real headers.

    return headers


def _detect_colspan_cells(
    zone_words: List[Dict],
    columns: List[Dict],
    rows: List[Dict],
    cells: List[Dict],
    img_w: int,
    img_h: int,
) -> List[Dict]:
    """Detect and merge cells that span multiple columns (colspan).

    A word-block (PaddleOCR phrase) that extends significantly past a column
    boundary into the next column indicates a merged cell.  This replaces
    the incorrectly split cells with a single cell spanning multiple columns.

    Works for both full-page scans and box zones.
    """
    if len(columns) < 2 or not zone_words or not rows:
        return cells

    from cv_words_first import _assign_word_to_row

    # Column boundaries (midpoints between adjacent columns)
    col_boundaries = []
    for ci in range(len(columns) - 1):
        col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)

    def _cols_covered(w_left: float, w_right: float) -> List[int]:
        """Return list of column indices that a word-block covers."""
        covered = []
        for col in columns:
            col_mid = (col["x_min"] + col["x_max"]) / 2
            # Word covers a column if it extends past the column's midpoint
            if w_left < col_mid < w_right:
                covered.append(col["index"])
            # Also include column if word starts within it
            elif col["x_min"] <= w_left < col["x_max"]:
                covered.append(col["index"])
        return sorted(set(covered))

    # Group original word-blocks by row
    row_word_blocks: Dict[int, List[Dict]] = {}
    for w in zone_words:
        ri = _assign_word_to_row(w, rows)
        row_word_blocks.setdefault(ri, []).append(w)

    # For each row, check if any word-block spans multiple columns
    rows_to_merge: Dict[int, List[Dict]] = {}  # row_index -> list of spanning word-blocks

    for ri, wblocks in row_word_blocks.items():
        spanning = []
        for w in wblocks:
            w_left = w["left"]
            w_right = w_left + w["width"]
            covered = _cols_covered(w_left, w_right)
            if len(covered) >= 2:
                spanning.append({"word": w, "cols": covered})
        if spanning:
            rows_to_merge[ri] = spanning

    if not rows_to_merge:
        return cells

    # Merge cells for spanning rows
    new_cells = []
    for cell in cells:
        ri = cell.get("row_index", -1)
        if ri not in rows_to_merge:
            new_cells.append(cell)
            continue

        # Check if this cell's column is part of a spanning block
        ci = cell.get("col_index", -1)
        is_part_of_span = False
        for span in rows_to_merge[ri]:
            if ci in span["cols"]:
                is_part_of_span = True
                # Only emit the merged cell for the FIRST column in the span
                if ci == span["cols"][0]:
                    # Use the ORIGINAL word-block text (not the split cell texts
                    # which may have broken words like "euros a" + "nd cents")
                    orig_word = span["word"]
                    merged_text = orig_word.get("text", "").strip()
                    all_wb = [orig_word]

                    # Compute merged bbox
                    if all_wb:
                        x_min = min(wb["left"] for wb in all_wb)
                        y_min = min(wb["top"] for wb in all_wb)
                        x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                        y_max = max(wb["top"] + wb["height"] for wb in all_wb)
                    else:
                        x_min = y_min = x_max = y_max = 0

                    new_cells.append({
                        "cell_id": cell["cell_id"],
                        "row_index": ri,
                        "col_index": span["cols"][0],
                        "col_type": "spanning_header",
                        "colspan": len(span["cols"]),
                        "text": merged_text,
                        "confidence": cell.get("confidence", 0),
                        "bbox_px": {"x": x_min, "y": y_min,
                                    "w": x_max - x_min, "h": y_max - y_min},
                        "bbox_pct": {
                            "x": round(x_min / img_w * 100, 2) if img_w else 0,
                            "y": round(y_min / img_h * 100, 2) if img_h else 0,
                            "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
                            "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
                        },
                        "word_boxes": all_wb,
                        "ocr_engine": cell.get("ocr_engine", ""),
                        "is_bold": cell.get("is_bold", False),
                    })
                    logger.info(
                        "colspan detected: row %d, cols %s -> merged %d cells (%r)",
                        ri, span["cols"], len(span["cols"]), merged_text[:50],
                    )
                break
        if not is_part_of_span:
            new_cells.append(cell)

    return new_cells