breakpilot-lehrer/klausur-service/backend/grid_build_cell_ops.py

"""
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
garbled cell cleanup, word-box reordering, and max_columns enforcement.

Extracted from grid_build_core.py for maintainability.
"""

import logging
import re
from typing import Any, Dict, List, Tuple

from cv_ocr_engines import (
    _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
)

logger = logging.getLogger(__name__)


def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
    """Remove blue bullet/artifact word_boxes (Step 5i).

    Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
    and syllable-split word merging.
    """
    _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
    _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}

    bullet_removed = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            to_remove: set = set()

            # Rule (a): tiny coloured symbols
            for i, wb in enumerate(wbs):
                cn = wb.get("color_name", "black")
                if (cn != "black"
                        and wb.get("width", 0) * wb.get("height", 0) < 200
                        and wb.get("conf", 100) < 85):
                    to_remove.add(i)

            # Rule (a2): isolated non-alphanumeric symbols
            for i, wb in enumerate(wbs):
                t = (wb.get("text") or "").strip()
                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
                    if t in _REMOVE_SYMBOLS:
                        to_remove.add(i)

            # Rule (b) + (c): overlap and duplicate detection
            to_merge: List[Tuple[int, int]] = []
            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
            for p in range(len(indexed) - 1):
                i1, w1 = indexed[p]
                i2, w2 = indexed[p + 1]
                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
                min_w = min(w1.get("width", 1), w2.get("width", 1))
                gap = x2s - x1e
                overlap_pct = overlap / min_w if min_w > 0 else 0

                if overlap_pct > 0.20:
                    t1 = (w1.get("text") or "").strip()
                    t2 = (w2.get("text") or "").strip()

                    # Syllable-split words
                    if (overlap_pct <= 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)):
                        to_merge.append((i1, i2))
                        continue

                    # High overlap with short prefix
                    if (overlap_pct > 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)
                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
                        to_merge.append((i1, i2))
                        continue

                    if overlap_pct <= 0.40:
                        continue

                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)

                    # Very high overlap: prefer IPA-dictionary word
                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
                        if in_dict_1 and not in_dict_2:
                            to_remove.add(i2)
                            continue
                        elif in_dict_2 and not in_dict_1:
                            to_remove.add(i1)
                            continue

                    if c1 < c2:
                        to_remove.add(i1)
                    elif c2 < c1:
                        to_remove.add(i2)
                    else:
                        if w1.get("height", 0) > w2.get("height", 0):
                            to_remove.add(i1)
                        else:
                            to_remove.add(i2)

                elif (gap < 6
                      and w1.get("color_name") == "blue"
                      and w2.get("color_name") == "blue"
                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)

            # Execute merges first (syllable-split words)
            if to_merge:
                merge_parent: Dict[int, int] = {}
                for mi1, mi2 in to_merge:
                    actual_mi1 = mi1
                    while actual_mi1 in merge_parent:
                        actual_mi1 = merge_parent[actual_mi1]
                    if actual_mi1 in to_remove or mi2 in to_remove:
                        continue
                    if mi2 in merge_parent:
                        continue
                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
                    mt2 = (mw2.get("text") or "").strip()
                    merged_text = mt1 + mt2
                    mx = min(mw1["left"], mw2["left"])
                    my = min(mw1["top"], mw2["top"])
                    mr = max(mw1["left"] + mw1["width"],
                             mw2["left"] + mw2["width"])
                    mb = max(mw1["top"] + mw1["height"],
                             mw2["top"] + mw2["height"])
                    mw1["text"] = merged_text
                    mw1["left"] = mx
                    mw1["top"] = my
                    mw1["width"] = mr - mx
                    mw1["height"] = mb - my
                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
                    to_remove.add(mi2)
                    merge_parent[mi2] = actual_mi1
                    bullet_removed -= 1

            if to_remove:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
                cell["word_boxes"] = filtered
                if not cell.get("_ipa_corrected"):
                    cell["text"] = _words_to_reading_order_text(filtered)

    if bullet_removed:
        for z in zones_data:
            z["cells"] = [c for c in z.get("cells", [])
                          if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)


def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
    """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
    _COMMON_SHORT_WORDS = {
        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
        "die", "der", "das", "dem", "den", "des", "ein", "und",
        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
        "on", "or", "so", "to", "up", "us", "we",
        "the", "and", "but", "for", "not",
    }
    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
    artifact_cells_removed = 0

    for z in zones_data:
        before = len(z.get("cells", []))
        kept = []
        for cell in z.get("cells", []):
            text = (cell.get("text") or "").strip()
            core = text.rstrip(".,;:!?'\"")
            is_artifact = False
            if not core:
                is_artifact = True
            elif _PURE_JUNK_RE.match(core):
                if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
                    is_artifact = True
            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
                is_artifact = True
            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
                is_artifact = True
            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
                  and not re.match(r'^[pPsS]\.?\d+$', core)):
                is_artifact = True
            if is_artifact:
                kept.append(None)
            else:
                kept.append(cell)
        z["cells"] = [c for c in kept if c is not None]
        artifact_cells_removed += before - len(z["cells"])

    if artifact_cells_removed:
        for z in zones_data:
            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)


def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
    """Normalise word_box order to reading order (Step 5j)."""
    wb_reordered = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
            sorted_wbs = [w for line in lines for w in line]
            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
                cell["word_boxes"] = sorted_wbs
                wb_reordered += 1
    if wb_reordered:
        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)


def _enforce_max_columns(
    zones_data: List[Dict[str, Any]],
    max_columns: int,
) -> None:
    """Enforce max_columns by merging narrowest columns (Step 5k)."""
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
        cols = z.get("columns", [])
        cells = z.get("cells", [])
        if len(cols) <= max_columns:
            continue

        logger.info(
            "max_columns=%d: zone %s has %d columns -> merging",
            max_columns, z.get("zone_index"), len(cols),
        )

        cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))

        while len(cols) > max_columns:
            narrowest = cols_by_width.pop(0)
            ni = narrowest["index"]

            sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
            pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
            if pos + 1 < len(sorted_by_x):
                merge_target = sorted_by_x[pos + 1]
            elif pos > 0:
                merge_target = sorted_by_x[pos - 1]
            else:
                break

            ti = merge_target["index"]

            merge_target["x_min_px"] = min(
                merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
                narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
            )
            merge_target["x_max_px"] = max(
                merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
                narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
            )
            if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
                merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
                merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])

            for cell in cells:
                if cell.get("col_index") == ni:
                    cell["col_index"] = ti
                    existing = next(
                        (c for c in cells if c["col_index"] == ti
                         and c["row_index"] == cell["row_index"]
                         and c is not cell),
                        None,
                    )
                    if existing:
                        existing["text"] = (
                            (existing.get("text", "") + " " + cell.get("text", "")).strip()
                        )
                        existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
                        cell["_merged"] = True

            z["cells"] = [c for c in cells if not c.get("_merged")]
            cells = z["cells"]
            cols.remove(narrowest)
            cols_by_width = [c for c in cols_by_width if c["index"] != ni]

        # Re-index columns 0..N-1
        for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
            old_idx = col["index"]
            col["index"] = new_idx
            for cell in cells:
                if cell.get("col_index") == old_idx:
                    cell["col_index"] = new_idx

        logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))